Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
11b848da
"...composable_kernel.git" did not exist on "f6934e0bf4460c7ad97c57d5f4a645e426048b1d"
Commit
11b848da
authored
Jun 10, 2019
by
Jing Zhang
Browse files
fixed stride_division_ceil
parent
06810ad4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
68 additions
and
72 deletions
+68
-72
driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
...er/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+2
-1
driver/driver.hip.cpp
driver/driver.hip.cpp
+24
-27
src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
...implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
+42
-44
No files found.
driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
View file @
11b848da
...
@@ -49,7 +49,8 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
...
@@ -49,7 +49,8 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
N1
=
2
;
constexpr
index_t
N1
=
2
;
constexpr
index_t
N2
=
4
;
constexpr
index_t
N2
=
4
;
constexpr
index_t
B
=
(
N
*
(
Ho
/
Strides
::
Get
(
I0
))
*
(
Wo
/
Strides
::
Get
(
I1
)))
/
(
N1
*
N2
);
constexpr
index_t
B
=
N
*
mod_conv
::
integer_divide_ceil
(
Ho
,
Strides
::
Get
(
I0
))
*
mod_conv
::
integer_divide_ceil
(
Wo
,
Strides
::
Get
(
I1
))
/
(
N1
*
N2
);
#if 1
#if 1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
driver/driver.hip.cpp
View file @
11b848da
...
@@ -25,8 +25,6 @@ struct GeneratorTensor_0
...
@@ -25,8 +25,6 @@ struct GeneratorTensor_0
}
}
};
};
struct
GeneratorTensor_1
struct
GeneratorTensor_1
{
{
template
<
class
...
Is
>
template
<
class
...
Is
>
...
@@ -122,12 +120,12 @@ template <class TIn,
...
@@ -122,12 +120,12 @@ template <class TIn,
class
Strides
,
class
Strides
,
class
Dilations
>
class
Dilations
>
void
host_direct_convolution_forw
(
const
Tensor
<
TIn
>&
in_nchw
,
void
host_direct_convolution_forw
(
const
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
TWei
>&
wei_kcyx
,
const
Tensor
<
TWei
>&
wei_kcyx
,
Tensor
<
TOut
>&
out_nkhw
,
Tensor
<
TOut
>&
out_nkhw
,
LowerPads
,
LowerPads
,
UpperPads
,
UpperPads
,
Strides
,
Strides
,
Dilations
)
Dilations
)
{
{
index_t
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
index_t
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
index_t
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
index_t
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
...
@@ -179,13 +177,12 @@ template <class TIn,
...
@@ -179,13 +177,12 @@ template <class TIn,
class
Strides
,
class
Strides
,
class
Dilations
>
class
Dilations
>
void
host_direct_convolution_back
(
Tensor
<
TOut
>&
in_nchw
,
void
host_direct_convolution_back
(
Tensor
<
TOut
>&
in_nchw
,
const
Tensor
<
TWei
>&
wei_kcyx
,
const
Tensor
<
TWei
>&
wei_kcyx
,
const
Tensor
<
TIn
>&
out_nkhw
,
const
Tensor
<
TIn
>&
out_nkhw
,
LowerPads
,
LowerPads
,
UpperPads
,
UpperPads
,
Strides
,
Strides
,
Dilations
Dilations
)
)
{
{
index_t
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
index_t
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
index_t
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
index_t
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
...
@@ -199,22 +196,23 @@ void host_direct_convolution_back(Tensor<TOut>& in_nchw,
...
@@ -199,22 +196,23 @@ void host_direct_convolution_back(Tensor<TOut>& in_nchw,
index_t
dilation_h
=
Dilations
{}.
Get
(
Number
<
0
>
{});
index_t
dilation_h
=
Dilations
{}.
Get
(
Number
<
0
>
{});
index_t
dilation_w
=
Dilations
{}.
Get
(
Number
<
1
>
{});
index_t
dilation_w
=
Dilations
{}.
Get
(
Number
<
1
>
{});
//loop n,c,hi,wi
//
loop n,c,hi,wi
auto
f
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
auto
f
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
double
v
=
0
;
double
v
=
0
;
//loop k,y,x
//
loop k,y,x
for
(
int
k
=
0
;
k
<
wei_kcyx
.
mDesc
.
GetLengths
()[
0
];
++
k
)
for
(
int
k
=
0
;
k
<
wei_kcyx
.
mDesc
.
GetLengths
()[
0
];
++
k
)
{
{
for
(
int
y
=
0
;
y
<
wei_kcyx
.
mDesc
.
GetLengths
()[
2
];
++
y
)
for
(
int
y
=
0
;
y
<
wei_kcyx
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
{
int
ho_
=
(
hi
-
y
*
dilation_h
+
h_pad_low
);
int
ho_
=
(
hi
-
y
*
dilation_h
+
h_pad_low
);
int
ho
=
ho_
/
stride_h
;
int
ho
=
ho_
/
stride_h
;
for
(
int
x
=
0
;
x
<
wei_kcyx
.
mDesc
.
GetLengths
()[
3
];
++
x
)
for
(
int
x
=
0
;
x
<
wei_kcyx
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
{
int
wo_
=
(
wi
-
x
*
dilation_w
+
w_pad_low
);
int
wo_
=
(
wi
-
x
*
dilation_w
+
w_pad_low
);
int
wo
=
wo_
/
stride_w
;
int
wo
=
wo_
/
stride_w
;
if
(
ho
>=
0
&&
ho
<
out_nkhw
.
mDesc
.
GetLengths
()[
2
]
&&
wo
>=
0
&&
if
(
ho
>=
0
&&
ho
<
out_nkhw
.
mDesc
.
GetLengths
()[
2
]
&&
wo
>=
0
&&
wo
<
out_nkhw
.
mDesc
.
GetLengths
()[
3
]
&&
ho_
%
stride_h
==
0
&&
wo_
%
stride_w
==
0
)
wo
<
out_nkhw
.
mDesc
.
GetLengths
()[
3
]
&&
ho_
%
stride_h
==
0
&&
wo_
%
stride_w
==
0
)
{
{
v
+=
double
(
out_nkhw
(
n
,
k
,
ho
,
wo
))
*
double
(
wei_kcyx
(
k
,
c
,
y
,
x
));
v
+=
double
(
out_nkhw
(
n
,
k
,
ho
,
wo
))
*
double
(
wei_kcyx
(
k
,
c
,
y
,
x
));
}
}
...
@@ -501,7 +499,7 @@ int main(int argc, char* argv[])
...
@@ -501,7 +499,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HDilation
=
1
;
constexpr
index_t
HDilation
=
1
;
constexpr
index_t
WDilation
=
1
;
constexpr
index_t
WDilation
=
1
;
constexpr
index_t
Direction
=
2
;
//1: Forward; 2:Backward
constexpr
index_t
Direction
=
1
;
//
1: Forward; 2:Backward
#if 0
#if 0
constexpr index_t N = 32;
constexpr index_t N = 32;
constexpr index_t C = 128;
constexpr index_t C = 128;
...
@@ -553,8 +551,8 @@ int main(int argc, char* argv[])
...
@@ -553,8 +551,8 @@ int main(int argc, char* argv[])
// 1x1 filter, 28x28 image
// 1x1 filter, 28x28 image
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
...
@@ -716,8 +714,8 @@ int main(int argc, char* argv[])
...
@@ -716,8 +714,8 @@ int main(int argc, char* argv[])
#elif
0
#elif
0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_0
{},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_0
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
//out_nkhw.GenerateTensorValue(GeneratorTensor_Checkboard{}, num_thread);
//
out_nkhw.GenerateTensorValue(GeneratorTensor_Checkboard{}, num_thread);
//out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
//
out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
out_nkhw
.
GenerateTensorValue
(
GeneratorTensor_4
{},
num_thread
);
out_nkhw
.
GenerateTensorValue
(
GeneratorTensor_4
{},
num_thread
);
#elif 0
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
...
@@ -764,8 +762,7 @@ int main(int argc, char* argv[])
...
@@ -764,8 +762,7 @@ int main(int argc, char* argv[])
strides
,
strides
,
dilations
,
dilations
,
in_nchw_device
,
in_nchw_device
,
nrepeat
nrepeat
);
);
#elif 1
#elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
View file @
11b848da
...
@@ -112,27 +112,28 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
...
@@ -112,27 +112,28 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// input tensor
// input tensor
// tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
// tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
constexpr
auto
in_n0_n1_n2_h_w_global_desc
=
in_n_c_h_w_global_desc
.
Slice
(
I2
,
constexpr
auto
in_n0_n1_n2_h_w_global_desc
=
Number
<
Ho
/
Strides
::
Get
(
I0
)
>
{})
in_n_c_h_w_global_desc
.
Slice
(
I3
,
Number
<
Wo
/
Strides
::
Get
(
I1
)
>
{})
.
Slice
(
I2
,
Number
<
mod_conv
::
integer_divide_ceil
(
Ho
,
Strides
::
Get
(
I0
))
>
{})
.
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{})
.
Slice
(
I3
,
Number
<
mod_conv
::
integer_divide_ceil
(
Wo
,
Strides
::
Get
(
I1
))
>
{})
.
Extract
(
Sequence
<
0
,
1
,
2
,
4
,
5
>
{});
.
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{})
.
Extract
(
Sequence
<
0
,
1
,
2
,
4
,
5
>
{});
//constexpr auto in_n0_n1_n2_h_w_global_desc =
//in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{})
// constexpr auto in_n0_n1_n2_h_w_global_desc =
//.Extract(Sequence<0, 1, 2, 4, 5>{});
// in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{})
//.Extract(Sequence<0, 1, 2, 4, 5>{});
//constexpr auto in_lengths_new = Sequence<N0, N1, N2, Ho, Wo>{};
// constexpr auto in_lengths_new = Sequence<N0, N1, N2, Ho, Wo>{};
//constexpr auto in_strides_new =
//Sequence<in_n0_n1_n2_h_w_global_desc.GetStride(I0),
// constexpr auto in_strides_new =
//in_n0_n1_n2_h_w_global_desc.GetStride(I1),
// Sequence<in_n0_n1_n2_h_w_global_desc.GetStride(I0),
//in_n0_n1_n2_h_w_global_desc.GetStride(I2),
// in_n0_n1_n2_h_w_global_desc.GetStride(I1),
//in_n0_n1_n2_h_w_global_desc.GetStride(I3),
// in_n0_n1_n2_h_w_global_desc.GetStride(I2),
//in_n0_n1_n2_h_w_global_desc.GetStride(I4)>{};
// in_n0_n1_n2_h_w_global_desc.GetStride(I3),
// in_n0_n1_n2_h_w_global_desc.GetStride(I4)>{};
//constexpr auto in_n0_n1_n2_h_w_new_global_desc =
//make_ConstantTensorDescriptor(in_lengths_new, in_strides_new);
// constexpr auto in_n0_n1_n2_h_w_new_global_desc =
// make_ConstantTensorDescriptor(in_lengths_new, in_strides_new);
constexpr
auto
in_n0_n1_n2_h_w_new_global_desc
=
in_n0_n1_n2_h_w_global_desc
;
constexpr
auto
in_n0_n1_n2_h_w_new_global_desc
=
in_n0_n1_n2_h_w_global_desc
;
// batch descritpor for device memory
// batch descritpor for device memory
...
@@ -140,17 +141,17 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
...
@@ -140,17 +141,17 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
constexpr
auto
in_c_y_x_global_desc
=
in_n_c_h_w_global_desc
.
Slice
(
I2
,
Number
<
Y
>
{})
constexpr
auto
in_c_y_x_global_desc
=
in_n_c_h_w_global_desc
.
Slice
(
I2
,
Number
<
Y
>
{})
.
Slice
(
I3
,
Number
<
X
>
{})
.
Slice
(
I3
,
Number
<
X
>
{})
.
Extract
(
Sequence
<
1
,
2
,
3
>
{});
.
Extract
(
Sequence
<
1
,
2
,
3
>
{});
//constexpr auto in_win_lengths_new = Sequence<in_c_y_x_global_desc.GetLength(I0),
//
constexpr auto in_win_lengths_new = Sequence<in_c_y_x_global_desc.GetLength(I0),
//in_c_y_x_global_desc.GetLength(I1),
//
in_c_y_x_global_desc.GetLength(I1),
//in_c_y_x_global_desc.GetLength(I2)>{};
//
in_c_y_x_global_desc.GetLength(I2)>{};
//constexpr auto in_win_strides_new =
//
constexpr auto in_win_strides_new =
//Sequence<in_c_y_x_global_desc.GetStride(I0),
//
Sequence<in_c_y_x_global_desc.GetStride(I0),
//in_c_y_x_global_desc.GetStride(I1),
//
in_c_y_x_global_desc.GetStride(I1),
//in_c_y_x_global_desc.GetStride(I2)>{};
//
in_c_y_x_global_desc.GetStride(I2)>{};
//constexpr auto in_c_y_x_new_global_desc =
//
constexpr auto in_c_y_x_new_global_desc =
//make_ConstantTensorDescriptor(in_win_lengths_new, in_win_strides_new);
//
make_ConstantTensorDescriptor(in_win_lengths_new, in_win_strides_new);
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
constexpr
auto
in_e_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
constexpr
auto
in_e_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
...
@@ -189,16 +190,14 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
...
@@ -189,16 +190,14 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
InBlockCopyDstDataPerWrite_N2
>
(
InBlockCopyDstDataPerWrite_N2
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
// weight tensor
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
// tensor descriptor in device memory, src of blockwise copy
#if 0
#if 0
constexpr auto wei_e_k_global_desc =
constexpr auto wei_e_k_global_desc =
wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
#else
#else
constexpr
auto
wei_e_k_global_desc
=
constexpr
auto
wei_e_k_global_desc
=
make_ConstantMergedTensorDescriptor
(
make_ConstantMergedTensorDescriptor
(
wei_k_c_y_x_global_desc
,
wei_k_c_y_x_global_desc
,
Sequence
<
1
,
2
,
3
>
{},
Sequence
<
0
>
{});
Sequence
<
1
,
2
,
3
>
{},
Sequence
<
0
>
{});
#endif
#endif
// tensor descriptor in LDS, dst of blockwise copy
// tensor descriptor in LDS, dst of blockwise copy
...
@@ -426,9 +425,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
...
@@ -426,9 +425,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I3
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I3
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I4
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I4
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I5
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I5
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I6
)
/
Strides
{}.
Get
(
I0
),
mod_conv
::
integer_divide_ceil
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I7
)
/
Strides
{}.
Get
(
I1
)
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I6
),
Strides
{}.
Get
(
I0
)),
>
{};
mod_conv
::
integer_divide_ceil
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetLength
(
I7
),
Strides
{}.
Get
(
I1
))
>
{};
constexpr
auto
out_strides_new
=
Sequence
<
constexpr
auto
out_strides_new
=
Sequence
<
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I0
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I0
),
...
@@ -438,12 +438,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
...
@@ -438,12 +438,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I4
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I4
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I5
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I5
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I6
)
*
Strides
{}.
Get
(
I0
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I6
)
*
Strides
{}.
Get
(
I0
),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I7
)
*
Strides
{}.
Get
(
I1
)
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
GetStride
(
I7
)
*
Strides
{}.
Get
(
I1
)
>
{};
>
{};
constexpr
auto
out_n0_n1_n2_k0_k1_k2_h_w_new_global_mem_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
out_n0_n1_n2_k0_k1_k2_h_w_new_global_mem_desc
=
out_lengths_new
,
out_strides_new
make_ConstantTensorDescriptor
(
out_lengths_new
,
out_strides_new
);
);
// calculate origin of thread output tensor on global memory
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
// blockwise GEMM c matrix starting index
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment