Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel
Commits
1f705244
You need to sign in or sign up before continuing.
Commit
1f705244
authored
Sep 12, 2019
by
Chao Liu
Browse files
padding for chwn is functional
parent
724e984b
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
118 additions
and
272 deletions
+118
-272
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
..._convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
+69
-219
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+2
-2
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+2
-2
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+4
-5
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+4
-5
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+6
-0
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+9
-14
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+5
-6
driver/src/driver.cpp
driver/src/driver.cpp
+17
-19
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
View file @
1f705244
...
@@ -47,27 +47,18 @@ template <index_t GridSize,
...
@@ -47,27 +47,18 @@ template <index_t GridSize,
index_t
OutThreadCopyDataPerAccess_N
>
index_t
OutThreadCopyDataPerAccess_N
>
struct
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
struct
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
I6
=
Number
<
6
>
{};
static
constexpr
auto
I7
=
Number
<
7
>
{};
static
constexpr
auto
I8
=
Number
<
8
>
{};
static
constexpr
auto
I9
=
Number
<
9
>
{};
static
constexpr
auto
I10
=
Number
<
10
>
{};
static
constexpr
auto
I11
=
Number
<
11
>
{};
static
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
static
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
#if 1
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
Float
*
const
__restrict__
p_out_global
)
const
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
static
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
// be careful of this assertion
// be careful of this assertion
static_assert
(
static_assert
(
NPerBlock
%
NPerThread
==
0
&&
NPerBlock
%
NPerThread
==
0
&&
...
@@ -122,8 +113,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -122,8 +113,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
const
index_t
wo_block_data_begin
=
block_work_multi_id
[
2
]
*
WoPerBlock
;
const
index_t
wo_block_data_begin
=
block_work_multi_id
[
2
]
*
WoPerBlock
;
const
index_t
n_block_data_begin
=
block_work_multi_id
[
3
]
*
NPerBlock
;
const
index_t
n_block_data_begin
=
block_work_multi_id
[
3
]
*
NPerBlock
;
const
index_t
h
i
_block_data_begin
=
ho_block_data_begin
-
LeftPads
{}[
0
]
;
const
index_t
h
p
_block_data_begin
=
ho_block_data_begin
;
const
index_t
w
i
_block_data_begin
=
wo_block_data_begin
-
LeftPads
{}[
1
]
;
const
index_t
w
p
_block_data_begin
=
wo_block_data_begin
;
// input global tensor view
// input global tensor view
constexpr
auto
in_c_hp_wp_n_global_desc
=
transform_tensor_descriptor
(
constexpr
auto
in_c_hp_wp_n_global_desc
=
transform_tensor_descriptor
(
...
@@ -133,12 +124,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -133,12 +124,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
// global tensor view
constexpr
auto
wei_c_k_global_desc_old
=
wei_c_y_x_k_global_desc_old
.
Extract
(
I0
,
I3
);
constexpr
auto
wei_c_k_global_desc
=
make_native_tensor_descriptor
(
wei_c_k_global_desc_old
.
GetLengths
(),
wei_c_k_global_desc_old
.
GetStrides
());
// LDS tensor view
// LDS tensor view
// be careful of alignment
// be careful of alignment
constexpr
index_t
max_align
=
math
::
lcm
(
InBlockCopyDataPerAccess_N
,
constexpr
index_t
max_align
=
math
::
lcm
(
InBlockCopyDataPerAccess_N
,
...
@@ -158,15 +143,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -158,15 +143,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc_old
=
make_ConstantTensorDescriptor_aligned
(
constexpr
auto
wei_c_
1_1_
k_block_desc_old
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
max_align
>
{});
Sequence
<
CPerBlock
,
1
,
1
,
KPerBlock
>
{},
Number
<
max_align
>
{});
constexpr
auto
wei_c_k_block_desc
=
make_native_tensor_descriptor
(
constexpr
auto
wei_c_
1_1_
k_block_desc
=
make_native_tensor_descriptor
(
wei_c_k_block_desc_old
.
GetLengths
(),
wei_c_k_block_desc_old
.
GetStrides
());
wei_c_
1_1_
k_block_desc_old
.
GetLengths
(),
wei_c_
1_1_
k_block_desc_old
.
GetStrides
());
// LDS: be careful of alignment
// LDS: be careful of alignment
constexpr
index_t
in_block_space
=
in_c_h_w_n_block_desc_old
.
GetElementSpace
();
constexpr
index_t
in_block_space
=
in_c_h_w_n_block_desc_old
.
GetElementSpace
();
constexpr
index_t
wei_block_space
=
wei_c_k_block_desc_old
.
GetElementSpace
();
constexpr
index_t
wei_block_space
=
wei_c_
1_1_
k_block_desc_old
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
...
@@ -181,46 +166,45 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -181,46 +166,45 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
// blockwise input copy
// blockwise input copy
// format is [C, Hi, Wi, N]
// format is [C, Hi, Wi, N]
auto
blockwise_in_copy
=
auto
blockwise_in_copy
=
#if 0
BlockwiseGenericTensorSliceCopy_v4
<
BlockSize
,
BlockwiseGenericTensorSliceCopy_v2
decltype
(
in_c_hp_wp_n_global_desc
),
#else
decltype
(
in_c_h_w_n_block_desc
),
BlockwiseGenericTensorSliceCopy_v4
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
#endif
InBlockCopySubLengths_CHWN
,
<
BlockSize
,
InBlockCopyClusterLengths_CHWN
,
decltype
(
in_c_hp_wp_n_global_desc
),
Sequence
<
0
,
1
,
2
,
3
>
,
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
0
,
1
,
2
,
3
>
,
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
Sequence
<
0
,
1
,
2
,
3
>
,
InBlockCopySubLengths_CHWN
,
3
,
InBlockCopyClusterLengths_CHWN
,
3
,
Sequence
<
0
,
1
,
2
,
3
>
,
InBlockCopyDataPerAccess_N
,
Sequence
<
0
,
1
,
2
,
3
>
,
InBlockCopyDataPerAccess_N
>
(
Sequence
<
0
,
1
,
2
,
3
>
,
{
0
,
hp_block_data_begin
,
wp_block_data_begin
,
n_block_data_begin
},
{
0
,
0
,
0
,
0
});
3
,
3
,
InBlockCopyDataPerAccess_N
,
InBlockCopyDataPerAccess_N
>
({
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
});
// blockwise wei copy
// blockwise wei copy
// format is [CPerBlock, KPerBlock]
// format is [CPerBlock, KPerBlock]
const
auto
blockwise_wei_copy
=
using
WeiBlockCopySubLengths_CYXK
=
#if 0
Sequence
<
WeiBlockCopySubLengths_CK
::
At
(
0
),
1
,
1
,
WeiBlockCopySubLengths_CK
::
At
(
1
)
>
;
BlockwiseGenericTensorSliceCopy_v2
using
WeiBlockCopyClusterLengths_CYXK
=
Sequence
<
WeiBlockCopyClusterLengths_CK
::
At
(
0
),
#else
1
,
BlockwiseGenericTensorSliceCopy_v4
1
,
#endif
WeiBlockCopyClusterLengths_CK
::
At
(
1
)
>
;
<
BlockSize
,
decltype
(
wei_c_k_global_desc
),
auto
blockwise_wei_copy
=
decltype
(
wei_c_k_block_desc
),
BlockwiseGenericTensorSliceCopy_v4
<
BlockSize
,
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
decltype
(
wei_c_y_x_k_global_desc
),
WeiBlockCopySubLengths_CK
,
decltype
(
wei_c_1_1_k_block_desc
),
WeiBlockCopyClusterLengths_CK
,
decltype
(
wei_c_1_1_k_block_desc
.
GetLengths
()),
Sequence
<
0
,
1
>
,
WeiBlockCopySubLengths_CYXK
,
Sequence
<
0
,
1
>
,
WeiBlockCopyClusterLengths_CYXK
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
1
,
Sequence
<
0
,
1
,
2
,
3
>
,
1
,
Sequence
<
0
,
1
,
2
,
3
>
,
WeiBlockCopyDataPerAccess_K
,
3
,
WeiBlockCopyDataPerAccess_K
>
({
0
,
0
},
{
0
,
0
});
3
,
WeiBlockCopyDataPerAccess_K
,
WeiBlockCopyDataPerAccess_K
>
(
{
0
,
0
,
0
,
k_block_data_begin
},
{
0
,
0
,
0
,
0
});
// a series of blockwise batched GEMM
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// C_matrix += transpose(A_matrix) * B_matrix
...
@@ -228,8 +212,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -228,8 +212,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
// A_matrix[C,K] is a sub-matrix of wei_block[C,K]
// A_matrix[C,K] is a sub-matrix of wei_block[C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
constexpr
auto
a_c_k_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
a_c_k_block_mtx_desc
=
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c_k_block_desc
.
GetStride
(
I0
)
>
{});
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c_1_1_k_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_c_wn_block_mtx_desc
=
constexpr
auto
b_c_wn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
...
@@ -270,39 +256,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -270,39 +256,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
// set threadwise output tensor to 0
// set threadwise output tensor to 0
threadwise_matrix_set_zero
(
c_k_wn_thread_mtx_desc
,
p_out_thread
);
threadwise_matrix_set_zero
(
c_k_wn_thread_mtx_desc
,
p_out_thread
);
#if 1
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_c_h_w_n_global_desc
.
CalculateOffset
(
{
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
,
n_block_data_begin
});
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
CalculateOffset
({
0
,
y
,
x
,
k_block_data_begin
});
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_c_h_w_n_global_desc
.
GetStride
(
I0
),
p_wei_global_block_offset
+=
CPerBlock
*
wei_c_y_x_k_global_desc
.
GetStride
(
I0
))
{
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block
);
__syncthreads
();
blockwise_batch_gemm
.
Run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
__syncthreads
();
}
}
}
#else
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
...
@@ -310,8 +263,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -310,8 +263,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
)
c_block_data_begin
+=
CPerBlock
)
{
{
blockwise_in_copy
.
Run
();
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_wei_copy
.
Run
();
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
__syncthreads
();
__syncthreads
();
...
@@ -320,28 +273,29 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -320,28 +273,29 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
__syncthreads
();
__syncthreads
();
// move along C
// move along C
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
CPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
CPerBlock
,
0
,
0
,
0
),
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
CPerBlock
,
0
,
0
,
0
>
{},
True
);
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
CPerBlock
,
0
,
0
,
0
),
True
);
}
}
// reset C
// reset C
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
C
,
0
,
0
,
0
>
{}
,
False
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
C
,
0
,
0
,
0
)
,
False
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
C
,
0
,
0
,
0
>
{}
,
False
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
C
,
0
,
0
,
0
)
,
False
);
// move aling X
// move aling X
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
1
,
0
>
{}
,
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
0
,
1
,
0
)
,
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
1
,
0
>
{}
,
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
0
,
1
,
0
)
,
True
);
}
}
// reset X
// reset X
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
X
,
0
>
{}
,
False
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
0
,
X
,
0
)
,
False
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
X
,
0
>
{}
,
False
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
0
,
X
,
0
)
,
False
);
// move along Y
// move along Y
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
1
,
0
,
0
>
{},
Fals
e
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
1
,
0
,
0
),
Tru
e
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
1
,
0
,
0
>
{},
Fals
e
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
1
,
0
,
0
),
Tru
e
);
}
}
#endif
// output: register to global mem
// output: register to global mem
const
auto
c_thread_mtx_begin
=
const
auto
c_thread_mtx_begin
=
...
@@ -454,110 +408,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -454,110 +408,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
});
});
}
}
#elif 0
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
// create a native tensor descriptor
constexpr
auto
in_c_h_w_n_global_desc
=
make_native_tensor_descriptor
(
InGlobalDesc
::
GetLengths
(),
InGlobalDesc
::
GetStrides
());
constexpr
index_t
C
=
in_c_h_w_n_global_desc
.
GetLength
(
I0
);
constexpr
index_t
Hi
=
in_c_h_w_n_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Wi
=
in_c_h_w_n_global_desc
.
GetLength
(
I2
);
constexpr
index_t
N
=
in_c_h_w_n_global_desc
.
GetLength
(
I3
);
// transformation: {c, h, w, n} --> {n, c, hp, wp}
// {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n}
constexpr
auto
in_n_c_hp_wp_global_desc
=
transform_tensor_descriptor
(
in_c_h_w_n_global_desc
,
make_tuple
(
Pad
<
Sequence
<
Hi
,
Wi
>
,
LeftPads
,
RightPads
>
{},
PassThrough
<
C
>
{},
PassThrough
<
N
>
{}),
make_tuple
(
Sequence
<
1
,
2
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
2
,
3
>
{},
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
// transformation: {n, c, hp, wp} --> {c, b}
// {n, hp, wp} --> {b}, {c} --> {c}
constexpr
auto
in_c_b_global_desc
=
transform_tensor_descriptor
(
in_n_c_hp_wp_global_desc
,
make_tuple
(
Merge
<
decltype
(
in_n_c_hp_wp_global_desc
.
GetLengths
(
I0
,
I2
,
I3
))
>
{},
PassThrough
<
in_n_c_hp_wp_global_desc
.
GetLength
(
I1
)
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
3
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
&&
get_block_1d_id
()
==
0
)
{
// 0
print_tensor_descriptor
(
"in_c_h_w_n_global_desc"
,
in_c_h_w_n_global_desc
);
// 1
print_tensor_descriptor
(
"in_n_c_hp_wp_global_desc"
,
in_n_c_hp_wp_global_desc
);
// 2
print_tensor_descriptor
(
"in_c_b_global_desc"
,
in_c_b_global_desc
);
constexpr
auto
idx2
=
MultiIndex
<
2
>
{
1
,
4
*
(
16
*
16
)
+
5
*
16
+
6
};
auto
idx1
=
in_c_b_global_desc
.
CalculateLowerIndex
(
idx2
);
auto
idx0
=
in_c_b_global_desc
.
GetLowerTensorDescriptor
().
CalculateLowerIndex
(
idx1
);
print_array
(
"idx2: "
,
idx2
);
print_array
(
"idx1: "
,
idx1
);
print_array
(
"idx0: "
,
idx0
);
printf
(
"in_c_b_global_desc offset: %lu
\n
"
,
in_c_b_global_desc
.
CalculateOffset
(
idx2
));
}
}
#else
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
// create a native tensor descriptor
constexpr
auto
in_c_h_w_n_global_desc
=
make_native_tensor_descriptor
(
InGlobalDesc
::
GetLengths
(),
InGlobalDesc
::
GetStrides
());
constexpr
index_t
C
=
in_c_h_w_n_global_desc
.
GetLength
(
I0
);
constexpr
index_t
Hi
=
in_c_h_w_n_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Wi
=
in_c_h_w_n_global_desc
.
GetLength
(
I2
);
constexpr
index_t
N
=
in_c_h_w_n_global_desc
.
GetLength
(
I3
);
// transformation: {c, h, w, n} --> {n, c, hp, wp}
// {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n}
constexpr
auto
in_c_hp_wp_n_global_desc
=
transform_tensor_descriptor
(
in_c_h_w_n_global_desc
,
make_tuple
(
PassThrough
<
C
>
{},
Pad
<
Sequence
<
Hi
,
Wi
>
,
LeftPads
,
RightPads
>
{},
PassThrough
<
N
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
if
(
get_thread_local_1d_id
()
==
0
&&
get_block_1d_id
()
==
0
)
{
// 0
print_tensor_descriptor
(
"in_c_h_w_n_global_desc"
,
in_c_h_w_n_global_desc
);
// 1
print_tensor_descriptor
(
"in_c_hp_wp_n_global_desc"
,
in_c_hp_wp_n_global_desc
);
constexpr
auto
idx1
=
MultiIndex
<
4
>
{
1
,
2
,
3
,
4
};
auto
idx0
=
in_c_hp_wp_n_global_desc
.
CalculateLowerIndex
(
idx1
);
print_array
(
"idx1: "
,
idx1
);
print_array
(
"idx0: "
,
idx0
);
auto
coord1
=
make_tensor_coordinate_v2
(
in_c_hp_wp_n_global_desc
,
idx1
);
print_array
(
"1: "
,
coord1
.
GetIndex
());
print_array
(
"0: "
,
coord1
.
GetLowerCoordinate
().
GetIndex
());
printf
(
"in_c_hp_wp_n_global_desc is_in_pad: %d
\n
"
,
coord1
.
IsAnyLevelIndexInPaddingArea
());
printf
(
"in_c_hp_wp_n_global_desc offset: %lu
\n
"
,
in_c_hp_wp_n_global_desc
.
CalculateOffset
(
idx1
));
}
}
#endif
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
1f705244
...
@@ -304,8 +304,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -304,8 +304,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#else
#else
blockwise_in_copy
.
MoveSrcSlic
ing
Window
({
EPerBlock
,
0
,
0
,
0
},
true
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
({
EPerBlock
,
0
,
0
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlic
ing
Window
({
EPerBlock
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlic
e
Window
({
EPerBlock
,
0
},
true
);
#endif
#endif
}
}
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
1f705244
...
@@ -303,7 +303,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -303,7 +303,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
__syncthreads
();
__syncthreads
();
...
@@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
__syncthreads
();
__syncthreads
();
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
1f705244
...
@@ -240,8 +240,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -240,8 +240,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
__syncthreads
();
__syncthreads
();
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
}
}
// copy output: register to global memory
// copy output: register to global memory
...
@@ -297,9 +297,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -297,9 +297,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
{
{
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSlicingWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
True
);
threadwise_out_copy
.
MoveDstSliceWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
threadwise_out_copy
.
MoveDstSlicingWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
}
}
}
}
}
}
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
1f705244
...
@@ -269,7 +269,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -269,7 +269,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
];
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
];
__syncthreads
();
__syncthreads
();
...
@@ -294,7 +294,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -294,7 +294,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
// even iteration
// even iteration
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
];
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
];
__syncthreads
();
__syncthreads
();
...
@@ -379,9 +379,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -379,9 +379,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
{
{
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSlicingWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
True
);
threadwise_out_copy
.
MoveDstSliceWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
threadwise_out_copy
.
MoveDstSlicingWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
}
}
}
}
}
}
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
1f705244
...
@@ -8,6 +8,12 @@ namespace ck {
...
@@ -8,6 +8,12 @@ namespace ck {
template
<
index_t
N
>
template
<
index_t
N
>
using
MultiIndex
=
Array
<
index_t
,
N
>
;
using
MultiIndex
=
Array
<
index_t
,
N
>
;
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
make_multi_index
(
Xs
...
xs
)
{
return
MultiIndex
<
sizeof
...(
Xs
)
>
(
xs
...);
}
template
<
index_t
Length
>
template
<
index_t
Length
>
struct
PassThrough
struct
PassThrough
{
{
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
1f705244
...
@@ -408,8 +408,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -408,8 +408,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
if
(
step_sizes
[
idim
]
!=
0
)
if
(
step_sizes
[
idim
]
!=
0
)
...
@@ -506,18 +505,16 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -506,18 +505,16 @@ struct BlockwiseGenericTensorSliceCopy_v2
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwiseLoad
.
MoveSrcSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwiseLoad
.
MoveSrcSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwiseStore
.
MoveDstSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwiseStore
.
MoveDstSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
private:
private:
...
@@ -753,18 +750,16 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -753,18 +750,16 @@ struct BlockwiseGenericTensorSliceCopy_v4
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwiseLoad
.
MoveSrcSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwiseLoad
.
MoveSrcSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwiseStore
.
MoveDstSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwiseStore
.
MoveDstSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
private:
private:
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
1f705244
...
@@ -757,7 +757,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -757,7 +757,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// T can be Sequence or Array
// T can be Sequence or Array
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlic
ing
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveSrcSlic
e
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
mSrcSliceOrigin
+=
step_sizes
;
...
@@ -765,7 +765,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -765,7 +765,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlic
ing
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveDstSlic
e
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
mDstSliceOrigin
+=
step_sizes
;
...
@@ -1045,8 +1045,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1045,8 +1045,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// TODO: still kind of messy
// TODO: still kind of messy
if
(
!
src_coord
.
IsAnyLevelIndexInPaddingArea
())
if
(
!
src_coord
.
IsAnyLevelIndexInPaddingArea
())
{
{
const
index_t
src_offset
=
const
index_t
src_offset
=
src_coord
.
GetOffset
();
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
...
@@ -1073,7 +1072,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1073,7 +1072,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlic
ing
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveSrcSlic
e
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
mSrcSliceOrigin
+=
step_sizes
;
...
@@ -1081,7 +1080,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1081,7 +1080,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlic
ing
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveDstSlic
e
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
mDstSliceOrigin
+=
step_sizes
;
...
...
driver/src/driver.cpp
View file @
1f705244
...
@@ -72,20 +72,20 @@ int main(int argc, char* argv[])
...
@@ -72,20 +72,20 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
;
using
namespace
ck
;
#if
1
#if
0
constexpr index_t N = 32;
constexpr index_t N = 32;
constexpr index_t C = 8;
constexpr index_t C = 8;
constexpr
index_t
HI
=
2
;
constexpr index_t HI =
1
;
constexpr
index_t
WI
=
2
;
constexpr index_t WI =
1
;
constexpr index_t K = 128;
constexpr index_t K = 128;
constexpr
index_t
Y
=
3
;
constexpr index_t Y =
1
;
constexpr
index_t
X
=
3
;
constexpr index_t X =
1
;
using ConvStrides = Sequence<1, 1>;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr
index_t
H
Pad
=
1
;
using Left
Pad
s
=
Sequence<1, 1>
;
constexpr
index_t
W
Pad
=
1
;
using Right
Pad
s
=
Sequence<0, 0>
;
#elif
1
#elif
1
// 3x3, 34x34
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
...
@@ -99,8 +99,8 @@ int main(int argc, char* argv[])
...
@@ -99,8 +99,8 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
H
Pad
=
1
;
using
Left
Pad
s
=
Sequence
<
1
,
1
>
;
constexpr
index_t
W
Pad
=
1
;
using
Right
Pad
s
=
Sequence
<
1
,
1
>
;
#elif 0
#elif 0
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
...
@@ -311,13 +311,10 @@ int main(int argc, char* argv[])
...
@@ -311,13 +311,10 @@ int main(int argc, char* argv[])
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#endif
#endif
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
lower_pads
,
upper_p
ads
);
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightP
ads
{}
);
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
...
@@ -378,8 +375,8 @@ int main(int argc, char* argv[])
...
@@ -378,8 +375,8 @@ int main(int argc, char* argv[])
wei_kcyx
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_desc
,
out_nkhw_device
,
out_nkhw_device
,
lower_p
ads
,
LeftP
ads
{}
,
upper_p
ads
,
RightP
ads
{}
,
nrepeat
);
nrepeat
);
#elif 0
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
...
@@ -434,11 +431,12 @@ int main(int argc, char* argv[])
...
@@ -434,11 +431,12 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if
0
#if
1
if
(
Y
==
3
&&
X
==
3
&&
ConvStrides
{}[
0
]
==
1
&&
ConvStrides
{}[
1
]
==
1
&&
if
(
Y
==
3
&&
X
==
3
&&
ConvStrides
{}[
0
]
==
1
&&
ConvStrides
{}[
1
]
==
1
&&
ConvDilations
{}[
0
]
==
1
&&
ConvDilations
{}[
1
]
==
1
)
ConvDilations
{}[
0
]
==
1
&&
ConvDilations
{}[
1
]
==
1
)
{
{
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
LeftPads
{},
RightPads
{});
}
}
else
else
#endif
#endif
...
@@ -448,8 +446,8 @@ int main(int argc, char* argv[])
...
@@ -448,8 +446,8 @@ int main(int argc, char* argv[])
out_nkhw_host
,
out_nkhw_host
,
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
lower_p
ads
,
LeftP
ads
{}
,
upper_p
ads
);
RightP
ads
{}
);
}
}
check_error
(
out_nkhw_host
,
out_nkhw_device
);
check_error
(
out_nkhw_host
,
out_nkhw_device
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment