Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
1f705244
Commit
1f705244
authored
Sep 12, 2019
by
Chao Liu
Browse files
padding for chwn is functional
parent
724e984b
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
118 additions
and
272 deletions
+118
-272
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
..._convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
+69
-219
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+2
-2
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+2
-2
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+4
-5
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+4
-5
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+6
-0
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+9
-14
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+5
-6
driver/src/driver.cpp
driver/src/driver.cpp
+17
-19
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
View file @
1f705244
...
@@ -47,27 +47,18 @@ template <index_t GridSize,
...
@@ -47,27 +47,18 @@ template <index_t GridSize,
index_t
OutThreadCopyDataPerAccess_N
>
index_t
OutThreadCopyDataPerAccess_N
>
struct
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
struct
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
I6
=
Number
<
6
>
{};
static
constexpr
auto
I7
=
Number
<
7
>
{};
static
constexpr
auto
I8
=
Number
<
8
>
{};
static
constexpr
auto
I9
=
Number
<
9
>
{};
static
constexpr
auto
I10
=
Number
<
10
>
{};
static
constexpr
auto
I11
=
Number
<
11
>
{};
static
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
static
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
#if 1
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
Float
*
const
__restrict__
p_out_global
)
const
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
static
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
// be careful of this assertion
// be careful of this assertion
static_assert
(
static_assert
(
NPerBlock
%
NPerThread
==
0
&&
NPerBlock
%
NPerThread
==
0
&&
...
@@ -122,8 +113,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -122,8 +113,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
const
index_t
wo_block_data_begin
=
block_work_multi_id
[
2
]
*
WoPerBlock
;
const
index_t
wo_block_data_begin
=
block_work_multi_id
[
2
]
*
WoPerBlock
;
const
index_t
n_block_data_begin
=
block_work_multi_id
[
3
]
*
NPerBlock
;
const
index_t
n_block_data_begin
=
block_work_multi_id
[
3
]
*
NPerBlock
;
const
index_t
h
i
_block_data_begin
=
ho_block_data_begin
-
LeftPads
{}[
0
]
;
const
index_t
h
p
_block_data_begin
=
ho_block_data_begin
;
const
index_t
w
i
_block_data_begin
=
wo_block_data_begin
-
LeftPads
{}[
1
]
;
const
index_t
w
p
_block_data_begin
=
wo_block_data_begin
;
// input global tensor view
// input global tensor view
constexpr
auto
in_c_hp_wp_n_global_desc
=
transform_tensor_descriptor
(
constexpr
auto
in_c_hp_wp_n_global_desc
=
transform_tensor_descriptor
(
...
@@ -133,12 +124,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -133,12 +124,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
// global tensor view
constexpr
auto
wei_c_k_global_desc_old
=
wei_c_y_x_k_global_desc_old
.
Extract
(
I0
,
I3
);
constexpr
auto
wei_c_k_global_desc
=
make_native_tensor_descriptor
(
wei_c_k_global_desc_old
.
GetLengths
(),
wei_c_k_global_desc_old
.
GetStrides
());
// LDS tensor view
// LDS tensor view
// be careful of alignment
// be careful of alignment
constexpr
index_t
max_align
=
math
::
lcm
(
InBlockCopyDataPerAccess_N
,
constexpr
index_t
max_align
=
math
::
lcm
(
InBlockCopyDataPerAccess_N
,
...
@@ -158,15 +143,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -158,15 +143,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc_old
=
make_ConstantTensorDescriptor_aligned
(
constexpr
auto
wei_c_
1_1_
k_block_desc_old
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
max_align
>
{});
Sequence
<
CPerBlock
,
1
,
1
,
KPerBlock
>
{},
Number
<
max_align
>
{});
constexpr
auto
wei_c_k_block_desc
=
make_native_tensor_descriptor
(
constexpr
auto
wei_c_
1_1_
k_block_desc
=
make_native_tensor_descriptor
(
wei_c_k_block_desc_old
.
GetLengths
(),
wei_c_k_block_desc_old
.
GetStrides
());
wei_c_
1_1_
k_block_desc_old
.
GetLengths
(),
wei_c_
1_1_
k_block_desc_old
.
GetStrides
());
// LDS: be careful of alignment
// LDS: be careful of alignment
constexpr
index_t
in_block_space
=
in_c_h_w_n_block_desc_old
.
GetElementSpace
();
constexpr
index_t
in_block_space
=
in_c_h_w_n_block_desc_old
.
GetElementSpace
();
constexpr
index_t
wei_block_space
=
wei_c_k_block_desc_old
.
GetElementSpace
();
constexpr
index_t
wei_block_space
=
wei_c_
1_1_
k_block_desc_old
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
...
@@ -181,46 +166,45 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -181,46 +166,45 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
// blockwise input copy
// blockwise input copy
// format is [C, Hi, Wi, N]
// format is [C, Hi, Wi, N]
auto
blockwise_in_copy
=
auto
blockwise_in_copy
=
#if 0
BlockwiseGenericTensorSliceCopy_v4
<
BlockSize
,
BlockwiseGenericTensorSliceCopy_v2
decltype
(
in_c_hp_wp_n_global_desc
),
#else
decltype
(
in_c_h_w_n_block_desc
),
BlockwiseGenericTensorSliceCopy_v4
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
#endif
InBlockCopySubLengths_CHWN
,
<
BlockSize
,
InBlockCopyClusterLengths_CHWN
,
decltype
(
in_c_hp_wp_n_global_desc
),
Sequence
<
0
,
1
,
2
,
3
>
,
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
0
,
1
,
2
,
3
>
,
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
Sequence
<
0
,
1
,
2
,
3
>
,
InBlockCopySubLengths_CHWN
,
3
,
InBlockCopyClusterLengths_CHWN
,
3
,
Sequence
<
0
,
1
,
2
,
3
>
,
InBlockCopyDataPerAccess_N
,
Sequence
<
0
,
1
,
2
,
3
>
,
InBlockCopyDataPerAccess_N
>
(
Sequence
<
0
,
1
,
2
,
3
>
,
{
0
,
hp_block_data_begin
,
wp_block_data_begin
,
n_block_data_begin
},
{
0
,
0
,
0
,
0
});
3
,
3
,
InBlockCopyDataPerAccess_N
,
InBlockCopyDataPerAccess_N
>
({
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
});
// blockwise wei copy
// blockwise wei copy
// format is [CPerBlock, KPerBlock]
// format is [CPerBlock, KPerBlock]
const
auto
blockwise_wei_copy
=
using
WeiBlockCopySubLengths_CYXK
=
#if 0
Sequence
<
WeiBlockCopySubLengths_CK
::
At
(
0
),
1
,
1
,
WeiBlockCopySubLengths_CK
::
At
(
1
)
>
;
BlockwiseGenericTensorSliceCopy_v2
using
WeiBlockCopyClusterLengths_CYXK
=
Sequence
<
WeiBlockCopyClusterLengths_CK
::
At
(
0
),
#else
1
,
BlockwiseGenericTensorSliceCopy_v4
1
,
#endif
WeiBlockCopyClusterLengths_CK
::
At
(
1
)
>
;
<
BlockSize
,
decltype
(
wei_c_k_global_desc
),
auto
blockwise_wei_copy
=
decltype
(
wei_c_k_block_desc
),
BlockwiseGenericTensorSliceCopy_v4
<
BlockSize
,
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
decltype
(
wei_c_y_x_k_global_desc
),
WeiBlockCopySubLengths_CK
,
decltype
(
wei_c_1_1_k_block_desc
),
WeiBlockCopyClusterLengths_CK
,
decltype
(
wei_c_1_1_k_block_desc
.
GetLengths
()),
Sequence
<
0
,
1
>
,
WeiBlockCopySubLengths_CYXK
,
Sequence
<
0
,
1
>
,
WeiBlockCopyClusterLengths_CYXK
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
1
,
Sequence
<
0
,
1
,
2
,
3
>
,
1
,
Sequence
<
0
,
1
,
2
,
3
>
,
WeiBlockCopyDataPerAccess_K
,
3
,
WeiBlockCopyDataPerAccess_K
>
({
0
,
0
},
{
0
,
0
});
3
,
WeiBlockCopyDataPerAccess_K
,
WeiBlockCopyDataPerAccess_K
>
(
{
0
,
0
,
0
,
k_block_data_begin
},
{
0
,
0
,
0
,
0
});
// a series of blockwise batched GEMM
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// C_matrix += transpose(A_matrix) * B_matrix
...
@@ -228,8 +212,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -228,8 +212,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
// A_matrix[C,K] is a sub-matrix of wei_block[C,K]
// A_matrix[C,K] is a sub-matrix of wei_block[C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
constexpr
auto
a_c_k_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
a_c_k_block_mtx_desc
=
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c_k_block_desc
.
GetStride
(
I0
)
>
{});
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c_1_1_k_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_c_wn_block_mtx_desc
=
constexpr
auto
b_c_wn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
...
@@ -270,39 +256,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -270,39 +256,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
// set threadwise output tensor to 0
// set threadwise output tensor to 0
threadwise_matrix_set_zero
(
c_k_wn_thread_mtx_desc
,
p_out_thread
);
threadwise_matrix_set_zero
(
c_k_wn_thread_mtx_desc
,
p_out_thread
);
#if 1
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_c_h_w_n_global_desc
.
CalculateOffset
(
{
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
,
n_block_data_begin
});
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
CalculateOffset
({
0
,
y
,
x
,
k_block_data_begin
});
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_c_h_w_n_global_desc
.
GetStride
(
I0
),
p_wei_global_block_offset
+=
CPerBlock
*
wei_c_y_x_k_global_desc
.
GetStride
(
I0
))
{
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block
);
__syncthreads
();
blockwise_batch_gemm
.
Run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
__syncthreads
();
}
}
}
#else
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
...
@@ -310,8 +263,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -310,8 +263,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
)
c_block_data_begin
+=
CPerBlock
)
{
{
blockwise_in_copy
.
Run
();
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_wei_copy
.
Run
();
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
__syncthreads
();
__syncthreads
();
...
@@ -320,28 +273,29 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -320,28 +273,29 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
__syncthreads
();
__syncthreads
();
// move along C
// move along C
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
CPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
CPerBlock
,
0
,
0
,
0
),
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
CPerBlock
,
0
,
0
,
0
>
{},
True
);
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
CPerBlock
,
0
,
0
,
0
),
True
);
}
}
// reset C
// reset C
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
C
,
0
,
0
,
0
>
{}
,
False
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
C
,
0
,
0
,
0
)
,
False
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
C
,
0
,
0
,
0
>
{}
,
False
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
C
,
0
,
0
,
0
)
,
False
);
// move aling X
// move aling X
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
1
,
0
>
{}
,
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
0
,
1
,
0
)
,
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
1
,
0
>
{}
,
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
0
,
1
,
0
)
,
True
);
}
}
// reset X
// reset X
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
X
,
0
>
{}
,
False
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
0
,
X
,
0
)
,
False
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
X
,
0
>
{}
,
False
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
0
,
X
,
0
)
,
False
);
// move along Y
// move along Y
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
1
,
0
,
0
>
{},
Fals
e
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
1
,
0
,
0
),
Tru
e
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
1
,
0
,
0
>
{},
Fals
e
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
0
,
1
,
0
,
0
),
Tru
e
);
}
}
#endif
// output: register to global mem
// output: register to global mem
const
auto
c_thread_mtx_begin
=
const
auto
c_thread_mtx_begin
=
...
@@ -454,110 +408,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -454,110 +408,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
});
});
}
}
#elif 0
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
// create a native tensor descriptor
constexpr
auto
in_c_h_w_n_global_desc
=
make_native_tensor_descriptor
(
InGlobalDesc
::
GetLengths
(),
InGlobalDesc
::
GetStrides
());
constexpr
index_t
C
=
in_c_h_w_n_global_desc
.
GetLength
(
I0
);
constexpr
index_t
Hi
=
in_c_h_w_n_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Wi
=
in_c_h_w_n_global_desc
.
GetLength
(
I2
);
constexpr
index_t
N
=
in_c_h_w_n_global_desc
.
GetLength
(
I3
);
// transformation: {c, h, w, n} --> {n, c, hp, wp}
// {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n}
constexpr
auto
in_n_c_hp_wp_global_desc
=
transform_tensor_descriptor
(
in_c_h_w_n_global_desc
,
make_tuple
(
Pad
<
Sequence
<
Hi
,
Wi
>
,
LeftPads
,
RightPads
>
{},
PassThrough
<
C
>
{},
PassThrough
<
N
>
{}),
make_tuple
(
Sequence
<
1
,
2
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
2
,
3
>
{},
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
// transformation: {n, c, hp, wp} --> {c, b}
// {n, hp, wp} --> {b}, {c} --> {c}
constexpr
auto
in_c_b_global_desc
=
transform_tensor_descriptor
(
in_n_c_hp_wp_global_desc
,
make_tuple
(
Merge
<
decltype
(
in_n_c_hp_wp_global_desc
.
GetLengths
(
I0
,
I2
,
I3
))
>
{},
PassThrough
<
in_n_c_hp_wp_global_desc
.
GetLength
(
I1
)
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
3
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
&&
get_block_1d_id
()
==
0
)
{
// 0
print_tensor_descriptor
(
"in_c_h_w_n_global_desc"
,
in_c_h_w_n_global_desc
);
// 1
print_tensor_descriptor
(
"in_n_c_hp_wp_global_desc"
,
in_n_c_hp_wp_global_desc
);
// 2
print_tensor_descriptor
(
"in_c_b_global_desc"
,
in_c_b_global_desc
);
constexpr
auto
idx2
=
MultiIndex
<
2
>
{
1
,
4
*
(
16
*
16
)
+
5
*
16
+
6
};
auto
idx1
=
in_c_b_global_desc
.
CalculateLowerIndex
(
idx2
);
auto
idx0
=
in_c_b_global_desc
.
GetLowerTensorDescriptor
().
CalculateLowerIndex
(
idx1
);
print_array
(
"idx2: "
,
idx2
);
print_array
(
"idx1: "
,
idx1
);
print_array
(
"idx0: "
,
idx0
);
printf
(
"in_c_b_global_desc offset: %lu
\n
"
,
in_c_b_global_desc
.
CalculateOffset
(
idx2
));
}
}
#else
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
// create a native tensor descriptor
constexpr
auto
in_c_h_w_n_global_desc
=
make_native_tensor_descriptor
(
InGlobalDesc
::
GetLengths
(),
InGlobalDesc
::
GetStrides
());
constexpr
index_t
C
=
in_c_h_w_n_global_desc
.
GetLength
(
I0
);
constexpr
index_t
Hi
=
in_c_h_w_n_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Wi
=
in_c_h_w_n_global_desc
.
GetLength
(
I2
);
constexpr
index_t
N
=
in_c_h_w_n_global_desc
.
GetLength
(
I3
);
// transformation: {c, h, w, n} --> {n, c, hp, wp}
// {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n}
constexpr
auto
in_c_hp_wp_n_global_desc
=
transform_tensor_descriptor
(
in_c_h_w_n_global_desc
,
make_tuple
(
PassThrough
<
C
>
{},
Pad
<
Sequence
<
Hi
,
Wi
>
,
LeftPads
,
RightPads
>
{},
PassThrough
<
N
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
if
(
get_thread_local_1d_id
()
==
0
&&
get_block_1d_id
()
==
0
)
{
// 0
print_tensor_descriptor
(
"in_c_h_w_n_global_desc"
,
in_c_h_w_n_global_desc
);
// 1
print_tensor_descriptor
(
"in_c_hp_wp_n_global_desc"
,
in_c_hp_wp_n_global_desc
);
constexpr
auto
idx1
=
MultiIndex
<
4
>
{
1
,
2
,
3
,
4
};
auto
idx0
=
in_c_hp_wp_n_global_desc
.
CalculateLowerIndex
(
idx1
);
print_array
(
"idx1: "
,
idx1
);
print_array
(
"idx0: "
,
idx0
);
auto
coord1
=
make_tensor_coordinate_v2
(
in_c_hp_wp_n_global_desc
,
idx1
);
print_array
(
"1: "
,
coord1
.
GetIndex
());
print_array
(
"0: "
,
coord1
.
GetLowerCoordinate
().
GetIndex
());
printf
(
"in_c_hp_wp_n_global_desc is_in_pad: %d
\n
"
,
coord1
.
IsAnyLevelIndexInPaddingArea
());
printf
(
"in_c_hp_wp_n_global_desc offset: %lu
\n
"
,
in_c_hp_wp_n_global_desc
.
CalculateOffset
(
idx1
));
}
}
#endif
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
1f705244
...
@@ -304,8 +304,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -304,8 +304,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#else
#else
blockwise_in_copy
.
MoveSrcSlic
ing
Window
({
EPerBlock
,
0
,
0
,
0
},
true
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
({
EPerBlock
,
0
,
0
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlic
ing
Window
({
EPerBlock
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlic
e
Window
({
EPerBlock
,
0
},
true
);
#endif
#endif
}
}
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
1f705244
...
@@ -303,7 +303,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -303,7 +303,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
__syncthreads
();
__syncthreads
();
...
@@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
__syncthreads
();
__syncthreads
();
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
1f705244
...
@@ -240,8 +240,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -240,8 +240,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
__syncthreads
();
__syncthreads
();
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
}
}
// copy output: register to global memory
// copy output: register to global memory
...
@@ -297,9 +297,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -297,9 +297,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
{
{
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSlicingWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
True
);
threadwise_out_copy
.
MoveDstSliceWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
threadwise_out_copy
.
MoveDstSlicingWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
}
}
}
}
}
}
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
1f705244
...
@@ -269,7 +269,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -269,7 +269,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
];
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
];
__syncthreads
();
__syncthreads
();
...
@@ -294,7 +294,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -294,7 +294,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
// even iteration
// even iteration
blockwise_in_copy
.
MoveSrcSlic
ing
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSlic
e
Window
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
];
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
];
__syncthreads
();
__syncthreads
();
...
@@ -379,9 +379,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -379,9 +379,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
{
{
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSlicingWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
True
);
threadwise_out_copy
.
MoveDstSliceWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
threadwise_out_copy
.
MoveDstSlicingWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
}
}
}
}
}
}
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
1f705244
...
@@ -8,6 +8,12 @@ namespace ck {
...
@@ -8,6 +8,12 @@ namespace ck {
template
<
index_t
N
>
template
<
index_t
N
>
using
MultiIndex
=
Array
<
index_t
,
N
>
;
using
MultiIndex
=
Array
<
index_t
,
N
>
;
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
make_multi_index
(
Xs
...
xs
)
{
return
MultiIndex
<
sizeof
...(
Xs
)
>
(
xs
...);
}
template
<
index_t
Length
>
template
<
index_t
Length
>
struct
PassThrough
struct
PassThrough
{
{
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
1f705244
...
@@ -408,8 +408,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -408,8 +408,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
if
(
step_sizes
[
idim
]
!=
0
)
if
(
step_sizes
[
idim
]
!=
0
)
...
@@ -506,18 +505,16 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -506,18 +505,16 @@ struct BlockwiseGenericTensorSliceCopy_v2
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwiseLoad
.
MoveSrcSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwiseLoad
.
MoveSrcSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwiseStore
.
MoveDstSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwiseStore
.
MoveDstSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
private:
private:
...
@@ -753,18 +750,16 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -753,18 +750,16 @@ struct BlockwiseGenericTensorSliceCopy_v4
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwiseLoad
.
MoveSrcSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwiseLoad
.
MoveSrcSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
{
mThreadwiseStore
.
MoveDstSlic
ing
Window
(
step_sizes
,
positive_direction
);
mThreadwiseStore
.
MoveDstSlic
e
Window
(
step_sizes
,
positive_direction
);
}
}
private:
private:
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
1f705244
...
@@ -757,7 +757,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -757,7 +757,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// T can be Sequence or Array
// T can be Sequence or Array
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlic
ing
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveSrcSlic
e
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
mSrcSliceOrigin
+=
step_sizes
;
...
@@ -765,7 +765,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -765,7 +765,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlic
ing
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveDstSlic
e
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
mDstSliceOrigin
+=
step_sizes
;
...
@@ -1045,8 +1045,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1045,8 +1045,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// TODO: still kind of messy
// TODO: still kind of messy
if
(
!
src_coord
.
IsAnyLevelIndexInPaddingArea
())
if
(
!
src_coord
.
IsAnyLevelIndexInPaddingArea
())
{
{
const
index_t
src_offset
=
const
index_t
src_offset
=
src_coord
.
GetOffset
();
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
...
@@ -1073,7 +1072,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1073,7 +1072,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlic
ing
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveSrcSlic
e
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
mSrcSliceOrigin
+=
step_sizes
;
...
@@ -1081,7 +1080,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1081,7 +1080,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}
}
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlic
ing
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveDstSlic
e
Window
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
mDstSliceOrigin
+=
step_sizes
;
...
...
driver/src/driver.cpp
View file @
1f705244
...
@@ -72,20 +72,20 @@ int main(int argc, char* argv[])
...
@@ -72,20 +72,20 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
;
using
namespace
ck
;
#if
1
#if
0
constexpr index_t N = 32;
constexpr index_t N = 32;
constexpr index_t C = 8;
constexpr index_t C = 8;
constexpr
index_t
HI
=
2
;
constexpr index_t HI =
1
;
constexpr
index_t
WI
=
2
;
constexpr index_t WI =
1
;
constexpr index_t K = 128;
constexpr index_t K = 128;
constexpr
index_t
Y
=
3
;
constexpr index_t Y =
1
;
constexpr
index_t
X
=
3
;
constexpr index_t X =
1
;
using ConvStrides = Sequence<1, 1>;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr
index_t
H
Pad
=
1
;
using Left
Pad
s
=
Sequence<1, 1>
;
constexpr
index_t
W
Pad
=
1
;
using Right
Pad
s
=
Sequence<0, 0>
;
#elif
1
#elif
1
// 3x3, 34x34
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
...
@@ -99,8 +99,8 @@ int main(int argc, char* argv[])
...
@@ -99,8 +99,8 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
H
Pad
=
1
;
using
Left
Pad
s
=
Sequence
<
1
,
1
>
;
constexpr
index_t
W
Pad
=
1
;
using
Right
Pad
s
=
Sequence
<
1
,
1
>
;
#elif 0
#elif 0
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
...
@@ -311,13 +311,10 @@ int main(int argc, char* argv[])
...
@@ -311,13 +311,10 @@ int main(int argc, char* argv[])
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#endif
#endif
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
lower_pads
,
upper_p
ads
);
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightP
ads
{}
);
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
...
@@ -378,8 +375,8 @@ int main(int argc, char* argv[])
...
@@ -378,8 +375,8 @@ int main(int argc, char* argv[])
wei_kcyx
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_desc
,
out_nkhw_device
,
out_nkhw_device
,
lower_p
ads
,
LeftP
ads
{}
,
upper_p
ads
,
RightP
ads
{}
,
nrepeat
);
nrepeat
);
#elif 0
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
...
@@ -434,11 +431,12 @@ int main(int argc, char* argv[])
...
@@ -434,11 +431,12 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if
0
#if
1
if
(
Y
==
3
&&
X
==
3
&&
ConvStrides
{}[
0
]
==
1
&&
ConvStrides
{}[
1
]
==
1
&&
if
(
Y
==
3
&&
X
==
3
&&
ConvStrides
{}[
0
]
==
1
&&
ConvStrides
{}[
1
]
==
1
&&
ConvDilations
{}[
0
]
==
1
&&
ConvDilations
{}[
1
]
==
1
)
ConvDilations
{}[
0
]
==
1
&&
ConvDilations
{}[
1
]
==
1
)
{
{
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
LeftPads
{},
RightPads
{});
}
}
else
else
#endif
#endif
...
@@ -448,8 +446,8 @@ int main(int argc, char* argv[])
...
@@ -448,8 +446,8 @@ int main(int argc, char* argv[])
out_nkhw_host
,
out_nkhw_host
,
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
lower_p
ads
,
LeftP
ads
{}
,
upper_p
ads
);
RightP
ads
{}
);
}
}
check_error
(
out_nkhw_host
,
out_nkhw_device
);
check_error
(
out_nkhw_host
,
out_nkhw_device
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment