Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b12bbcee
You need to sign in or sign up before continuing.
Commit
b12bbcee
authored
Sep 26, 2019
by
Chao Liu
Browse files
clean up
parent
51a9fa1d
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
2372 additions
and
2299 deletions
+2372
-2299
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+32
-19
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+35
-22
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+116
-246
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
...clude/tensor_description/tensor_coordinate_deprecated.hpp
+346
-0
composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
...l/include/tensor_description/tensor_coordinate_helper.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
...ernel/include/tensor_description/tensor_coordinate_v2.hpp
+0
-215
composable_kernel/include/tensor_description/tensor_view.hpp
composable_kernel/include/tensor_description/tensor_view.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_visit.hpp
...osable_kernel/include/tensor_description/tensor_visit.hpp
+2
-2
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+1
-670
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
...ration/blockwise_generic_tensor_slice_copy_deprecated.hpp
+692
-0
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+7
-1113
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
...ation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+1129
-0
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+2
-2
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
..._convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+5
-5
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
b12bbcee
...
@@ -5,9 +5,9 @@
...
@@ -5,9 +5,9 @@
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_generic_tensor_slice_copy
_deprecated
.hpp"
#include "blockwise_gemm.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy
_deprecated
.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -265,8 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -265,8 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block_double
);
blockwise_in_copy
.
template
Run
<
Float
,
address_space_t
::
global
>(
p_in_global
,
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block_double
);
p_in_block_double
);
blockwise_wei_copy
.
template
Run
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_block_double
);
}
}
// LDS double buffer: main body
// LDS double buffer: main body
...
@@ -288,8 +290,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -288,8 +290,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
*
p_wei_block_next
=
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
@@ -297,23 +299,25 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -297,23 +299,25 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_global
,
p_wei_register_buffer
);
p_in_global
,
p_in_thread_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
p_in_block_next
);
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
p_wei_block_next
);
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_next
);
}
}
}
}
// LDS double buffer: tail
// LDS double buffer: tail
{
{
// even iteration
// even iteration
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
@@ -321,17 +325,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -321,17 +325,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_global
,
p_wei_register_buffer
);
p_in_global
,
p_in_thread_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_double
+
wei_block_space
);
p_wei_block_double
+
wei_block_space
);
// odd iteration
// odd iteration
__syncthreads
();
__syncthreads
();
...
@@ -390,7 +396,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -390,7 +396,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
0
,
0
,
b_thread_data_on_global
,
b_thread_data_on_global
,
0
})
0
})
.
template
Run_amd_experiment
<
Float
,
0
,
2
>(
p_out_thread
,
p_out_global
);
#if 0
.Run
#else
// tweaking
.
template
Run_optimized_address_calculation
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#endif
(
p_out_thread
,
p_out_global
);
}
}
}
}
};
};
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
b12bbcee
...
@@ -5,9 +5,9 @@
...
@@ -5,9 +5,9 @@
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_generic_tensor_slice_copy
_deprecated
.hpp"
#include "blockwise_gemm.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy
_deprecated
.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -251,8 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -251,8 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block_double
);
blockwise_in_copy
.
template
Run
<
Float
,
address_space_t
::
global
>(
p_in_global
,
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block_double
);
p_in_block_double
);
blockwise_wei_copy
.
template
Run
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_block_double
);
}
}
// LDS double buffer: main body
// LDS double buffer: main body
...
@@ -274,51 +276,54 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -274,51 +276,54 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
Float
*
p_wei_block_next
=
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
]
;
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
)
;
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_block_on_global
,
p_in_global
,
p_in_thread_buffer
);
p_wei_register_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
p_in_block_next
);
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
p_wei_block_next
);
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_next
);
}
}
}
}
// LDS double buffer: tail
// LDS double buffer: tail
{
{
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
// even iteration
// even iteration
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
]
;
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
)
;
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_block_on_global
,
p_wei_register_buffer
);
p_in_global
,
p_in_thread_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_double
+
wei_block_space
);
p_wei_block_double
+
wei_block_space
);
// odd iteration
// odd iteration
__syncthreads
();
__syncthreads
();
...
@@ -385,7 +390,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -385,7 +390,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
{
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
#if 1
.
Run
#else // tweaking
.
template
Run_optimized_address_calculation
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#endif
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
threadwise_out_copy
.
MoveDstSliceWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
threadwise_out_copy
.
MoveDstSliceWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
b12bbcee
#ifndef CK_TENSOR_COORDINATE_HPP
#ifndef CK_TENSOR_COORDINATE_
V2_
HPP
#define CK_TENSOR_COORDINATE_HPP
#define CK_TENSOR_COORDINATE_
V2_
HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "dimension.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "multi_index_transform.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
namespace
ck
{
// TensorDesc is ConstantTensorDescriptor
template
<
typename
TensorDesc
>
template
<
class
TensorDesc
>
struct
TensorCoordinate
;
struct
NormalTensorCoordinate
{
using
type
=
NormalTensorCoordinate
;
using
tensor_desc_type
=
TensorDesc
;
template
<
typename
NativeTensorDesc
>
struct
NativeTensorCoordinate
{
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
N
ormal
TensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_inde
x
)
__host__
__device__
constexpr
N
ative
TensorCoordinate
(
Index
id
x
)
:
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_inde
x
)
}
:
mIndex
(
idx
),
mOffset
(
tensor_desc_type
::
CalculateOffset
(
id
x
)
)
{
{
}
}
template
<
class
...
Xs
>
template
<
typename
...
Xs
>
__host__
__device__
constexpr
N
ormal
TensorCoordinate
(
Xs
...
xs
)
__host__
__device__
constexpr
N
ative
TensorCoordinate
(
Xs
...
xs
)
:
N
ormal
TensorCoordinate
(
Array
<
index_t
,
nDim
>
{
xs
...})
:
N
ative
TensorCoordinate
(
Index
{
xs
...})
{
{
}
}
template
<
index_t
...
Xs
>
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
N
ormal
TensorCoordinate
(
Sequence
<
Xs
...
>
)
__host__
__device__
constexpr
N
ative
TensorCoordinate
(
Sequence
<
Xs
...
>
)
:
N
ormal
TensorCoordinate
(
Array
<
index_t
,
nDim
>
{
Xs
...})
:
N
ative
TensorCoordinate
(
Index
{
Xs
...})
{
{
}
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
const
Index
&
GetIndex
()
const
{
return
mIndex
;
}
// T is Array or Sequence
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
mOffset
;
}
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
+=
idx_diff
;
mOffset
+=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
mOffset
+=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
-=
idx_diff
;
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
mOffset
-=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
type
coord
=
*
this
;
coord
+=
step_sizes
;
coord
+=
idx_diff
;
return
coord
;
return
coord
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
type
coord
=
*
this
;
coord
-=
step_sizes
;
coord
-=
idx_diff
;
return
coord
;
return
coord
;
}
}
// reposition point of origin, and return compensated offset.
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
index_t
offset_diff
=
mOffset
;
mOffset
=
0
;
return
offset_diff
;
}
private:
private:
// mIndex may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndex
;
index_t
mOffset
;
index_t
mOffset
;
};
};
// TensorDesc is ConstantMergedTensorDescriptor
template
<
typename
TransformedTensorDesc
>
template
<
class
TensorDesc
>
struct
TransformedTensorCoordinate
struct
MergedTensorCoordinate
{
{
using
type
=
MergedTensorCoordinate
;
using
tensor_desc_type
=
TransformedTensorDesc
;
using
tensor_desc_type
=
TensorDesc
;
using
LowerCoord
=
typename
TensorCoordinate
<
decltype
(
tensor_desc_type
::
GetLowerTensorDescriptor
())
>::
type
;
using
UpperCoord
=
TransformedTensorCoordinate
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDim
=
using
UpperIndex
=
MultiIndex
<
nDim
>
;
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
__host__
__device__
constexpr
Merg
edTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_inde
x
)
__host__
__device__
constexpr
Transform
edTensorCoordinate
(
UpperIndex
id
x
)
:
m
OriginalIndex
{
tensor_desc_type
::
GetOrigin
al
M
ul
tiIndexFromMultiIndex
(
tensor_inde
x
)}
:
m
IndexUp
{
idx
},
mCoordLow
{
tensor_desc_type
::
C
al
c
ul
ateLowerIndex
(
id
x
)}
{
{
// partial offset on each dimension
}
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mOriginalIndex
,
partial_original_dims
));
});
// complete offset
template
<
typename
...
Xs
>
mOffset
=
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Xs
...
xs
)
accumulate_on_array
(
mPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
:
TransformedTensorCoordinate
(
UpperIndex
{
xs
...})
{
}
}
template
<
class
...
Xs
>
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
Merg
edTensorCoordinate
(
Xs
...
xs
)
__host__
__device__
constexpr
Transform
edTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
Merg
edTensorCoordinate
(
Array
<
index_t
,
nDim
>
{
x
s
...})
:
Transform
edTensorCoordinate
(
UpperIndex
{
X
s
...})
{
{
}
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{}
;
}
template
<
class
IDim
,
class
T
,
bool
PositiveDirection
>
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
mCoordLow
;
}
__host__
__device__
void
MoveOnDimension
(
IDim
idim_
,
T
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
mIndexUp
;
}
{
constexpr
auto
idim
=
idim_
;
__host__
__device__
constexpr
const
UpperIndex
&
GetIndex
()
const
{
return
GetUpperIndex
();
}
// if step_size is known at compile time
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
static_if
<
is_static
<
T
>::
value
>
{}(
{
[
&
](
auto
)
{
static_if
<
T
{}
==
0
>
{}([
&
](
auto
)
{
return
;
});
});
return
GetLowerCoordinate
().
GetOffset
();
// update original index
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
index_t
ndim_partial_original
=
partial_original_dims
.
GetSize
();
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
const
auto
partial_original_step_sizes
=
partial_original_desc
.
GetMultiIndexFrom1dIndex
(
step_size
);
// update partial original multi-id
auto
partial_original_id
=
extract_array
(
mOriginalIndex
,
partial_original_dims
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
partial_original_id
+=
partial_original_step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
carry
)
{
++
partial_original_id
(
i
);
}
carry
=
false
;
if
(
partial_original_id
[
i
]
>=
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
-=
partial_original_desc
.
GetLength
(
i
);
carry
=
true
;
}
});
// highest dimension
if
(
carry
)
{
++
partial_original_id
(
0
);
}
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id
+=
partial_original_desc
.
GetLengths
()
-
partial_original_step_sizes
;
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
borrow
)
{
--
partial_original_id
(
i
);
}
borrow
=
false
;
if
(
partial_original_id
[
i
]
<
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
+=
partial_original_desc
.
GetLength
(
i
);
borrow
=
true
;
}
});
// highest dimension
if
(
borrow
)
{
--
partial_original_id
(
0
);
}
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id
=
partial_original_id
-
partial_original_desc
.
GetLengths
();
});
// update "mOriginalIndex"
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
idim_original
=
partial_original_dims
[
I
];
mOriginalIndex
(
idim_original
)
=
partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_partial_offset
=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
partial_original_id
);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
}).
Else
([
&
](
auto
)
{
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
});
}
}
// T is Array or Sequence
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
mCoordLow
+=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// mIndexUp is updated here, but some (or all) of its entries may never be used
// compiler should remove dead code path, because step_sizes is known at
// compiler should remove those entries as dead code
// compile time
mIndexUp
+=
idx_up_diff
;
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
}
});
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
UpperIndex
&
idx_up_diff
)
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mCoordLow
-=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove dead code path, because step_sizes is known at
// compiler should remove those entries as dead code
// compile time
mIndexUp
-=
idx_up_diff
;
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
}
});
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
UpperCoord
coord
_up
=
*
this
;
coord
+=
step_sizes
;
coord
_up
+=
idx_up_diff
;
return
coord
;
return
coord
_up
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
UpperCoord
coord
_up
=
*
this
;
coord
-=
step_sizes
;
coord
_up
-=
idx_up_diff
;
return
coord
;
return
coord
_up
;
}
}
__host__
__device__
static
constexpr
index_t
RepositionOrigin
()
{
return
0
;
}
// this function should be inexpensive, because there is no upper-to-lower index transformation
__host__
__device__
constexpr
bool
IsUpperIndexMappedToValidOffset
()
const
{
return
tensor_desc_type
::
IsUpperIndexMappedToValidLowerIndex
(
GetIndex
())
&&
mCoordLow
.
IsUpperIndexMappedToValidOffset
();
}
private:
private:
// Allocate register memory for all merged dimensions and normal dimensions.
// mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
// However, only those merged dimensions, whose index will be involved in arithmetic
// may
// after the construction of this TensorCoordinate (e.g. when user move a slicing
// never be used. Compiler should be able to remove these entries as well as its calculation
// window on the merged dimension), will use these register memory.
// as dead code.
// Let's hope compiler will optimize away those register memory allocated for normal
// TODO: make sure compiler indeed remove these dead code
// dimensions, and those merged dimensions, that would never be involved in index
UpperIndex
mIndexUp
;
// arithmetic after construction of TensorCoordinate.
LowerCoord
mCoordLow
;
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
// count on compiler to optimize away those register memory for us
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nDim
>
mPartialOffsets
;
// complete offset
index_t
mOffset
;
};
};
template
<
class
TensorDesc
>
template
<
typename
TensorDesc
>
struct
TensorCoordinate
struct
TensorCoordinate
{
{
private:
private:
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
Constant
TensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
Native
TensorDescriptor
<
Ts
...
>
)
{
{
return
NormalTensorCoordinate
<
ConstantTensorDescriptor
<
Ts
...
>>
();
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
}
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMerg
edTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
Transform
edTensorDescriptor
<
Ts
...
>
)
{
{
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
}
public:
public:
...
...
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
0 → 100644
View file @
b12bbcee
#ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP
#define CK_TENSOR_COORDINATE_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
namespace
ck
{
// TensorDesc is ConstantTensorDescriptor
template
<
class
TensorDesc
>
struct
NormalTensorCoordinate_deprecated
{
using
type
=
NormalTensorCoordinate_deprecated
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_index
)}
{
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Xs
...
xs
)
:
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Sequence
<
Xs
...
>
)
:
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
{
Xs
...})
{
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mOffset
+=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
// reposition point of origin, and return compensated offset.
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
index_t
offset_diff
=
mOffset
;
mOffset
=
0
;
return
offset_diff
;
}
private:
index_t
mOffset
;
};
// TensorDesc is ConstantMergedTensorDescriptor
template
<
class
TensorDesc
>
struct
MergedTensorCoordinate
{
using
type
=
MergedTensorCoordinate
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDim
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
__host__
__device__
constexpr
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mOriginalIndex
{
tensor_desc_type
::
GetOriginalMultiIndexFromMultiIndex
(
tensor_index
)}
{
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mOriginalIndex
,
partial_original_dims
));
});
// complete offset
mOffset
=
accumulate_on_array
(
mPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
MergedTensorCoordinate
(
Xs
...
xs
)
:
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
template
<
class
IDim
,
class
T
,
bool
PositiveDirection
>
__host__
__device__
void
MoveOnDimension
(
IDim
idim_
,
T
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
constexpr
auto
idim
=
idim_
;
// if step_size is known at compile time
static_if
<
is_static
<
T
>::
value
>
{}(
[
&
](
auto
)
{
static_if
<
T
{}
==
0
>
{}([
&
](
auto
)
{
return
;
});
});
// update original index
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
index_t
ndim_partial_original
=
partial_original_dims
.
GetSize
();
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
const
auto
partial_original_step_sizes
=
partial_original_desc
.
GetMultiIndexFrom1dIndex
(
step_size
);
// update partial original multi-id
auto
partial_original_id
=
extract_array
(
mOriginalIndex
,
partial_original_dims
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
partial_original_id
+=
partial_original_step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
carry
)
{
++
partial_original_id
(
i
);
}
carry
=
false
;
if
(
partial_original_id
[
i
]
>=
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
-=
partial_original_desc
.
GetLength
(
i
);
carry
=
true
;
}
});
// highest dimension
if
(
carry
)
{
++
partial_original_id
(
0
);
}
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id
+=
partial_original_desc
.
GetLengths
()
-
partial_original_step_sizes
;
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
borrow
)
{
--
partial_original_id
(
i
);
}
borrow
=
false
;
if
(
partial_original_id
[
i
]
<
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
+=
partial_original_desc
.
GetLength
(
i
);
borrow
=
true
;
}
});
// highest dimension
if
(
borrow
)
{
--
partial_original_id
(
0
);
}
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id
=
partial_original_id
-
partial_original_desc
.
GetLengths
();
});
// update "mOriginalIndex"
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
idim_original
=
partial_original_dims
[
I
];
mOriginalIndex
(
idim_original
)
=
partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_partial_offset
=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
partial_original_id
);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
}).
Else
([
&
](
auto
)
{
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
});
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// compiler should remove dead code path, because step_sizes is known at
// compile time
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
}
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// compiler should remove dead code path, because step_sizes is known at
// compile time
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
}
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
__host__
__device__
static
constexpr
index_t
RepositionOrigin
()
{
return
0
;
}
private:
// Allocate register memory for all merged dimensions and normal dimensions.
// However, only those merged dimensions, whose index will be involved in arithmetic
// after the construction of this TensorCoordinate (e.g. when user move a slicing
// window on the merged dimension), will use these register memory.
// Let's hope compiler will optimize away those register memory allocated for normal
// dimensions, and those merged dimensions, that would never be involved in index
// arithmetic after construction of TensorCoordinate.
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
// count on compiler to optimize away those register memory for us
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nDim
>
mPartialOffsets
;
// complete offset
index_t
mOffset
;
};
template
<
class
TensorDesc
>
struct
TensorCoordinate_deprecated
{
private:
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
{
return
NormalTensorCoordinate_deprecated
<
ConstantTensorDescriptor
<
Ts
...
>>
();
}
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
{
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
}
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
View file @
b12bbcee
...
@@ -9,7 +9,7 @@ template <typename TensorDesc>
...
@@ -9,7 +9,7 @@ template <typename TensorDesc>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
make_tensor_coordinate_v2
(
TensorDesc
,
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
idx
)
make_tensor_coordinate_v2
(
TensorDesc
,
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
idx
)
{
{
return
typename
TensorCoordinate
_v2
<
TensorDesc
>::
type
(
idx
);
return
typename
TensorCoordinate
<
TensorDesc
>::
type
(
idx
);
}
}
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
deleted
100644 → 0
View file @
51a9fa1d
#ifndef CK_TENSOR_COORDINATE_V2_HPP
#define CK_TENSOR_COORDINATE_V2_HPP
#include "common_header.hpp"
#include "dimension.hpp"
#include "multi_index_transform.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
;
template
<
typename
NativeTensorDesc
>
struct
NativeTensorCoordinate
{
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
NativeTensorCoordinate
(
Index
idx
)
:
mIndex
(
idx
),
mOffset
(
tensor_desc_type
::
CalculateOffset
(
idx
))
{
}
template
<
typename
...
Xs
>
__host__
__device__
constexpr
NativeTensorCoordinate
(
Xs
...
xs
)
:
NativeTensorCoordinate
(
Index
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
NativeTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
NativeTensorCoordinate
(
Index
{
Xs
...})
{
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
const
Index
&
GetIndex
()
const
{
return
mIndex
;
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
+=
idx_diff
;
mOffset
+=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
{
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
-=
idx_diff
;
mOffset
-=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
+=
idx_diff
;
return
coord
;
}
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
-=
idx_diff
;
return
coord
;
}
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
private:
// mIndex may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndex
;
index_t
mOffset
;
};
template
<
typename
TransformedTensorDesc
>
struct
TransformedTensorCoordinate
{
using
tensor_desc_type
=
TransformedTensorDesc
;
using
LowerCoord
=
typename
TensorCoordinate_v2
<
decltype
(
tensor_desc_type
::
GetLowerTensorDescriptor
())
>::
type
;
using
UpperCoord
=
TransformedTensorCoordinate
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
UpperIndex
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
TransformedTensorCoordinate
(
UpperIndex
idx
)
:
mIndexUp
{
idx
},
mCoordLow
{
tensor_desc_type
::
CalculateLowerIndex
(
idx
)}
{
}
template
<
typename
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Xs
...
xs
)
:
TransformedTensorCoordinate
(
UpperIndex
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
TransformedTensorCoordinate
(
UpperIndex
{
Xs
...})
{
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
mCoordLow
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
mIndexUp
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetIndex
()
const
{
return
GetUpperIndex
();
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
GetLowerCoordinate
().
GetOffset
();
}
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
mCoordLow
+=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
// mIndexUp is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndexUp
+=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
UpperIndex
&
idx_up_diff
)
{
mCoordLow
-=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndexUp
-=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
+=
idx_up_diff
;
return
coord_up
;
}
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
-=
idx_up_diff
;
return
coord_up
;
}
// this function should be inexpensive, because there is no upper-to-lower index transformation
__host__
__device__
constexpr
bool
IsUpperIndexMappedToValidOffset
()
const
{
return
tensor_desc_type
::
IsUpperIndexMappedToValidLowerIndex
(
GetIndex
())
&&
mCoordLow
.
IsUpperIndexMappedToValidOffset
();
}
private:
// mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
// may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
UpperIndex
mIndexUp
;
LowerCoord
mCoordLow
;
};
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
{
private:
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_view.hpp
View file @
b12bbcee
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_coordinate
_deprecated
.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -14,7 +14,7 @@ struct NormalTensorView
...
@@ -14,7 +14,7 @@ struct NormalTensorView
{
{
using
type
=
NormalTensorView
;
using
type
=
NormalTensorView
;
using
tensor_desc_type
=
TensorDesc
;
using
tensor_desc_type
=
TensorDesc
;
using
coordinate_type
=
typename
NormalTensorCoordinate
<
TensorDesc
>::
type
;
using
coordinate_type
=
typename
NormalTensorCoordinate
_deprecated
<
TensorDesc
>::
type
;
using
data_type
=
TData
;
using
data_type
=
TData
;
static
constexpr
auto
nDim
=
TensorDesc
::
GetNumOfDimension
();
static
constexpr
auto
nDim
=
TensorDesc
::
GetNumOfDimension
();
...
...
composable_kernel/include/tensor_description/tensor_visit.hpp
View file @
b12bbcee
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "dimension.hpp"
#include "dimension.hpp"
#include "dimension_transform.hpp"
#include "dimension_transform.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate
_v2
.hpp"
#include "tensor_coordinate.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -13,7 +13,7 @@ template <class TensorDescriptor>
...
@@ -13,7 +13,7 @@ template <class TensorDescriptor>
struct
TensorVisit
struct
TensorVisit
{
{
using
Index
=
typename
TensorDescriptor
::
Index
;
using
Index
=
typename
TensorDescriptor
::
Index
;
using
Coordinate
=
typename
TensorCoordinate
_v2
<
TensorDescriptor
>::
type
;
using
Coordinate
=
typename
TensorCoordinate
<
TensorDescriptor
>::
type
;
__host__
__device__
static
void
Run_v1
(
Index
idx_begin
)
__host__
__device__
static
void
Run_v1
(
Index
idx_begin
)
{
{
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
b12bbcee
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
0 → 100644
View file @
b12bbcee
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
b12bbcee
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
0 → 100644
View file @
b12bbcee
This diff is collapsed.
Click to expand it.
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
b12bbcee
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
template
<
class
T
,
template
<
class
T
,
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
b12bbcee
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
@@ -164,7 +164,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -164,7 +164,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_conv
=
constexpr
auto
gridwise_conv
=
#if
1
#if
0
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
#else
#else
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
View file @
b12bbcee
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
0
#if
1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
128
;
constexpr
index_t
BPerBlock
=
128
;
...
...
driver/src/driver.cpp
View file @
b12bbcee
...
@@ -14,11 +14,11 @@
...
@@ -14,11 +14,11 @@
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif
1
#elif
0
// 1x7 filter, 0x3 pad, 17x17 input
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
128
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment