Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6fc49f91
"vscode:/vscode.git/clone" did not exist on "1e37e838e56673c65f532b13ea9e76ad4e6c1a53"
Commit
6fc49f91
authored
May 30, 2020
by
Chao Liu
Browse files
remove deprecated tensor descriptor
parent
506a823a
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
35 additions
and
2377 deletions
+35
-2377
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
...l/include/tensor_description/ConstantMatrixDescriptor.hpp
+0
-13
composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor_deprecated.hpp
...description/ConstantMergedTensorDescriptor_deprecated.hpp
+0
-210
composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
...ensor_description/ConstantTensorDescriptor_deprecated.hpp
+0
-612
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
...clude/tensor_description/tensor_coordinate_deprecated.hpp
+0
-348
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
...ration/blockwise_generic_tensor_slice_copy_deprecated.hpp
+0
-613
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
...ation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+0
-495
composable_kernel/include/utility/math.hpp
composable_kernel/include/utility/math.hpp
+1
-0
driver/CMakeLists.txt
driver/CMakeLists.txt
+1
-3
driver/include/conv_common.hpp
driver/include/conv_common.hpp
+0
-48
driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_tensor.hpp
driver/include/device_tensor.hpp
+13
-15
driver/include/host_conv.hpp
driver/include/host_conv.hpp
+1
-1
driver/include/host_conv_bwd_data.hpp
driver/include/host_conv_bwd_data.hpp
+1
-1
driver/include/host_tensor.hpp
driver/include/host_tensor.hpp
+11
-11
No files found.
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
View file @
6fc49f91
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
#define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
#define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -58,18 +57,6 @@ __host__ __device__ constexpr auto
...
@@ -58,18 +57,6 @@ __host__ __device__ constexpr auto
return
ConstantMatrixDescriptor
<
NRow
,
NCol
,
RowStride
>
{};
return
ConstantMatrixDescriptor
<
NRow
,
NCol
,
RowStride
>
{};
}
}
template
<
typename
...
Ts
>
__host__
__device__
constexpr
auto
make_ConstantMatrixDescriptor
(
ConstantTensorDescriptor_deprecated
<
Ts
...
>
)
{
using
TDesc
=
ConstantTensorDescriptor_deprecated
<
Ts
...
>
;
static_assert
(
TDesc
::
GetNumOfDimension
()
==
2
,
"wrong"
);
static_assert
(
TDesc
::
GetStrides
()[
1
]
==
1
,
"wrong"
);
return
ConstantMatrixDescriptor
<
TDesc
::
GetLengths
()[
0
],
TDesc
::
GetLengths
()[
1
],
TDesc
::
GetStrides
()[
0
]
>
{};
}
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
constexpr
auto
make_ConstantMatrixDescriptor
(
NativeTensorDescriptor
<
Ts
...
>
)
__host__
__device__
constexpr
auto
make_ConstantMatrixDescriptor
(
NativeTensorDescriptor
<
Ts
...
>
)
{
{
...
...
composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor_deprecated.hpp
deleted
100644 → 0
View file @
506a823a
#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
namespace
ck
{
// OriginalTensorDesc : ConstantTensorDescriptor_deprecated<...>
// it's the tensor whose dimensions are to be merged
// OriginalDimMergeSeqs : Sequence<...>...
// each is a sequence of original dimensions (of OriginalTensorDesc) to be merged
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
struct
ConstantMergedTensorDescriptor_deprecated
{
using
Type
=
ConstantMergedTensorDescriptor_deprecated
;
static
constexpr
auto
mOriginalDimMergeSeqs
=
std
::
tuple
<
OriginalDimMergeSeqs
...
>
{};
static
constexpr
index_t
nDim
=
sizeof
...(
OriginalDimMergeSeqs
);
static
constexpr
index_t
nOriginalDim
=
OriginalTensorDesc
::
GetNumOfDimension
();
__host__
__device__
constexpr
ConstantMergedTensorDescriptor_deprecated
()
{
static_assert
(
nDim
<=
nOriginalDim
,
"wrong!"
);
// TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
// OriginalTensorDesc::nDim number of dimensions
// TODO: check OriginalDimMergeSeqs contains all original dimensions
// TODO: check there is no duplication in OriginalDimMergeSeqs
}
__host__
__device__
static
constexpr
auto
GetOriginalTensorDescriptor
()
{
return
OriginalTensorDesc
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfDimension
()
{
return
Number
<
nDim
>
{};
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
auto
GetContainedOriginalDimensions
(
Number
<
IDim
>
)
{
return
std
::
get
<
IDim
>
(
mOriginalDimMergeSeqs
);
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
bool
ContainMultipleOriginalDimensions
(
Number
<
IDim
>
)
{
return
(
std
::
get
<
IDim
>
(
mOriginalDimMergeSeqs
).
GetSize
()
>
1
);
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
auto
GetLength
(
Number
<
IDim
>
)
{
constexpr
auto
original_dims_partial
=
std
::
get
<
IDim
>
(
mOriginalDimMergeSeqs
);
return
OriginalTensorDesc
::
Extract
(
original_dims_partial
).
GetElementSize
();
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
auto
GetStride
(
Number
<
IDim
>
)
{
static_assert
(
!
ContainMultipleOriginalDimensions
(
Number
<
IDim
>
{}),
"wrong! stride of a merged dimension is undefined"
);
constexpr
auto
idim_original
=
std
::
get
<
IDim
>
(
mOriginalDimMergeSeqs
).
Back
();
return
OriginalTensorDesc
::
GetStride
(
Number
<
idim_original
>
{});
}
// this is a hack to return the stride of the last original dimension of a merged dimension
// TODO: refactor this once the concept of "dimension" is used
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
auto
GetLastOriginalDimensionStride
(
Number
<
IDim
>
)
{
constexpr
auto
idim_last_original
=
std
::
get
<
IDim
>
(
mOriginalDimMergeSeqs
).
Back
();
return
OriginalTensorDesc
::
GetStride
(
Number
<
idim_last_original
>
{});
}
__host__
__device__
static
constexpr
auto
GetLengths
()
{
return
Sequence
<
OriginalTensorDesc
::
Extract
(
OriginalDimMergeSeqs
{}).
GetElementSize
()...
>
{};
}
__host__
__device__
static
constexpr
auto
GetElementSize
()
{
return
OriginalTensorDesc
::
GetElementSize
();
}
template
<
class
OriginalDimsPartial
>
struct
lambda_1_GetOriginalMultiIndexFromMultiIndex
{
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
;
__host__
__device__
constexpr
lambda_1_GetOriginalMultiIndexFromMultiIndex
(
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial_
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id_
)
:
original_multi_id_partial
(
original_multi_id_partial_
),
original_multi_id
(
original_multi_id_
)
{
}
template
<
index_t
I
>
__host__
__device__
constexpr
void
operator
()(
Number
<
I
>
)
const
{
constexpr
index_t
idim_original
=
OriginalDimsPartial
::
Get
(
Number
<
I
>
{});
index_t
itmp
=
original_multi_id_partial
[
I
];
original_multi_id
(
idim_original
)
=
itmp
;
}
};
struct
lambda_0_GetOriginalMultiIndexFromMultiIndex
{
const
Array
<
index_t
,
nDim
>&
multi_id
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
;
__host__
__device__
constexpr
lambda_0_GetOriginalMultiIndexFromMultiIndex
(
const
Array
<
index_t
,
nDim
>&
multi_id_
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id_
)
:
multi_id
(
multi_id_
),
original_multi_id
(
original_multi_id_
)
{
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim
>
)
const
{
constexpr
auto
original_dims_partial
=
std
::
get
<
IDim
>
(
Type
::
mOriginalDimMergeSeqs
);
// get partial original-multi-id corresponding to this merged dimension
const
auto
original_multi_id_partial
=
OriginalTensorDesc
::
Extract
(
original_dims_partial
)
.
GetMultiIndexFrom1dIndex
(
multi_id
[
IDim
]);
static_for
<
0
,
original_dims_partial
.
GetSize
(),
1
>
{}(
lambda_1_GetOriginalMultiIndexFromMultiIndex
<
decltype
(
original_dims_partial
)
>
(
original_multi_id_partial
,
original_multi_id
));
}
};
// return type is Array<...>
__host__
__device__
static
constexpr
auto
GetOriginalMultiIndexFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
{
Array
<
index_t
,
nOriginalDim
>
original_multi_id
;
static_for
<
0
,
nDim
,
1
>
{}(
lambda_0_GetOriginalMultiIndexFromMultiIndex
(
multi_id
,
original_multi_id
));
return
original_multi_id
;
}
template
<
index_t
...
Is
>
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
Sequence
<
Is
...
>
)
{
constexpr
auto
multi_id
=
sequence2array
(
Sequence
<
Is
...
>
{});
constexpr
auto
original_multi_id
=
GetOriginalMultiIndexFromMultiIndex
(
multi_id
);
return
OriginalTensorDesc
::
GetOffsetFromMultiIndex
(
original_multi_id
);
}
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
{
auto
original_multi_id
=
GetOriginalMultiIndexFromMultiIndex
(
multi_id
);
return
OriginalTensorDesc
::
GetOffsetFromMultiIndex
(
original_multi_id
);
}
template
<
class
...
Is
>
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
Is
...
is
)
{
return
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
{
is
...});
}
__host__
__device__
static
constexpr
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
{
constexpr
auto
packed_desc
=
make_ConstantTensorDescriptor_packed
(
GetLengths
());
return
packed_desc
.
GetMultiIndexFrom1dIndex
(
id
);
}
__host__
__device__
static
constexpr
auto
Pack
()
{
constexpr
auto
lengths
=
GetLengths
();
constexpr
auto
strides
=
calculate_tensor_strides_packed
(
lengths
);
return
ConstantTensorDescriptor_deprecated
<
decltype
(
lengths
),
decltype
(
strides
)
>
{};
}
};
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
__host__
__device__
constexpr
auto
make_ConstantMergedTensorDescriptor
(
OriginalTensorDesc
,
OriginalDimMergeSeqs
...)
{
return
ConstantMergedTensorDescriptor_deprecated
<
OriginalTensorDesc
,
OriginalDimMergeSeqs
...
>
{};
}
template
<
class
TDesc
>
__host__
__device__
void
print_ConstantMergedTensorDescriptor
(
const
char
*
s
,
TDesc
)
{
print_ConstantTensorDescriptor
(
s
,
TDesc
::
GetOriginalTensorDescriptor
());
}
}
// namespace ck
#endif
composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
deleted
100644 → 0
View file @
506a823a
#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
#define CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
#include "common_header.hpp"
namespace
ck
{
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_packed_deprecated
(
Lengths
)
{
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
}
template
<
class
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_aligned_old
(
Lengths
,
Number
<
Align
>
)
{
constexpr
index_t
L_back_align
=
Align
*
math
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
return
calculate_tensor_strides_packed_deprecated
(
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
}
template
<
class
Lengths
,
class
Strides
>
struct
ConstantTensorDescriptor_deprecated
{
using
Type
=
ConstantTensorDescriptor_deprecated
;
static
constexpr
index_t
nDim
=
Lengths
::
GetSize
();
__host__
__device__
constexpr
ConstantTensorDescriptor_deprecated
()
{
static_assert
(
Lengths
::
GetSize
()
==
Strides
::
GetSize
(),
"nDim not consistent"
);
}
__host__
__device__
static
constexpr
auto
GetOriginalTensorDescriptor
()
{
return
Type
{};
}
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
auto
GetContainedOriginalDimensions
(
Number
<
IDim
>
)
{
return
Sequence
<
IDim
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfDimension
()
{
return
Number
<
nDim
>
{};
}
__host__
__device__
static
constexpr
auto
GetLengths
()
{
return
Lengths
{};
}
__host__
__device__
static
constexpr
auto
GetStrides
()
{
return
Strides
{};
}
__host__
__device__
static
constexpr
auto
GetLength
(
index_t
IDim
)
{
return
Lengths
{}[
IDim
];
}
__host__
__device__
static
constexpr
auto
GetStride
(
index_t
IDim
)
{
return
Strides
{}[
IDim
];
}
struct
lambda_AreDimensionsContinuous
{
bool
&
is_continuous
;
__host__
__device__
constexpr
lambda_AreDimensionsContinuous
(
bool
&
is_continuous_
)
:
is_continuous
(
is_continuous_
)
{
}
template
<
index_t
IDim_
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim_
>
)
const
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
constexpr
auto
IDim_p1
=
Number
<
IDim_
+
1
>
{};
is_continuous
=
is_continuous
&&
(
GetStride
(
IDim
)
>=
GetStride
(
IDim_p1
)
&&
GetStride
(
IDim
)
==
GetStride
(
IDim_p1
)
*
GetLength
(
IDim_p1
));
}
};
__host__
__device__
static
constexpr
bool
AreDimensionsContinuous
()
{
bool
is_continuous
=
true
;
static_for
<
0
,
nDim
-
1
,
1
>
{}(
lambda_AreDimensionsContinuous
(
is_continuous
));
return
is_continuous
;
}
__host__
__device__
static
constexpr
bool
IsPackedTensor
()
{
return
AreDimensionsContinuous
()
&&
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
;
}
template
<
class
T
>
__host__
__device__
static
constexpr
bool
ContainMultipleOriginalDimensions
(
T
)
{
return
false
;
}
__host__
__device__
static
constexpr
auto
GetElementSize
()
{
return
Number
<
reduce_on_sequence
(
Lengths
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
>
{};
}
__host__
__device__
static
constexpr
auto
GetElementSpace
()
{
constexpr
index_t
element_space_unaligned
=
reduce_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
1
>
{});
return
Number
<
element_space_unaligned
>
{};
}
// emulate constexpr lambda
template
<
index_t
NSize
>
struct
lambda_GetOffsetFromMultiIndex
{
Array
<
index_t
,
NSize
>&
multi_id
;
index_t
&
offset
;
__host__
__device__
constexpr
lambda_GetOffsetFromMultiIndex
(
Array
<
index_t
,
NSize
>&
multi_id_
,
index_t
&
offset_
)
:
multi_id
(
multi_id_
),
offset
(
offset_
)
{
}
template
<
class
X
>
__host__
__device__
constexpr
void
operator
()(
X
IDim
)
const
{
offset
+=
multi_id
[
IDim
]
*
Type
::
GetStride
(
IDim
);
}
};
template
<
index_t
NSize
>
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
Array
<
index_t
,
NSize
>
multi_id
)
{
static_assert
(
NSize
==
nDim
,
"wrong! Dimension not consistent"
);
index_t
offset
=
0
;
static_for
<
0
,
nDim
,
1
>
{}(
lambda_GetOffsetFromMultiIndex
<
NSize
>
(
multi_id
,
offset
));
return
offset
;
}
template
<
class
...
Is
>
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
Is
...
is
)
{
return
GetOffsetFromMultiIndex
(
Array
<
index_t
,
sizeof
...(
Is
)
>
{
is
...});
}
template
<
index_t
...
Is
>
__host__
__device__
static
constexpr
auto
GetOffsetFromMultiIndex
(
Sequence
<
Is
...
>
)
{
static_assert
(
sizeof
...(
Is
)
==
nDim
,
"wrong! Dimension not consistent"
);
constexpr
auto
multi_id
=
Sequence
<
Is
...
>
{};
return
Number
<
reduce_on_sequence
(
multi_id
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
0
>
{})
>
{};
}
// emulate constexpr lambda
template
<
class
PackedStrides
>
struct
lambda_GetMultiIndexFrom1dIndex
{
index_t
&
id
;
Array
<
index_t
,
nDim
>&
multi_id
;
__host__
__device__
constexpr
lambda_GetMultiIndexFrom1dIndex
(
index_t
&
id_
,
Array
<
index_t
,
nDim
>&
multi_id_
)
:
id
(
id_
),
multi_id
(
multi_id_
)
{
}
template
<
class
IDim_
>
__host__
__device__
constexpr
void
operator
()(
IDim_
)
const
{
constexpr
auto
IDim
=
IDim_
{};
constexpr
index_t
stride
=
PackedStrides
::
Get
(
IDim
);
multi_id
(
IDim
)
=
id
/
stride
;
id
-=
multi_id
[
IDim
]
*
stride
;
}
};
__host__
__device__
static
constexpr
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
{
Array
<
index_t
,
nDim
>
multi_id
;
using
PackedStrides
=
decltype
(
calculate_tensor_strides_packed_deprecated
(
GetLengths
()));
// calculate index in each of the dimensions in the order of their dimension
static_for
<
0
,
nDim
-
1
,
1
>
{}(
lambda_GetMultiIndexFrom1dIndex
<
PackedStrides
>
(
id
,
multi_id
));
multi_id
(
Number
<
nDim
-
1
>
{})
=
id
/
PackedStrides
::
Get
(
Number
<
nDim
-
1
>
{});
return
multi_id
;
}
__host__
__device__
static
constexpr
auto
GetOriginalMultiIndexFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
{
return
multi_id
;
}
// This function doesn't do carry check on the highest dimension for positive stepping (or
// borrow check on the highest dimension for negative stepping) , for performance reason. It is
// the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
// highest dimension for positive stepping (or on the lowest dimension for negative stepping)
template
<
bool
PositiveDirection
>
__host__
__device__
static
Array
<
index_t
,
nDim
>
UpdateMultiIndexGivenStepSizeOf1dIndex
(
Array
<
index_t
,
nDim
>
old_multi_id
,
index_t
step_size_of_1d_index
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
Array
<
index_t
,
nDim
>
new_multi_id
;
const
auto
step_sizes
=
GetMultiIndexFrom1dIndex
(
step_size_of_1d_index
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
new_multi_id
=
old_multi_id
+
step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDimReverse
)
{
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
;
constexpr
auto
IDim
=
Number
<
idim
>
{};
if
(
carry
)
{
++
new_multi_id
(
idim
);
}
carry
=
false
;
if
(
new_multi_id
[
idim
]
>=
GetLength
(
IDim
))
{
new_multi_id
(
idim
)
-=
GetLength
(
IDim
);
carry
=
true
;
}
});
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
new_multi_id
=
old_multi_id
+
(
GetLengths
()
-
step_sizes
);
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDimReverse
)
{
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
;
constexpr
auto
IDim
=
Number
<
idim
>
{};
if
(
borrow
)
{
--
new_multi_id
(
idim
);
}
borrow
=
false
;
if
(
new_multi_id
[
idim
]
<
GetLength
(
IDim
))
{
new_multi_id
(
idim
)
+=
GetLength
(
IDim
);
borrow
=
true
;
}
});
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
new_multi_id
=
new_multi_id
-
GetLengths
();
});
return
new_multi_id
;
}
template
<
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
Extract
(
Number
<
IDims
>
...
extract_dims
)
{
static_assert
(
sizeof
...(
IDims
)
<=
GetNumOfDimension
(),
"wrong! too many number of dimensions to be extracted"
);
using
extract_lengths
=
decltype
(
Lengths
::
Extract
(
extract_dims
...));
using
extract_strides
=
decltype
(
Strides
::
Extract
(
extract_dims
...));
return
ConstantTensorDescriptor_deprecated
<
extract_lengths
,
extract_strides
>
{};
}
template
<
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
Extract
(
Sequence
<
IDims
...
>
)
{
return
Extract
(
Number
<
IDims
>
{}...);
}
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
Embed
(
ConstantTensorDescriptor_deprecated
<
Ts
...
>
)
{
using
leaf_tensor
=
ConstantTensorDescriptor_deprecated
<
Ts
...
>
;
return
ConstantTensorDescriptor_deprecated
<
decltype
(
GetLengths
().
PushBack
(
leaf_tensor
::
GetLengths
())),
decltype
(
GetStrides
().
PushBack
(
leaf_tensor
::
GetStrides
()))
>
{};
}
template
<
index_t
IDimVector
,
index_t
DataPerVector
>
struct
lambda_IsVectorizationAllowed
{
bool
&
is_allowed
;
__host__
__device__
constexpr
lambda_IsVectorizationAllowed
(
bool
&
is_allowed_
)
:
is_allowed
(
is_allowed_
)
{
}
template
<
index_t
IDim_
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim_
>
)
const
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
if
(
IDimVector
!=
IDim
&&
Strides
::
Get
(
IDim
)
%
DataPerVector
!=
0
)
{
is_allowed
=
false
;
}
}
};
template
<
index_t
IDimVector
,
index_t
DataPerVector
>
__host__
__device__
static
constexpr
bool
IsVectorizationAllowed
(
Number
<
IDimVector
>
,
Number
<
DataPerVector
>
)
{
bool
is_allowed
=
(
Strides
{}[
IDimVector
]
==
1
||
DataPerVector
==
1
)
&&
Lengths
{}[
IDimVector
]
%
DataPerVector
==
0
;
static_for
<
0
,
nDim
,
1
>
{}(
lambda_IsVectorizationAllowed
<
IDimVector
,
DataPerVector
>
{
is_allowed
});
return
is_allowed
;
}
template
<
index_t
IDim
,
index_t
DataPerVector
>
__host__
__device__
static
constexpr
auto
Vectorize
(
Number
<
IDim
>
,
Number
<
DataPerVector
>
)
{
constexpr
auto
idim
=
Number
<
IDim
>
{};
constexpr
auto
data_per_vector
=
Number
<
DataPerVector
>
{};
static_assert
(
IsVectorizationAllowed
(
idim
,
data_per_vector
),
"wrong!"
);
using
vectorized_lengths
=
decltype
(
Lengths
::
Modify
(
Number
<
IDim
>
{},
Number
<
Lengths
{}[
IDim
]
/
DataPerVector
>
{}));
using
vectorized_strides
=
decltype
((
Strides
{}
/
Number
<
DataPerVector
>
{}).
Modify
(
Number
<
IDim
>
{},
Number
<
1
>
{}));
return
ConstantTensorDescriptor_deprecated
<
vectorized_lengths
,
vectorized_strides
>
{};
}
template
<
index_t
IDim
,
index_t
SliceLen
>
__host__
__device__
static
constexpr
auto
Slice
(
Number
<
IDim
>
,
Number
<
SliceLen
>
)
{
using
slice_lengths
=
decltype
(
Lengths
::
Modify
(
Number
<
IDim
>
{},
Number
<
SliceLen
>
{}));
return
ConstantTensorDescriptor_deprecated
<
slice_lengths
,
Strides
>
{};
}
template
<
index_t
...
Is
>
__host__
__device__
static
constexpr
auto
Slice
(
Sequence
<
Is
...
>
slice_lengths
)
{
static_assert
(
slice_lengths
.
GetSize
()
==
nDim
,
"wrong!"
);
return
ConstantTensorDescriptor_deprecated
<
decltype
(
slice_lengths
),
Strides
>
{};
}
template
<
index_t
IDim
,
index_t
SliceLength
,
index_t
SliceStride
>
__host__
__device__
static
constexpr
auto
StridedSlice
(
Number
<
IDim
>
,
Number
<
SliceLength
>
,
Number
<
SliceStride
>
)
{
constexpr
index_t
new_stride
=
Strides
::
Get
(
Number
<
IDim
>
{})
*
SliceStride
;
using
new_lengths
=
decltype
(
Lengths
::
Modify
(
Number
<
IDim
>
{},
Number
<
SliceLength
>
{}));
using
new_strides
=
decltype
(
Strides
::
Modify
(
Number
<
IDim
>
{},
Number
<
new_stride
>
{}));
return
ConstantTensorDescriptor_deprecated
<
new_lengths
,
new_strides
>
{};
}
template
<
index_t
IDim
,
index_t
...
FoldIntervals
>
__host__
__device__
static
constexpr
auto
Fold
(
Number
<
IDim
>
,
Number
<
FoldIntervals
>
...)
{
constexpr
auto
fold_intervals
=
Sequence
<
FoldIntervals
...
>
{};
constexpr
index_t
fold_intervals_product
=
reduce_on_sequence
(
fold_intervals
,
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
// length of the dimension to be folded needs to be dividable by fold_interval_product,
// otherwise, folding is invalid
static_assert
(
unfold_length
%
fold_intervals_product
==
0
,
"wrong! length on the dimension to be folded cannot be evenly divided!"
);
// folded lengths
constexpr
auto
fold_lengths
=
Sequence
<
unfold_length
/
fold_intervals_product
>
{}.
PushBack
(
fold_intervals
);
// folded strides
constexpr
auto
fold_strides
=
Number
<
unfold_stride
>
{}
*
reverse_inclusive_scan_sequence
(
fold_intervals
.
PushBack
(
Number
<
1
>
{}),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
IDim
,
1
>::
type
{};
constexpr
auto
right
=
typename
arithmetic_sequence_gen
<
IDim
+
1
,
GetNumOfDimension
(),
1
>::
type
{};
constexpr
auto
new_lengths
=
GetLengths
().
Extract
(
left
).
PushBack
(
fold_lengths
).
PushBack
(
GetLengths
().
Extract
(
right
));
constexpr
auto
new_strides
=
GetStrides
().
Extract
(
left
).
PushBack
(
fold_strides
).
PushBack
(
GetStrides
().
Extract
(
right
));
return
ConstantTensorDescriptor_deprecated
<
decltype
(
new_lengths
),
decltype
(
new_strides
)
>
{};
}
template
<
index_t
IDim
,
index_t
...
FoldIntervals
>
__host__
__device__
static
constexpr
auto
Fold
(
Number
<
IDim
>
,
Sequence
<
FoldIntervals
...
>
)
{
return
Fold
(
Number
<
IDim
>
{},
Number
<
FoldIntervals
>
{}...);
}
// this function unfold dimension [FirstUnfoldDim, ..., LastUnfoldDim] into 1 dimension
template
<
index_t
FirstUnfoldDim
,
index_t
LastUnfoldDim
>
__host__
__device__
static
constexpr
auto
Unfold
(
Number
<
FirstUnfoldDim
>
,
Number
<
LastUnfoldDim
>
)
{
static_assert
(
FirstUnfoldDim
>=
0
&&
LastUnfoldDim
<
nDim
&&
FirstUnfoldDim
<=
LastUnfoldDim
,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"
);
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
FirstUnfoldDim
,
1
>::
type
{};
constexpr
auto
middle
=
typename
arithmetic_sequence_gen
<
FirstUnfoldDim
,
LastUnfoldDim
+
1
,
1
>::
type
{};
constexpr
auto
right
=
typename
arithmetic_sequence_gen
<
LastUnfoldDim
+
1
,
GetNumOfDimension
(),
1
>::
type
{};
// dimensions to be unfolded need to be continuous
static_assert
(
Type
::
Extract
(
middle
).
AreDimensionsContinuous
(),
"wrong! not unfoldable"
);
// unfolded length, stride
constexpr
index_t
unfold_length
=
reduce_on_sequence
(
GetLengths
().
Extract
(
middle
),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
// new lengths, strides
constexpr
auto
new_lengths
=
GetLengths
()
.
Extract
(
left
)
.
PushBack
(
Number
<
unfold_length
>
{})
.
PushBack
(
GetLengths
().
Extract
(
right
));
constexpr
auto
new_strides
=
GetStrides
()
.
Extract
(
left
)
.
PushBack
(
Number
<
unfold_stride
>
{})
.
PushBack
(
GetStrides
().
Extract
(
right
));
return
ConstantTensorDescriptor_deprecated
<
decltype
(
new_lengths
),
decltype
(
new_strides
)
>
{};
}
__host__
__device__
static
constexpr
auto
Pack
()
{
using
packed_strides
=
decltype
(
calculate_tensor_strides_packed_deprecated
(
Lengths
{}));
return
ConstantTensorDescriptor_deprecated
<
Lengths
,
packed_strides
>
{};
}
template
<
class
MapNew2Old
>
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
MapNew2Old
)
{
return
ConstantTensorDescriptor_deprecated
<
decltype
(
Lengths
::
ReorderGivenNew2Old
(
MapNew2Old
{})),
decltype
(
Strides
::
ReorderGivenNew2Old
(
MapNew2Old
{}))
>
{};
}
template
<
class
MapOld2New
>
__host__
__device__
static
constexpr
auto
ReorderGivenOld2New
(
MapOld2New
)
{
return
ConstantTensorDescriptor_deprecated
<
decltype
(
Lengths
::
ReorderGivenOld2New
(
MapOld2New
{})),
decltype
(
Strides
::
ReorderGivenOld2New
(
MapOld2New
{}))
>
{};
}
};
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_packed
(
Lengths
)
{
using
Strides
=
decltype
(
calculate_tensor_strides_packed_deprecated
(
Lengths
{}));
return
ConstantTensorDescriptor_deprecated
<
Lengths
,
Strides
>
{};
}
template
<
class
Lengths
,
class
Strides
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor
(
Lengths
,
Strides
)
{
return
ConstantTensorDescriptor_deprecated
<
Lengths
,
Strides
>
{};
}
template
<
class
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_aligned
(
Lengths
,
Number
<
Align
>
)
{
using
Strides
=
decltype
(
calculate_tensor_strides_aligned_old
(
Lengths
{},
Number
<
Align
>
{}));
return
ConstantTensorDescriptor_deprecated
<
Lengths
,
Strides
>
{};
}
template
<
index_t
...
Lengths
,
index_t
...
Strides
>
__host__
__device__
void
print_ConstantTensorDescriptor
(
const
char
*
s
,
ConstantTensorDescriptor_deprecated
<
Sequence
<
Lengths
...
>
,
Sequence
<
Strides
...
>>
)
{
constexpr
index_t
ndim
=
sizeof
...(
Lengths
);
static_assert
(
ndim
>
0
&&
ndim
<=
12
,
"wrong!"
);
static_if
<
ndim
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u}, strides {%u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
2
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u}, strides {%u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
3
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u}, strides {%u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
4
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
5
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
6
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
7
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
8
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
9
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
"%u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
10
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
"%u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
11
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
"%u %u "
"%u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
12
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
"%u %u %u %u "
"%u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
}
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
deleted
100644 → 0
View file @
506a823a
#ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP
#define CK_TENSOR_COORDINATE_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMergedTensorDescriptor_deprecated.hpp"
namespace
ck
{
// TensorDesc is ConstantTensorDescriptor_deprecated
template
<
class
TensorDesc
>
struct
NormalTensorCoordinate_deprecated
{
using
type
=
NormalTensorCoordinate_deprecated
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_index
)}
{
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Xs
...
xs
)
:
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Sequence
<
Xs
...
>
)
:
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
{
Xs
...})
{
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mOffset
+=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
// reposition point of origin, and return compensated offset.
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
index_t
offset_diff
=
mOffset
;
mOffset
=
0
;
return
offset_diff
;
}
private:
index_t
mOffset
;
};
// TensorDesc is ConstantMergedTensorDescriptor_deprecated
template
<
class
TensorDesc
>
struct
MergedTensorCoordinate_deprecated
{
using
type
=
MergedTensorCoordinate_deprecated
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDim
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
__host__
__device__
constexpr
MergedTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mOriginalIndex
{
tensor_desc_type
::
GetOriginalMultiIndexFromMultiIndex
(
tensor_index
)}
{
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mOriginalIndex
,
partial_original_dims
));
});
// complete offset
mOffset
=
accumulate_on_array
(
mPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
MergedTensorCoordinate_deprecated
(
Xs
...
xs
)
:
MergedTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
template
<
class
IDim
,
class
T
,
bool
PositiveDirection
>
__host__
__device__
void
MoveOnDimension
(
IDim
idim_
,
T
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
constexpr
auto
idim
=
idim_
;
// if step_size is known at compile time
static_if
<
is_static
<
T
>::
value
>
{}(
[
&
](
auto
)
{
static_if
<
T
{}
==
0
>
{}([
&
](
auto
)
{
return
;
});
});
// update original index
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
index_t
ndim_partial_original
=
partial_original_dims
.
GetSize
();
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
const
auto
partial_original_step_sizes
=
partial_original_desc
.
GetMultiIndexFrom1dIndex
(
step_size
);
// update partial original multi-id
auto
partial_original_id
=
extract_array
(
mOriginalIndex
,
partial_original_dims
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
partial_original_id
+=
partial_original_step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
carry
)
{
++
partial_original_id
(
i
);
}
carry
=
false
;
if
(
partial_original_id
[
i
]
>=
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
-=
partial_original_desc
.
GetLength
(
i
);
carry
=
true
;
}
});
// highest dimension
if
(
carry
)
{
++
partial_original_id
(
0
);
}
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id
+=
partial_original_desc
.
GetLengths
()
-
partial_original_step_sizes
;
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
borrow
)
{
--
partial_original_id
(
i
);
}
borrow
=
false
;
if
(
partial_original_id
[
i
]
<
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
+=
partial_original_desc
.
GetLength
(
i
);
borrow
=
true
;
}
});
// highest dimension
if
(
borrow
)
{
--
partial_original_id
(
0
);
}
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id
=
partial_original_id
-
partial_original_desc
.
GetLengths
();
});
// update "mOriginalIndex"
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
idim_original
=
partial_original_dims
[
I
];
mOriginalIndex
(
idim_original
)
=
partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_partial_offset
=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
partial_original_id
);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
}).
Else
([
&
](
auto
)
{
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
});
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// compiler should remove dead code path, because step_sizes is known at
// compile time
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
}
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// compiler should remove dead code path, because step_sizes is known at
// compile time
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
}
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
__host__
__device__
static
constexpr
index_t
RepositionOrigin
()
{
return
0
;
}
private:
// Allocate register memory for all merged dimensions and normal dimensions.
// However, only those merged dimensions, whose index will be involved in arithmetic
// after the construction of this TensorCoordinate (e.g. when user move a slicing
// window on the merged dimension), will use these register memory.
// Let's hope compiler will optimize away those register memory allocated for normal
// dimensions, and those merged dimensions, that would never be involved in index
// arithmetic after construction of TensorCoordinate.
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor_deprecated, so we don't need to
// count on compiler to optimize away those register memory for us
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nDim
>
mPartialOffsets
;
// complete offset
index_t
mOffset
;
};
template
<
class
TensorDesc
>
struct
TensorCoordinate_deprecated
{
private:
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor_deprecated
<
Ts
...
>
)
{
return
NormalTensorCoordinate_deprecated
<
ConstantTensorDescriptor_deprecated
<
Ts
...
>>
();
}
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor_deprecated
<
Ts
...
>
)
{
return
MergedTensorCoordinate_deprecated
<
ConstantMergedTensorDescriptor_deprecated
<
Ts
...
>>
();
}
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
deleted
100644 → 0
View file @
506a823a
#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMergedTensorDescriptor_deprecated.hpp"
#include "tensor_coordinate_deprecated.hpp"
#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
namespace
ck
{
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1_deprecated
template
<
index_t
BlockSize
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v1_deprecated
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDimSrc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDimDst
=
DstDesc
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
// per-thread offset
index_t
mThreadSrcOffset
;
index_t
mThreadDstOffset
;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array
<
index_t
,
nDim
>
mThreadSrcPartialOffsets
;
Array
<
index_t
,
nDim
>
mThreadDstPartialOffsets
;
// multi-id of original tensor
Array
<
index_t
,
nOriginalDimSrc
>
mThreadSrcOriginalMultiId
;
Array
<
index_t
,
nOriginalDimDst
>
mThreadDstOriginalMultiId
;
__device__
BlockwiseGenericTensorSliceCopy_v1_deprecated
(
Array
<
index_t
,
nDim
>
src_block_data_id_begin
,
Array
<
index_t
,
nDim
>
dst_block_data_id_begin
)
{
// check NDim consistency
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong"
);
// check thread arrange order and read/write access order are valid
static_assert
(
is_valid_sequence_map
<
ThreadClusterArrangeOrder
>::
value
&&
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong!"
);
// thread cluster
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
// BlockSize
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
// divide work
constexpr
auto
data_per_cluster_per_dims
=
SubLengths
{}
*
ThreadClusterLengths
{};
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_assert
(
SliceLengths
::
Get
(
IDim
)
%
data_per_cluster_per_dims
.
Get
(
IDim
)
==
0
,
"wrong! cannot evenly divide sliced tensor into cluster"
);
});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
data_per_cluster_per_dims
;
// additional check for merged dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
// src
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim_
)
>
{}([
&
](
auto
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr
auto
idim_last_original_src
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Back
();
static_assert
(
SrcDesc
::
GetOriginalTensorDescriptor
().
GetLength
(
idim_last_original_src
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
"wrong!"
);
// merged dimension should have repeat_lengths = 1
static_assert
(
repeat_lengths
[
IDim
]
==
1
,
"wrong! repeat_lengths shoud be 1 on merged dimension"
);
});
// dst
static_if
<
DstDesc
::
ContainMultipleOriginalDimensions
(
IDim_
)
>
{}([
&
](
auto
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr
auto
idim_last_original_dst
=
DstDesc
::
GetContainedOriginalDimensions
(
IDim
).
Back
();
static_assert
(
DstDesc
::
GetOriginalTensorDescriptor
().
GetLength
(
idim_last_original_dst
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
"wrong!"
);
// merged dimension should have repeat_lengths = 1
static_assert
(
repeat_lengths
[
IDim
]
==
1
,
"wrong! repeat_lengths shoud be 1 on merged dimension"
);
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
// original multi-id
mThreadSrcOriginalMultiId
=
SrcDesc
::
GetOriginalMultiIndexFromMultiIndex
(
src_block_data_id_begin
+
thread_data_id_begin
);
mThreadDstOriginalMultiId
=
DstDesc
::
GetOriginalMultiIndexFromMultiIndex
(
dst_block_data_id_begin
+
thread_data_id_begin
);
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
mThreadSrcPartialOffsets
(
IDim
)
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
));
});
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
dst_partial_original_dims
=
DstDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
dst_partial_original_desc
=
DstDesc
::
GetOriginalTensorDescriptor
().
Extract
(
dst_partial_original_dims
);
mThreadDstPartialOffsets
(
IDim
)
=
dst_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadDstOriginalMultiId
,
dst_partial_original_dims
));
});
// complete offset
mThreadSrcOffset
=
accumulate_on_array
(
mThreadSrcPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
mThreadDstOffset
=
accumulate_on_array
(
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
__device__
static
constexpr
auto
GetRegisterBufferDescriptor
()
{
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
return
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
}
__device__
static
constexpr
index_t
GetThreadBufferSize
()
{
return
GetRegisterBufferDescriptor
().
GetElementSpace
();
}
template
<
typename
TData
>
__device__
void
RunLoadThreadBuffer
(
const
TData
*
__restrict__
p_src
,
TData
*
__restrict__
p_buffer
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
constexpr
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
const
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1_deprecated
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
<
SrcDesc
,
decltype
(
thread_buffer_desc
),
SubLengths
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
});
}
template
<
typename
TData
>
__device__
void
RunStoreThreadBuffer
(
const
TData
*
__restrict__
p_buffer
,
TData
*
__restrict__
p_dst
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
constexpr
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1_deprecated
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
<
decltype
(
thread_buffer_desc
),
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
});
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
__restrict__
p_src
,
TData
*
__restrict__
p_dst
)
const
{
TData
p_buffer
[
GetThreadBufferSize
()];
RunLoadThreadBuffer
(
p_src
,
p_buffer
);
RunStoreThreadBuffer
(
p_buffer
,
p_dst
);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template
<
index_t
IDim_
,
index_t
StepSize
,
bool
PositiveDirection
>
__device__
void
MoveSlicingWindowOnSourceTensor
(
Number
<
IDim_
>
,
Number
<
StepSize
>
,
integral_constant
<
bool
,
PositiveDirection
>
direction
)
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
>
{}([
&
](
auto
)
{
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
// calculate new partial original multi-id
auto
old_src_partial_original_id
=
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
);
auto
new_src_partial_original_id
=
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
old_src_partial_original_id
,
StepSize
,
direction
);
// update "mThreadSrcOriginalMultiId"
static_for
<
0
,
decltype
(
src_partial_original_dims
)
::
GetSize
(),
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
IDimOriginal
=
src_partial_original_dims
[
I
];
mThreadSrcOriginalMultiId
(
IDimOriginal
)
=
new_src_partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_src_partial_offset
=
mThreadSrcPartialOffsets
[
IDim
];
const
index_t
new_src_partial_offset
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
new_src_partial_original_id
);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets
(
IDim
)
=
new_src_partial_offset
;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
}).
Else
([
&
](
auto
)
{
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr
auto
IDimOriginal
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Front
();
static_if
<
PositiveDirection
>
{}([
&
](
auto
fwd
)
{
mThreadSrcOffset
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
+=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
}).
Else
([
&
](
auto
fwd
)
{
mThreadSrcOffset
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
-=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
});
});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
if
(
step_sizes
[
idim
]
!=
0
)
{
MoveSlicingWindowOnSourceTensor
(
idim
,
step_sizes
[
idim
],
positive_direction
);
}
});
}
};
// This version use TensorCoordiante
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
template
<
index_t
BlockSize
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v2_deprecated
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2_deprecated
(
const
Index
&
src_block_slice_origin
,
const
Index
&
dst_block_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
static
constexpr
index_t
GetThreadBufferSize
()
{
return
ThreadBufferDesc
::
GetElementSpace
();
}
template
<
typename
BlockSrcData
,
typename
ThreadBufferData
,
AddressSpace
BlockSrcAddressSpace
,
AddressSpace
ThreadBufferAddressSpace
>
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
ThreadBufferData
*
p_thread_buffer
,
integral_constant
<
AddressSpace
,
BlockSrcAddressSpace
>
,
integral_constant
<
AddressSpace
,
ThreadBufferAddressSpace
>
)
const
{
constexpr
auto
block_src_address_space
=
integral_constant
<
AddressSpace
,
BlockSrcAddressSpace
>
{};
constexpr
auto
thread_buffer_address_space
=
integral_constant
<
AddressSpace
,
ThreadBufferAddressSpace
>
{};
mThreadwiseLoad
.
Run
(
p_block_src
,
p_thread_buffer
,
block_src_address_space
,
thread_buffer_address_space
);
}
template
<
typename
BlockSrcData
,
typename
ThreadBufferData
>
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
ThreadBufferData
*
p_thread_buffer
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
Generic
>
{};
RunLoadThreadBuffer
(
p_block_src
,
p_thread_buffer
,
generic_address_space
,
generic_address_space
);
}
template
<
typename
ThreadBufferData
,
typename
BlockDstData
,
AddressSpace
ThreadBufferAddressSpace
,
AddressSpace
BlockDstAddressSpace
>
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
BlockDstData
*
p_block_dst
,
integral_constant
<
AddressSpace
,
ThreadBufferAddressSpace
>
,
integral_constant
<
AddressSpace
,
BlockDstAddressSpace
>
)
const
{
constexpr
auto
thread_buffer_address_space
=
integral_constant
<
AddressSpace
,
ThreadBufferAddressSpace
>
{};
constexpr
auto
block_dst_address_space
=
integral_constant
<
AddressSpace
,
BlockDstAddressSpace
>
{};
mThreadwiseStore
.
Run
(
p_thread_buffer
,
p_block_dst
,
thread_buffer_address_space
,
block_dst_address_space
);
}
template
<
typename
ThreadBufferData
,
typename
BlockDstData
>
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
BlockDstData
*
p_block_dst
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
Generic
>
{};
RunStoreThreadBuffer
(
p_thread_buffer
,
p_block_dst
,
generic_address_space
,
generic_address_space
);
}
template
<
typename
BlockSrcData
,
typename
BlockDstData
,
AddressSpace
BlockSrcAddressSpace
,
AddressSpace
BlockDstAddressSpace
>
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
,
integral_constant
<
AddressSpace
,
BlockSrcAddressSpace
>
block_src_address_space
,
integral_constant
<
AddressSpace
,
BlockDstAddressSpace
>
block_dst_address_space
)
const
{
BlockSrcData
p_thread_buffer
[
GetThreadBufferSize
()];
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
Generic
>
{};
RunLoadThreadBuffer
(
p_block_src
,
p_thread_buffer
,
block_src_address_space
,
generic_address_space
);
// if there is type conversion, it's done during store
RunStoreThreadBuffer
(
p_thread_buffer
,
p_block_dst
,
generic_address_space
,
block_dst_address_space
);
}
template
<
typename
BlockSrcData
,
typename
BlockDstData
>
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
Generic
>
{};
Run
(
p_block_src
,
p_block_dst
,
generic_address_space
,
generic_address_space
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSliceWindow
(
step_sizes
,
positive_direction
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSliceWindow
(
step_sizes
,
positive_direction
);
}
private:
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
<
SrcDesc
,
ThreadBufferDesc
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
<
ThreadBufferDesc
,
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
deleted
100644 → 0
View file @
506a823a
#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMergedTensorDescriptor_deprecated.hpp"
#include "tensor_coordinate_deprecated.hpp"
namespace
ck
{
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
()
:
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
Array
<
index_t
,
nDim
>
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
class
SrcData
,
class
DstData
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}(
[
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
SrcData
p_src_long_vector
[
long_vector_size
];
DstData
p_dst_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
}
// type conversion
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
{}(
p_src_long_vector
[
i
]);
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
}
});
}
private:
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
// This version use TensorCoordinate_deprecated
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoordinate
=
typename
TensorCoordinate_deprecated
<
SrcDesc
>::
type
;
using
DstCoordinate
=
typename
TensorCoordinate_deprecated
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
(
const
Index
&
src_slice_origin
,
const
Index
&
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
SrcVectorAccessDim
]
%
SrcDataPerAccess
==
0
&&
SliceLengths
{}[
DstVectorAccessDim
]
%
DstDataPerAccess
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
()
:
ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoordinate
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoordinate
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
typename
TDesc
,
class
Lengths
>
struct
IsolateMergedDimLengths
{
template
<
typename
IDim
>
__device__
constexpr
index_t
operator
()(
IDim
idim
)
const
{
return
TDesc
::
ContainMultipleOriginalDimensions
(
idim
)
?
Lengths
{}[
idim
]
:
1
;
}
};
template
<
typename
SrcData
,
typename
DstData
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
,
integral_constant
<
AddressSpace
,
SrcAddressSpace
>
,
integral_constant
<
AddressSpace
,
DstAddressSpace
>
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
SrcData
p_src_buffer_
[
buffer_desc
.
GetElementSpace
()];
SrcData
*
p_src_buffer
=
p_src_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}(
[
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
src_merged_dim_data_id
(
src_vector_access_dim
)
=
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
src_merged_offset
=
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id
)
{
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
src_normal_dim_data_id
(
src_vector_access_dim
)
=
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. normal dimension is known at compile-time
const
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
src_vector_t
vector_data
;
// Read vector from src.
// 1. Source code version can take src of all kinds of memory-space
// 2. Intrinsic version using buffer_load can only take
// src from global-memory
//
// Commemt for loading from global-memory:
// When:
// 1) using source code, in order for compiler to emit optimal
// load instruction, or
// 2) using buffer_load intrinsic, in order for ISA to be valid,
// following assumptions need to be satisfied:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed by
// algorithm)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if
<
SrcAddressSpace
==
AddressSpace
::
Global
>
{}([
&
](
auto
fwd
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
vector_data
=
amd_intrinsic_buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
fwd
(
p_src
),
src_merged_offset
,
src_normal_offset
);
#else
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
#endif
}).
Else
([
&
](
auto
)
{
// src can be all kinds of memory-space.
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
});
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_src_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
SrcData
*>
(
&
vector_data
)[
i
];
}
});
});
}
// type conversion
// TODO: would compiler do a good job reusing register for buffer?
DstData
p_dst_buffer_
[
buffer_desc
.
GetElementSpace
()];
DstData
*
p_dst_buffer
=
p_dst_buffer_
;
ford
<
SliceLengths
>
{}([
&
](
auto
idx
)
{
p_dst_buffer
[
buffer_desc
.
GetOffsetFromMultiIndex
(
idx
)]
=
type_convert
<
DstData
>
{}(
p_src_buffer
[
buffer_desc
.
GetOffsetFromMultiIndex
(
idx
)]);
});
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
dst_merged_offset
=
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
DstData
*>
(
&
vector_data
)[
i
]
=
p_dst_buffer
[
buffer_offset
];
}
// offset w.r.t. normal dimension is known at compile-time
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// Write vector into dst.
// 1. Source code version can take dst of all kinds of memory-space
// 2. Intrinsic version using buffer_store can only take
// dst from global-memory
//
// Commemt for storing into global-memory:
// When:
// 1) using source code, in order for compiler to emit optimal
// store instruction, or
// 2) using buffer_store, intrinsic in order ISA to be valid
// following assumptions need to be satisfied:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed by
// algorithm)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if
<
DstAddressSpace
==
AddressSpace
::
Global
>
{}([
&
](
auto
fwd
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store
<
DstData
,
DstDataPerAccess
>
(
vector_data
,
fwd
(
p_dst
),
dst_merged_offset
,
dst_normal_offset
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
#endif
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
});
});
});
}
}
template
<
typename
SrcData
,
typename
DstData
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
Generic
>
{};
Run
(
p_src
,
p_dst
,
generic_address_space
,
generic_address_space
);
}
// T can be Sequence or Array
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
SrcCoordinate
mSrcSliceOrigin
;
DstCoordinate
mDstSliceOrigin
;
};
}
// namespace ck
#endif
composable_kernel/include/utility/math.hpp
View file @
6fc49f91
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#include "config.hpp"
#include "config.hpp"
#include "integral_constant.hpp"
#include "integral_constant.hpp"
#include "number.hpp"
#include "type.hpp"
#include "type.hpp"
namespace
ck
{
namespace
ck
{
...
...
driver/CMakeLists.txt
View file @
6fc49f91
set
(
TENSOR_SOURCE
set
(
TENSOR_SOURCE
src/tensor.cpp;
src/
host_
tensor.cpp;
src/device.cpp;
src/device.cpp;
)
)
...
@@ -25,8 +25,6 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
...
@@ -25,8 +25,6 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
endif
()
endif
()
add_executable
(
conv_driver
${
CONV_SOURCE
}
)
add_executable
(
conv_driver
${
CONV_SOURCE
}
)
add_executable
(
col2im_driver
${
COL2IM_SOURCE
}
)
add_executable
(
conv_bwd_data_driver
${
CONV_BWD_DATA_SOURCE
}
)
add_executable
(
conv_bwd_data_driver
${
CONV_BWD_DATA_SOURCE
}
)
target_link_libraries
(
conv_driver PRIVATE host
)
target_link_libraries
(
conv_driver PRIVATE host
)
target_link_libraries
(
col2im_driver PRIVATE host
)
target_link_libraries
(
conv_bwd_data_driver PRIVATE host
)
target_link_libraries
(
conv_bwd_data_driver PRIVATE host
)
driver/include/conv_common.hpp
View file @
6fc49f91
#ifndef CONV_COMMON_HPP
#ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP
#define CONV_COMMON_HPP
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
template
<
class
InDesc
,
class
WeiDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
LowerPads
,
class
UpperPads
>
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor_deprecated
(
InDesc
,
WeiDesc
,
ConvStrides
,
ConvDilations
,
LowerPads
,
UpperPads
)
{
using
namespace
ck
;
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
in_desc
.
GetNumOfDimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
wei_desc
.
GetNumOfDimension
()
==
4
,
"weight nDim is not 4"
);
static_assert
(
in_desc
.
GetLength
(
I1
)
==
wei_desc
.
GetLength
(
I1
),
"input & weight dimension not consistent"
);
constexpr
index_t
N
=
in_desc
.
GetLength
(
I0
);
constexpr
index_t
Hi
=
in_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
wei_desc
.
GetLength
(
I0
);
constexpr
index_t
Y
=
wei_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_desc
.
GetLength
(
I3
);
constexpr
index_t
HPadLow
=
LowerPads
{}.
Get
(
I0
);
constexpr
index_t
WPadLow
=
LowerPads
{}.
Get
(
I1
);
constexpr
index_t
HPadUp
=
UpperPads
{}.
Get
(
I0
);
constexpr
index_t
WPadUp
=
UpperPads
{}.
Get
(
I1
);
constexpr
index_t
YEff
=
(
Y
-
1
)
*
ConvDilations
{}[
0
]
+
1
;
constexpr
index_t
XEff
=
(
X
-
1
)
*
ConvDilations
{}[
1
]
+
1
;
constexpr
index_t
Ho
=
(
Hi
+
HPadLow
+
HPadUp
-
YEff
)
/
ConvStrides
{}[
0
]
+
1
;
constexpr
index_t
Wo
=
(
Wi
+
WPadLow
+
WPadUp
-
XEff
)
/
ConvStrides
{}[
1
]
+
1
;
return
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
}
template
<
class
InDesc
,
template
<
class
InDesc
,
class
WeiDesc
,
class
WeiDesc
,
class
ConvStrides
,
class
ConvStrides
,
...
...
driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
View file @
6fc49f91
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "
host_
tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
...
...
driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp
View file @
6fc49f91
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "
host_
tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp"
...
...
driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
View file @
6fc49f91
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "
host_
tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp"
...
...
driver/include/device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
View file @
6fc49f91
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "
host_
tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp"
...
...
driver/include/device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
6fc49f91
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "
host_
tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
6fc49f91
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "
host_
tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
6fc49f91
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "
host_
tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
...
...
driver/include/device_tensor.hpp
View file @
6fc49f91
#pragma once
#pragma once
#include "tensor.hpp"
#include "
host_
tensor.hpp"
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
template
<
typename
Const
TensorDesc
,
std
::
size_t
...
Is
>
template
<
typename
TensorDesc
,
std
::
size_t
...
Is
>
auto
make_TensorDescriptor_impl
(
Const
TensorDesc
,
std
::
integer_sequence
<
std
::
size_t
,
Is
...
>
)
auto
make_
Host
TensorDescriptor_impl
(
TensorDesc
,
std
::
integer_sequence
<
std
::
size_t
,
Is
...
>
)
{
{
std
::
initializer_list
<
std
::
size_t
>
lengths
=
{
Const
TensorDesc
::
GetLengths
()[
Is
]...};
std
::
initializer_list
<
std
::
size_t
>
lengths
=
{
TensorDesc
::
GetLengths
()[
Is
]...};
std
::
initializer_list
<
std
::
size_t
>
strides
=
{
Const
TensorDesc
::
GetStrides
()[
Is
]...};
std
::
initializer_list
<
std
::
size_t
>
strides
=
{
TensorDesc
::
GetStrides
()[
Is
]...};
return
TensorDescriptor
(
lengths
,
strides
);
return
Host
TensorDescriptor
(
lengths
,
strides
);
}
}
template
<
typename
Const
TensorDesc
>
template
<
typename
TensorDesc
>
auto
make_TensorDescriptor
(
Const
TensorDesc
)
auto
make_
Host
TensorDescriptor
(
TensorDesc
)
{
{
return
make_TensorDescriptor_impl
(
return
make_HostTensorDescriptor_impl
(
ConstTensorDesc
{},
TensorDesc
{},
std
::
make_integer_sequence
<
std
::
size_t
,
TensorDesc
::
GetNumOfDimension
()
>
{});
std
::
make_integer_sequence
<
std
::
size_t
,
ConstTensorDesc
::
GetNumOfDimension
()
>
{});
}
}
template
<
typename
Const
TensorDesc
>
template
<
typename
TensorDesc
>
void
ostream_
ConstantT
ensor
D
escriptor
(
Const
TensorDesc
,
std
::
ostream
&
os
=
std
::
cout
)
void
ostream_
t
ensor
_d
escriptor
(
TensorDesc
,
std
::
ostream
&
os
=
std
::
cout
)
{
{
ostream_TensorDescriptor
(
make_TensorDescriptor
(
Const
TensorDesc
{}),
os
);
ostream_
Host
TensorDescriptor
(
make_
Host
TensorDescriptor
(
TensorDesc
{}),
os
);
}
}
driver/include/host_conv.hpp
View file @
6fc49f91
#pragma once
#pragma once
#include "tensor.hpp"
#include "
host_
tensor.hpp"
template
<
class
TIn
,
template
<
class
TIn
,
class
TWei
,
class
TWei
,
...
...
driver/include/host_conv_bwd_data.hpp
View file @
6fc49f91
#pragma once
#pragma once
#include "tensor.hpp"
#include "
host_
tensor.hpp"
template
<
typename
TIn
,
template
<
typename
TIn
,
typename
TWei
,
typename
TWei
,
...
...
driver/include/tensor.hpp
→
driver/include/
host_
tensor.hpp
View file @
6fc49f91
#ifndef TENSOR_HPP
#ifndef
HOST_
TENSOR_HPP
#define TENSOR_HPP
#define
HOST_
TENSOR_HPP
#include <thread>
#include <thread>
#include <vector>
#include <vector>
...
@@ -65,26 +65,26 @@ auto construct_f_unpack_args(F, T args)
...
@@ -65,26 +65,26 @@ auto construct_f_unpack_args(F, T args)
return
construct_f_unpack_args_impl
<
F
>
(
args
,
std
::
make_index_sequence
<
N
>
{});
return
construct_f_unpack_args_impl
<
F
>
(
args
,
std
::
make_index_sequence
<
N
>
{});
}
}
struct
TensorDescriptor
struct
Host
TensorDescriptor
{
{
TensorDescriptor
()
=
delete
;
Host
TensorDescriptor
()
=
delete
;
template
<
typename
X
>
template
<
typename
X
>
TensorDescriptor
(
std
::
vector
<
X
>
lens
);
Host
TensorDescriptor
(
std
::
vector
<
X
>
lens
);
template
<
typename
X
,
typename
Y
>
template
<
typename
X
,
typename
Y
>
TensorDescriptor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
);
Host
TensorDescriptor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
);
void
CalculateStrides
();
void
CalculateStrides
();
template
<
class
Range
>
template
<
class
Range
>
TensorDescriptor
(
const
Range
&
lens
)
:
mLens
(
lens
.
begin
(),
lens
.
end
())
Host
TensorDescriptor
(
const
Range
&
lens
)
:
mLens
(
lens
.
begin
(),
lens
.
end
())
{
{
this
->
CalculateStrides
();
this
->
CalculateStrides
();
}
}
template
<
class
Range1
,
class
Range2
>
template
<
class
Range1
,
class
Range2
>
TensorDescriptor
(
const
Range1
&
lens
,
const
Range2
&
strides
)
Host
TensorDescriptor
(
const
Range1
&
lens
,
const
Range2
&
strides
)
:
mLens
(
lens
.
begin
(),
lens
.
end
()),
mStrides
(
strides
.
begin
(),
strides
.
end
())
:
mLens
(
lens
.
begin
(),
lens
.
end
()),
mStrides
(
strides
.
begin
(),
strides
.
end
())
{
{
}
}
...
@@ -205,7 +205,7 @@ struct Tensor
...
@@ -205,7 +205,7 @@ struct Tensor
{
{
}
}
Tensor
(
const
TensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
Tensor
(
const
Host
TensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
template
<
class
G
>
template
<
class
G
>
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
...
@@ -267,11 +267,11 @@ struct Tensor
...
@@ -267,11 +267,11 @@ struct Tensor
typename
std
::
vector
<
T
>::
const_iterator
end
()
const
{
return
mData
.
end
();
}
typename
std
::
vector
<
T
>::
const_iterator
end
()
const
{
return
mData
.
end
();
}
TensorDescriptor
mDesc
;
Host
TensorDescriptor
mDesc
;
std
::
vector
<
T
>
mData
;
std
::
vector
<
T
>
mData
;
};
};
void
ostream_TensorDescriptor
(
const
TensorDescriptor
&
desc
,
std
::
ostream
&
os
=
std
::
cout
)
void
ostream_
Host
TensorDescriptor
(
const
Host
TensorDescriptor
&
desc
,
std
::
ostream
&
os
=
std
::
cout
)
{
{
os
<<
"dim "
<<
desc
.
GetNumOfDimension
()
<<
", "
;
os
<<
"dim "
<<
desc
.
GetNumOfDimension
()
<<
", "
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment