Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
724e984b
"test/git@developer.sourcefind.cn:change/sglang.git" did not exist on "ad0ff62a4c25f9d47533c22be083cacf38f60c68"
Commit
724e984b
authored
Sep 11, 2019
by
Chao Liu
Browse files
enabling padding for chwn format
parent
ca42e910
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
810 additions
and
326 deletions
+810
-326
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
..._convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
+169
-200
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+56
-9
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
...ernel/include/tensor_description/tensor_coordinate_v2.hpp
+98
-44
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+69
-9
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+143
-20
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+160
-7
composable_kernel/include/utility/array.hpp
composable_kernel/include/utility/array.hpp
+3
-14
composable_kernel/include/utility/array_helper.hpp
composable_kernel/include/utility/array_helper.hpp
+86
-2
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+9
-2
composable_kernel/include/utility/sequence.hpp
composable_kernel/include/utility/sequence.hpp
+2
-4
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
...ce_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+14
-14
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
View file @
724e984b
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
724e984b
...
@@ -22,17 +22,27 @@ struct PassThrough
...
@@ -22,17 +22,27 @@ struct PassThrough
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
UpperIndex
idx_up
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
{
return
idx_up
;
return
idx_up
;
}
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
UpperIndex
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
{
return
idx_up_diff
;
return
idx_up_diff
;
}
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
/* idx_up */
)
{
return
false
;
}
};
};
// LowLengths: Sequence<...>
// LowLengths: Sequence<...>
...
@@ -55,17 +65,39 @@ struct Pad
...
@@ -55,17 +65,39 @@ struct Pad
return
GetLowerLengths
()
+
LeftPads
{}
+
RightPads
{};
return
GetLowerLengths
()
+
LeftPads
{}
+
RightPads
{};
}
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
UpperIndex
idx_up
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
{
return
idx_up
-
LeftPads
{};
return
idx_up
-
LeftPads
{};
}
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
UpperIndex
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
{
return
idx_up_diff
;
return
idx_up_diff
;
}
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
idx_up
)
const
{
bool
flag
=
false
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// only check if there is left-padding
static_if
<
(
LeftPads
::
At
(
idim
)
!=
0
)
>
{}(
[
&
](
auto
)
{
flag
=
flag
||
idx_up
[
idim
]
<
LeftPads
::
At
(
idim
);
});
// only check if there is right-padding
static_if
<
(
RightPads
::
At
(
idim
)
!=
0
)
>
{}([
&
](
auto
)
{
flag
=
flag
||
idx_up
[
idim
]
>=
LeftPads
::
At
(
idim
)
+
LowLengths
::
At
(
idim
);
});
});
return
flag
;
}
};
};
// LowLengths: Sequence<...>
// LowLengths: Sequence<...>
...
@@ -124,7 +156,7 @@ struct Merge
...
@@ -124,7 +156,7 @@ struct Merge
.
PushBack
(
Number
<
1
>
{});
.
PushBack
(
Number
<
1
>
{});
// calculate index in each of the dimensions in the order of their dimension
// calculate index in each of the dimensions in the order of their dimension
#if 1
#if 1
// would compile to same ISA?
static_for
<
0
,
nDimLow
-
1
,
1
>
{}(
static_for
<
0
,
nDimLow
-
1
,
1
>
{}(
lambda_CalculateLowerIndex
<
decltype
(
pseudo_low_strides
)
>
(
itmp
,
idx_low
));
lambda_CalculateLowerIndex
<
decltype
(
pseudo_low_strides
)
>
(
itmp
,
idx_low
));
...
@@ -138,8 +170,10 @@ struct Merge
...
@@ -138,8 +170,10 @@ struct Merge
}
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
__host__
__device__
static
constexpr
auto
const
LowerIndex
&
idx_low_old
)
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
idx_low_old
)
{
{
LowerIndex
idx_low_diff
;
LowerIndex
idx_low_diff
;
...
@@ -149,6 +183,13 @@ struct Merge
...
@@ -149,6 +183,13 @@ struct Merge
}
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
/* idx_up */
)
{
return
false
;
}
};
};
// UpLengths: Sequence<...>
// UpLengths: Sequence<...>
...
@@ -189,7 +230,10 @@ struct Unmerge
...
@@ -189,7 +230,10 @@ struct Unmerge
return
idx_low
;
return
idx_low
;
}
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
{
return
CalculateLowerIndex
(
idx_up_diff
);
return
CalculateLowerIndex
(
idx_up_diff
);
}
}
...
@@ -240,7 +284,10 @@ struct Embed
...
@@ -240,7 +284,10 @@ struct Embed
return
idx_low
;
return
idx_low
;
}
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
{
LowerIndex
idx_low_diff
{
0
};
LowerIndex
idx_low_diff
{
0
};
...
...
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
View file @
724e984b
...
@@ -3,26 +3,28 @@
...
@@ -3,26 +3,28 @@
#include "common_header.hpp"
#include "common_header.hpp"
#include "dimension.hpp"
#include "dimension.hpp"
#include "
dimension
_transform.hpp"
#include "
multi_index
_transform.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
namespace
ck
{
template
<
class
NativeTensorDesc
>
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
;
template
<
typename
NativeTensorDesc
>
struct
NativeTensorCoordinate
struct
NativeTensorCoordinate
{
{
using
type
=
NativeTensorCoordinate
;
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
using
tensor_desc_type
=
NativeTensorDesc
;
using
Index
=
tensor_desc_type
::
Index
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
static
constexpr
index_t
nDim
=
Index
::
GetSize
();
__host__
__device__
constexpr
NativeTensorCoordinate
(
Index
idx
)
__host__
__device__
constexpr
NativeTensorCoordinate
(
Index
idx
)
:
m
Offset
{
GetTensorDesriptor
().
Get
Offset
(
idx
)
}
:
m
Index
(
idx
),
mOffset
(
tensor_desc_type
::
Calculate
Offset
(
idx
)
)
{
{
}
}
template
<
class
...
Xs
>
template
<
typename
...
Xs
>
__host__
__device__
constexpr
NativeTensorCoordinate
(
Xs
...
xs
)
__host__
__device__
constexpr
NativeTensorCoordinate
(
Xs
...
xs
)
:
NativeTensorCoordinate
(
Index
{
xs
...})
:
NativeTensorCoordinate
(
Index
{
xs
...})
{
{
...
@@ -36,82 +38,103 @@ struct NativeTensorCoordinate
...
@@ -36,82 +38,103 @@ struct NativeTensorCoordinate
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
constexpr
const
Index
&
GetIndex
()
const
{
return
mIndex
;
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
type
operator
+=
(
Index
idx_diff
)
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
{
mOffset
+=
tensor_desc_type
::
GetOffsetDiff
(
idx_diff
);
// mIndex is updated here, but some (or all) of its entries may never be used
mIndex
+=
idx_diff
;
mOffset
+=
tensor_desc_type
::
CalculateOffset
(
idx_diff
);
return
*
this
;
return
*
this
;
}
}
__host__
__device__
type
operator
-=
(
Index
idx_diff
)
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
{
{
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
idx_diff
);
// mIndex is updated here, but some (or all) of its entries may never be used
mIndex
-=
idx_diff
;
mOffset
-=
tensor_desc_type
::
CalculateOffset
(
idx_diff
);
return
*
this
;
return
*
this
;
}
}
__host__
__device__
constexpr
type
operator
+
(
Index
idx_diff
)
const
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
{
{
type
coord
=
*
this
;
type
coord
=
*
this
;
coord
+=
idx_diff
;
coord
+=
idx_diff
;
return
coord
;
return
coord
;
}
}
__host__
__device__
constexpr
type
operator
-
(
Index
idx_diff
)
const
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
{
{
type
coord
=
*
this
;
type
coord
=
*
this
;
coord
-=
idx_diff
;
coord
-=
idx_diff
;
return
coord
;
return
coord
;
}
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsAnyLevelIndexInPaddingArea
()
{
return
false
;
}
private:
private:
// mIndex may be saved and update, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndex
;
index_t
mOffset
;
index_t
mOffset
;
};
};
template
<
class
TransformedTensorDesc
>
template
<
typename
TransformedTensorDesc
>
struct
TransformedTensorCoordinate
struct
TransformedTensorCoordinate
{
{
using
type
=
TransformedTensorCoordinate
;
using
tensor_desc_type
=
TransformedTensorDesc
;
using
tensor_desc_type
=
TransformedTensorDesc
;
using
Index
=
tensor_desc_type
::
UpperIndex
;
using
LowerCoord
=
typename
TensorCoordinate_v2
<
decltype
(
tensor_desc_type
::
GetLowerTensorDescriptor
())
>::
type
;
using
lower_coordinate_type
=
using
UpperCoord
=
TransformedTensorCoordinate
;
TensorCoordiante_v2
<
decltype
(
GetTensorDescriptor
().
GetLowerTensorDescriptor
())
>::
type
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
UpperIndex
=
MultiIndex
<
nDim
>
;
static
constexpr
index_t
nDim
=
Index
::
GetSize
();
__host__
__device__
constexpr
TransformedTensorCoordinate
(
UpperIndex
idx
)
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Index
idx
)
:
mIndexUp
{
idx
},
mCoordLow
{
tensor_desc_type
::
CalculateLowerIndex
(
idx
)}
:
mIndex
{
idx
},
mCoordLow
{
GetTensorDescriptor
().
GetLowerIndex
(
idx
)}
{
{
}
}
template
<
class
...
Xs
>
template
<
typename
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Xs
...
xs
)
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Xs
...
xs
)
:
TransformedTensorCoordinate
(
Index
{
xs
...})
:
TransformedTensorCoordinate
(
Upper
Index
{
xs
...})
{
{
}
}
template
<
index_t
...
Xs
>
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Sequence
<
Xs
...
>
)
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
TransformedTensorCoordinate
(
Index
{
Xs
...})
:
TransformedTensorCoordinate
(
Upper
Index
{
Xs
...})
{
{
}
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mCoordLow
.
GetOffset
();
}
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
mCoordLow
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
mIndexUp
;
}
__host__
__device__
constexpr
Index
GetIndex
()
const
{
return
m
Index
;
}
__host__
__device__
constexpr
const
Upper
Index
&
GetIndex
()
const
{
return
GetUpper
Index
()
;
}
__host__
__device__
type
operator
+=
(
Index
idx_up_diff
)
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
GetLowerCoordinate
().
GetOffset
();
}
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
{
{
// For transformation of multi-index difference, not all transformation functions need to
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
// transformation function. The transformation function itself decides to use them or not.
mCoordLow
+=
mCoordLow
+=
tensor_desc_type
::
CalculateLowerIndexDiff
(
tensor_desc_type
::
GetLowerIndexDiff
(
idx_up_diff
,
m
Index
Up
,
mCoordLow
.
GetIndex
());
idx_up_diff
,
Get
Index
(),
GetLowerCoordinate
()
.
GetIndex
());
// mIndexUp is updated here, but some (or all) of its entries may never be used
// mIndexUp is updated here, but some (or all) of its entries may never be used
mIndexUp
+=
idx_up_diff
;
mIndexUp
+=
idx_up_diff
;
...
@@ -119,11 +142,35 @@ struct TransformedTensorCoordinate
...
@@ -119,11 +142,35 @@ struct TransformedTensorCoordinate
return
*
this
;
return
*
this
;
}
}
__host__
__device__
constexpr
type
operator
+
(
Index
idx_up_diff
)
const
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
Upper
Index
&
idx_up_diff
)
{
{
type
coord
=
*
this
;
mCoordLow
-=
tensor_desc_type
::
CalculateLowerIndexDiff
(
coord
+=
idx_diff
;
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
return
coord
;
mIndexUp
-=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
+=
idx_up_diff
;
return
coord_up
;
}
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
-=
idx_up_diff
;
return
coord_up
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
constexpr
bool
IsAnyLevelIndexInPaddingArea
()
const
{
return
tensor_desc_type
::
IsUpperIndexInPaddingArea
(
GetIndex
())
||
mCoordLow
.
IsAnyLevelIndexInPaddingArea
();
}
}
private:
private:
...
@@ -131,22 +178,22 @@ struct TransformedTensorCoordinate
...
@@ -131,22 +178,22 @@ struct TransformedTensorCoordinate
// never be used. Compiler should be able to remove these entries as well as its calculation
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// as dead code.
// TODO: make sure compiler indeed remove these dead code
// TODO: make sure compiler indeed remove these dead code
Index
mIndexUp
;
Upper
Index
mIndexUp
;
l
ower
_c
oord
inate_type
mCoordLow
;
L
ower
C
oord
mCoordLow
;
};
};
template
<
class
TensorDesc
>
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
struct
TensorCoordinate_v2
{
{
private:
private:
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
();
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
();
}
}
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
{
...
@@ -156,5 +203,12 @@ struct TensorCoordinate_v2
...
@@ -156,5 +203,12 @@ struct TensorCoordinate_v2
public:
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
};
template
<
typename
TensorDesc
>
__host__
__device__
constexpr
auto
make_tensor_coordinate_v2
(
TensorDesc
,
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
idx
)
{
return
typename
TensorCoordinate_v2
<
TensorDesc
>::
type
(
idx
);
}
}
}
#endif
#endif
composable_kernel/include/tensor_description/tensor_descriptor.hpp
View file @
724e984b
...
@@ -64,6 +64,18 @@ struct NativeTensorDescriptor
...
@@ -64,6 +64,18 @@ struct NativeTensorDescriptor
return
GetStrides
(
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
{});
return
GetStrides
(
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
{});
}
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
accumulate_on_sequence
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
return
accumulate_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
1
>
{});
}
// TODO: this cannot return constepxr because of use of lambda
__host__
__device__
static
constexpr
index_t
CalculateOffset
(
const
Index
&
idx
)
__host__
__device__
static
constexpr
index_t
CalculateOffset
(
const
Index
&
idx
)
{
{
index_t
offset
=
0
;
index_t
offset
=
0
;
...
@@ -73,6 +85,12 @@ struct NativeTensorDescriptor
...
@@ -73,6 +85,12 @@ struct NativeTensorDescriptor
return
offset
;
return
offset
;
}
}
// TODO: remove this
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
const
Index
&
idx
)
{
return
CalculateOffset
(
idx
);
}
__host__
__device__
static
constexpr
index_t
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
__host__
__device__
static
constexpr
index_t
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
{
{
index_t
offset_diff
=
0
;
index_t
offset_diff
=
0
;
...
@@ -100,6 +118,12 @@ struct NativeTensorDescriptor
...
@@ -100,6 +118,12 @@ struct NativeTensorDescriptor
{
{
return
Tuple
<>
{};
return
Tuple
<>
{};
}
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
Index
&
/* idx */
)
{
return
false
;
}
};
};
// LowerTensorDescriptor
// LowerTensorDescriptor
...
@@ -248,6 +272,17 @@ struct TransformedTensorDescriptor
...
@@ -248,6 +272,17 @@ struct TransformedTensorDescriptor
return
GetLengths
(
Sequence
<
IDim
,
IDims
...
>
{});
return
GetLengths
(
Sequence
<
IDim
,
IDims
...
>
{});
}
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
accumulate_on_sequence
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
// TODO: Is this the correct definition for transformed tensor?
return
GetLowerTensorDescriptor
().
GetElementSpace
();
}
// TODO: right now return value is constexpr because use of non-constepxr lambda
// TODO: right now return value is constexpr because use of non-constepxr lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
{
...
@@ -256,8 +291,8 @@ struct TransformedTensorDescriptor
...
@@ -256,8 +291,8 @@ struct TransformedTensorDescriptor
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
auto
idx_low_part
=
pick_array_element
(
idx_low
,
LowDimensionIds
{}.
At
(
itran
));
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
auto
idx_low_part
=
pick_array_element
(
idx_low
,
LowDimensionIds
{}.
At
(
itran
));
// this assume each lower (single) index is only assocaited with one transformation,
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
// which is required for index transformation, and has been checked during constructor
...
@@ -269,26 +304,29 @@ struct TransformedTensorDescriptor
...
@@ -269,26 +304,29 @@ struct TransformedTensorDescriptor
}
}
// TODO: right now return value is constexpr because use of non-constepxr lambda
// TODO: right now return value is constexpr because use of non-constepxr lambda
__host__
__device__
static
constexpr
LowerIndex
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndexDiff
(
CalculateLowerIndexD
iff
(
const
UpperIndex
&
idx_up_
diff
,
const
LowerIndex
&
idx_low_old
)
const
UpperIndex
&
idx_up_d
iff
,
const
UpperIndex
&
idx_up_
old
,
const
LowerIndex
&
idx_low_old
)
{
{
LowerIndex
idx_low_diff
;
LowerIndex
idx_low_diff
;
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
::
At
(
itran
);
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
const
auto
idx_up_diff_part
=
const
auto
idx_up_diff_part
=
pick_array_element
(
idx_up_diff
,
UpDimensionIds
::
At
(
itran
));
pick_array_element
(
idx_up_diff
,
UpDimensionIds
{}.
At
(
itran
));
auto
idx_
low_diff
_part
=
pick_array_element
(
idx_
low_diff
,
Low
DimensionIds
::
At
(
itran
));
const
auto
idx_
up_old
_part
=
pick_array_element
(
idx_
up_old
,
Up
DimensionIds
{}.
At
(
itran
));
const
auto
idx_low_old_part
=
const
auto
idx_low_old_part
=
pick_array_element
(
idx_low_old
,
LowDimensionIds
::
At
(
itran
));
pick_array_element
(
idx_low_old
,
LowDimensionIds
{}.
At
(
itran
));
auto
idx_low_diff_part
=
pick_array_element
(
idx_low_diff
,
LowDimensionIds
{}.
At
(
itran
));
// this assume each lower (single) index is associated with only one transformation,
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
// of TransformedTensorDescriptor
idx_low_diff_part
=
tran
.
CalculateLowerIndex
(
idx_up_diff_part
,
idx_low_old_part
);
idx_low_diff_part
=
tran
.
CalculateLowerIndexDiff
(
to_array
(
idx_up_diff_part
),
to_array
(
idx_up_old_part
),
to_array
(
idx_low_old_part
));
});
});
return
idx_low_diff
;
return
idx_low_diff
;
...
@@ -299,6 +337,12 @@ struct TransformedTensorDescriptor
...
@@ -299,6 +337,12 @@ struct TransformedTensorDescriptor
return
GetLowerTensorDescriptor
().
CalculateOffset
(
CalculateLowerIndex
(
idx_up
));
return
GetLowerTensorDescriptor
().
CalculateOffset
(
CalculateLowerIndex
(
idx_up
));
}
}
// TODO: remove this
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
const
UpperIndex
&
idx_up
)
{
return
CalculateOffset
(
idx_up
);
}
#if 0
#if 0
template <index_t IDim>
template <index_t IDim>
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
...
@@ -321,6 +365,22 @@ struct TransformedTensorDescriptor
...
@@ -321,6 +365,22 @@ struct TransformedTensorDescriptor
// not implemented
// not implemented
}
}
#endif
#endif
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
idx_up
)
{
bool
flag
=
false
;
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
flag
=
flag
||
tran
.
IsUpperIndexInPaddingArea
(
to_array
(
idx_up_part
));
});
return
flag
;
}
};
};
template
<
index_t
...
Lengths
,
index_t
...
Strides
>
template
<
index_t
...
Lengths
,
index_t
...
Strides
>
...
@@ -337,7 +397,7 @@ __host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths)
...
@@ -337,7 +397,7 @@ __host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths)
Lengths
::
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
Lengths
::
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
.
PushBack
(
Number
<
1
>
{});
return
make_
N
ative
T
ensor
D
escriptor
(
Lengths
{},
strides
);
return
make_
n
ative
_t
ensor
_d
escriptor
(
Lengths
{},
strides
);
}
}
template
<
typename
LowTensorDescriptor
,
template
<
typename
LowTensorDescriptor
,
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
724e984b
...
@@ -7,6 +7,8 @@
...
@@ -7,6 +7,8 @@
#include "tensor_coordinate.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "tensor_view.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate_v2.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
...
@@ -418,6 +420,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -418,6 +420,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
}
}
};
};
// This version use TensorCoordiante
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// memory layout (ordering of dimensions) can be different between src and dst.
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
...
@@ -518,7 +521,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -518,7 +521,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
}
}
private:
private:
using
RegisterBufferDesc
=
decltype
(
make_
ConstantT
ensor
D
escriptor_packed
(
SubLengths
{}));
using
RegisterBufferDesc
=
decltype
(
make_
native_t
ensor
_d
escriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
SrcDesc
,
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
SrcDesc
,
RegisterBufferDesc
,
RegisterBufferDesc
,
...
@@ -544,6 +547,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -544,6 +547,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
ThreadwiseStore
mThreadwiseStore
;
ThreadwiseStore
mThreadwiseStore
;
};
};
// this version use TensorView and TensorCoordinate
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
class
SrcTensor
,
class
SrcTensor
,
class
DstTensor
,
class
DstTensor
,
...
@@ -639,25 +643,25 @@ struct BlockwiseGenericTensorSliceCopy_v3
...
@@ -639,25 +643,25 @@ struct BlockwiseGenericTensorSliceCopy_v3
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadBufferTensor
=
NormalTensorView
<
ThreadBufferDesc
,
data_type
>
;
using
ThreadBufferTensor
=
NormalTensorView
<
ThreadBufferDesc
,
data_type
>
;
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v3
<
SrcTensor
,
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v3
r1
<
SrcTensor
,
ThreadBufferTensor
,
ThreadBufferTensor
,
SubLengths
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
SrcDataPerAccess
,
1
>
;
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v3
<
ThreadBufferTensor
,
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v3
r1
<
ThreadBufferTensor
,
DstTensor
,
DstTensor
,
SubLengths
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
1
,
DstDataPerAccess
>
;
DstDataPerAccess
>
;
data_type
mpBuffer
[
ThreadBufferDesc
::
GetElementSpace
()];
data_type
mpBuffer
[
ThreadBufferDesc
::
GetElementSpace
()];
...
@@ -667,6 +671,125 @@ struct BlockwiseGenericTensorSliceCopy_v3
...
@@ -667,6 +671,125 @@ struct BlockwiseGenericTensorSliceCopy_v3
ThreadwiseStore
mThreadwiseStore
;
ThreadwiseStore
mThreadwiseStore
;
};
};
template
<
index_t
BlockSize
,
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SubLengths
,
class
ThreadClusterLengths
,
class
ThreadClusterArrangeOrder
,
class
SrcDimAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v4
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
using
SrcCoord
=
typename
TensorCoordinate_v2
<
SrcDesc
>::
type
;
using
DstCoord
=
typename
TensorCoordinate_v2
<
DstDesc
>::
type
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v4
(
SrcCoord
src_block_slice_origin
,
DstCoord
dst_block_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
Size
()
&&
nDim
==
SubLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
return
RegisterBufferDesc
::
GetElementSpace
();
}
template
<
class
TData
>
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
{
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
}
template
<
class
TData
>
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
{
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
}
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSlicingWindow
(
step_sizes
,
positive_direction
);
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSlicingWindow
(
step_sizes
,
positive_direction
);
}
private:
using
RegisterBufferDesc
=
decltype
(
make_native_tensor_descriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
SrcDesc
,
RegisterBufferDesc
,
SubLengths
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
RegisterBufferDesc
,
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
724e984b
...
@@ -6,6 +6,8 @@
...
@@ -6,6 +6,8 @@
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "tensor_view.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate_v2.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
...
@@ -427,6 +429,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
...
@@ -427,6 +429,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
};
// This version use TensorCoordinate
// This threadwise copy allow vector access of src and dst.
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows the vector size to be different on src and dst.
...
@@ -774,6 +777,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -774,6 +777,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
DstCoordinate
mDstSliceOrigin
;
DstCoordinate
mDstSliceOrigin
;
};
};
// this version use TensorView and TensorCoordinate
template
<
class
SrcTensor
,
template
<
class
SrcTensor
,
class
DstTensor
,
class
DstTensor
,
class
SliceLengths
,
class
SliceLengths
,
...
@@ -783,7 +787,7 @@ template <class SrcTensor,
...
@@ -783,7 +787,7 @@ template <class SrcTensor,
index_t
DstVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v3
struct
ThreadwiseGenericTensorSliceCopy_v3
r1
{
{
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
...
@@ -791,10 +795,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3
...
@@ -791,10 +795,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
(
SrcTensor
src
,
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
r1
(
SrcTensor
src
,
SrcCoordinate
src_slice_origin
,
SrcCoordinate
src_slice_origin
,
DstTensor
dst
,
DstTensor
dst
,
DstCoordinate
dst_slice_origin
)
DstCoordinate
dst_slice_origin
)
:
mSrc
{
src
},
:
mSrc
{
src
},
mDst
{
dst
},
mDst
{
dst
},
mSrcSlice
{
src
.
Slice
(
src_slice_origin
,
SliceLengths
{})},
mSrcSlice
{
src
.
Slice
(
src_slice_origin
,
SliceLengths
{})},
...
@@ -821,8 +825,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3
...
@@ -821,8 +825,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3
"wrong! vectorized access is not allowed"
);
"wrong! vectorized access is not allowed"
);
}
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
()
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
r1
()
:
ThreadwiseGenericTensorSliceCopy_v3
(
:
ThreadwiseGenericTensorSliceCopy_v3
r1
(
SrcTensor
{},
SrcCoordinate
{},
DstTensor
{},
DstCoordinate
{})
SrcTensor
{},
SrcCoordinate
{},
DstTensor
{},
DstCoordinate
{})
{
{
}
}
...
@@ -940,5 +944,154 @@ struct ThreadwiseGenericTensorSliceCopy_v3
...
@@ -940,5 +944,154 @@ struct ThreadwiseGenericTensorSliceCopy_v3
DstSlice
mDstSlice
;
DstSlice
mDstSlice
;
};
};
// This version use multi-index transformation
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template
<
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v4r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
typename
TensorCoordinate_v2
<
SrcDesc
>::
type
;
using
DstCoord
=
typename
TensorCoordinate_v2
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v4r2
(
SrcCoord
src_slice_origin
,
DstCoord
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
Size
()
&&
nDim
==
DimAccessOrder
::
Size
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>
{},
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
"wrong! cannot evenly divide"
);
// TODO:: sanity-check if vectorized memory access is allowed on src and dst
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v4r2
()
:
ThreadwiseGenericTensorSliceCopy_v4r2
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoord
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoord
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// set 0
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_long_vector
[
i
]
=
0
;
}
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
auto
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
// check for padding
// TODO: still kind of messy
if
(
!
src_coord
.
IsAnyLevelIndexInPaddingArea
())
{
const
index_t
src_offset
=
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
}
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
}
});
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
SrcCoord
mSrcSliceOrigin
;
DstCoord
mDstSliceOrigin
;
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/array.hpp
View file @
724e984b
...
@@ -23,20 +23,9 @@ struct Array
...
@@ -23,20 +23,9 @@ struct Array
static_assert
(
sizeof
...(
Xs
)
+
1
==
NSize
,
"wrong! size"
);
static_assert
(
sizeof
...(
Xs
)
+
1
==
NSize
,
"wrong! size"
);
}
}
#if 0
template <typename T>
__host__ __device__ explicit constexpr Array(const T& x)
{
static_assert(T::Size() == NSize, "wrong! size");
static_for<0, NSize, 1>{}([&](auto i){
mData[i] = x.At(i);
})
}
#endif
__host__
__device__
static
constexpr
index_t
Size
()
{
return
NSize
;
}
__host__
__device__
static
constexpr
index_t
Size
()
{
return
NSize
;
}
// TODO: remove
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
Size
();
}
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
Size
();
}
template
<
index_t
I
>
template
<
index_t
I
>
...
@@ -265,8 +254,8 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
...
@@ -265,8 +254,8 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
return
new_array
;
return
new_array
;
}
}
template
<
typename
F
,
typename
X
,
typename
Y
,
typename
Z
>
// emulate constepxr lambda for array
// emulate constepxr lambda for array
// math
template
<
typename
F
,
typename
X
,
typename
Y
,
typename
Z
>
struct
lambda_array_math
struct
lambda_array_math
{
{
const
F
&
f
;
const
F
&
f
;
...
...
composable_kernel/include/utility/array_helper.hpp
View file @
724e984b
...
@@ -5,8 +5,8 @@
...
@@ -5,8 +5,8 @@
namespace
ck
{
namespace
ck
{
template
<
typename
T
,
index_t
NSize
>
template
<
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
T
,
NSize
>
a
)
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
unsigned_t
,
NSize
>
a
)
{
{
constexpr
index_t
nsize
=
a
.
GetSize
();
constexpr
index_t
nsize
=
a
.
GetSize
();
...
@@ -89,5 +89,89 @@ __host__ __device__ void print_array(const char* s, Array<T, NSize> a)
...
@@ -89,5 +89,89 @@ __host__ __device__ void print_array(const char* s, Array<T, NSize> a)
});
});
}
}
template
<
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
signed_t
,
NSize
>
a
)
{
constexpr
index_t
nsize
=
a
.
GetSize
();
static_assert
(
nsize
>
0
&&
nsize
<=
10
,
"wrong!"
);
static_if
<
nsize
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d}
\n
"
,
s
,
nsize
,
a
[
0
]);
});
static_if
<
nsize
==
2
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
]);
});
static_if
<
nsize
==
3
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
]);
});
static_if
<
nsize
==
4
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
]);
});
static_if
<
nsize
==
5
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
]);
});
static_if
<
nsize
==
6
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
]);
});
static_if
<
nsize
==
7
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
]);
});
static_if
<
nsize
==
8
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
]);
});
static_if
<
nsize
==
9
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
]);
});
static_if
<
nsize
==
10
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
],
a
[
9
]);
});
}
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
724e984b
...
@@ -15,6 +15,15 @@
...
@@ -15,6 +15,15 @@
namespace ck {
namespace ck {
using unsigned_t = uint32_t;
using signed_t = int;
#if 0 // debug
using index_t = unsigned_t;
#else
using index_t = signed_t;
#endif
// For some reason, CUDA need this definition, otherwise
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// kernel would produce wrong result, indicating the compiler fail to generate correct
...
@@ -22,8 +31,6 @@ namespace ck {
...
@@ -22,8 +31,6 @@ namespace ck {
using float2_t = float2;
using float2_t = float2;
using float4_t = float4;
using float4_t = float4;
using index_t = uint32_t;
template <class T>
template <class T>
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
{
{
...
...
composable_kernel/include/utility/sequence.hpp
View file @
724e984b
...
@@ -537,11 +537,9 @@ struct sequence_unique_sort
...
@@ -537,11 +537,9 @@ struct sequence_unique_sort
};
};
template
<
typename
SeqMap
>
template
<
typename
SeqMap
>
struct
is_valid_sequence_map
struct
is_valid_sequence_map
:
is_same
<
typename
arithmetic_sequence_gen
<
0
,
SeqMap
::
Size
(),
1
>::
type
,
typename
sequence_sort
<
SeqMap
,
math
::
less
<
index_t
>>::
type
>
{
{
static
constexpr
bool
value
=
is_same
<
typename
arithmetic_sequence_gen
<
0
,
SeqMap
::
Size
(),
1
>::
type
,
typename
sequence_sort
<
SeqMap
,
math
::
less
<
index_t
>>::
type
>
{};
};
};
template
<
typename
SeqMap
>
template
<
typename
SeqMap
>
...
...
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
View file @
724e984b
...
@@ -115,7 +115,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
...
@@ -115,7 +115,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
4
;
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
4
;
#endif
#endif
#if
0
// debug
#if
1
// debug
constexpr
index_t
GridSize
=
constexpr
index_t
GridSize
=
(
N
/
NPerBlock
)
*
(
K
/
KPerBlock
)
*
(
Ho
/
HoPerBlock
)
*
(
Wo
/
WoPerBlock
);
(
N
/
NPerBlock
)
*
(
K
/
KPerBlock
)
*
(
Ho
/
HoPerBlock
)
*
(
Wo
/
WoPerBlock
);
#else
#else
...
...
driver/src/driver.cpp
View file @
724e984b
...
@@ -73,25 +73,25 @@ int main(int argc, char* argv[])
...
@@ -73,25 +73,25 @@ int main(int argc, char* argv[])
using
namespace
ck
;
using
namespace
ck
;
#if 1
#if 1
constexpr
index_t
N
=
10
;
constexpr
index_t
N
=
32
;
constexpr
index_t
C
=
10
;
constexpr
index_t
C
=
8
;
constexpr
index_t
HI
=
10
;
constexpr
index_t
HI
=
2
;
constexpr
index_t
WI
=
10
;
constexpr
index_t
WI
=
2
;
constexpr
index_t
K
=
1
0
;
constexpr
index_t
K
=
1
28
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
3
;
constexpr
index_t
HPad
=
1
;
constexpr
index_t
WPad
=
3
;
constexpr
index_t
WPad
=
1
;
#elif 1
#elif 1
// 3x3, 34x34
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
3
4
;
constexpr
index_t
HI
=
3
2
;
constexpr
index_t
WI
=
3
4
;
constexpr
index_t
WI
=
3
2
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
X
=
3
;
...
@@ -99,8 +99,8 @@ int main(int argc, char* argv[])
...
@@ -99,8 +99,8 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
1
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
1
;
#elif 0
#elif 0
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
...
@@ -434,7 +434,7 @@ int main(int argc, char* argv[])
...
@@ -434,7 +434,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if
1
#if
0
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment