Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
724e984b
Commit
724e984b
authored
Sep 11, 2019
by
Chao Liu
Browse files
enabling padding for chwn format
parent
ca42e910
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
810 additions
and
326 deletions
+810
-326
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
..._convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
+169
-200
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+56
-9
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
...ernel/include/tensor_description/tensor_coordinate_v2.hpp
+98
-44
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+69
-9
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+143
-20
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+160
-7
composable_kernel/include/utility/array.hpp
composable_kernel/include/utility/array.hpp
+3
-14
composable_kernel/include/utility/array_helper.hpp
composable_kernel/include/utility/array_helper.hpp
+86
-2
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+9
-2
composable_kernel/include/utility/sequence.hpp
composable_kernel/include/utility/sequence.hpp
+2
-4
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
...ce_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+14
-14
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
View file @
724e984b
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
724e984b
...
...
@@ -22,17 +22,27 @@ struct PassThrough
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
UpperIndex
idx_up
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
UpperIndex
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
/* idx_up */
)
{
return
false
;
}
};
// LowLengths: Sequence<...>
...
...
@@ -55,17 +65,39 @@ struct Pad
return
GetLowerLengths
()
+
LeftPads
{}
+
RightPads
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
UpperIndex
idx_up
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
-
LeftPads
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
UpperIndex
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
idx_up
)
const
{
bool
flag
=
false
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// only check if there is left-padding
static_if
<
(
LeftPads
::
At
(
idim
)
!=
0
)
>
{}(
[
&
](
auto
)
{
flag
=
flag
||
idx_up
[
idim
]
<
LeftPads
::
At
(
idim
);
});
// only check if there is right-padding
static_if
<
(
RightPads
::
At
(
idim
)
!=
0
)
>
{}([
&
](
auto
)
{
flag
=
flag
||
idx_up
[
idim
]
>=
LeftPads
::
At
(
idim
)
+
LowLengths
::
At
(
idim
);
});
});
return
flag
;
}
};
// LowLengths: Sequence<...>
...
...
@@ -124,7 +156,7 @@ struct Merge
.
PushBack
(
Number
<
1
>
{});
// calculate index in each of the dimensions in the order of their dimension
#if 1
#if 1
// would compile to same ISA?
static_for
<
0
,
nDimLow
-
1
,
1
>
{}(
lambda_CalculateLowerIndex
<
decltype
(
pseudo_low_strides
)
>
(
itmp
,
idx_low
));
...
...
@@ -138,8 +170,10 @@ struct Merge
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
LowerIndex
&
idx_low_old
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
idx_low_old
)
{
LowerIndex
idx_low_diff
;
...
...
@@ -149,6 +183,13 @@ struct Merge
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
/* idx_up */
)
{
return
false
;
}
};
// UpLengths: Sequence<...>
...
...
@@ -189,7 +230,10 @@ struct Unmerge
return
idx_low
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
CalculateLowerIndex
(
idx_up_diff
);
}
...
...
@@ -240,7 +284,10 @@ struct Embed
return
idx_low
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
LowerIndex
idx_low_diff
{
0
};
...
...
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
View file @
724e984b
...
...
@@ -3,26 +3,28 @@
#include "common_header.hpp"
#include "dimension.hpp"
#include "
dimension
_transform.hpp"
#include "
multi_index
_transform.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
template
<
class
NativeTensorDesc
>
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
;
template
<
typename
NativeTensorDesc
>
struct
NativeTensorCoordinate
{
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
using
Index
=
tensor_desc_type
::
Index
;
static
constexpr
index_t
nDim
=
Index
::
GetSize
();
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
NativeTensorCoordinate
(
Index
idx
)
:
m
Offset
{
GetTensorDesriptor
().
Get
Offset
(
idx
)
}
:
m
Index
(
idx
),
mOffset
(
tensor_desc_type
::
Calculate
Offset
(
idx
)
)
{
}
template
<
class
...
Xs
>
template
<
typename
...
Xs
>
__host__
__device__
constexpr
NativeTensorCoordinate
(
Xs
...
xs
)
:
NativeTensorCoordinate
(
Index
{
xs
...})
{
...
...
@@ -36,82 +38,103 @@ struct NativeTensorCoordinate
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
constexpr
const
Index
&
GetIndex
()
const
{
return
mIndex
;
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
type
operator
+=
(
Index
idx_diff
)
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
mOffset
+=
tensor_desc_type
::
GetOffsetDiff
(
idx_diff
);
// mIndex is updated here, but some (or all) of its entries may never be used
mIndex
+=
idx_diff
;
mOffset
+=
tensor_desc_type
::
CalculateOffset
(
idx_diff
);
return
*
this
;
}
__host__
__device__
type
operator
-=
(
Index
idx_diff
)
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
{
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
idx_diff
);
// mIndex is updated here, but some (or all) of its entries may never be used
mIndex
-=
idx_diff
;
mOffset
-=
tensor_desc_type
::
CalculateOffset
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
+
(
Index
idx_diff
)
const
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
+=
idx_diff
;
return
coord
;
}
__host__
__device__
constexpr
type
operator
-
(
Index
idx_diff
)
const
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
-=
idx_diff
;
return
coord
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsAnyLevelIndexInPaddingArea
()
{
return
false
;
}
private:
// mIndex may be saved and update, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndex
;
index_t
mOffset
;
};
template
<
class
TransformedTensorDesc
>
template
<
typename
TransformedTensorDesc
>
struct
TransformedTensorCoordinate
{
using
type
=
TransformedTensorCoordinate
;
using
tensor_desc_type
=
TransformedTensorDesc
;
using
Index
=
tensor_desc_type
::
UpperIndex
;
using
lower_coordinate_type
=
TensorCoordiante_v2
<
decltype
(
GetTensorDescriptor
().
GetLowerTensorDescriptor
())
>::
type
;
static
constexpr
index_t
nDim
=
Index
::
GetSize
();
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Index
idx
)
:
mIndex
{
idx
},
mCoordLow
{
GetTensorDescriptor
().
GetLowerIndex
(
idx
)}
using
LowerCoord
=
typename
TensorCoordinate_v2
<
decltype
(
tensor_desc_type
::
GetLowerTensorDescriptor
())
>::
type
;
using
UpperCoord
=
TransformedTensorCoordinate
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
UpperIndex
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
TransformedTensorCoordinate
(
UpperIndex
idx
)
:
mIndexUp
{
idx
},
mCoordLow
{
tensor_desc_type
::
CalculateLowerIndex
(
idx
)}
{
}
template
<
class
...
Xs
>
template
<
typename
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Xs
...
xs
)
:
TransformedTensorCoordinate
(
Index
{
xs
...})
:
TransformedTensorCoordinate
(
Upper
Index
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
TransformedTensorCoordinate
(
Index
{
Xs
...})
:
TransformedTensorCoordinate
(
Upper
Index
{
Xs
...})
{
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mCoordLow
.
GetOffset
();
}
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
mCoordLow
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
mIndexUp
;
}
__host__
__device__
constexpr
Index
GetIndex
()
const
{
return
m
Index
;
}
__host__
__device__
constexpr
const
Upper
Index
&
GetIndex
()
const
{
return
GetUpper
Index
()
;
}
__host__
__device__
type
operator
+=
(
Index
idx_up_diff
)
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
GetLowerCoordinate
().
GetOffset
();
}
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
mCoordLow
+=
tensor_desc_type
::
GetLowerIndexDiff
(
idx_up_diff
,
m
Index
Up
,
mCoordLow
.
GetIndex
());
mCoordLow
+=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
Get
Index
(),
GetLowerCoordinate
()
.
GetIndex
());
// mIndexUp is updated here, but some (or all) of its entries may never be used
mIndexUp
+=
idx_up_diff
;
...
...
@@ -119,11 +142,35 @@ struct TransformedTensorCoordinate
return
*
this
;
}
__host__
__device__
constexpr
type
operator
+
(
Index
idx_up_diff
)
const
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
Upper
Index
&
idx_up_diff
)
{
type
coord
=
*
this
;
coord
+=
idx_diff
;
return
coord
;
mCoordLow
-=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
mIndexUp
-=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
+=
idx_up_diff
;
return
coord_up
;
}
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
-=
idx_up_diff
;
return
coord_up
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
constexpr
bool
IsAnyLevelIndexInPaddingArea
()
const
{
return
tensor_desc_type
::
IsUpperIndexInPaddingArea
(
GetIndex
())
||
mCoordLow
.
IsAnyLevelIndexInPaddingArea
();
}
private:
...
...
@@ -131,22 +178,22 @@ struct TransformedTensorCoordinate
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndexUp
;
l
ower
_c
oord
inate_type
mCoordLow
;
Upper
Index
mIndexUp
;
L
ower
C
oord
mCoordLow
;
};
template
<
class
TensorDesc
>
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
{
private:
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
();
}
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
...
...
@@ -156,5 +203,12 @@ struct TensorCoordinate_v2
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
template
<
typename
TensorDesc
>
__host__
__device__
constexpr
auto
make_tensor_coordinate_v2
(
TensorDesc
,
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
idx
)
{
return
typename
TensorCoordinate_v2
<
TensorDesc
>::
type
(
idx
);
}
}
#endif
composable_kernel/include/tensor_description/tensor_descriptor.hpp
View file @
724e984b
...
...
@@ -64,6 +64,18 @@ struct NativeTensorDescriptor
return
GetStrides
(
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
accumulate_on_sequence
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
return
accumulate_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
1
>
{});
}
// TODO: this cannot return constepxr because of use of lambda
__host__
__device__
static
constexpr
index_t
CalculateOffset
(
const
Index
&
idx
)
{
index_t
offset
=
0
;
...
...
@@ -73,6 +85,12 @@ struct NativeTensorDescriptor
return
offset
;
}
// TODO: remove this
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
const
Index
&
idx
)
{
return
CalculateOffset
(
idx
);
}
__host__
__device__
static
constexpr
index_t
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
{
index_t
offset_diff
=
0
;
...
...
@@ -100,6 +118,12 @@ struct NativeTensorDescriptor
{
return
Tuple
<>
{};
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
Index
&
/* idx */
)
{
return
false
;
}
};
// LowerTensorDescriptor
...
...
@@ -248,6 +272,17 @@ struct TransformedTensorDescriptor
return
GetLengths
(
Sequence
<
IDim
,
IDims
...
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
accumulate_on_sequence
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
// TODO: Is this the correct definition for transformed tensor?
return
GetLowerTensorDescriptor
().
GetElementSpace
();
}
// TODO: right now return value is constexpr because use of non-constepxr lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
...
...
@@ -256,8 +291,8 @@ struct TransformedTensorDescriptor
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
auto
idx_low_part
=
pick_array_element
(
idx_low
,
LowDimensionIds
{}.
At
(
itran
));
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
auto
idx_low_part
=
pick_array_element
(
idx_low
,
LowDimensionIds
{}.
At
(
itran
));
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
...
...
@@ -269,26 +304,29 @@ struct TransformedTensorDescriptor
}
// TODO: right now return value is constexpr because use of non-constepxr lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndexD
iff
(
const
UpperIndex
&
idx_up_
diff
,
const
LowerIndex
&
idx_low_old
)
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_d
iff
,
const
UpperIndex
&
idx_up_
old
,
const
LowerIndex
&
idx_low_old
)
{
LowerIndex
idx_low_diff
;
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
::
At
(
itran
);
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
const
auto
idx_up_diff_part
=
pick_array_element
(
idx_up_diff
,
UpDimensionIds
::
At
(
itran
));
pick_array_element
(
idx_up_diff
,
UpDimensionIds
{}.
At
(
itran
));
auto
idx_
low_diff
_part
=
pick_array_element
(
idx_
low_diff
,
Low
DimensionIds
::
At
(
itran
));
const
auto
idx_
up_old
_part
=
pick_array_element
(
idx_
up_old
,
Up
DimensionIds
{}.
At
(
itran
));
const
auto
idx_low_old_part
=
pick_array_element
(
idx_low_old
,
LowDimensionIds
::
At
(
itran
));
pick_array_element
(
idx_low_old
,
LowDimensionIds
{}.
At
(
itran
));
auto
idx_low_diff_part
=
pick_array_element
(
idx_low_diff
,
LowDimensionIds
{}.
At
(
itran
));
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part
=
tran
.
CalculateLowerIndex
(
idx_up_diff_part
,
idx_low_old_part
);
idx_low_diff_part
=
tran
.
CalculateLowerIndexDiff
(
to_array
(
idx_up_diff_part
),
to_array
(
idx_up_old_part
),
to_array
(
idx_low_old_part
));
});
return
idx_low_diff
;
...
...
@@ -299,6 +337,12 @@ struct TransformedTensorDescriptor
return
GetLowerTensorDescriptor
().
CalculateOffset
(
CalculateLowerIndex
(
idx_up
));
}
// TODO: remove this
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
const
UpperIndex
&
idx_up
)
{
return
CalculateOffset
(
idx_up
);
}
#if 0
template <index_t IDim>
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
...
...
@@ -321,6 +365,22 @@ struct TransformedTensorDescriptor
// not implemented
}
#endif
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
idx_up
)
{
bool
flag
=
false
;
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
flag
=
flag
||
tran
.
IsUpperIndexInPaddingArea
(
to_array
(
idx_up_part
));
});
return
flag
;
}
};
template
<
index_t
...
Lengths
,
index_t
...
Strides
>
...
...
@@ -337,7 +397,7 @@ __host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths)
Lengths
::
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
return
make_
N
ative
T
ensor
D
escriptor
(
Lengths
{},
strides
);
return
make_
n
ative
_t
ensor
_d
escriptor
(
Lengths
{},
strides
);
}
template
<
typename
LowTensorDescriptor
,
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
724e984b
...
...
@@ -7,6 +7,8 @@
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate_v2.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
...
...
@@ -418,6 +420,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
}
};
// This version use TensorCoordiante
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
template
<
index_t
BlockSize
,
...
...
@@ -518,7 +521,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
}
private:
using
RegisterBufferDesc
=
decltype
(
make_
ConstantT
ensor
D
escriptor_packed
(
SubLengths
{}));
using
RegisterBufferDesc
=
decltype
(
make_
native_t
ensor
_d
escriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
SrcDesc
,
RegisterBufferDesc
,
...
...
@@ -544,6 +547,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
ThreadwiseStore
mThreadwiseStore
;
};
// this version use TensorView and TensorCoordinate
template
<
index_t
BlockSize
,
class
SrcTensor
,
class
DstTensor
,
...
...
@@ -639,25 +643,25 @@ struct BlockwiseGenericTensorSliceCopy_v3
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadBufferTensor
=
NormalTensorView
<
ThreadBufferDesc
,
data_type
>
;
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v3
<
SrcTensor
,
ThreadBufferTensor
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v3
<
ThreadBufferTensor
,
DstTensor
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v3
r1
<
SrcTensor
,
ThreadBufferTensor
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v3
r1
<
ThreadBufferTensor
,
DstTensor
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
data_type
mpBuffer
[
ThreadBufferDesc
::
GetElementSpace
()];
...
...
@@ -667,6 +671,125 @@ struct BlockwiseGenericTensorSliceCopy_v3
ThreadwiseStore
mThreadwiseStore
;
};
template
<
index_t
BlockSize
,
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SubLengths
,
class
ThreadClusterLengths
,
class
ThreadClusterArrangeOrder
,
class
SrcDimAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v4
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
using
SrcCoord
=
typename
TensorCoordinate_v2
<
SrcDesc
>::
type
;
using
DstCoord
=
typename
TensorCoordinate_v2
<
DstDesc
>::
type
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v4
(
SrcCoord
src_block_slice_origin
,
DstCoord
dst_block_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
Size
()
&&
nDim
==
SubLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
return
RegisterBufferDesc
::
GetElementSpace
();
}
template
<
class
TData
>
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
{
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
}
template
<
class
TData
>
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
{
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
}
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSlicingWindow
(
step_sizes
,
positive_direction
);
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSlicingWindow
(
step_sizes
,
positive_direction
);
}
private:
using
RegisterBufferDesc
=
decltype
(
make_native_tensor_descriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
SrcDesc
,
RegisterBufferDesc
,
SubLengths
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
RegisterBufferDesc
,
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
724e984b
...
...
@@ -6,6 +6,8 @@
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate_v2.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
...
...
@@ -427,6 +429,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
// This version use TensorCoordinate
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
...
...
@@ -774,6 +777,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
DstCoordinate
mDstSliceOrigin
;
};
// this version use TensorView and TensorCoordinate
template
<
class
SrcTensor
,
class
DstTensor
,
class
SliceLengths
,
...
...
@@ -783,7 +787,7 @@ template <class SrcTensor,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v3
struct
ThreadwiseGenericTensorSliceCopy_v3
r1
{
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
...
...
@@ -791,10 +795,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
(
SrcTensor
src
,
SrcCoordinate
src_slice_origin
,
DstTensor
dst
,
DstCoordinate
dst_slice_origin
)
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
r1
(
SrcTensor
src
,
SrcCoordinate
src_slice_origin
,
DstTensor
dst
,
DstCoordinate
dst_slice_origin
)
:
mSrc
{
src
},
mDst
{
dst
},
mSrcSlice
{
src
.
Slice
(
src_slice_origin
,
SliceLengths
{})},
...
...
@@ -821,8 +825,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3
"wrong! vectorized access is not allowed"
);
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
()
:
ThreadwiseGenericTensorSliceCopy_v3
(
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
r1
()
:
ThreadwiseGenericTensorSliceCopy_v3
r1
(
SrcTensor
{},
SrcCoordinate
{},
DstTensor
{},
DstCoordinate
{})
{
}
...
...
@@ -940,5 +944,154 @@ struct ThreadwiseGenericTensorSliceCopy_v3
DstSlice
mDstSlice
;
};
// This version use multi-index transformation
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template
<
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v4r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
typename
TensorCoordinate_v2
<
SrcDesc
>::
type
;
using
DstCoord
=
typename
TensorCoordinate_v2
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v4r2
(
SrcCoord
src_slice_origin
,
DstCoord
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
Size
()
&&
nDim
==
DimAccessOrder
::
Size
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>
{},
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
"wrong! cannot evenly divide"
);
// TODO:: sanity-check if vectorized memory access is allowed on src and dst
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v4r2
()
:
ThreadwiseGenericTensorSliceCopy_v4r2
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoord
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoord
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// set 0
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_long_vector
[
i
]
=
0
;
}
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
auto
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
// check for padding
// TODO: still kind of messy
if
(
!
src_coord
.
IsAnyLevelIndexInPaddingArea
())
{
const
index_t
src_offset
=
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
}
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
}
});
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
SrcCoord
mSrcSliceOrigin
;
DstCoord
mDstSliceOrigin
;
};
}
// namespace ck
#endif
composable_kernel/include/utility/array.hpp
View file @
724e984b
...
...
@@ -23,20 +23,9 @@ struct Array
static_assert
(
sizeof
...(
Xs
)
+
1
==
NSize
,
"wrong! size"
);
}
#if 0
template <typename T>
__host__ __device__ explicit constexpr Array(const T& x)
{
static_assert(T::Size() == NSize, "wrong! size");
static_for<0, NSize, 1>{}([&](auto i){
mData[i] = x.At(i);
})
}
#endif
__host__
__device__
static
constexpr
index_t
Size
()
{
return
NSize
;
}
// TODO: remove
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
Size
();
}
template
<
index_t
I
>
...
...
@@ -265,8 +254,8 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
return
new_array
;
}
template
<
typename
F
,
typename
X
,
typename
Y
,
typename
Z
>
// emulate constepxr lambda for array
// math
// emulate constepxr lambda for array
template
<
typename
F
,
typename
X
,
typename
Y
,
typename
Z
>
struct
lambda_array_math
{
const
F
&
f
;
...
...
composable_kernel/include/utility/array_helper.hpp
View file @
724e984b
...
...
@@ -5,8 +5,8 @@
namespace
ck
{
template
<
typename
T
,
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
T
,
NSize
>
a
)
template
<
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
unsigned_t
,
NSize
>
a
)
{
constexpr
index_t
nsize
=
a
.
GetSize
();
...
...
@@ -89,5 +89,89 @@ __host__ __device__ void print_array(const char* s, Array<T, NSize> a)
});
}
template
<
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
signed_t
,
NSize
>
a
)
{
constexpr
index_t
nsize
=
a
.
GetSize
();
static_assert
(
nsize
>
0
&&
nsize
<=
10
,
"wrong!"
);
static_if
<
nsize
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d}
\n
"
,
s
,
nsize
,
a
[
0
]);
});
static_if
<
nsize
==
2
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
]);
});
static_if
<
nsize
==
3
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
]);
});
static_if
<
nsize
==
4
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
]);
});
static_if
<
nsize
==
5
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
]);
});
static_if
<
nsize
==
6
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
]);
});
static_if
<
nsize
==
7
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
]);
});
static_if
<
nsize
==
8
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
]);
});
static_if
<
nsize
==
9
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
]);
});
static_if
<
nsize
==
10
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
],
a
[
9
]);
});
}
}
// namespace ck
#endif
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
724e984b
...
...
@@ -15,6 +15,15 @@
namespace ck {
using unsigned_t = uint32_t;
using signed_t = int;
#if 0 // debug
using index_t = unsigned_t;
#else
using index_t = signed_t;
#endif
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
...
...
@@ -22,8 +31,6 @@ namespace ck {
using float2_t = float2;
using float4_t = float4;
using index_t = uint32_t;
template <class T>
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
{
...
...
composable_kernel/include/utility/sequence.hpp
View file @
724e984b
...
...
@@ -537,11 +537,9 @@ struct sequence_unique_sort
};
template
<
typename
SeqMap
>
struct
is_valid_sequence_map
struct
is_valid_sequence_map
:
is_same
<
typename
arithmetic_sequence_gen
<
0
,
SeqMap
::
Size
(),
1
>::
type
,
typename
sequence_sort
<
SeqMap
,
math
::
less
<
index_t
>>::
type
>
{
static
constexpr
bool
value
=
is_same
<
typename
arithmetic_sequence_gen
<
0
,
SeqMap
::
Size
(),
1
>::
type
,
typename
sequence_sort
<
SeqMap
,
math
::
less
<
index_t
>>::
type
>
{};
};
template
<
typename
SeqMap
>
...
...
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
View file @
724e984b
...
...
@@ -115,7 +115,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
4
;
#endif
#if
0
// debug
#if
1
// debug
constexpr
index_t
GridSize
=
(
N
/
NPerBlock
)
*
(
K
/
KPerBlock
)
*
(
Ho
/
HoPerBlock
)
*
(
Wo
/
WoPerBlock
);
#else
...
...
driver/src/driver.cpp
View file @
724e984b
...
...
@@ -73,25 +73,25 @@ int main(int argc, char* argv[])
using
namespace
ck
;
#if 1
constexpr
index_t
N
=
10
;
constexpr
index_t
C
=
10
;
constexpr
index_t
HI
=
10
;
constexpr
index_t
WI
=
10
;
constexpr
index_t
K
=
1
0
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
N
=
32
;
constexpr
index_t
C
=
8
;
constexpr
index_t
HI
=
2
;
constexpr
index_t
WI
=
2
;
constexpr
index_t
K
=
1
28
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
3
;
constexpr
index_t
WPad
=
3
;
constexpr
index_t
HPad
=
1
;
constexpr
index_t
WPad
=
1
;
#elif 1
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
3
4
;
constexpr
index_t
WI
=
3
4
;
constexpr
index_t
HI
=
3
2
;
constexpr
index_t
WI
=
3
2
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
...
...
@@ -99,8 +99,8 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
HPad
=
1
;
constexpr
index_t
WPad
=
1
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
...
...
@@ -434,7 +434,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
#if
1
#if
0
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment