Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
67ad47e7
You need to sign in or sign up before continuing.
Commit
67ad47e7
authored
Aug 16, 2021
by
Chao Liu
Browse files
refactor
parent
16effa76
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
63 additions
and
69 deletions
+63
-69
composable_kernel/include/tensor_description/cluster_descriptor.hpp
..._kernel/include/tensor_description/cluster_descriptor.hpp
+1
-1
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+14
-14
composable_kernel/include/tensor_description/tensor_adaptor.hpp
...able_kernel/include/tensor_description/tensor_adaptor.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+6
-6
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
...lude/tensor_operation/blockwise_tensor_slice_transfer.hpp
+1
-1
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
...e/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
+1
-1
composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
...lude/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
+8
-8
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
...nel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
+6
-6
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
...nel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
+8
-8
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+3
-3
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+4
-4
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
.../tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+9
-9
composable_kernel/include/utility/math.hpp
composable_kernel/include/utility/math.hpp
+0
-6
No files found.
composable_kernel/include/tensor_description/cluster_descriptor.hpp
View file @
67ad47e7
...
@@ -8,7 +8,7 @@ namespace ck {
...
@@ -8,7 +8,7 @@ namespace ck {
template
<
typename
Lengths
,
template
<
typename
Lengths
,
typename
ArrangeOrder
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>
::
type
>
typename
ArrangeOrder
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>
::
type
>
__host__
__device__
constexpr
auto
make_cluster_descriptor
_v2
(
__host__
__device__
constexpr
auto
make_cluster_descriptor
(
const
Lengths
&
lengths
,
const
Lengths
&
lengths
,
ArrangeOrder
order
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>::
type
{})
ArrangeOrder
order
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>::
type
{})
{
{
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
67ad47e7
...
@@ -481,11 +481,11 @@ struct Merge_v1_carry_check
...
@@ -481,11 +481,11 @@ struct Merge_v1_carry_check
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
using
LowLengthsScan
=
decltype
(
using
LowLengthsScan
=
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{}));
decltype
(
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{}));
using
UpLengths
=
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{})));
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})));
LowLengths
low_lengths_
;
LowLengths
low_lengths_
;
LowLengthsScan
low_lengths_scan_
;
LowLengthsScan
low_lengths_scan_
;
...
@@ -496,8 +496,8 @@ struct Merge_v1_carry_check
...
@@ -496,8 +496,8 @@ struct Merge_v1_carry_check
__host__
__device__
constexpr
Merge_v1_carry_check
(
const
LowLengths
&
low_lengths
)
__host__
__device__
constexpr
Merge_v1_carry_check
(
const
LowLengths
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
:
low_lengths_
{
low_lengths
},
low_lengths_scan_
{
low_lengths_scan_
{
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{})},
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{}))}
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{}))}
{
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
}
...
@@ -1037,7 +1037,7 @@ struct Merge_v2_magic_division
...
@@ -1037,7 +1037,7 @@ struct Merge_v2_magic_division
using
UpperIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
using
UpLengths
=
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{})));
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})));
using
LowLengthsMagicDivisorMultipiler
=
decltype
(
using
LowLengthsMagicDivisorMultipiler
=
decltype
(
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengths
>
{},
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengths
>
{},
...
@@ -1062,7 +1062,7 @@ struct Merge_v2_magic_division
...
@@ -1062,7 +1062,7 @@ struct Merge_v2_magic_division
low_lengths_magic_divisor_shift_
{
generate_tuple
(
low_lengths_magic_divisor_shift_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicShift
(
low_lengths
[
i
]);
},
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicShift
(
low_lengths
[
i
]);
},
Number
<
NDimLow
>
{})},
Number
<
NDimLow
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{}))}
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{}))}
{
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
}
...
@@ -1188,11 +1188,11 @@ struct Merge_v2r2_magic_division
...
@@ -1188,11 +1188,11 @@ struct Merge_v2r2_magic_division
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
using
LowLengthsScan
=
decltype
(
using
LowLengthsScan
=
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{}));
decltype
(
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{}));
using
UpLengths
=
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{})));
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})));
using
LowLengthsScanMagicDivisorMultipiler
=
decltype
(
generate_tuple
(
using
LowLengthsScanMagicDivisorMultipiler
=
decltype
(
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengthsScan
>
{},
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengthsScan
>
{},
...
@@ -1213,14 +1213,14 @@ struct Merge_v2r2_magic_division
...
@@ -1213,14 +1213,14 @@ struct Merge_v2r2_magic_division
__host__
__device__
constexpr
Merge_v2r2_magic_division
(
const
LowLengths
&
low_lengths
)
__host__
__device__
constexpr
Merge_v2r2_magic_division
(
const
LowLengths
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
:
low_lengths_
{
low_lengths
},
low_lengths_scan_
{
low_lengths_scan_
{
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{})},
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{})},
low_lengths_scan_magic_divisor_multiplier_
{
generate_tuple
(
low_lengths_scan_magic_divisor_multiplier_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicMultiplier
(
low_lengths_scan_
[
i
]);
},
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicMultiplier
(
low_lengths_scan_
[
i
]);
},
Number
<
NDimLow
>
{})},
Number
<
NDimLow
>
{})},
low_lengths_scan_magic_divisor_shift_
{
generate_tuple
(
low_lengths_scan_magic_divisor_shift_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicShift
(
low_lengths_scan_
[
i
]);
},
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicShift
(
low_lengths_scan_
[
i
]);
},
Number
<
NDimLow
>
{})},
Number
<
NDimLow
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{}))}
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{}))}
{
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
}
...
@@ -1336,7 +1336,7 @@ struct UnMerge
...
@@ -1336,7 +1336,7 @@ struct UnMerge
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
using
UpLengthsScan
=
using
UpLengthsScan
=
decltype
(
container_reverse_exclusive_scan
(
UpLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{}));
decltype
(
container_reverse_exclusive_scan
(
UpLengths
{},
math
::
multiplies
{},
Number
<
1
>
{}));
UpLengths
up_lengths_
;
UpLengths
up_lengths_
;
UpLengthsScan
up_lengths_scan_
;
UpLengthsScan
up_lengths_scan_
;
...
@@ -1346,7 +1346,7 @@ struct UnMerge
...
@@ -1346,7 +1346,7 @@ struct UnMerge
__host__
__device__
constexpr
UnMerge
(
const
UpLengths
&
up_lengths
)
__host__
__device__
constexpr
UnMerge
(
const
UpLengths
&
up_lengths
)
:
up_lengths_
{
up_lengths
},
:
up_lengths_
{
up_lengths
},
up_lengths_scan_
{
up_lengths_scan_
{
container_reverse_exclusive_scan
(
up_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{})}
container_reverse_exclusive_scan
(
up_lengths
,
math
::
multiplies
{},
Number
<
1
>
{})}
{
{
}
}
...
...
composable_kernel/include/tensor_description/tensor_adaptor.hpp
View file @
67ad47e7
...
@@ -64,7 +64,7 @@ struct TensorAdaptor
...
@@ -64,7 +64,7 @@ struct TensorAdaptor
Number
<
ndim_top_
>
{});
Number
<
ndim_top_
>
{});
// TODO: make container_reduce support tuple of Number and index_t
// TODO: make container_reduce support tuple of Number and index_t
return
container_reduce
(
lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{});
return
container_reduce
(
lengths
,
math
::
multiplies
{},
Number
<
1
>
{});
}
}
template
<
index_t
IDim
>
template
<
index_t
IDim
>
...
...
composable_kernel/include/tensor_description/tensor_descriptor.hpp
View file @
67ad47e7
...
@@ -69,7 +69,7 @@ struct TensorDescriptor
...
@@ -69,7 +69,7 @@ struct TensorDescriptor
Number
<
ndim_visible_
>
{});
Number
<
ndim_visible_
>
{});
// TODO: make container_reduce support tuple of Number and index_t
// TODO: make container_reduce support tuple of Number and index_t
return
container_reduce
(
lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{});
return
container_reduce
(
lengths
,
math
::
multiplies
{},
Number
<
1
>
{});
}
}
template
<
index_t
IDim
>
template
<
index_t
IDim
>
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
67ad47e7
...
@@ -38,8 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
...
@@ -38,8 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
template
<
typename
...
Lengths
,
template
<
typename
...
Lengths
,
typename
...
Strides
,
typename
...
Strides
,
typename
enable_if
<
sizeof
...(
Lengths
)
==
sizeof
...(
Strides
),
bool
>
::
type
=
false
>
typename
enable_if
<
sizeof
...(
Lengths
)
==
sizeof
...(
Strides
),
bool
>
::
type
=
false
>
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor
_v2
(
const
Tuple
<
Lengths
...
>&
lengths
,
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor
(
const
Tuple
<
Lengths
...
>&
lengths
,
const
Tuple
<
Strides
...
>&
strides
)
const
Tuple
<
Strides
...
>&
strides
)
{
{
constexpr
index_t
N
=
sizeof
...(
Lengths
);
constexpr
index_t
N
=
sizeof
...(
Lengths
);
...
@@ -100,7 +100,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
...
@@ -100,7 +100,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
constexpr
auto
visible_dim_hidden_ids
=
typename
arithmetic_sequence_gen
<
1
,
N
+
1
,
1
>::
type
{};
constexpr
auto
visible_dim_hidden_ids
=
typename
arithmetic_sequence_gen
<
1
,
N
+
1
,
1
>::
type
{};
const
auto
element_space_size
=
container_reduce
(
lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{});
const
auto
element_space_size
=
container_reduce
(
lengths
,
math
::
multiplies
{},
Number
<
1
>
{});
return
TensorDescriptor
<
remove_cv_t
<
decltype
(
transforms
)
>
,
return
TensorDescriptor
<
remove_cv_t
<
decltype
(
transforms
)
>
,
remove_cv_t
<
decltype
(
low_dim_hidden_idss
)
>
,
remove_cv_t
<
decltype
(
low_dim_hidden_idss
)
>
,
...
@@ -112,7 +112,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
...
@@ -112,7 +112,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
template
<
typename
...
Lengths
,
typename
Align
>
template
<
typename
...
Lengths
,
typename
Align
>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor_aligned
_v2
(
const
Tuple
<
Lengths
...
>&
lengths
,
Align
align
)
make_naive_tensor_descriptor_aligned
(
const
Tuple
<
Lengths
...
>&
lengths
,
Align
align
)
{
{
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -133,7 +133,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
...
@@ -133,7 +133,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
else
else
{
{
return
container_reduce
(
lengths
,
return
container_reduce
(
lengths
,
math
::
multiplies
_v2
{},
math
::
multiplies
{},
Number
<
stride_n_minus_2
>
{},
Number
<
stride_n_minus_2
>
{},
i
+
I1
,
i
+
I1
,
Number
<
N
-
1
>
{},
Number
<
N
-
1
>
{},
...
@@ -142,7 +142,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
...
@@ -142,7 +142,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
},
},
Number
<
N
>
{});
Number
<
N
>
{});
return
make_naive_tensor_descriptor
_v2
(
lengths
,
strides
);
return
make_naive_tensor_descriptor
(
lengths
,
strides
);
}
}
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
View file @
67ad47e7
...
@@ -143,7 +143,7 @@ struct BlockwiseTensorSliceTransfer_v4
...
@@ -143,7 +143,7 @@ struct BlockwiseTensorSliceTransfer_v4
private:
private:
static
constexpr
auto
thread_cluster_desc_
=
static
constexpr
auto
thread_cluster_desc_
=
make_cluster_descriptor
_v2
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
using
ThreadwiseTransfer
=
ThreadwiseTensorSliceTransfer_v3
<
ThreadSliceLengths
,
ThreadwiseTensorSliceTransfer_v3
<
ThreadSliceLengths
,
...
...
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
View file @
67ad47e7
...
@@ -131,7 +131,7 @@ struct BlockwiseTensorSliceTransfer_v4r1
...
@@ -131,7 +131,7 @@ struct BlockwiseTensorSliceTransfer_v4r1
private:
private:
static
constexpr
auto
thread_cluster_desc_
=
static
constexpr
auto
thread_cluster_desc_
=
make_cluster_descriptor
_v2
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
using
ThreadwiseTransfer
=
ThreadwiseTensorSliceTransfer_v3r1
<
ThreadSliceLengths
,
ThreadwiseTensorSliceTransfer_v3r1
<
ThreadSliceLengths
,
...
...
composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
View file @
67ad47e7
...
@@ -110,13 +110,13 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
...
@@ -110,13 +110,13 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
max_lds_align
);
...
@@ -248,10 +248,10 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
...
@@ -248,10 +248,10 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
constexpr
auto
BN
=
GN0
*
GN11
;
constexpr
auto
BN
=
GN0
*
GN11
;
constexpr
auto
BM1
=
constexpr
auto
BM1
=
Number
<
container_reduce
(
BM10BN10ThreadClusterBM10Xs
{},
math
::
multiplies
_v2
{},
I1
)
*
Number
<
container_reduce
(
BM10BN10ThreadClusterBM10Xs
{},
math
::
multiplies
{},
I1
)
*
BM1PerThreadBM11
>
{};
BM1PerThreadBM11
>
{};
constexpr
auto
BN1
=
constexpr
auto
BN1
=
Number
<
container_reduce
(
BM10BN10ThreadClusterBN10Xs
{},
math
::
multiplies
_v2
{},
I1
)
*
Number
<
container_reduce
(
BM10BN10ThreadClusterBN10Xs
{},
math
::
multiplies
{},
I1
)
*
BN1PerThreadBN11
>
{};
BN1PerThreadBN11
>
{};
constexpr
auto
BM0
=
BM
/
BM1
;
constexpr
auto
BM0
=
BM
/
BM1
;
...
@@ -354,24 +354,24 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
...
@@ -354,24 +354,24 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
max_lds_align
);
// A matrix in LDS memory for blockwise GEMM
// A matrix in LDS memory for blockwise GEMM
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_block_desc_gk0_bm_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_block_desc_gk0_bm_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
*
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
*
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
// B matrix in LDS memory for blockwise GEMM
// B matrix in LDS memory for blockwise GEMM
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_block_desc_gk0_bn_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_block_desc_gk0_bn_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
*
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
*
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
static_assert
(
a_block_desc_gk0_gm0_gm10_gm11_gk1
.
GetElementSpaceSize
()
==
static_assert
(
a_block_desc_gk0_gm0_gm10_gm11_gk1
.
GetElementSpaceSize
()
==
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
View file @
67ad47e7
...
@@ -166,12 +166,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
...
@@ -166,12 +166,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
...
@@ -351,22 +351,22 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
...
@@ -351,22 +351,22 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k_m0_m1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k_m0_m1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k_n0_n1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k_n0_n1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
// A matrix blockwise copy
// A matrix blockwise copy
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
View file @
67ad47e7
...
@@ -163,12 +163,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
...
@@ -163,12 +163,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
// TODO: check alignment
// TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
...
@@ -274,10 +274,10 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
...
@@ -274,10 +274,10 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
const
auto
N0
=
N
/
N1
;
const
auto
N0
=
N
/
N1
;
constexpr
auto
M11
=
constexpr
auto
M11
=
Number
<
container_reduce
(
M11N11ThreadClusterM110Xs
{},
math
::
multiplies
_v2
{},
I1
)
*
Number
<
container_reduce
(
M11N11ThreadClusterM110Xs
{},
math
::
multiplies
{},
I1
)
*
M1PerThreadM111
>
{};
M1PerThreadM111
>
{};
constexpr
auto
N11
=
constexpr
auto
N11
=
Number
<
container_reduce
(
M11N11ThreadClusterN110Xs
{},
math
::
multiplies
_v2
{},
I1
)
*
Number
<
container_reduce
(
M11N11ThreadClusterN110Xs
{},
math
::
multiplies
{},
I1
)
*
N1PerThreadN111
>
{};
N1PerThreadN111
>
{};
constexpr
auto
M10
=
M1
/
M11
;
constexpr
auto
M10
=
M1
/
M11
;
...
@@ -354,23 +354,23 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
...
@@ -354,23 +354,23 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
// TODO: check alignment
// TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k0_m0_m1_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k0_m0_m1_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k0_n0_n1_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k0_n0_n1_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
// A matrix in LDS memory, for blockwise GEMM
// A matrix in LDS memory, for blockwise GEMM
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
// B matrix in LDS memory, for blockwise GEMM
// B matrix in LDS memory, for blockwise GEMM
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
static_assert
(
a_k0_m0_m1_k1_block_desc
.
GetElementSpaceSize
()
==
static_assert
(
a_k0_m0_m1_k1_block_desc
.
GetElementSpaceSize
()
==
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
View file @
67ad47e7
...
@@ -58,7 +58,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -58,7 +58,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
...
@@ -132,10 +132,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -132,10 +132,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_e_k_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_e_k_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
View file @
67ad47e7
...
@@ -148,12 +148,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -148,12 +148,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
...
@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
// A matrix blockwise copy
// A matrix blockwise copy
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
View file @
67ad47e7
...
@@ -91,13 +91,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -91,13 +91,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
container_reverse_exclusive_scan
(
container_reverse_exclusive_scan
(
container_reorder_given_new2old
(
src_vector_tensor_lengths
,
container_reorder_given_new2old
(
src_vector_tensor_lengths
,
SrcVectorTensorContiguousDimOrder
{}),
SrcVectorTensorContiguousDimOrder
{}),
math
::
multiplies
_v2
{},
math
::
multiplies
{},
I1
),
I1
),
SrcVectorTensorContiguousDimOrder
{});
SrcVectorTensorContiguousDimOrder
{});
constexpr
auto
src_vector_desc
=
constexpr
auto
src_vector_desc
=
make_naive_tensor_descriptor
_v2
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
make_naive_tensor_descriptor
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
// access order and lengths
// access order and lengths
constexpr
auto
src_access_lengths
=
SliceLengths
{}
/
src_vector_tensor_lengths
;
constexpr
auto
src_access_lengths
=
SliceLengths
{}
/
src_vector_tensor_lengths
;
...
@@ -259,13 +259,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -259,13 +259,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
container_reverse_exclusive_scan
(
container_reverse_exclusive_scan
(
container_reorder_given_new2old
(
dst_vector_tensor_lengths
,
container_reorder_given_new2old
(
dst_vector_tensor_lengths
,
DstVectorTensorContiguousDimOrder
{}),
DstVectorTensorContiguousDimOrder
{}),
math
::
multiplies
_v2
{},
math
::
multiplies
{},
I1
),
I1
),
DstVectorTensorContiguousDimOrder
{});
DstVectorTensorContiguousDimOrder
{});
constexpr
auto
dst_vector_desc
=
constexpr
auto
dst_vector_desc
=
make_naive_tensor_descriptor
_v2
(
sequence_to_tuple_of_number
(
dst_vector_tensor_lengths
),
make_naive_tensor_descriptor
(
sequence_to_tuple_of_number
(
dst_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
dst_vector_tensor_strides
));
sequence_to_tuple_of_number
(
dst_vector_tensor_strides
));
// dst access order and lengths
// dst access order and lengths
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_vector_tensor_lengths
;
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_vector_tensor_lengths
;
...
@@ -699,13 +699,13 @@ struct ThreadwiseTensorSliceTransfer_v4r1
...
@@ -699,13 +699,13 @@ struct ThreadwiseTensorSliceTransfer_v4r1
container_reverse_exclusive_scan
(
container_reverse_exclusive_scan
(
container_reorder_given_new2old
(
src_vector_tensor_lengths
,
container_reorder_given_new2old
(
src_vector_tensor_lengths
,
SrcVectorTensorContiguousDimOrder
{}),
SrcVectorTensorContiguousDimOrder
{}),
math
::
multiplies
_v2
{},
math
::
multiplies
{},
I1
),
I1
),
SrcVectorTensorContiguousDimOrder
{});
SrcVectorTensorContiguousDimOrder
{});
constexpr
auto
src_vector_desc
=
constexpr
auto
src_vector_desc
=
make_naive_tensor_descriptor
_v2
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
make_naive_tensor_descriptor
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
// access order and lengths
// access order and lengths
constexpr
auto
access_lengths
=
SliceLengths
{}
/
src_vector_tensor_lengths
;
constexpr
auto
access_lengths
=
SliceLengths
{}
/
src_vector_tensor_lengths
;
...
...
composable_kernel/include/utility/math.hpp
View file @
67ad47e7
...
@@ -28,13 +28,7 @@ struct minus
...
@@ -28,13 +28,7 @@ struct minus
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
-
b
;
}
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
-
b
;
}
};
};
template
<
typename
T
>
struct
multiplies
struct
multiplies
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
*
b
;
}
};
struct
multiplies_v2
{
{
template
<
typename
A
,
typename
B
>
template
<
typename
A
,
typename
B
>
__host__
__device__
constexpr
auto
operator
()(
const
A
&
a
,
const
B
&
b
)
const
__host__
__device__
constexpr
auto
operator
()(
const
A
&
a
,
const
B
&
b
)
const
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment