Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
67ad47e7
"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "02a8c0ad20f392514e987fa5400e04aa40ec1a00"
Commit
67ad47e7
authored
Aug 16, 2021
by
Chao Liu
Browse files
refactor
parent
16effa76
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
63 additions
and
69 deletions
+63
-69
composable_kernel/include/tensor_description/cluster_descriptor.hpp
..._kernel/include/tensor_description/cluster_descriptor.hpp
+1
-1
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+14
-14
composable_kernel/include/tensor_description/tensor_adaptor.hpp
...able_kernel/include/tensor_description/tensor_adaptor.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+6
-6
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
...lude/tensor_operation/blockwise_tensor_slice_transfer.hpp
+1
-1
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
...e/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
+1
-1
composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
...lude/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
+8
-8
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
...nel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
+6
-6
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
...nel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
+8
-8
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+3
-3
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+4
-4
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
.../tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+9
-9
composable_kernel/include/utility/math.hpp
composable_kernel/include/utility/math.hpp
+0
-6
No files found.
composable_kernel/include/tensor_description/cluster_descriptor.hpp
View file @
67ad47e7
...
...
@@ -8,7 +8,7 @@ namespace ck {
template
<
typename
Lengths
,
typename
ArrangeOrder
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>
::
type
>
__host__
__device__
constexpr
auto
make_cluster_descriptor
_v2
(
__host__
__device__
constexpr
auto
make_cluster_descriptor
(
const
Lengths
&
lengths
,
ArrangeOrder
order
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>::
type
{})
{
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
67ad47e7
...
...
@@ -481,11 +481,11 @@ struct Merge_v1_carry_check
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
using
LowLengthsScan
=
decltype
(
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{}));
using
LowLengthsScan
=
decltype
(
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{}));
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{})));
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})));
LowLengths
low_lengths_
;
LowLengthsScan
low_lengths_scan_
;
...
...
@@ -496,8 +496,8 @@ struct Merge_v1_carry_check
__host__
__device__
constexpr
Merge_v1_carry_check
(
const
LowLengths
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
low_lengths_scan_
{
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{}))}
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{}))}
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
...
...
@@ -1037,7 +1037,7 @@ struct Merge_v2_magic_division
using
UpperIndex
=
MultiIndex
<
1
>
;
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{})));
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})));
using
LowLengthsMagicDivisorMultipiler
=
decltype
(
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengths
>
{},
...
...
@@ -1062,7 +1062,7 @@ struct Merge_v2_magic_division
low_lengths_magic_divisor_shift_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicShift
(
low_lengths
[
i
]);
},
Number
<
NDimLow
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{}))}
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{}))}
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
...
...
@@ -1188,11 +1188,11 @@ struct Merge_v2r2_magic_division
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
using
LowLengthsScan
=
decltype
(
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{}));
using
LowLengthsScan
=
decltype
(
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{}));
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{})));
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})));
using
LowLengthsScanMagicDivisorMultipiler
=
decltype
(
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengthsScan
>
{},
...
...
@@ -1213,14 +1213,14 @@ struct Merge_v2r2_magic_division
__host__
__device__
constexpr
Merge_v2r2_magic_division
(
const
LowLengths
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
low_lengths_scan_
{
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{})},
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{})},
low_lengths_scan_magic_divisor_multiplier_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicMultiplier
(
low_lengths_scan_
[
i
]);
},
Number
<
NDimLow
>
{})},
low_lengths_scan_magic_divisor_shift_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicShift
(
low_lengths_scan_
[
i
]);
},
Number
<
NDimLow
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{}))}
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies
{},
Number
<
1
>
{}))}
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
...
...
@@ -1336,7 +1336,7 @@ struct UnMerge
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
using
UpLengthsScan
=
decltype
(
container_reverse_exclusive_scan
(
UpLengths
{},
math
::
multiplies
_v2
{},
Number
<
1
>
{}));
decltype
(
container_reverse_exclusive_scan
(
UpLengths
{},
math
::
multiplies
{},
Number
<
1
>
{}));
UpLengths
up_lengths_
;
UpLengthsScan
up_lengths_scan_
;
...
...
@@ -1346,7 +1346,7 @@ struct UnMerge
__host__
__device__
constexpr
UnMerge
(
const
UpLengths
&
up_lengths
)
:
up_lengths_
{
up_lengths
},
up_lengths_scan_
{
container_reverse_exclusive_scan
(
up_lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{})}
container_reverse_exclusive_scan
(
up_lengths
,
math
::
multiplies
{},
Number
<
1
>
{})}
{
}
...
...
composable_kernel/include/tensor_description/tensor_adaptor.hpp
View file @
67ad47e7
...
...
@@ -64,7 +64,7 @@ struct TensorAdaptor
Number
<
ndim_top_
>
{});
// TODO: make container_reduce support tuple of Number and index_t
return
container_reduce
(
lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{});
return
container_reduce
(
lengths
,
math
::
multiplies
{},
Number
<
1
>
{});
}
template
<
index_t
IDim
>
...
...
composable_kernel/include/tensor_description/tensor_descriptor.hpp
View file @
67ad47e7
...
...
@@ -69,7 +69,7 @@ struct TensorDescriptor
Number
<
ndim_visible_
>
{});
// TODO: make container_reduce support tuple of Number and index_t
return
container_reduce
(
lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{});
return
container_reduce
(
lengths
,
math
::
multiplies
{},
Number
<
1
>
{});
}
template
<
index_t
IDim
>
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
67ad47e7
...
...
@@ -38,8 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
template
<
typename
...
Lengths
,
typename
...
Strides
,
typename
enable_if
<
sizeof
...(
Lengths
)
==
sizeof
...(
Strides
),
bool
>
::
type
=
false
>
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor
_v2
(
const
Tuple
<
Lengths
...
>&
lengths
,
const
Tuple
<
Strides
...
>&
strides
)
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor
(
const
Tuple
<
Lengths
...
>&
lengths
,
const
Tuple
<
Strides
...
>&
strides
)
{
constexpr
index_t
N
=
sizeof
...(
Lengths
);
...
...
@@ -100,7 +100,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
constexpr
auto
visible_dim_hidden_ids
=
typename
arithmetic_sequence_gen
<
1
,
N
+
1
,
1
>::
type
{};
const
auto
element_space_size
=
container_reduce
(
lengths
,
math
::
multiplies
_v2
{},
Number
<
1
>
{});
const
auto
element_space_size
=
container_reduce
(
lengths
,
math
::
multiplies
{},
Number
<
1
>
{});
return
TensorDescriptor
<
remove_cv_t
<
decltype
(
transforms
)
>
,
remove_cv_t
<
decltype
(
low_dim_hidden_idss
)
>
,
...
...
@@ -112,7 +112,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
template
<
typename
...
Lengths
,
typename
Align
>
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor_aligned
_v2
(
const
Tuple
<
Lengths
...
>&
lengths
,
Align
align
)
make_naive_tensor_descriptor_aligned
(
const
Tuple
<
Lengths
...
>&
lengths
,
Align
align
)
{
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -133,7 +133,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
else
{
return
container_reduce
(
lengths
,
math
::
multiplies
_v2
{},
math
::
multiplies
{},
Number
<
stride_n_minus_2
>
{},
i
+
I1
,
Number
<
N
-
1
>
{},
...
...
@@ -142,7 +142,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
},
Number
<
N
>
{});
return
make_naive_tensor_descriptor
_v2
(
lengths
,
strides
);
return
make_naive_tensor_descriptor
(
lengths
,
strides
);
}
}
// namespace ck
...
...
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
View file @
67ad47e7
...
...
@@ -143,7 +143,7 @@ struct BlockwiseTensorSliceTransfer_v4
private:
static
constexpr
auto
thread_cluster_desc_
=
make_cluster_descriptor
_v2
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
ThreadwiseTensorSliceTransfer_v3
<
ThreadSliceLengths
,
...
...
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
View file @
67ad47e7
...
...
@@ -131,7 +131,7 @@ struct BlockwiseTensorSliceTransfer_v4r1
private:
static
constexpr
auto
thread_cluster_desc_
=
make_cluster_descriptor
_v2
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
ThreadwiseTensorSliceTransfer_v3r1
<
ThreadSliceLengths
,
...
...
composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
View file @
67ad47e7
...
...
@@ -110,13 +110,13 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
...
...
@@ -248,10 +248,10 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
constexpr
auto
BN
=
GN0
*
GN11
;
constexpr
auto
BM1
=
Number
<
container_reduce
(
BM10BN10ThreadClusterBM10Xs
{},
math
::
multiplies
_v2
{},
I1
)
*
Number
<
container_reduce
(
BM10BN10ThreadClusterBM10Xs
{},
math
::
multiplies
{},
I1
)
*
BM1PerThreadBM11
>
{};
constexpr
auto
BN1
=
Number
<
container_reduce
(
BM10BN10ThreadClusterBN10Xs
{},
math
::
multiplies
_v2
{},
I1
)
*
Number
<
container_reduce
(
BM10BN10ThreadClusterBN10Xs
{},
math
::
multiplies
{},
I1
)
*
BN1PerThreadBN11
>
{};
constexpr
auto
BM0
=
BM
/
BM1
;
...
...
@@ -354,24 +354,24 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
// A matrix in LDS memory for blockwise GEMM
// be careful of LDS alignment
constexpr
auto
a_block_desc_gk0_bm_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_block_desc_gk0_bm_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
*
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
// B matrix in LDS memory for blockwise GEMM
// be careful of LDS alignment
constexpr
auto
b_block_desc_gk0_bn_gk1
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_block_desc_gk0_bn_gk1
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
*
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
static_assert
(
a_block_desc_gk0_gm0_gm10_gm11_gk1
.
GetElementSpaceSize
()
==
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
View file @
67ad47e7
...
...
@@ -166,12 +166,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
...
...
@@ -351,22 +351,22 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_k_m0_m1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k_m0_m1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_k_n0_n1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k_n0_n1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
// A matrix blockwise copy
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
View file @
67ad47e7
...
...
@@ -163,12 +163,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
// TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
...
...
@@ -274,10 +274,10 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
const
auto
N0
=
N
/
N1
;
constexpr
auto
M11
=
Number
<
container_reduce
(
M11N11ThreadClusterM110Xs
{},
math
::
multiplies
_v2
{},
I1
)
*
Number
<
container_reduce
(
M11N11ThreadClusterM110Xs
{},
math
::
multiplies
{},
I1
)
*
M1PerThreadM111
>
{};
constexpr
auto
N11
=
Number
<
container_reduce
(
M11N11ThreadClusterN110Xs
{},
math
::
multiplies
_v2
{},
I1
)
*
Number
<
container_reduce
(
M11N11ThreadClusterN110Xs
{},
math
::
multiplies
{},
I1
)
*
N1PerThreadN111
>
{};
constexpr
auto
M10
=
M1
/
M11
;
...
...
@@ -354,23 +354,23 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
// TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_k0_m0_m1_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k0_m0_m1_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_k0_n0_n1_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k0_n0_n1_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// A matrix in LDS memory, for blockwise GEMM
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// B matrix in LDS memory, for blockwise GEMM
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
static_assert
(
a_k0_m0_m1_k1_block_desc
.
GetElementSpaceSize
()
==
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
View file @
67ad47e7
...
...
@@ -58,7 +58,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
...
...
@@ -132,10 +132,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_e_k_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_e_k_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
View file @
67ad47e7
...
...
@@ -148,12 +148,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
...
...
@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
_v2
(
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
// A matrix blockwise copy
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
View file @
67ad47e7
...
...
@@ -91,13 +91,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
container_reverse_exclusive_scan
(
container_reorder_given_new2old
(
src_vector_tensor_lengths
,
SrcVectorTensorContiguousDimOrder
{}),
math
::
multiplies
_v2
{},
math
::
multiplies
{},
I1
),
SrcVectorTensorContiguousDimOrder
{});
constexpr
auto
src_vector_desc
=
make_naive_tensor_descriptor
_v2
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
make_naive_tensor_descriptor
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
// access order and lengths
constexpr
auto
src_access_lengths
=
SliceLengths
{}
/
src_vector_tensor_lengths
;
...
...
@@ -259,13 +259,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
container_reverse_exclusive_scan
(
container_reorder_given_new2old
(
dst_vector_tensor_lengths
,
DstVectorTensorContiguousDimOrder
{}),
math
::
multiplies
_v2
{},
math
::
multiplies
{},
I1
),
DstVectorTensorContiguousDimOrder
{});
constexpr
auto
dst_vector_desc
=
make_naive_tensor_descriptor
_v2
(
sequence_to_tuple_of_number
(
dst_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
dst_vector_tensor_strides
));
make_naive_tensor_descriptor
(
sequence_to_tuple_of_number
(
dst_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
dst_vector_tensor_strides
));
// dst access order and lengths
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_vector_tensor_lengths
;
...
...
@@ -699,13 +699,13 @@ struct ThreadwiseTensorSliceTransfer_v4r1
container_reverse_exclusive_scan
(
container_reorder_given_new2old
(
src_vector_tensor_lengths
,
SrcVectorTensorContiguousDimOrder
{}),
math
::
multiplies
_v2
{},
math
::
multiplies
{},
I1
),
SrcVectorTensorContiguousDimOrder
{});
constexpr
auto
src_vector_desc
=
make_naive_tensor_descriptor
_v2
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
make_naive_tensor_descriptor
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
// access order and lengths
constexpr
auto
access_lengths
=
SliceLengths
{}
/
src_vector_tensor_lengths
;
...
...
composable_kernel/include/utility/math.hpp
View file @
67ad47e7
...
...
@@ -28,13 +28,7 @@ struct minus
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
-
b
;
}
};
template
<
typename
T
>
struct
multiplies
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
*
b
;
}
};
struct
multiplies_v2
{
template
<
typename
A
,
typename
B
>
__host__
__device__
constexpr
auto
operator
()(
const
A
&
a
,
const
B
&
b
)
const
...
...
gaoqiong
@gaoqiong
mentioned in commit
dfb80c4e
·
Dec 05, 2023
mentioned in commit
dfb80c4e
mentioned in commit dfb80c4e39ec7b304c3ebc88bab2a204bc4906b9
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment