Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
1f9546e0
Commit
1f9546e0
authored
Dec 12, 2024
by
root
Browse files
Merge branch 'develop' into gemm_bf16_sk_muozturk
parents
78394194
86990558
Changes
484
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
377 additions
and
2 deletions
+377
-2
include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp
...ude/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp
+169
-0
include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp
.../ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp
+28
-0
include/ck_tile/ops/reduce.hpp
include/ck_tile/ops/reduce.hpp
+4
-0
include/ck_tile/ops/reduce/block/block_reduce.hpp
include/ck_tile/ops/reduce/block/block_reduce.hpp
+176
-2
No files found.
Too many changes to show.
To preserve performance only
484 of 484+
files are displayed.
Plain diff
Email patch
include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp
0 → 100644
View file @
1f9546e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
// #include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp"
namespace
ck_tile
{
/* independent host side argument, no template
*/
struct
GenericPermuteHostArgs
{
static
constexpr
index_t
kMaxRanks
=
8
;
// TODO: hardcoded
const
void
*
p_src
;
void
*
p_dst
;
index_t
rank
;
index_t
shape
[
kMaxRanks
];
// input shape
index_t
perm
[
kMaxRanks
];
// permute index
};
/*
simulate torch.permute:
x_ = x_.view(x.shape[0],
x.shape[1]//16, 16,
x.shape[2]//32, 4, 8)
x_ = x_.permute(0,1,3,4,2,5)
x_ = x_.contiguous()
x_ = x_.view(x.shape[0], x.shape[1], x.shape[2]);//
this kernel is supposed not to be performant(just OK), with functional support up to kMaxRanks
dim of permutation, with a single kernel
*/
template
<
typename
Problem_
>
struct
GenericPermute
{
using
Problem
=
ck_tile
::
remove_cvref_t
<
Problem_
>
;
using
DataType
=
remove_cvref_t
<
typename
Problem
::
DataType
>
;
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kMaxRanks
=
Problem
::
kMaxRanks
;
static
constexpr
bool
KeepLastDim
=
Problem
::
KeepLastDim
;
struct
__attribute__
((
packed
))
Kargs
{
const
void
*
p_src
;
void
*
p_dst
;
// index_t rank;
index_t
num_elements
;
index_t
perm_length
[
kMaxRanks
];
// tensor length after permutation
index_t
perm_stride
[
kMaxRanks
];
// tensor stride after permutation
};
CK_TILE_HOST
static
constexpr
index_t
TotalElements
(
const
GenericPermuteHostArgs
&
h
)
{
index_t
n
=
1
;
for
(
auto
i
=
0
;
i
<
h
.
rank
;
i
++
)
{
n
*=
h
.
shape
[
i
];
}
return
n
;
}
CK_TILE_HOST
static
constexpr
Kargs
MakeKargs
(
const
GenericPermuteHostArgs
&
h
)
{
Kargs
a
;
a
.
p_src
=
h
.
p_src
;
a
.
p_dst
=
h
.
p_dst
;
// assert rank <= kMaxRanks
index_t
i
=
0
;
index_t
perm
[
kMaxRanks
];
index_t
x_shape
[
kMaxRanks
];
index_t
x_stride
[
kMaxRanks
];
// index_t perm_length[kMaxRanks];
for
(;
i
<
h
.
rank
;
i
++
)
{
x_shape
[
i
]
=
h
.
shape
[
i
];
perm
[
i
]
=
h
.
perm
[
i
];
}
for
(;
i
<
kMaxRanks
;
i
++
)
{
x_shape
[
i
]
=
1
;
perm
[
i
]
=
i
;
// will index to len = 1
}
index_t
stride
=
1
;
for
(
index_t
j
=
kMaxRanks
-
1
;
j
>=
0
;
j
--
)
{
x_stride
[
j
]
=
stride
;
stride
*=
x_shape
[
j
];
}
for
(
index_t
j
=
0
;
j
<
kMaxRanks
;
j
++
)
{
a
.
perm_length
[
j
]
=
x_shape
[
perm
[
j
]];
a
.
perm_stride
[
j
]
=
x_stride
[
perm
[
j
]];
}
a
.
num_elements
=
TotalElements
(
h
);
return
a
;
}
CK_TILE_HOST
static
constexpr
auto
GridSize
(
GenericPermuteHostArgs
h
)
{
auto
total
=
TotalElements
(
h
);
auto
grids
=
dim3
((
total
+
BlockSize
()
-
1
)
/
BlockSize
());
// printf("### total:%d, grids:%dx%dx%d\n", total, );
return
grids
;
}
CK_TILE_HOST_DEVICE
static
constexpr
auto
BlockSize
()
{
return
Problem
::
kBlockSize
;
}
CK_TILE_DEVICE
void
operator
()(
Kargs
kargs
)
const
{
index_t
id
=
blockIdx
.
x
*
BlockSize
()
+
threadIdx
.
x
;
if
(
id
>=
kargs
.
num_elements
)
return
;
const
auto
perm_length
=
generate_tuple
([
&
](
auto
I
)
{
return
kargs
.
perm_length
[
I
];
},
number
<
kMaxRanks
>
{});
const
auto
perm_stride
=
generate_tuple
([
&
](
auto
I
)
{
return
kargs
.
perm_stride
[
I
];
},
number
<
kMaxRanks
>
{});
const
DataType
*
p_src
=
reinterpret_cast
<
const
DataType
*>
(
kargs
.
p_src
);
DataType
*
p_dst
=
reinterpret_cast
<
DataType
*>
(
kargs
.
p_dst
);
const
auto
src_view_0
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
p_src
,
perm_length
,
perm_stride
,
number
<
1
>
{},
number
<
1
>
{});
const
auto
src_view
=
transform_tensor_view
(
src_view_0
,
make_tuple
(
make_merge_transform
(
perm_length
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
kMaxRanks
,
1
>::
type
{}),
make_tuple
(
sequence
<
0
>
{}));
auto
dst_view_0
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
p_dst
,
perm_length
,
number
<
1
>
{});
auto
dst_view
=
transform_tensor_view
(
dst_view_0
,
make_tuple
(
make_merge_transform
(
perm_length
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
kMaxRanks
,
1
>::
type
{}),
make_tuple
(
sequence
<
0
>
{}));
// TODO: hard code to vector 1
using
vector_t
=
thread_buffer
<
DataType
,
1
>
;
const
auto
src_coord
=
make_tensor_coordinate
(
src_view
.
get_tensor_descriptor
(),
array
<
index_t
,
1
>
{
id
});
const
auto
dst_coord
=
make_tensor_coordinate
(
dst_view
.
get_tensor_descriptor
(),
array
<
index_t
,
1
>
{
id
});
// printf("src id:%d, os:%d\n", id, src_coord.get_offset());
// printf("dst id:%d, os:%d\n", id, dst_coord.get_offset());
const
vector_t
x
=
src_view
.
template
get_vectorized_elements
<
vector_t
>(
src_coord
,
0
);
dst_view
.
template
set_vectorized_elements
<
vector_t
>(
dst_coord
,
0
,
x
);
}
};
}
// namespace ck_tile
include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp
0 → 100644
View file @
1f9546e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/utility/type_traits.hpp"
namespace
ck_tile
{
template
<
typename
DataType_
,
index_t
kBlockSize_
=
256
,
index_t
kMaxRanks_
=
8
,
bool
KeepLastDim_
=
false
>
struct
GenericPermuteProblem
{
using
DataType
=
remove_cvref_t
<
DataType_
>
;
static
constexpr
index_t
kBlockSize
=
kBlockSize_
;
static
constexpr
index_t
kMaxRanks
=
kMaxRanks_
;
/* KeepLastDim:
* if last dim keep the same? this can help enable vector load
* permute(0, 2, 4, 1, 3, 5) -> true
* permute(0, 3, 2, 1) -> false
*/
static
constexpr
bool
KeepLastDim
=
KeepLastDim_
;
// TODO: not used(?)
};
}
// namespace ck_tile
include/ck_tile/ops/reduce.hpp
View file @
1f9546e0
...
...
@@ -4,4 +4,8 @@
#pragma once
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
include/ck_tile/ops/reduce/block/block_reduce.hpp
View file @
1f9546e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include <tuple>
// This file is not support cross warp reduce
namespace
ck_tile
{
/*
* TODO: block_tile_reduce_sync() currently has a limitation
* Y dim must have at least one dim not been reduced
*/
// synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
template
<
typename
AccDistributedTensor_
,
typename
ReduceFunc
,
bool
WithBroadcast
=
true
>
CK_TILE_DEVICE
void
block_tile_reduce_sync
(
AccDistributedTensor_
&
acc_tensor
,
...
...
@@ -22,7 +28,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
constexpr
index_t
idim_p_lane
=
NDimP
-
1
;
const
auto
ps_idx
=
make_array
<
index_t
>
(
get_block_id
(),
get_lane_id
());
const
auto
ps_idx
=
detail
::
get_partition_index
(
acc_tensor
.
get_tile_distribution
());
const
auto
rs_idx
=
acc_tensor
.
get_tile_distribution
().
calculate_rs_index_from_ps_index
(
ps_idx
);
constexpr
index_t
thread_buf_size
=
AccDistributedTensor_
::
get_thread_buffer_size
();
...
...
@@ -104,6 +110,65 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
});
}
/*
* this version is faster, using xor to do reduce, no need broadcast anymore
* TODO: the limitation is to-be-reduced P dim can only mapping to one R dim?
*/
template
<
typename
AccDistributedTensor_
,
typename
ReduceFunc
>
CK_TILE_DEVICE
void
block_tile_reduce_xor_sync
(
AccDistributedTensor_
&
acc_tensor
,
const
ReduceFunc
&
reduce_func
)
{
using
Dstr
=
typename
AccDistributedTensor_
::
StaticTileDistribution
;
using
DstrEncode
=
typename
Dstr
::
DstrEncode
;
using
DstrEncodeDetail
=
typename
DstrEncode
::
detail
;
constexpr
index_t
NDimP
=
Dstr
::
get_num_of_dimension_p
();
constexpr
index_t
NDimR
=
Dstr
::
get_num_of_dimension_r
();
constexpr
index_t
idim_p_lane
=
NDimP
-
1
;
constexpr
index_t
thread_buf_size
=
AccDistributedTensor_
::
get_thread_buffer_size
();
// loop over thread data
static_for
<
0
,
thread_buf_size
,
1
>
{}([
&
](
auto
i
)
{
auto
v_local
=
acc_tensor
.
get_thread_buffer
()[
i
];
// cross-lane reduce for replication
// only reduce on R dimension correspond to lane
// (lane id maps to this R dimension)
static_for
<
0
,
NDimR
,
1
>
{}([
&
](
auto
idim_r
)
{
// FIXME: nasty to use does_p_own_r_
if
constexpr
(
DstrEncodeDetail
::
does_p_own_r_
[
idim_p_lane
][
idim_r
])
{
constexpr
index_t
r_length
=
DstrEncode
::
rs_lengths_
[
idim_r
];
constexpr
index_t
lid_over_rid_derivative
=
DstrEncodeDetail
::
ps_over_rs_derivative_
[
idim_p_lane
][
idim_r
];
static_assert
(
is_power_of_two_integer
(
r_length
),
"wrong! only support power of 2 reduction"
);
constexpr
index_t
nstage
=
integer_log2_floor
(
r_length
);
// reduction sweep forward
static_for
<
0
,
nstage
,
1
>
{}([
&
](
auto
istage
)
{
// xor
index_t
src_lane
=
__lane_id
()
^
(
number
<
lid_over_rid_derivative
<<
istage
.
value
>
{}.
value
);
// pull data from remote lane
const
auto
v_remote
=
warp_shuffle
(
v_local
,
src_lane
);
// reduce
v_local
=
reduce_func
(
v_local
,
v_remote
);
});
}
});
acc_tensor
.
get_thread_buffer
()(
i
)
=
v_local
;
});
}
// FIXME: this is for 2D to 1D reduce only, need to support n-D
template
<
typename
AccDistributedTensor_
,
typename
InDistributedTensor_
,
...
...
@@ -175,6 +240,10 @@ CK_TILE_DEVICE void block_tile_reduce(AccDistributedTensor_& acc_tensor,
#endif
}
/*
* TODO: block_tile_reduce() currently has a limitation
* Y dim must have at least one dim not been reduced
*/
template
<
typename
AccDataType_
,
typename
InDistributedTensor_
,
index_t
...
InReduceDims
,
...
...
@@ -208,4 +277,109 @@ CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor,
return
acc_tensor
;
}
// this version only support 2D->1D reduce (reduce-dim=seq<0, 1>)
// this version only support in/acc/out datatypes are the same
// this version will call thread/warp+sync in one function call
//
template
<
typename
InDistributedTensor_
>
struct
BlockReduce2D
{
using
InDistributedTensor
=
remove_cvref_t
<
InDistributedTensor_
>
;
using
InDataType
=
typename
InDistributedTensor
::
DataType
;
CK_TILE_HOST_DEVICE
BlockReduce2D
(
const
InDistributedTensor
&
t_
,
const
InDataType
&
reduce_init_
)
:
t
(
t_
),
reduce_init
(
reduce_init_
)
{
}
CK_TILE_HOST_DEVICE
constexpr
auto
MakeDstBlockTile
()
const
{
using
ReduceDim
=
sequence
<
1
>
;
// hard coded
constexpr
auto
acc_dstr
=
make_static_tile_distribution
(
ck_tile
::
detail
::
make_reduce_tile_distribution_encoding
(
InDistributedTensor
::
get_tile_distribution
()
.
get_static_tile_distribution_encoding
(),
ReduceDim
{}));
auto
dst_
=
make_static_distributed_tensor
<
InDataType
>
(
acc_dstr
);
// init acc_tensor
tile_elementwise_inout
([
&
](
auto
&
x_
)
{
x_
=
type_convert
<
InDataType
>
(
reduce_init
);
},
dst_
);
return
dst_
;
}
// return number of pixels each lane need to reduce
CK_TILE_HOST_DEVICE
constexpr
auto
get_reduce_length_y
()
const
{
constexpr
auto
spans
=
InDistributedTensor
::
get_distributed_spans
();
}
// Here ReducePacksPerXDim is not the same meaning as that in static_uford/sweep_tile_uspan
// this is number of packs along the X-dim. We need to compute the Unpacks along the Y dim
// internally
// For simplicity, we just support along the row dimension, ReducePacksPerXDim is always 2
// element , and the first element is always ignored For simplicity, will always try from
// right-to-left to find alone which Y dim to split
template
<
typename
ReduceFunc
,
typename
ReduceSyncFunc
,
typename
ReducePacksPerXDim
=
uniform_sequence_gen_t
<
2
,
1
>
>
CK_TILE_HOST_DEVICE
auto
operator
()(
const
ReduceFunc
&
reduce_func
,
const
ReduceSyncFunc
&
reduce_sync_func
,
ReducePacksPerXDim
=
{})
const
{
constexpr
auto
spans
=
InDistributedTensor
::
get_distributed_spans
();
constexpr
auto
row_y_unpacks
=
[
&
]()
{
constexpr
auto
row_y_lengths
=
typename
decltype
(
spans
[
number
<
1
>
{}])
::
Impl
{};
constexpr
auto
row_y_size
=
reduce_on_sequence
(
row_y_lengths
,
multiplies
{},
number
<
1
>
{});
constexpr
auto
row_y_packs
=
ReducePacksPerXDim
{}.
at
(
number
<
1
>
{});
static_assert
(
row_y_size
%
row_y_packs
==
0
);
constexpr
auto
row_y_slice_size
=
row_y_size
/
row_y_packs
;
constexpr
auto
slice_info
=
slice_sequence
(
row_y_lengths
,
number
<
row_y_slice_size
>
{});
constexpr
auto
unpacks
=
slice_info
[
number
<
1
>
{}];
return
unpacks
;
}();
auto
acc_tensor
=
MakeDstBlockTile
();
// in-thread reduction
// FIXME: hard coded to be 2D to 1D reduction
sweep_tile_span
(
spans
[
number
<
0
>
{}],
[
&
](
auto
dstr_idx_i0
)
{
constexpr
auto
acc_dstr_idx
=
make_tuple
(
dstr_idx_i0
);
auto
acc
=
acc_tensor
[
acc_dstr_idx
];
sweep_tile_uspan
(
spans
[
number
<
1
>
{}],
[
&
](
auto
...
dstr_idx_i1
)
{
acc
=
reduce_func
(
acc
,
t
[
make_tuple
(
dstr_idx_i0
,
dstr_idx_i1
)]...);
},
row_y_unpacks
);
acc_tensor
(
acc_dstr_idx
)
=
acc
;
});
// TODO: always use xor to do cross-lane reduce
block_tile_reduce_xor_sync
(
acc_tensor
,
reduce_sync_func
);
return
acc_tensor
;
}
template
<
typename
ReduceFunc
>
CK_TILE_HOST_DEVICE
auto
operator
()(
const
ReduceFunc
&
reduce_func
)
const
{
return
operator
()(
reduce_func
,
reduce_func
);
}
InDistributedTensor
t
;
InDataType
reduce_init
;
};
// deduction guide
template
<
typename
T
>
CK_TILE_HOST_DEVICE_EXTERN
BlockReduce2D
(
const
T
&
,
const
typename
T
::
DataType
&
)
->
BlockReduce2D
<
T
>
;
}
// namespace ck_tile
Prev
1
…
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment