Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
fbfad6c2
Commit
fbfad6c2
authored
Nov 01, 2024
by
dummycoderfe
Browse files
format codes
parent
9964919d
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
61 additions
and
52 deletions
+61
-52
CMakeLists.txt
CMakeLists.txt
+5
-0
cmake/EnableCompilerWarnings.cmake
cmake/EnableCompilerWarnings.cmake
+1
-0
example/ck_tile/12_moe_sorting/moe_sorting.cpp
example/ck_tile/12_moe_sorting/moe_sorting.cpp
+6
-6
example/ck_tile/12_moe_sorting/moe_sorting_api.cpp
example/ck_tile/12_moe_sorting/moe_sorting_api.cpp
+5
-5
example/ck_tile/12_moe_sorting/moe_sorting_api.hpp
example/ck_tile/12_moe_sorting/moe_sorting_api.hpp
+1
-1
include/ck_tile/host/reference/reference_moe_sorting.hpp
include/ck_tile/host/reference/reference_moe_sorting.hpp
+13
-12
include/ck_tile/ops/moe_sorting/kernel/moe_sorting_kernel.hpp
...ude/ck_tile/ops/moe_sorting/kernel/moe_sorting_kernel.hpp
+30
-28
No files found.
CMakeLists.txt
View file @
fbfad6c2
...
...
@@ -494,6 +494,11 @@ include_directories(BEFORE
${
HIP_INCLUDE_DIRS
}
)
SET
(
BUILD_DEV ON CACHE BOOL
"BUILD_DEV"
)
if
(
BUILD_DEV
)
add_compile_options
(
-Werror
)
add_compile_options
(
-Weverything
)
endif
()
message
(
"CMAKE_CXX_FLAGS:
${
CMAKE_CXX_FLAGS
}
"
)
if
(
"
${
CMAKE_CXX_COMPILER_ID
}
"
MATCHES
"Clang"
)
...
...
cmake/EnableCompilerWarnings.cmake
View file @
fbfad6c2
...
...
@@ -66,6 +66,7 @@ else()
-Wunreachable-code
-Wunused
-Wno-reserved-identifier
-Werror
-Wno-option-ignored
-Wsign-compare
-Wno-extra-semi-stmt
...
...
example/ck_tile/12_moe_sorting/moe_sorting.cpp
View file @
fbfad6c2
...
...
@@ -19,7 +19,7 @@ auto create_args(int argc, char* argv[])
{
ck_tile
::
ArgParser
arg_parser
;
arg_parser
.
insert
(
"v"
,
"1"
,
"weather do CPU validation or not"
)
.
insert
(
"pr_i"
,
"
fp16
"
,
"in
put
data type.
fp16/fp32 (representing 8/16/32 bit data
)"
)
.
insert
(
"pr_i"
,
"
int32
"
,
"in
dex
data type.
(currently only int32 supported now
)"
)
.
insert
(
"pr_w"
,
"fp32"
,
"output weight data type(currently only fp32 supported now)"
)
.
insert
(
"t"
,
"128"
,
"number of input tokens"
)
.
insert
(
"e"
,
"8"
,
"number of experts"
)
...
...
@@ -63,7 +63,7 @@ template <typename WeightType, typename IndexType = ck_tile::index_t>
bool
test_moe_sorting
(
ck_tile
::
ArgParser
args
)
{
int
validate
=
args
.
get_int
(
"v"
);
std
::
string
in
put
_prec
=
args
.
get_str
(
"pr_i"
);
std
::
string
in
dex
_prec
=
args
.
get_str
(
"pr_i"
);
std
::
string
weight_prec
=
args
.
get_str
(
"pr_w"
);
int
tokens
=
args
.
get_int
(
"t"
);
int
experts
=
args
.
get_int
(
"e"
);
...
...
@@ -115,7 +115,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
topk_ids_dev
.
ToDevice
(
topk_ids_host
.
data
());
weights_dev
.
ToDevice
(
weights_host
.
data
());
moe_sorting_trait
trait
{
in
put
_prec
,
weight_prec
,
experts
,
topk
,
unit_size
,
tokens
};
moe_sorting_trait
trait
{
in
dex
_prec
,
weight_prec
,
experts
,
topk
,
unit_size
,
tokens
};
moe_sorting_kargs
karg
{
topk_ids_dev
.
GetDeviceBuffer
(),
weights_dev
.
GetDeviceBuffer
(),
...
...
@@ -135,7 +135,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
repeat
};
auto
ms
=
moe_sorting
(
trait
,
karg
,
sc
);
printf
(
"[%s|%s]tokens:%d, experts:%d, topk:%d, st_i:%d, ms:%f , "
,
in
put
_prec
.
c_str
(),
in
dex
_prec
.
c_str
(),
weight_prec
.
c_str
(),
tokens
,
experts
,
...
...
@@ -192,11 +192,11 @@ int main(int argc, char** argv)
auto
[
result
,
args
]
=
create_args
(
argc
,
argv
);
if
(
!
result
)
return
-
1
;
std
::
string
in
put
_prec
=
args
.
get_str
(
"pr_i"
);
std
::
string
in
dex
_prec
=
args
.
get_str
(
"pr_i"
);
std
::
string
weight_prec
=
args
.
get_str
(
"pr_w"
);
bool
r
=
true
;
if
(
weight_prec
.
compare
(
"fp32"
)
==
0
)
if
(
weight_prec
.
compare
(
"fp32"
)
==
0
&&
index_prec
.
compare
(
"int32"
)
==
0
)
{
r
&=
test_moe_sorting
<
float
,
ck_tile
::
index_t
>
(
args
);
}
...
...
example/ck_tile/12_moe_sorting/moe_sorting_api.cpp
View file @
fbfad6c2
...
...
@@ -5,11 +5,11 @@
float
moe_sorting
(
moe_sorting_trait
t
,
moe_sorting_kargs
a
,
ck_tile
::
stream_config
s
)
{
if
(
t
.
weight_type
==
"fp32"
)
if
(
t
.
weight_type
==
"fp32"
&&
t
.
index_type
==
"int32"
)
{
using
index_t
=
ck_tile
::
index_t
;
using
ms_weight_type
=
float
;
using
ms_problem
=
ck_tile
::
MoeSortingProblem
<
index_t
,
ms_weight_type
>
;
using
index_t
=
ck_tile
::
index_t
;
using
ms_weight_type
=
float
;
using
ms_problem
=
ck_tile
::
MoeSortingProblem
<
index_t
,
ms_weight_type
>
;
// using ms_pipeline = ck_tile::MoeSortingPipeline<ms_problem>;
using
kernel
=
ck_tile
::
MoeSortingKernel
<
ms_problem
>
;
auto
kargs
=
kernel
::
MakeKargs
(
a
);
...
...
@@ -17,7 +17,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_kargs a, ck_tile::stream_conf
const
dim3
blocks
=
ck_tile
::
max
(
t
.
experts
,
ck_tile
::
get_warp_size
());
const
size_t
lds_size
=
((
blocks
.
x
+
1
)
*
t
.
experts
+
(
t
.
experts
+
1
))
*
sizeof
(
index_t
);
float
ave_time
=
ck_tile
::
launch_kernel
(
s
,
ck_tile
::
make_kernel
<
64
,
1
>
(
kernel
{},
grids
,
blocks
,
lds_size
,
kargs
));
s
,
ck_tile
::
make_kernel
(
kernel
{},
grids
,
blocks
,
lds_size
,
kargs
));
return
ave_time
;
}
return
-
1
;
...
...
example/ck_tile/12_moe_sorting/moe_sorting_api.hpp
View file @
fbfad6c2
...
...
@@ -9,7 +9,7 @@
struct
moe_sorting_trait
{
std
::
string
in
put
_type
;
std
::
string
in
dex
_type
;
std
::
string
weight_type
;
// currently always float
int
experts
;
int
topk
;
...
...
include/ck_tile/host/reference/reference_moe_sorting.hpp
View file @
fbfad6c2
...
...
@@ -19,11 +19,11 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
const
index_t
unit_size
)
{
const
index_t
num_token
=
topk_ids
.
mDesc
.
get_lengths
()[
0
];
const
index_t
topk
=
topk_ids
.
mDesc
.
get_lengths
()[
1
];
const
index_t
topk
=
topk_ids
.
mDesc
.
get_lengths
()[
1
];
std
::
vector
<
std
::
vector
<
IndexType
>>
expert_tokens
(
experts
,
std
::
vector
<
IndexType
>
(
unit_size
,
num_token
));
std
::
vector
<
std
::
vector
<
WeightType
>>
expert_token_weights
(
experts
,
std
::
vector
<
WeightType
>
(
unit_size
,
0
));
std
::
vector
<
std
::
vector
<
WeightType
>>
expert_token_weights
(
experts
,
std
::
vector
<
WeightType
>
(
unit_size
,
0
));
std
::
vector
<
IndexType
>
expert_slices
(
experts
,
1
);
std
::
vector
<
IndexType
>
expert_slice_idxs
(
experts
,
0
);
...
...
@@ -31,7 +31,7 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
{
for
(
index_t
k
=
0
;
k
<
topk
;
k
++
)
{
IndexType
e
=
topk_ids
(
t
,
k
);
IndexType
e
=
topk_ids
(
t
,
k
);
WeightType
w
=
weights
(
t
,
k
);
index_t
idx
=
expert_slice_idxs
[
e
];
if
(
idx
>
expert_slices
[
e
]
*
unit_size
-
1
)
...
...
@@ -40,10 +40,10 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
index_t
new_size
=
expert_slices
[
e
]
*
unit_size
;
expert_tokens
[
e
].
resize
(
new_size
);
expert_token_weights
[
e
].
resize
(
new_size
);
for
(
index_t
i
dx
=
(
expert_slices
[
e
]
-
1
)
*
unit_size
;
i
dx
<
new_size
;
i
dx
++
)
for
(
index_t
i
=
(
expert_slices
[
e
]
-
1
)
*
unit_size
;
i
<
new_size
;
i
++
)
{
expert_tokens
[
e
][
i
dx
]
=
num_token
;
expert_token_weights
[
e
][
i
dx
]
=
0
;
expert_tokens
[
e
][
i
]
=
num_token
;
expert_token_weights
[
e
][
i
]
=
0
;
}
}
...
...
@@ -53,15 +53,16 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
}
}
IndexType
*
out_tokens
=
sorted_token_ids
.
data
();
WeightType
*
out_weights
=
sorted_weight
.
data
();
IndexType
*
out_expert_id
=
sorted_expert_ids
.
data
();
IndexType
*
out_tokens
=
sorted_token_ids
.
data
();
WeightType
*
out_weights
=
sorted_weight
.
data
();
IndexType
*
out_expert_id
=
sorted_expert_ids
.
data
();
for
(
index_t
e
=
0
;
e
<
experts
;
e
++
)
{
memcpy
(
out_tokens
,
expert_tokens
[
e
].
data
(),
sizeof
(
index_t
)
*
expert_slices
[
e
]
*
unit_size
);
out_tokens
+=
expert_slices
[
e
]
*
unit_size
;
memcpy
(
out_weights
,
expert_token_weights
[
e
].
data
(),
sizeof
(
WeightType
)
*
expert_slices
[
e
]
*
unit_size
);
memcpy
(
out_weights
,
expert_token_weights
[
e
].
data
(),
sizeof
(
WeightType
)
*
expert_slices
[
e
]
*
unit_size
);
out_weights
+=
expert_slices
[
e
]
*
unit_size
;
for
(
index_t
s
=
0
;
s
<
expert_slices
[
e
];
s
++
)
...
...
include/ck_tile/ops/moe_sorting/kernel/moe_sorting_kernel.hpp
View file @
fbfad6c2
...
...
@@ -30,7 +30,7 @@ template <typename Problem_>
struct
MoeSortingKernel
{
// using Pipeline = remove_cvref_t<Pipeline_>;
using
Problem
=
remove_cvref_t
<
Problem_
>
;
using
Problem
=
remove_cvref_t
<
Problem_
>
;
using
IndexType
=
typename
Problem
::
IndexType
;
using
WeightType
=
typename
Problem
::
WeightType
;
...
...
@@ -55,11 +55,12 @@ struct MoeSortingKernel
index_t
*
total_tokens_post_pad
,
const
index_t
num_experts
,
const
index_t
unit_size
,
const
size
_t
numel
,
const
index
_t
numel
,
const
index_t
topk
)
const
{
const
size_t
tokens_per_thread
=
integer_divide_ceil
(
numel
,
blockDim
.
x
);
const
size_t
start_idx
=
threadIdx
.
x
*
tokens_per_thread
;
const
index_t
tokens_per_thread
=
integer_divide_ceil
(
numel
,
blockDim
.
x
);
const
index_t
tid
=
static_cast
<
index_t
>
(
threadIdx
.
x
);
const
index_t
start_idx
=
tid
*
tokens_per_thread
;
extern
__shared__
index_t
shared_mem
[];
...
...
@@ -68,34 +69,35 @@ struct MoeSortingKernel
for
(
int
i
=
0
;
i
<
num_experts
;
++
i
)
{
tokens_cnts
[
calc_index
(
num_experts
,
t
hreadIdx
.
x
+
1
,
i
)]
=
0
;
tokens_cnts
[
calc_index
(
num_experts
,
t
id
+
1
,
i
)]
=
0
;
}
for
(
int
i
=
start_idx
;
i
<
numel
&&
i
<
start_idx
+
tokens_per_thread
;
++
i
)
for
(
int
i
=
start_idx
;
i
<
numel
&&
i
<
start_idx
+
tokens_per_thread
;
++
i
)
{
++
tokens_cnts
[
calc_index
(
num_experts
,
t
hreadIdx
.
x
+
1
,
topk_id
[
i
])];
++
tokens_cnts
[
calc_index
(
num_experts
,
t
id
+
1
,
topk_id
[
i
])];
}
__syncthreads
();
if
(
t
hreadIdx
.
x
<
num_experts
)
if
(
t
id
<
num_experts
)
{
tokens_cnts
[
calc_index
(
num_experts
,
0
,
t
hreadIdx
.
x
)]
=
0
;
for
(
int
i
=
1
;
i
<=
blockDim
.
x
;
++
i
)
tokens_cnts
[
calc_index
(
num_experts
,
0
,
t
id
)]
=
0
;
for
(
int
i
=
1
;
i
<=
static_cast
<
index_t
>
(
blockDim
.
x
)
;
++
i
)
{
tokens_cnts
[
calc_index
(
num_experts
,
i
,
t
hreadIdx
.
x
)]
+=
tokens_cnts
[
calc_index
(
num_experts
,
i
-
1
,
t
hreadIdx
.
x
)];
tokens_cnts
[
calc_index
(
num_experts
,
i
,
t
id
)]
+=
tokens_cnts
[
calc_index
(
num_experts
,
i
-
1
,
t
id
)];
}
}
__syncthreads
();
if
(
t
hreadIdx
.
x
==
0
)
if
(
t
id
==
0
)
{
cumsum
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<=
num_experts
;
++
i
)
{
cumsum
[
i
]
=
cumsum
[
i
-
1
]
+
max
(
integer_divide_ceil
(
tokens_cnts
[
calc_index
(
num_experts
,
blockDim
.
x
,
i
-
1
)],
unit_size
),
max
(
integer_divide_ceil
(
tokens_cnts
[
calc_index
(
num_experts
,
blockDim
.
x
,
i
-
1
)],
unit_size
),
1
)
*
unit_size
;
}
...
...
@@ -103,11 +105,11 @@ struct MoeSortingKernel
}
__syncthreads
();
if
(
t
hreadIdx
.
x
<
num_experts
)
if
(
t
id
<
num_experts
)
{
for
(
int
i
=
cumsum
[
t
hreadIdx
.
x
];
i
<
cumsum
[
t
hreadIdx
.
x
+
1
];
i
+=
unit_size
)
for
(
int
i
=
cumsum
[
t
id
];
i
<
cumsum
[
t
id
+
1
];
i
+=
unit_size
)
{
expert_ids
[
i
/
unit_size
]
=
t
hreadIdx
.
x
;
expert_ids
[
i
/
unit_size
]
=
t
id
;
}
}
...
...
@@ -115,17 +117,17 @@ struct MoeSortingKernel
{
index_t
expert_id
=
topk_id
[
i
];
index_t
rank_post_pad
=
tokens_cnts
[
calc_index
(
num_experts
,
t
hreadIdx
.
x
,
expert_id
)]
+
cumsum
[
expert_id
];
tokens_cnts
[
calc_index
(
num_experts
,
t
id
,
expert_id
)]
+
cumsum
[
expert_id
];
sorted_token_ids
[
rank_post_pad
]
=
i
/
topk
;
sorted_weights
[
rank_post_pad
]
=
weights
[
i
];
++
tokens_cnts
[
calc_index
(
num_experts
,
t
hreadIdx
.
x
,
expert_id
)];
++
tokens_cnts
[
calc_index
(
num_experts
,
t
id
,
expert_id
)];
}
const
index_t
prefill_token
=
numel
/
topk
;
if
(
t
hreadIdx
.
x
<
num_experts
)
if
(
t
id
<
num_experts
)
{
index_t
expert_offset
=
cumsum
[
t
hreadIdx
.
x
]
+
tokens_cnts
[
calc_index
(
num_experts
,
blockDim
.
x
,
t
hreadIdx
.
x
)];
while
(
expert_offset
<
cumsum
[
t
hreadIdx
.
x
+
1
])
cumsum
[
t
id
]
+
tokens_cnts
[
calc_index
(
num_experts
,
blockDim
.
x
,
t
id
)];
while
(
expert_offset
<
cumsum
[
t
id
+
1
])
{
sorted_token_ids
[
expert_offset
]
=
prefill_token
;
sorted_weights
[
expert_offset
]
=
static_cast
<
WeightType
>
(
0.0
);
...
...
@@ -137,12 +139,12 @@ struct MoeSortingKernel
CK_TILE_DEVICE
void
operator
()(
Kargs
kargs
)
const
{
const
size_t
numel
=
kargs
.
tokens
*
kargs
.
topk
;
return
moe_align_block_size_kernel
(
static_cast
<
const
IndexType
*>
(
kargs
.
p_topk_ids
),
static_cast
<
const
WeightType
*>
(
kargs
.
p_weights
),
static_cast
<
IndexType
*>
(
kargs
.
sorted_token_ids
),
static_cast
<
WeightType
*>
(
kargs
.
sorted_weights
),
static_cast
<
IndexType
*>
(
kargs
.
expert_ids
),
static_cast
<
IndexType
*>
(
kargs
.
total_tokens_post_pad
),
return
moe_align_block_size_kernel
(
static_cast
<
const
IndexType
*>
(
kargs
.
p_topk_ids
),
static_cast
<
const
WeightType
*>
(
kargs
.
p_weights
),
static_cast
<
IndexType
*>
(
kargs
.
sorted_token_ids
),
static_cast
<
WeightType
*>
(
kargs
.
sorted_weights
),
static_cast
<
IndexType
*>
(
kargs
.
expert_ids
),
static_cast
<
IndexType
*>
(
kargs
.
total_tokens_post_pad
),
kargs
.
num_experts
,
kargs
.
unit_size
,
numel
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment