Commit fbfad6c2 authored by dummycoderfe's avatar dummycoderfe
Browse files

format codes

parent 9964919d
...@@ -494,6 +494,11 @@ include_directories(BEFORE ...@@ -494,6 +494,11 @@ include_directories(BEFORE
${HIP_INCLUDE_DIRS} ${HIP_INCLUDE_DIRS}
) )
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
add_compile_options(-Werror)
add_compile_options(-Weverything)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
......
...@@ -66,6 +66,7 @@ else() ...@@ -66,6 +66,7 @@ else()
-Wunreachable-code -Wunreachable-code
-Wunused -Wunused
-Wno-reserved-identifier -Wno-reserved-identifier
-Werror
-Wno-option-ignored -Wno-option-ignored
-Wsign-compare -Wsign-compare
-Wno-extra-semi-stmt -Wno-extra-semi-stmt
......
...@@ -19,7 +19,7 @@ auto create_args(int argc, char* argv[]) ...@@ -19,7 +19,7 @@ auto create_args(int argc, char* argv[])
{ {
ck_tile::ArgParser arg_parser; ck_tile::ArgParser arg_parser;
arg_parser.insert("v", "1", "weather do CPU validation or not") arg_parser.insert("v", "1", "weather do CPU validation or not")
.insert("pr_i", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)") .insert("pr_i", "int32", "index data type. (currently only int32 supported now)")
.insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)") .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)")
.insert("t", "128", "number of input tokens") .insert("t", "128", "number of input tokens")
.insert("e", "8", "number of experts") .insert("e", "8", "number of experts")
...@@ -63,7 +63,7 @@ template <typename WeightType, typename IndexType = ck_tile::index_t> ...@@ -63,7 +63,7 @@ template <typename WeightType, typename IndexType = ck_tile::index_t>
bool test_moe_sorting(ck_tile::ArgParser args) bool test_moe_sorting(ck_tile::ArgParser args)
{ {
int validate = args.get_int("v"); int validate = args.get_int("v");
std::string input_prec = args.get_str("pr_i"); std::string index_prec = args.get_str("pr_i");
std::string weight_prec = args.get_str("pr_w"); std::string weight_prec = args.get_str("pr_w");
int tokens = args.get_int("t"); int tokens = args.get_int("t");
int experts = args.get_int("e"); int experts = args.get_int("e");
...@@ -115,7 +115,7 @@ bool test_moe_sorting(ck_tile::ArgParser args) ...@@ -115,7 +115,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
topk_ids_dev.ToDevice(topk_ids_host.data()); topk_ids_dev.ToDevice(topk_ids_host.data());
weights_dev.ToDevice(weights_host.data()); weights_dev.ToDevice(weights_host.data());
moe_sorting_trait trait{input_prec, weight_prec, experts, topk, unit_size, tokens}; moe_sorting_trait trait{index_prec, weight_prec, experts, topk, unit_size, tokens};
moe_sorting_kargs karg{topk_ids_dev.GetDeviceBuffer(), moe_sorting_kargs karg{topk_ids_dev.GetDeviceBuffer(),
weights_dev.GetDeviceBuffer(), weights_dev.GetDeviceBuffer(),
...@@ -135,7 +135,7 @@ bool test_moe_sorting(ck_tile::ArgParser args) ...@@ -135,7 +135,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
repeat}; repeat};
auto ms = moe_sorting(trait, karg, sc); auto ms = moe_sorting(trait, karg, sc);
printf("[%s|%s]tokens:%d, experts:%d, topk:%d, st_i:%d, ms:%f , ", printf("[%s|%s]tokens:%d, experts:%d, topk:%d, st_i:%d, ms:%f , ",
input_prec.c_str(), index_prec.c_str(),
weight_prec.c_str(), weight_prec.c_str(),
tokens, tokens,
experts, experts,
...@@ -192,11 +192,11 @@ int main(int argc, char** argv) ...@@ -192,11 +192,11 @@ int main(int argc, char** argv)
auto [result, args] = create_args(argc, argv); auto [result, args] = create_args(argc, argv);
if(!result) if(!result)
return -1; return -1;
std::string input_prec = args.get_str("pr_i"); std::string index_prec = args.get_str("pr_i");
std::string weight_prec = args.get_str("pr_w"); std::string weight_prec = args.get_str("pr_w");
bool r = true; bool r = true;
if(weight_prec.compare("fp32") == 0) if(weight_prec.compare("fp32") == 0 && index_prec.compare("int32") == 0)
{ {
r &= test_moe_sorting<float, ck_tile::index_t>(args); r &= test_moe_sorting<float, ck_tile::index_t>(args);
} }
......
...@@ -5,11 +5,11 @@ ...@@ -5,11 +5,11 @@
float moe_sorting(moe_sorting_trait t, moe_sorting_kargs a, ck_tile::stream_config s) float moe_sorting(moe_sorting_trait t, moe_sorting_kargs a, ck_tile::stream_config s)
{ {
if(t.weight_type == "fp32") if(t.weight_type == "fp32" && t.index_type == "int32")
{ {
using index_t = ck_tile::index_t; using index_t = ck_tile::index_t;
using ms_weight_type = float; using ms_weight_type = float;
using ms_problem = ck_tile::MoeSortingProblem<index_t, ms_weight_type>; using ms_problem = ck_tile::MoeSortingProblem<index_t, ms_weight_type>;
// using ms_pipeline = ck_tile::MoeSortingPipeline<ms_problem>; // using ms_pipeline = ck_tile::MoeSortingPipeline<ms_problem>;
using kernel = ck_tile::MoeSortingKernel<ms_problem>; using kernel = ck_tile::MoeSortingKernel<ms_problem>;
auto kargs = kernel::MakeKargs(a); auto kargs = kernel::MakeKargs(a);
...@@ -17,7 +17,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_kargs a, ck_tile::stream_conf ...@@ -17,7 +17,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_kargs a, ck_tile::stream_conf
const dim3 blocks = ck_tile::max(t.experts, ck_tile::get_warp_size()); const dim3 blocks = ck_tile::max(t.experts, ck_tile::get_warp_size());
const size_t lds_size = ((blocks.x + 1) * t.experts + (t.experts + 1)) * sizeof(index_t); const size_t lds_size = ((blocks.x + 1) * t.experts + (t.experts + 1)) * sizeof(index_t);
float ave_time = ck_tile::launch_kernel( float ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<64, 1>(kernel{}, grids, blocks, lds_size, kargs)); s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_size, kargs));
return ave_time; return ave_time;
} }
return -1; return -1;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
struct moe_sorting_trait struct moe_sorting_trait
{ {
std::string input_type; std::string index_type;
std::string weight_type; // currently always float std::string weight_type; // currently always float
int experts; int experts;
int topk; int topk;
......
...@@ -19,11 +19,11 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids, ...@@ -19,11 +19,11 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
const index_t unit_size) const index_t unit_size)
{ {
const index_t num_token = topk_ids.mDesc.get_lengths()[0]; const index_t num_token = topk_ids.mDesc.get_lengths()[0];
const index_t topk = topk_ids.mDesc.get_lengths()[1]; const index_t topk = topk_ids.mDesc.get_lengths()[1];
std::vector<std::vector<IndexType>> expert_tokens(experts, std::vector<std::vector<IndexType>> expert_tokens(experts,
std::vector<IndexType>(unit_size, num_token)); std::vector<IndexType>(unit_size, num_token));
std::vector<std::vector<WeightType>> expert_token_weights(experts, std::vector<std::vector<WeightType>> expert_token_weights(
std::vector<WeightType>(unit_size, 0)); experts, std::vector<WeightType>(unit_size, 0));
std::vector<IndexType> expert_slices(experts, 1); std::vector<IndexType> expert_slices(experts, 1);
std::vector<IndexType> expert_slice_idxs(experts, 0); std::vector<IndexType> expert_slice_idxs(experts, 0);
...@@ -31,7 +31,7 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids, ...@@ -31,7 +31,7 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
{ {
for(index_t k = 0; k < topk; k++) for(index_t k = 0; k < topk; k++)
{ {
IndexType e = topk_ids(t, k); IndexType e = topk_ids(t, k);
WeightType w = weights(t, k); WeightType w = weights(t, k);
index_t idx = expert_slice_idxs[e]; index_t idx = expert_slice_idxs[e];
if(idx > expert_slices[e] * unit_size - 1) if(idx > expert_slices[e] * unit_size - 1)
...@@ -40,10 +40,10 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids, ...@@ -40,10 +40,10 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
index_t new_size = expert_slices[e] * unit_size; index_t new_size = expert_slices[e] * unit_size;
expert_tokens[e].resize(new_size); expert_tokens[e].resize(new_size);
expert_token_weights[e].resize(new_size); expert_token_weights[e].resize(new_size);
for(index_t idx = (expert_slices[e] - 1) * unit_size; idx < new_size; idx++) for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++)
{ {
expert_tokens[e][idx] = num_token; expert_tokens[e][i] = num_token;
expert_token_weights[e][idx] = 0; expert_token_weights[e][i] = 0;
} }
} }
...@@ -53,15 +53,16 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids, ...@@ -53,15 +53,16 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
} }
} }
IndexType* out_tokens = sorted_token_ids.data(); IndexType* out_tokens = sorted_token_ids.data();
WeightType* out_weights = sorted_weight.data(); WeightType* out_weights = sorted_weight.data();
IndexType* out_expert_id = sorted_expert_ids.data(); IndexType* out_expert_id = sorted_expert_ids.data();
for(index_t e = 0; e < experts; e++) for(index_t e = 0; e < experts; e++)
{ {
memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size); memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size);
out_tokens += expert_slices[e] * unit_size; out_tokens += expert_slices[e] * unit_size;
memcpy( memcpy(out_weights,
out_weights, expert_token_weights[e].data(), sizeof(WeightType) * expert_slices[e] * unit_size); expert_token_weights[e].data(),
sizeof(WeightType) * expert_slices[e] * unit_size);
out_weights += expert_slices[e] * unit_size; out_weights += expert_slices[e] * unit_size;
for(index_t s = 0; s < expert_slices[e]; s++) for(index_t s = 0; s < expert_slices[e]; s++)
......
...@@ -30,7 +30,7 @@ template <typename Problem_> ...@@ -30,7 +30,7 @@ template <typename Problem_>
struct MoeSortingKernel struct MoeSortingKernel
{ {
// using Pipeline = remove_cvref_t<Pipeline_>; // using Pipeline = remove_cvref_t<Pipeline_>;
using Problem = remove_cvref_t<Problem_>; using Problem = remove_cvref_t<Problem_>;
using IndexType = typename Problem::IndexType; using IndexType = typename Problem::IndexType;
using WeightType = typename Problem::WeightType; using WeightType = typename Problem::WeightType;
...@@ -55,11 +55,12 @@ struct MoeSortingKernel ...@@ -55,11 +55,12 @@ struct MoeSortingKernel
index_t* total_tokens_post_pad, index_t* total_tokens_post_pad,
const index_t num_experts, const index_t num_experts,
const index_t unit_size, const index_t unit_size,
const size_t numel, const index_t numel,
const index_t topk) const const index_t topk) const
{ {
const size_t tokens_per_thread = integer_divide_ceil(numel, blockDim.x); const index_t tokens_per_thread = integer_divide_ceil(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread; const index_t tid = static_cast<index_t>(threadIdx.x);
const index_t start_idx = tid * tokens_per_thread;
extern __shared__ index_t shared_mem[]; extern __shared__ index_t shared_mem[];
...@@ -68,34 +69,35 @@ struct MoeSortingKernel ...@@ -68,34 +69,35 @@ struct MoeSortingKernel
for(int i = 0; i < num_experts; ++i) for(int i = 0; i < num_experts; ++i)
{ {
tokens_cnts[calc_index(num_experts, threadIdx.x + 1, i)] = 0; tokens_cnts[calc_index(num_experts, tid + 1, i)] = 0;
} }
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i)
{ {
++tokens_cnts[calc_index(num_experts, threadIdx.x + 1, topk_id[i])]; ++tokens_cnts[calc_index(num_experts, tid + 1, topk_id[i])];
} }
__syncthreads(); __syncthreads();
if(threadIdx.x < num_experts) if(tid < num_experts)
{ {
tokens_cnts[calc_index(num_experts, 0, threadIdx.x)] = 0; tokens_cnts[calc_index(num_experts, 0, tid)] = 0;
for(int i = 1; i <= blockDim.x; ++i) for(int i = 1; i <= static_cast<index_t>(blockDim.x); ++i)
{ {
tokens_cnts[calc_index(num_experts, i, threadIdx.x)] += tokens_cnts[calc_index(num_experts, i, tid)] +=
tokens_cnts[calc_index(num_experts, i - 1, threadIdx.x)]; tokens_cnts[calc_index(num_experts, i - 1, tid)];
} }
} }
__syncthreads(); __syncthreads();
if(threadIdx.x == 0) if(tid == 0)
{ {
cumsum[0] = 0; cumsum[0] = 0;
for(int i = 1; i <= num_experts; ++i) for(int i = 1; i <= num_experts; ++i)
{ {
cumsum[i] = cumsum[i] =
cumsum[i - 1] + cumsum[i - 1] +
max(integer_divide_ceil(tokens_cnts[calc_index(num_experts, blockDim.x, i - 1)], unit_size), max(integer_divide_ceil(tokens_cnts[calc_index(num_experts, blockDim.x, i - 1)],
unit_size),
1) * 1) *
unit_size; unit_size;
} }
...@@ -103,11 +105,11 @@ struct MoeSortingKernel ...@@ -103,11 +105,11 @@ struct MoeSortingKernel
} }
__syncthreads(); __syncthreads();
if(threadIdx.x < num_experts) if(tid < num_experts)
{ {
for(int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; i += unit_size) for(int i = cumsum[tid]; i < cumsum[tid + 1]; i += unit_size)
{ {
expert_ids[i / unit_size] = threadIdx.x; expert_ids[i / unit_size] = tid;
} }
} }
...@@ -115,17 +117,17 @@ struct MoeSortingKernel ...@@ -115,17 +117,17 @@ struct MoeSortingKernel
{ {
index_t expert_id = topk_id[i]; index_t expert_id = topk_id[i];
index_t rank_post_pad = index_t rank_post_pad =
tokens_cnts[calc_index(num_experts, threadIdx.x, expert_id)] + cumsum[expert_id]; tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i / topk; sorted_token_ids[rank_post_pad] = i / topk;
sorted_weights[rank_post_pad] = weights[i]; sorted_weights[rank_post_pad] = weights[i];
++tokens_cnts[calc_index(num_experts, threadIdx.x, expert_id)]; ++tokens_cnts[calc_index(num_experts, tid, expert_id)];
} }
const index_t prefill_token = numel / topk; const index_t prefill_token = numel / topk;
if(threadIdx.x < num_experts) if(tid < num_experts)
{ {
index_t expert_offset = index_t expert_offset =
cumsum[threadIdx.x] + tokens_cnts[calc_index(num_experts, blockDim.x, threadIdx.x)]; cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)];
while(expert_offset < cumsum[threadIdx.x + 1]) while(expert_offset < cumsum[tid + 1])
{ {
sorted_token_ids[expert_offset] = prefill_token; sorted_token_ids[expert_offset] = prefill_token;
sorted_weights[expert_offset] = static_cast<WeightType>(0.0); sorted_weights[expert_offset] = static_cast<WeightType>(0.0);
...@@ -137,12 +139,12 @@ struct MoeSortingKernel ...@@ -137,12 +139,12 @@ struct MoeSortingKernel
CK_TILE_DEVICE void operator()(Kargs kargs) const CK_TILE_DEVICE void operator()(Kargs kargs) const
{ {
const size_t numel = kargs.tokens * kargs.topk; const size_t numel = kargs.tokens * kargs.topk;
return moe_align_block_size_kernel(static_cast<const IndexType *>(kargs.p_topk_ids), return moe_align_block_size_kernel(static_cast<const IndexType*>(kargs.p_topk_ids),
static_cast<const WeightType *>(kargs.p_weights), static_cast<const WeightType*>(kargs.p_weights),
static_cast<IndexType *>(kargs.sorted_token_ids), static_cast<IndexType*>(kargs.sorted_token_ids),
static_cast<WeightType *>(kargs.sorted_weights), static_cast<WeightType*>(kargs.sorted_weights),
static_cast<IndexType *>(kargs.expert_ids), static_cast<IndexType*>(kargs.expert_ids),
static_cast<IndexType *>(kargs.total_tokens_post_pad), static_cast<IndexType*>(kargs.total_tokens_post_pad),
kargs.num_experts, kargs.num_experts,
kargs.unit_size, kargs.unit_size,
numel, numel,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment