Commit fbfad6c2 authored by dummycoderfe's avatar dummycoderfe
Browse files

format codes

parent 9964919d
......@@ -494,6 +494,11 @@ include_directories(BEFORE
${HIP_INCLUDE_DIRS}
)
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
add_compile_options(-Werror)
add_compile_options(-Weverything)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
......
......@@ -66,6 +66,7 @@ else()
-Wunreachable-code
-Wunused
-Wno-reserved-identifier
-Werror
-Wno-option-ignored
-Wsign-compare
-Wno-extra-semi-stmt
......
......@@ -19,7 +19,7 @@ auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("v", "1", "weather do CPU validation or not")
.insert("pr_i", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
.insert("pr_i", "int32", "index data type. (currently only int32 supported now)")
.insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)")
.insert("t", "128", "number of input tokens")
.insert("e", "8", "number of experts")
......@@ -63,7 +63,7 @@ template <typename WeightType, typename IndexType = ck_tile::index_t>
bool test_moe_sorting(ck_tile::ArgParser args)
{
int validate = args.get_int("v");
std::string input_prec = args.get_str("pr_i");
std::string index_prec = args.get_str("pr_i");
std::string weight_prec = args.get_str("pr_w");
int tokens = args.get_int("t");
int experts = args.get_int("e");
......@@ -115,7 +115,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
topk_ids_dev.ToDevice(topk_ids_host.data());
weights_dev.ToDevice(weights_host.data());
moe_sorting_trait trait{input_prec, weight_prec, experts, topk, unit_size, tokens};
moe_sorting_trait trait{index_prec, weight_prec, experts, topk, unit_size, tokens};
moe_sorting_kargs karg{topk_ids_dev.GetDeviceBuffer(),
weights_dev.GetDeviceBuffer(),
......@@ -135,7 +135,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
repeat};
auto ms = moe_sorting(trait, karg, sc);
printf("[%s|%s]tokens:%d, experts:%d, topk:%d, st_i:%d, ms:%f , ",
input_prec.c_str(),
index_prec.c_str(),
weight_prec.c_str(),
tokens,
experts,
......@@ -192,11 +192,11 @@ int main(int argc, char** argv)
auto [result, args] = create_args(argc, argv);
if(!result)
return -1;
std::string input_prec = args.get_str("pr_i");
std::string index_prec = args.get_str("pr_i");
std::string weight_prec = args.get_str("pr_w");
bool r = true;
if(weight_prec.compare("fp32") == 0)
if(weight_prec.compare("fp32") == 0 && index_prec.compare("int32") == 0)
{
r &= test_moe_sorting<float, ck_tile::index_t>(args);
}
......
......@@ -5,7 +5,7 @@
float moe_sorting(moe_sorting_trait t, moe_sorting_kargs a, ck_tile::stream_config s)
{
if(t.weight_type == "fp32")
if(t.weight_type == "fp32" && t.index_type == "int32")
{
using index_t = ck_tile::index_t;
using ms_weight_type = float;
......@@ -17,7 +17,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_kargs a, ck_tile::stream_conf
const dim3 blocks = ck_tile::max(t.experts, ck_tile::get_warp_size());
const size_t lds_size = ((blocks.x + 1) * t.experts + (t.experts + 1)) * sizeof(index_t);
float ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<64, 1>(kernel{}, grids, blocks, lds_size, kargs));
s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_size, kargs));
return ave_time;
}
return -1;
......
......@@ -9,7 +9,7 @@
struct moe_sorting_trait
{
std::string input_type;
std::string index_type;
std::string weight_type; // currently always float
int experts;
int topk;
......
......@@ -22,8 +22,8 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
const index_t topk = topk_ids.mDesc.get_lengths()[1];
std::vector<std::vector<IndexType>> expert_tokens(experts,
std::vector<IndexType>(unit_size, num_token));
std::vector<std::vector<WeightType>> expert_token_weights(experts,
std::vector<WeightType>(unit_size, 0));
std::vector<std::vector<WeightType>> expert_token_weights(
experts, std::vector<WeightType>(unit_size, 0));
std::vector<IndexType> expert_slices(experts, 1);
std::vector<IndexType> expert_slice_idxs(experts, 0);
......@@ -40,10 +40,10 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
index_t new_size = expert_slices[e] * unit_size;
expert_tokens[e].resize(new_size);
expert_token_weights[e].resize(new_size);
for(index_t idx = (expert_slices[e] - 1) * unit_size; idx < new_size; idx++)
for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++)
{
expert_tokens[e][idx] = num_token;
expert_token_weights[e][idx] = 0;
expert_tokens[e][i] = num_token;
expert_token_weights[e][i] = 0;
}
}
......@@ -60,8 +60,9 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
{
memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size);
out_tokens += expert_slices[e] * unit_size;
memcpy(
out_weights, expert_token_weights[e].data(), sizeof(WeightType) * expert_slices[e] * unit_size);
memcpy(out_weights,
expert_token_weights[e].data(),
sizeof(WeightType) * expert_slices[e] * unit_size);
out_weights += expert_slices[e] * unit_size;
for(index_t s = 0; s < expert_slices[e]; s++)
......
......@@ -55,11 +55,12 @@ struct MoeSortingKernel
index_t* total_tokens_post_pad,
const index_t num_experts,
const index_t unit_size,
const size_t numel,
const index_t numel,
const index_t topk) const
{
const size_t tokens_per_thread = integer_divide_ceil(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
const index_t tokens_per_thread = integer_divide_ceil(numel, blockDim.x);
const index_t tid = static_cast<index_t>(threadIdx.x);
const index_t start_idx = tid * tokens_per_thread;
extern __shared__ index_t shared_mem[];
......@@ -68,34 +69,35 @@ struct MoeSortingKernel
for(int i = 0; i < num_experts; ++i)
{
tokens_cnts[calc_index(num_experts, threadIdx.x + 1, i)] = 0;
tokens_cnts[calc_index(num_experts, tid + 1, i)] = 0;
}
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i)
for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i)
{
++tokens_cnts[calc_index(num_experts, threadIdx.x + 1, topk_id[i])];
++tokens_cnts[calc_index(num_experts, tid + 1, topk_id[i])];
}
__syncthreads();
if(threadIdx.x < num_experts)
if(tid < num_experts)
{
tokens_cnts[calc_index(num_experts, 0, threadIdx.x)] = 0;
for(int i = 1; i <= blockDim.x; ++i)
tokens_cnts[calc_index(num_experts, 0, tid)] = 0;
for(int i = 1; i <= static_cast<index_t>(blockDim.x); ++i)
{
tokens_cnts[calc_index(num_experts, i, threadIdx.x)] +=
tokens_cnts[calc_index(num_experts, i - 1, threadIdx.x)];
tokens_cnts[calc_index(num_experts, i, tid)] +=
tokens_cnts[calc_index(num_experts, i - 1, tid)];
}
}
__syncthreads();
if(threadIdx.x == 0)
if(tid == 0)
{
cumsum[0] = 0;
for(int i = 1; i <= num_experts; ++i)
{
cumsum[i] =
cumsum[i - 1] +
max(integer_divide_ceil(tokens_cnts[calc_index(num_experts, blockDim.x, i - 1)], unit_size),
max(integer_divide_ceil(tokens_cnts[calc_index(num_experts, blockDim.x, i - 1)],
unit_size),
1) *
unit_size;
}
......@@ -103,11 +105,11 @@ struct MoeSortingKernel
}
__syncthreads();
if(threadIdx.x < num_experts)
if(tid < num_experts)
{
for(int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; i += unit_size)
for(int i = cumsum[tid]; i < cumsum[tid + 1]; i += unit_size)
{
expert_ids[i / unit_size] = threadIdx.x;
expert_ids[i / unit_size] = tid;
}
}
......@@ -115,17 +117,17 @@ struct MoeSortingKernel
{
index_t expert_id = topk_id[i];
index_t rank_post_pad =
tokens_cnts[calc_index(num_experts, threadIdx.x, expert_id)] + cumsum[expert_id];
tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i / topk;
sorted_weights[rank_post_pad] = weights[i];
++tokens_cnts[calc_index(num_experts, threadIdx.x, expert_id)];
++tokens_cnts[calc_index(num_experts, tid, expert_id)];
}
const index_t prefill_token = numel / topk;
if(threadIdx.x < num_experts)
if(tid < num_experts)
{
index_t expert_offset =
cumsum[threadIdx.x] + tokens_cnts[calc_index(num_experts, blockDim.x, threadIdx.x)];
while(expert_offset < cumsum[threadIdx.x + 1])
cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)];
while(expert_offset < cumsum[tid + 1])
{
sorted_token_ids[expert_offset] = prefill_token;
sorted_weights[expert_offset] = static_cast<WeightType>(0.0);
......@@ -137,12 +139,12 @@ struct MoeSortingKernel
CK_TILE_DEVICE void operator()(Kargs kargs) const
{
const size_t numel = kargs.tokens * kargs.topk;
return moe_align_block_size_kernel(static_cast<const IndexType *>(kargs.p_topk_ids),
static_cast<const WeightType *>(kargs.p_weights),
static_cast<IndexType *>(kargs.sorted_token_ids),
static_cast<WeightType *>(kargs.sorted_weights),
static_cast<IndexType *>(kargs.expert_ids),
static_cast<IndexType *>(kargs.total_tokens_post_pad),
return moe_align_block_size_kernel(static_cast<const IndexType*>(kargs.p_topk_ids),
static_cast<const WeightType*>(kargs.p_weights),
static_cast<IndexType*>(kargs.sorted_token_ids),
static_cast<WeightType*>(kargs.sorted_weights),
static_cast<IndexType*>(kargs.expert_ids),
static_cast<IndexType*>(kargs.total_tokens_post_pad),
kargs.num_experts,
kargs.unit_size,
numel,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment