Commit 3679054a authored by fsx950223's avatar fsx950223
Browse files

optimize sqrt

parent 50034ff0
...@@ -206,7 +206,7 @@ struct GridwiseSparseEmbeddingsForwardLayernorm ...@@ -206,7 +206,7 @@ struct GridwiseSparseEmbeddingsForwardLayernorm
constexpr auto mean_var_offset = constexpr auto mean_var_offset =
mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_)); mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_));
auto divisor = 1 / __builtin_amdgcn_sqrtf(var_thread_buf(Number<mean_var_offset>{}) + epsilon);
static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) { static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
constexpr auto register_offset = thread_buf_desc.CalculateOffset( constexpr auto register_offset = thread_buf_desc.CalculateOffset(
make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_)); make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
...@@ -214,8 +214,7 @@ struct GridwiseSparseEmbeddingsForwardLayernorm ...@@ -214,8 +214,7 @@ struct GridwiseSparseEmbeddingsForwardLayernorm
gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_)); gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_));
auto acc_val = acc_thread_buf[Number<register_offset>{}]; auto acc_val = acc_thread_buf[Number<register_offset>{}];
acc_val = (acc_val - mean_thread_buf(Number<mean_var_offset>{})) / acc_val = (acc_val - mean_thread_buf(Number<mean_var_offset>{})) * divisor;
sqrt(var_thread_buf(Number<mean_var_offset>{}) + epsilon);
acc_val = acc_val * gamma_thread_buf[Number<gamma_beta_offset>{}] + acc_val = acc_val * gamma_thread_buf[Number<gamma_beta_offset>{}] +
beta_thread_buf[Number<gamma_beta_offset>{}]; beta_thread_buf[Number<gamma_beta_offset>{}];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment