Commit 1f131e76 authored by aska-0096's avatar aska-0096
Browse files

fp8 sanity

parent 487826b3
......@@ -122,9 +122,9 @@ float add_rmsnorm2d_rdquant_fwd_b16_(add_rmsnorm2d_rdquant_fwd_traits t,
if(a.n<8192){
if(t.save_x){
if (a.n % 8 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 256, 8, true, true, false>>(s, a);
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 2, 1, 512, 8, true, true, false>>(s, a);
else if (a.n % 4 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 8, 1, 256, 4, true, true, false>>(s, a);
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 512, 4, true, true, false>>(s, a);
else if (a.n % 2 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 1024, 2, true, true, false>>(s, a);
else
......@@ -132,9 +132,9 @@ float add_rmsnorm2d_rdquant_fwd_b16_(add_rmsnorm2d_rdquant_fwd_traits t,
}
else{
if (a.n % 8 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 256, 8, true, false, false>>(s, a);
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 2, 1, 512, 8, true, false, false>>(s, a);
else if (a.n % 4 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 8, 1, 256, 4, true, false, false>>(s, a);
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 512, 4, true, false, false>>(s, a);
else if (a.n % 2 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 1024, 2, true, false, false>>(s, a);
else
......@@ -143,9 +143,9 @@ float add_rmsnorm2d_rdquant_fwd_b16_(add_rmsnorm2d_rdquant_fwd_traits t,
}
else{
if (a.n % 8 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 256, 8, false, false, false>>(s, a);
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 2, 1, 512, 8, false, false, false>>(s, a);
else if (a.n % 4 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 8, 1, 256, 4, false, false, false>>(s, a);
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 512, 4, false, false, false>>(s, a);
else if (a.n % 2 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 1024, 2, false, false, false>>(s, a);
else
......@@ -154,9 +154,9 @@ float add_rmsnorm2d_rdquant_fwd_b16_(add_rmsnorm2d_rdquant_fwd_traits t,
}
else if(a.n > 8192) {
if (a.n % 8 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 256, 8, true, true, true>>(s, a);
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 2, 1, 512, 8, true, true, true>>(s, a);
else if (a.n % 4 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 8, 1, 256, 4, true, true, true>>(s, a);
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 512, 4, true, true, true>>(s, a);
else if (a.n % 2 == 0)
r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1, 4, 1, 1024, 2, true, true, true>>(s, a);
else
......
......@@ -6,28 +6,28 @@
// clang-format off
// rm rn tm tn vn pd x 3p
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 256, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 256, 4, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1, 512, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 512, 4, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 256, 8, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 256, 4, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1, 512, 8, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 512, 4, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 256, 8, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 256, 4, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1, 512, 8, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 512, 4, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 256, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 256, 4, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1, 512, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 512, 4, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 256, 8, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 256, 4, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1, 512, 8, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 512, 4, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 256, 8, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 256, 4, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1, 512, 8, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 512, 4, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1, false, false, false>>(const S&, A);
......
......@@ -6,12 +6,12 @@
// clang-format off
// rm rn tm tn vn pd x 3p
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 256, 8, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 256, 4, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1, 512, 8, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 512, 4, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 256, 8, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 256, 4, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1, 512, 8, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 512, 4, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1, true, true, true>>(const S&, A);
// clang-format on
......@@ -6,29 +6,28 @@
// clang-format off
// rm rn tm tn vn pd x 3p
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 256, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1, 256, 4, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1, 512, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 512, 4, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 256, 8, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1, 256, 4, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1, 512, 8, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 512, 4, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 256, 8, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1, 256, 4, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1, 512, 8, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 512, 4, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 256, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1, 256, 4, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1, 512, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 512, 4, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 256, 8, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1, 256, 4, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1, 512, 8, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 512, 4, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1, true, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 256, 8, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1, 256, 4, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1, 512, 8, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 512, 4, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2, false, false, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1, false, false, false>>(const S&, A);
// clang-format on
......@@ -6,12 +6,12 @@
// clang-format off
// rm rn tm tn vn pd x 3p
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 256, 8, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1, 256, 4, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1, 512, 8, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 512, 4, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 256, 8, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1, 256, 4, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1, 512, 8, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 512, 4, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2, true, true, true>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1, true, true, true>>(const S&, A);
// clang-format on
......@@ -22,12 +22,7 @@ CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor<XDataType>&
// scale = amax / 127 for int8
auto v_scale = type_convert<XDataType>(scale_m(m));
auto v_qx = v_x / v_scale;
qx_m_n(m, n) = saturates<QXDataType>{}(v_qx);
if(m == 0 && n == 4)
printf("Qy: %lf, Satruates Qy: %lf\n",
type_convert<float>(v_qx),
type_convert<float>(qx_m_n(m, n)));
qx_m_n(m, n) = type_convert<QXDataType>(saturates<QXDataType>{}(v_qx));
}
};
......
......@@ -89,7 +89,7 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass
auto x = tile_elementwise_in(
[&](const auto& a_, const auto& b_) {
return type_convert<ComputeDataType>(a_) + type_convert<ComputeDataType>(b_);
return type_convert<ComputeDataType>(a_+b_);
},
a,
b);
......@@ -157,7 +157,7 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass
sweep_tile(qy, [&, yscale_ = yscale](auto idx) {
constexpr auto i_idx = make_tuple(idx[number<0>{}]);
auto qy_ = y[idx] / yscale_[i_idx];
qy(idx) = saturates<QYDataType>{}(qy_);
qy(idx) = type_convert<QYDataType>(saturates<QYDataType>{}(qy_));
});
store_tile(qy_window, qy);
}
......
......@@ -260,7 +260,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
const auto x_ = type_convert<ComputeDataType>(x[idx]);
auto y_ = x_ * inv_rms[i_idx] * gamma_;
auto qy_ = y_ / yscale[i_idx];
qy(idx) = saturates<QYDataType>{}(qy_);
qy(idx) = type_convert<QYDataType>(saturates<QYDataType>{}(qy_));
});
store_tile(qy_window, qy);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment