Merge branch 'develop_v2.4'

5b82e699 · wenjh · 9a815d0b · 7f946529 · 5b82e699 · 5b82e699
Commit 5b82e699 authored Jun 12, 2025 by wenjh
3 changed files
--- a/tests/pytorch/test_int8_blockwise_layers.py
+++ b/tests/pytorch/test_int8_blockwise_layers.py
@@ -167,7 +167,7 @@ class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase)
            dtype=dtype,
            y_error=0.9,
            ln_out_error=0.5,
-            dgrad_error=1.5,
+            dgrad_error=1,
            wgrad_error=1,
            bgrad_error=0.5,
            recipe1_golden_tensors=None,

--- a/transformer_engine/common/recipe/fp8_block_scaling.cu
+++ b/transformer_engine/common/recipe/fp8_block_scaling.cu
@@ -116,7 +116,12 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
      if (h_in_input < h && w_in_input < w && idx_in_input >= start_offset &&
          idx_in_input < end_offset) {
        float inp = static_cast<float>(input_minus_offset[idx_in_input]) * scale;
-        smem[h_in_smem][w_in_smem] = static_cast<OType>(inp);
+        if constexpr(std::is_same_v<OType, int8_t>) {
+          smem[h_in_smem][w_in_smem] = static_cast<OType>(lroundf(fmaxf(-127.0f, fminf(127.0f, inp))));
+        }
+        else {
+          smem[h_in_smem][w_in_smem] = static_cast<OType>(inp);
+        }
        skip_store = false;
      }
    }

--- a/transformer_engine/common/transpose/quantize_transpose_square_blockwise.cu
+++ b/transformer_engine/common/transpose/quantize_transpose_square_blockwise.cu
@@ -431,9 +431,15 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK) block_scaled_cast_transpose
      for (int j = 0; j < THREAD_TILE_DIM_X; j++) {
        // Step 3: Store cast output
        CType scale_data = block_tile_scale;
+        OType scaled_elt = 0;
-        OType scaled_elt =
+        if constexpr(std::is_same_v<OType, int8_t>) {
+          scaled_elt =
+            static_cast<OType>(lroundf(fmaxf(-127.0f, fminf(127.0f, static_cast<CType>(thrd_tile_input[i].data.elt[j]) * scale_data))));
+        }
+        else {
+          scaled_elt =
            static_cast<OType>(static_cast<CType>(thrd_tile_input[i].data.elt[j]) * scale_data);
+        }
        tmp_output_c.data.elt[j] = scaled_elt;
        // Step 4: do transpose within thread tile
        if constexpr (kReturnTranspose) {