[Common] Fix NVFP4 tuned-kernel numerics (#2639)

* Fixed scaling-factor computation for FP32 to match the reference implementation. Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> * Uncommented the tuned kernel path Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

[Common] Fix NVFP4 tuned-kernel numerics (#2639)
* Fixed scaling-factor computation for FP32 to match the reference implementation. Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> * Uncommented the tuned kernel path Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
29b84c16 · Oleg Goncharov · GitHub · 94ba75d7 · 29b84c16 · 29b84c16
Unverified Commit 29b84c16 authored Feb 03, 2026 by Oleg Goncharov Committed by GitHub Feb 02, 2026
2 changed files
--- a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh
+++ b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh
@@ -1168,10 +1168,10 @@ void quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output,
  // TODO(Frank): Is there a better way to do this?
  bool return_transpose = output->has_columnwise_data();

-  // if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
-  //   quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
-  //   return;
-  // }
+  if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
+    quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
+    return;
+  }

  constexpr bool COMPUTE_ACTIVATIONS = false;
  using ParamOP = Empty;

--- a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
+++ b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
@@ -163,9 +163,24 @@ __device__ __forceinline__ float get_amax_of_pair(const IType2 pair) {
 template <typename SF_TYPE>
 __device__ __forceinline__ SF_TYPE
 compute_nvfp4_scaling_coefficient(const nvfp4_scale_t S_dec_block, const float S_enc) {
-  constexpr float float_max = detail::TypeExtrema<SF_TYPE>::max;
-  const float scale_rcp = fminf(S_enc / static_cast<float>(S_dec_block), float_max);
-  return static_cast<SF_TYPE>(scale_rcp);
+  NVTE_DEVICE_ERROR("Unsupported scaling-factor type. Only FP32 and BF16 are supported.");
+}
+
+template <>
+__device__ __forceinline__ float compute_nvfp4_scaling_coefficient<float>(
+    const nvfp4_scale_t S_dec_block, const float S_enc) {
+  const float S_dec = 1.0f / S_enc;
+  const float scale_rcp =
+      fminf(1.0f / (static_cast<float>(S_dec_block) * S_dec), detail::TypeExtrema<float>::max);
+  return scale_rcp;
+}
+
+template <>
+__device__ __forceinline__ bf16
+compute_nvfp4_scaling_coefficient<bf16>(const nvfp4_scale_t S_dec_block, const float S_enc) {
+  const float scale_rcp =
+      fminf(S_enc / (static_cast<float>(S_dec_block)), detail::TypeExtrema<bf16>::max);
+  return static_cast<bf16>(scale_rcp);
 }

 template <bool USE_STOCHASTIC_ROUNDING, bool USE_FAST_MATH>