[Common] Disabled the tuned NVFP4 kernels (#2615)

* Disabled the tuned NVFP4 kernels Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> * Disabled fast math in cpp tests Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> --------- Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com>

[Common] Disabled the tuned NVFP4 kernels (#2615)
* Disabled the tuned NVFP4 kernels Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> * Disabled fast math in cpp tests Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> --------- Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com>
a0a89a8e · Oleg Goncharov · GitHub · 52ee5ea0 · a0a89a8e · a0a89a8e
Unverified Commit a0a89a8e authored Jan 23, 2026 by Oleg Goncharov Committed by GitHub Jan 22, 2026
2 changed files
--- a/tests/cpp/operator/test_cast_nvfp4_transpose.cu
+++ b/tests/cpp/operator/test_cast_nvfp4_transpose.cu
@@ -677,11 +677,6 @@ std::vector<ActivationType> Activation_types = {
    ActivationType::Identity
 };

-std::vector<bool> use_fast_nvfp4_scaling_vec = {
-    false,
-    true
-};
-
 }  // namespace

 class FusedCastTransposeNVFP4TestSuite : public ::testing::TestWithParam
@@ -743,7 +738,7 @@ INSTANTIATE_TEST_SUITE_P(
        ::testing::ValuesIn(Activation_types),
        ::testing::ValuesIn(tensor_dims),
        ::testing::Values(DType::kBFloat16),
-        ::testing::ValuesIn(use_fast_nvfp4_scaling_vec)),
+        ::testing::Values(false)),
    [](const testing::TestParamInfo<FusedCastTransposeNVFP4TestSuite::ParamType>& info) {
        std::string name = to_string(std::get<0>(info.param));
      const auto& shape = std::get<1>(info.param);

--- a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh
+++ b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh
@@ -1168,10 +1168,10 @@ void quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output,
  // TODO(Frank): Is there a better way to do this?
  bool return_transpose = output->has_columnwise_data();

-  if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
-    quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
-    return;
-  }
+  // if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
+  //   quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
+  //   return;
+  // }

  constexpr bool COMPUTE_ACTIVATIONS = false;
  using ParamOP = Empty;