Refactor normalization.cpp to use quantizer logic introduced in #1952 (#2006)

Refactor normalization.cpp to use quantizer logic introduced in #1952 instead of manual quantization Signed-off-by: Jan Bielak <jbielak@nvidia.com>

Refactor normalization.cpp to use quantizer logic introduced in #1952 (#2006)
Refactor normalization.cpp to use quantizer logic introduced in #1952 instead of manual quantization Signed-off-by: Jan Bielak <jbielak@nvidia.com>
11ac24cf · Jan Bielak · GitHub · 020428f0 · 11ac24cf
Unverified Commit 11ac24cf authored Jul 29, 2025 by Jan Bielak Committed by GitHub Jul 29, 2025
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 78 deletions

transformer_engine/pytorch/csrc/extensions/normalization.cpp transformer_engine/pytorch/csrc/extensions/normalization.cpp +2 -78

No files found.
--- a/transformer_engine/pytorch/csrc/extensions/normalization.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/normalization.cpp
@@ -139,45 +139,7 @@ std::vector<py::object> layernorm_fwd(py::handle input, py::handle weight, Maybe
  // Quantize output if using unfused kernel
  if (force_unfused_kernel) {
-    QuantizationConfigWrapper quant_config;
+    my_quantizer->quantize(unquantized_out_cu, out_cu);
-    if (IsFloat8CurrentScalingQuantizers(quantizer.ptr())) {
-      // my_quantizer here has to be a Float8CurrentScalingQuantizer
-      auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer *>(my_quantizer.get());
-      NVTE_SCOPED_GIL_RELEASE({
-        nvte_compute_amax(unquantized_out_cu.data(), out_cu.data(),
-                          at::cuda::getCurrentCUDAStream());
-      });
-      // check if we need to do amax reudction (depending on model parallel configs)
-      if (my_quantizer_cs->with_amax_reduction) {
-        c10::intrusive_ptr<dist_group_type> process_group_ptr =
-            my_quantizer_cs->amax_reduction_group;
-        // construct torch tesnor from NVTEBasicTensor without reallocating memory
-        at::Tensor &amax_tensor_torch = my_quantizer_cs->amax;
-        std::vector<at::Tensor> tensors = {amax_tensor_torch};
-        // allreduce amax tensor
-        c10d::AllreduceOptions allreduce_opts;
-        allreduce_opts.reduceOp = c10d::ReduceOp::MAX;
-        process_group_ptr->allreduce(tensors, allreduce_opts)->wait();
-      }
-      quant_config.set_force_pow_2_scales(my_quantizer_cs->force_pow_2_scales);
-      quant_config.set_amax_epsilon(my_quantizer_cs->amax_epsilon);
-      NVTE_SCOPED_GIL_RELEASE({
-        nvte_compute_scale_from_amax(out_cu.data(), quant_config, at::cuda::getCurrentCUDAStream());
-      });
-      // set amax ptr to null in te_output TensorWrapper to avoid atomic amax updates in kernel
-      out_cu.set_amax(nullptr, DType::kFloat32, out_cu.defaultShape);
-    } else if (IsFloat8BlockwiseQuantizers(quantizer.ptr())) {
-      auto my_quantizer_bw = static_cast<Float8BlockQuantizer *>(my_quantizer.get());
-      quant_config.set_force_pow_2_scales(my_quantizer_bw->force_pow_2_scales);
-      quant_config.set_amax_epsilon(my_quantizer_bw->amax_epsilon);
-      if (my_quantizer_bw->all_gather_usage) {
-        quant_config.set_float8_block_scale_tensor_format(Float8BlockScaleTensorFormat::COMPACT);
-      }
-    }
-    NVTE_SCOPED_GIL_RELEASE({
-      nvte_quantize_v2(unquantized_out_cu.data(), out_cu.data(), quant_config,
-                       at::cuda::getCurrentCUDAStream());
-    });
  }
  return {out, py::cast(mu), py::cast(rsigma)};
@@ -300,45 +262,7 @@ std::vector<py::object> rmsnorm_fwd(const py::handle &input, const py::handle &w
  // Quantize output if using unfused kernel
  if (force_unfused_kernel) {
-    QuantizationConfigWrapper quant_config;
+    my_quantizer->quantize(unquantized_out_cu, out_cu);
-    if (IsFloat8CurrentScalingQuantizers(quantizer.ptr())) {
-      // my_quantizer here has to be a Float8CurrentScalingQuantizer
-      auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer *>(my_quantizer.get());
-      NVTE_SCOPED_GIL_RELEASE({
-        nvte_compute_amax(unquantized_out_cu.data(), out_cu.data(),
-                          at::cuda::getCurrentCUDAStream());
-      });
-      // check if we need to do amax reudction (depending on model parallel configs)
-      if (my_quantizer_cs->with_amax_reduction) {
-        c10::intrusive_ptr<dist_group_type> process_group_ptr =
-            my_quantizer_cs->amax_reduction_group;
-        // construct torch tesnor from NVTEBasicTensor without reallocating memory
-        at::Tensor &amax_tensor_torch = my_quantizer_cs->amax;
-        std::vector<at::Tensor> tensors = {amax_tensor_torch};
-        // allreduce amax tensor
-        c10d::AllreduceOptions allreduce_opts;
-        allreduce_opts.reduceOp = c10d::ReduceOp::MAX;
-        process_group_ptr->allreduce(tensors, allreduce_opts)->wait();
-      }
-      quant_config.set_force_pow_2_scales(my_quantizer_cs->force_pow_2_scales);
-      quant_config.set_amax_epsilon(my_quantizer_cs->amax_epsilon);
-      NVTE_SCOPED_GIL_RELEASE({
-        nvte_compute_scale_from_amax(out_cu.data(), quant_config, at::cuda::getCurrentCUDAStream());
-      });
-      // set amax ptr to null in te_output TensorWrapper to avoid atomic amax updates in kernel
-      out_cu.set_amax(nullptr, DType::kFloat32, out_cu.defaultShape);
-    } else if (IsFloat8BlockwiseQuantizers(quantizer.ptr())) {
-      auto my_quantizer_bw = static_cast<Float8BlockQuantizer *>(my_quantizer.get());
-      quant_config.set_force_pow_2_scales(my_quantizer_bw->force_pow_2_scales);
-      quant_config.set_amax_epsilon(my_quantizer_bw->amax_epsilon);
-      if (my_quantizer_bw->all_gather_usage) {
-        quant_config.set_float8_block_scale_tensor_format(Float8BlockScaleTensorFormat::COMPACT);
-      }
-    }
-    NVTE_SCOPED_GIL_RELEASE({
-      nvte_quantize_v2(unquantized_out_cu.data(), out_cu.data(), quant_config,
-                       at::cuda::getCurrentCUDAStream());
-    });
  }
  return {out, py::none(), py::cast(rsigma)};