[C] NVFP4 quantization for `GroupedTensor` (#2655)

* NVFP4 GroupedQuantize Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> Signed-off-by: Zhongbo Zhu <zhongboz@nvidia.com> Co-authored-by: Zhongbo Zhu <zhongboz@nvidia.com> * fix fp4 Signed-off-by: Zhongbo Zhu <zhongboz@nvidia.com> * Remove unnecessary file Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> --------- Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> Signed-off-by: Zhongbo Zhu <zhongboz@nvidia.com> Co-authored-by: Zhongbo Zhu <zhongboz@nvidia.com>

[C] NVFP4 quantization for `GroupedTensor` (#2655)
* NVFP4 GroupedQuantize Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> Signed-off-by: Zhongbo Zhu <zhongboz@nvidia.com> Co-authored-by: Zhongbo Zhu <zhongboz@nvidia.com> * fix fp4 Signed-off-by: Zhongbo Zhu <zhongboz@nvidia.com> * Remove unnecessary file Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> --------- Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> Signed-off-by: Zhongbo Zhu <zhongboz@nvidia.com> Co-authored-by: Zhongbo Zhu <zhongboz@nvidia.com>
402ea54b · Kirthi Shankar Sivamani · GitHub · ac81c85b · 402ea54b · 402ea54b
Unverified Commit 402ea54b authored Feb 12, 2026 by Kirthi Shankar Sivamani Committed by GitHub Feb 12, 2026
5 changed files
--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -173,10 +173,12 @@ list(APPEND transformer_engine_cuda_arch_specific_sources
     cast/cast.cu
     gemm/cutlass_grouped_gemm.cu
     hadamard_transform/group_hadamard_transform.cu
+     hadamard_transform/graph_safe_group_hadamard_transform.cu
     hadamard_transform/hadamard_transform.cu
     hadamard_transform/hadamard_transform_cast_fusion.cu
     hadamard_transform/group_hadamard_transform_cast_fusion.cu
     hadamard_transform/group_row_cast_col_hadamard_transform_cast_fusion.cu
+     hadamard_transform/graph_safe_group_row_cast_col_hadamard_transform_cast_fusion.cu
     multi_tensor/compute_scale.cu
     recipe/mxfp8_scaling.cu
     transpose/quantize_transpose_square_blockwise.cu

--- a/transformer_engine/common/hadamard_transform/graph_safe_group_hadamard_transform.cu
+++ b/transformer_engine/common/hadamard_transform/graph_safe_group_hadamard_transform.cu
--- a/transformer_engine/common/hadamard_transform/graph_safe_group_row_cast_col_hadamard_transform_cast_fusion.cu
+++ b/transformer_engine/common/hadamard_transform/graph_safe_group_row_cast_col_hadamard_transform_cast_fusion.cu
--- a/transformer_engine/common/include/transformer_engine/hadamard_transform.h
+++ b/transformer_engine/common/include/transformer_engine/hadamard_transform.h
@@ -86,6 +86,24 @@ void nvte_group_hadamard_transform_amax(const NVTETensor input, NVTETensor* outp
                                        int random_sign_mask, int random_sign_mask_t,
                                        cudaStream_t stream);
+/*! \brief Grouped-tensor amax with Hadamard transform (graph safe, device-managed grouping).
+ *
+ *  This function is experimental and the API is not stable.
+ *
+ *  This API assumes that the split info (grouping of tensors) is on device and unknown to the host;
+ *  therefore, this is a graph safe API and the grouped-tensor argument is passed as a single device structure.
+ *
+ *  \param[in]      input               NVTEGroupedTensor representing grouped input tensors.
+ *  \param[in,out]  output              NVTEGroupedTensor for output amax (row/col). Only the row-wise and
+ *                                      column-wise amaxes are updated.
+ *  \param[in]      random_sign_mask    16-bit sign mask for RHT.
+ *  \param[in]      random_sign_mask_t  16-bit sign mask for transposed RHT.
+ *  \param[in]      stream              CUDA stream used for the operation.
+ */
+void nvte_group_hadamard_transform_amax_graph_safe(const NVTEGroupedTensor input,
+                                                   NVTEGroupedTensor output, int random_sign_mask,
+                                                   int random_sign_mask_t, cudaStream_t stream);
 /*!
 * \brief Perform the grouped-tensor columnwise Hadamard transform cast fusion operation.
 *
@@ -124,6 +142,22 @@ void nvte_group_hadamard_transform_cast_fusion(const NVTETensor input, NVTETenso
                                               const NVTEQuantizationConfig quant_config,
                                               NVTETensor quant_workspace, cudaStream_t stream);
+/*!
+ * \brief Perform the grouped-tensor Hadamard transform cast fusion operation in graph-safe mode.
+ *
+ *  This function is experimental and the API is not stable. Group_ prefix means contiguous input concatenated.
+ *
+ *  \param[in]      input             NVTEGroupedTensor representing grouped input tensors.
+ *  \param[in,out]  output            NVTEGroupedTensor for output (row/column-wise quantized results).
+ *  \param[in]      hadamard_matrix   Hadamard matrix to use for transformation.
+ *  \param[in]      quant_config      Quantization configuration.
+ *  \param[in]      quant_workspace   Workspace buffer. Must be at least 4 bytes.
+ *  \param[in]      stream            CUDA stream used for the operation.
+ */
+void nvte_group_hadamard_transform_cast_fusion_graph_safe(
+    const NVTEGroupedTensor input, NVTEGroupedTensor output, const NVTETensor hadamard_matrix,
+    const NVTEQuantizationConfig quant_config, NVTETensor quant_workspace, cudaStream_t stream);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

--- a/transformer_engine/common/include/transformer_engine/multi_tensor.h
+++ b/transformer_engine/common/include/transformer_engine/multi_tensor.h
@@ -296,6 +296,17 @@ void nvte_multi_tensor_compute_scale_inv_e8m0_cuda(int chunk_size, NVTETensor **
 void nvte_group_amax(const NVTETensor input, NVTETensor *outputs, const size_t *split_sections,
                     size_t num_tensors, cudaStream_t stream);
+/*! \brief Grouped-tensor amax without doing hadamard transform.
+ *
+ *  This function is experimental and the API is not stable.
+ *
+ *  \param[in]      input            NVTEGroupedTensor Input tensor.
+ *  \param[in,out]  output           NVTEGroupedTensor Output tensor.
+ *  \param[in]      stream           CUDA stream used for the operation.
+ */
+void nvte_group_amax_graph_safe(const NVTEGroupedTensor input, NVTEGroupedTensor output,
+                                cudaStream_t stream);
 #ifdef __cplusplus
 }  // extern "C"
 #endif