Debug doc generation (#2576)

Debug Doxygen and LaTeX warnings Signed-off-by: Tim Moon <tmoon@nvidia.com>

Debug doc generation (#2576)
Debug Doxygen and LaTeX warnings Signed-off-by: Tim Moon <tmoon@nvidia.com>
2f8ae81c · Tim Moon · GitHub · 32f403fd · 2f8ae81c · 2f8ae81c
Unverified Commit 2f8ae81c authored Jan 09, 2026 by Tim Moon Committed by GitHub Jan 09, 2026
11 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -41,3 +41,4 @@ compile_commands.json
 .nfs
 tensor_dumps/
 artifacts/
+*.DS_Store
--- a/transformer_engine/common/include/transformer_engine/cast.h
+++ b/transformer_engine/common/include/transformer_engine/cast.h
@@ -130,7 +130,7 @@ void nvte_quantize_v2(const NVTETensor input, NVTETensor output,
 *  \param[in]     stream           CUDA stream used for the operation.
 */
 void nvte_quantize_dbias(const NVTETensor input, NVTETensor output, NVTETensor dbias,
-                         NVTETensor workplace, cudaStream_t stream);
+                         NVTETensor workspace, cudaStream_t stream);

 /*! \brief Computes backward of GeLU operation on the input, then casts to FP8/MXFP8.
 *         Additionally, reduces the result of the GeLU backward along columns.
@@ -264,6 +264,7 @@ void nvte_dequantize(const NVTETensor input, NVTETensor output, cudaStream_t str
 *  \param[in]      inputs           List of input tensors to be cast.
 *  \param[in,out]  outputs          List of output quantized tensors.
 *  \param[in]      quant_config     (Optional) Quantization configurations.
+ *  \param[in]      num_tensors      Number of input and output tensors.
 *  \param[in]      stream           CUDA stream used for the operation.
 */
 void nvte_multi_tensor_quantize(const NVTETensor *inputs, NVTETensor *outputs,

--- a/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h
+++ b/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h
@@ -4,8 +4,8 @@
 * See LICENSE for license information.
 ************************************************************************/

-/*! \file transpose_with_noop.h
- *  \brief Functions handling transposes with no-op.
+/*! \file cast_transpose_noop.h
+ *  \brief Transpose functions with no-op flag.
 */

 #ifndef TRANSFORMER_ENGINE_CAST_TRANSPOSE_WITH_NOOP_H_

--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -409,7 +409,6 @@ void nvte_fused_attn_bwd_qkvpacked(
 *  \param[in]     softmax_type              Attention softmax type.
 *  \param[in]     window_size_left          Sliding window size (the left half).
 *  \param[in]     window_size_right         Sliding window size (the right half).
- *  \param[in]     deterministic             Whether to execute with deterministic behaviours.
 *  \param[in]     workspace                 Workspace tensor.
 *  \param[in]     stream                    CUDA stream used for this operation.
 */
@@ -673,7 +672,7 @@ void nvte_populate_rng_state_async(NVTETensor rng_state_dst, const NVTETensor se
 *  \param[in]     len                      batch_size x sequence_length.
 *  \param[in]     stream                   CUDA stream used for this operation.
 */
-uint32_t nvte_get_runtime_num_segments(NVTETensor cu_seqlen, NVTETensor workspace, size_t len,
+uint32_t nvte_get_runtime_num_segments(NVTETensor cu_seqlens, NVTETensor workspace, size_t len,
                                       cudaStream_t stream);

 /*!  \brief Set the seed and offset for RNG state.
@@ -830,8 +829,7 @@ void nvte_convert_thd_to_bshd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETens
 *  \param[in]     tensor           Input tensor.
 *  \param[in]     cu_seqlens       Cumulative sequence lengths, [batch_size + 1].
 *  \param[out]    new_tensor       Output tensor.
- *  \param[in]     b                Batch size.
- *  \param[in]     max_seq_len      Maximum sequence length.
+ *  \param[in]     t                Packed sequence length.
 *  \param[in]     stream           CUDA stream used for this operation.
 */
 void nvte_convert_bshd_to_thd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETensor new_tensor,

--- a/transformer_engine/common/include/transformer_engine/gemm.h
+++ b/transformer_engine/common/include/transformer_engine/gemm.h
@@ -255,9 +255,11 @@ class MatmulConfigWrapper {
  MatmulConfigWrapper(const MatmulConfigWrapper &) = delete;
  MatmulConfigWrapper &operator=(const MatmulConfigWrapper &) = delete;

+  /*! \brief Move constructor. */
  MatmulConfigWrapper(MatmulConfigWrapper &&other) : config_{other.config_} {
    other.config_ = nullptr;
  }
+  /*! \brief Move-assignment operator. */
  MatmulConfigWrapper &operator=(MatmulConfigWrapper &&other) {
    if (config_ != nullptr) {
      nvte_destroy_matmul_config(config_);

--- a/transformer_engine/common/include/transformer_engine/multi_tensor.h
+++ b/transformer_engine/common/include/transformer_engine/multi_tensor.h
@@ -288,7 +288,7 @@ void nvte_multi_tensor_compute_scale_inv_e8m0_cuda(int chunk_size, NVTETensor **
 *  and populate the amax of the corresponding output tensor.
 *
 *  \param[in]      input            Input tensor.
- *  \param[in,out]  amaxes           Array of output tensors. Only the amax is updated.
+ *  \param[in,out]  outputs          Array of output tensors. Only the amax is updated.
 *  \param[in]      split_sections   Size of each tensor split along dimension 0.
 *  \param[in]      num_tensors      Number of tensor splits.
 *  \param[in]      stream           CUDA stream used for the operation.

--- a/transformer_engine/common/include/transformer_engine/normalization.h
+++ b/transformer_engine/common/include/transformer_engine/normalization.h
@@ -163,11 +163,16 @@ void nvte_rmsnorm_bwd_add(const NVTETensor dz, const NVTETensor x, const NVTETen
                          NVTETensor dgamma, NVTETensor workspace, const int multiprocessorCount,
                          const bool zero_centered_gamma, cudaStream_t stream);

-/*! \brief Helper to enable cuDNN backend for normalization
+/*! \brief Set whether to enable cuDNN backend for normalization forward.
 *
- *  \param[in]     bool              Enable if True
+ *  \param[in]  enable  Whether to enable cuDNN backend.
 */
 void nvte_enable_cudnn_norm_fwd(bool enable);
+
+/*! \brief Set whether to enable cuDNN backend for normalization backward.
+ *
+ *  \param[in]  enable  Whether to enable cuDNN backend.
+ */
 void nvte_enable_cudnn_norm_bwd(bool enable);

 /*! \brief Control whether norm computes `gamma += 1.0` for zero-centered gamma
@@ -176,11 +181,14 @@ void nvte_enable_cudnn_norm_bwd(bool enable);
 *  Currently this only applies to the CuDNN backend. If CuDNN is not used,
 *  this setting has no effect.
 *
- *  \param[in]     bool              Enable if True
+ *  \param[in]  enable  Whether to enable zero-centered gamma.
 */
 void nvte_enable_zero_centered_gamma_in_weight_dtype(bool enable);

+#ifdef __cplusplus
+/*! \brief Normalization function type */
 enum class NVTE_Norm_Type { LayerNorm, RMSNorm };
+#endif

 #ifdef __cplusplus
 }  // extern "C"

--- a/transformer_engine/common/include/transformer_engine/recipe.h
+++ b/transformer_engine/common/include/transformer_engine/recipe.h
@@ -23,7 +23,7 @@ extern "C" {
 * the last, the last entry shifts to the second to last) and the
 * first entry is set to zero. The scaling factor is estimated so the
 * FP8 tensor's maximum absolute value is
- * @f$ 2^{-\text{margin}} \text{max}_\text{fp8\_dtype} @f$.
+ * @f$ 2^{-margin} \max_{fp8\_dtype} @f$.
 *
 *  \param[in] amax_history             History of maximum absolute values.
 *                                      Shape: [history_length, num_scales]
@@ -54,7 +54,7 @@ void nvte_delayed_scaling_recipe_amax_and_scale_update(
 * the last, the last entry shifts to the second to last) and the
 * first entry is set to zero. The scaling factor is estimated so the
 * FP8 tensor's maximum absolute value is
- * @f$ 2^{-\text{margin}} \text{max}_\text{fp8\_dtype} @f$.
+ * @f$ 2^{-margin} \max_{fp8\_dtype} @f$.
 *
 *  \param[in] amax_reduction_buffer    The contiguous buffer used for amax reduction.
 *                                      Shape: [num_scales * num_tensors]

--- a/transformer_engine/common/include/transformer_engine/swizzle.h
+++ b/transformer_engine/common/include/transformer_engine/swizzle.h
@@ -34,6 +34,7 @@ void nvte_swizzle_scaling_factors(const NVTETensor input, NVTETensor output, cud
 *
 *  \param[in]     inputs       Input tensors with non-swizzled scale_inv.
 *  \param[in,out] outputs      Output tensors which hosts swizzled scale_inv.
+ *  \param[in]     num_tensors  Number of input and output tensors.
 *  \param[in]     stream       CUDA stream used for the operation.
 *
 *  Requirements:

--- a/transformer_engine/common/include/transformer_engine/transformer_engine.h
+++ b/transformer_engine/common/include/transformer_engine/transformer_engine.h
@@ -51,8 +51,11 @@ struct NVTEShape {
 *  It does not own the memory it points to.
 */
 struct NVTEBasicTensor {
+  /*! Pointer to data buffer. */
  void *data_ptr;
+  /*! Data type. */
  NVTEDType dtype;
+  /*! Tensor shape. */
  NVTEShape shape;
 };

@@ -144,7 +147,7 @@ void *nvte_tensor_columnwise_data(const NVTETensor tensor);
 *
 *  \param[data] Pointer to start of shape array. If NULL, the shape
 *               will be filled with zeros.
- *  \param[data] Number of dimensions (must be <= 14)
+ *  \param[ndim] Number of dimensions (must be <= 14)
 *
 *  \return A shape. The shape will own its own copy of the data.
 */
@@ -177,7 +180,7 @@ size_t nvte_tensor_ndims(const NVTETensor tensor);
 /*! \brief Get the size of a specific tensor dimension.
 *
 *  \param[in] tensor Tensor.
- *  \param[in] size_t Dimension index.
+ *  \param[in] dim    Dimension index.
 *
 *  \return Size of the tensor at the specified dimension.
 */
@@ -258,8 +261,7 @@ NVTEShape nvte_tensor_scale_inv_shape(const NVTETensor tensor);
 /*! \brief Reset tensor value to zero.
 *
 *  \param[in] tensor Tensor.
- *
- *  \return A scale_inv shape of the input tensor.
+ *  \param[in] stream CUDA stream to use for the operation.
 */
 void nvte_zero_tensor(const NVTETensor tensor, cudaStream_t stream);

@@ -539,7 +541,7 @@ enum class DType {
 /*! \brief Check if TE datatype is FP8
 *
 * Return true if TE datatype is FP8
- *  \param[in] DType      TE Datatype of interest
+ *  \param[in] t      TE Datatype of interest
 */
 inline bool is_fp8_dtype(const DType t) {
  return t == DType::kFloat8E4M3 || t == DType::kFloat8E5M2;
@@ -548,14 +550,14 @@ inline bool is_fp8_dtype(const DType t) {
 /*! \brief Check if TE datatype is FP4
 *
 * Return true if TE datatype is FP4
- *  \param[in] DType      TE Datatype of interest
+ *  \param[in] t      TE Datatype of interest
 */
 inline bool is_fp4_dtype(const DType t) { return t == DType::kFloat4E2M1; }

 /*! \brief Check if TE datatype is high precision (FP32, FP16, BF16)
 *
 * Return true if TE datatype is high precision
- *  \param[in] DType      TE Datatype of interest
+ *  \param[in] t      TE Datatype of interest
 */
 inline bool is_high_precision_dtype(const DType t) {
  return t == DType::kFloat32 || t == DType::kBFloat16 || t == DType::kFloat16;
@@ -579,6 +581,7 @@ class TensorWrapper {
   *  \param[in] scale_dptr      Pointer to the scale value.
   *  \param[in] scale_inv_shape Shape of scale_inv
   *  \param[in] scale_inv_dptr  Pointer to the inverse of scale value.
+   *  \param[in] scaling_mode    Tensor data format.
   */
  TensorWrapper(void *dptr, const NVTEShape &shape, const DType dtype, float *amax_dptr = nullptr,
                float *scale_dptr = nullptr, float *scale_inv_dptr = nullptr,
@@ -615,6 +618,7 @@ class TensorWrapper {
   *  \param[in] scale_dptr      Pointer to the scale value.
   *  \param[in] scale_inv_shape Shape of scale_inv
   *  \param[in] scale_inv_dptr  Pointer to the inverse of scale value.
+   *  \param[in] scaling_mode    Tensor data format.
   */
  TensorWrapper(void *dptr, const std::vector<size_t> &shape, const DType dtype,
                float *amax_dptr = nullptr, float *scale_dptr = nullptr,
@@ -766,7 +770,7 @@ class TensorWrapper {

  /*! \brief Get the size of this TensorWrapper in the given dimension.
   *
-   *  \param[in] size_t Dimension index.
+   *  \param[in] dim Dimension index.
   *
   *  \return Size of this TensorWrapper in given dimension.
   */
@@ -935,9 +939,11 @@ class QuantizationConfigWrapper {
  QuantizationConfigWrapper(const QuantizationConfigWrapper &) = delete;
  QuantizationConfigWrapper &operator=(const QuantizationConfigWrapper &) = delete;

+  /*! \brief Move constructor. */
  QuantizationConfigWrapper(QuantizationConfigWrapper &&other) : config_{other.config_} {
    other.config_ = nullptr;
  }
+  /*! \brief Move-assignment operator. */
  QuantizationConfigWrapper &operator=(QuantizationConfigWrapper &&other) {
    if (config_ != nullptr) {
      nvte_destroy_quantization_config(config_);

--- a/transformer_engine/common/include/transformer_engine/transpose.h
+++ b/transformer_engine/common/include/transformer_engine/transpose.h
@@ -231,7 +231,7 @@ void nvte_cast_transpose_dbias_dsrelu(const NVTETensor input, const NVTETensor a
 *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
 *
 *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
 *                                     gated activation operation.
 *                                     Shape [N, H * 2].
 *  \param[in,out] output              Result of the cast.
@@ -250,7 +250,7 @@ void nvte_dgeglu_cast_transpose(const NVTETensor input, const NVTETensor act_inp
 *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
 *
 *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
 *                                     gated activation operation.
 *                                     Shape [N, H * 2].
 *  \param[in,out] output              Result of the cast.
@@ -269,7 +269,7 @@ void nvte_dswiglu_cast_transpose(const NVTETensor input, const NVTETensor act_in
 *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
 *
 *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
 *                                     gated activation operation.
 *                                     Shape [N, H * 2].
 *  \param[in,out] output              Result of the cast.
@@ -288,7 +288,7 @@ void nvte_dreglu_cast_transpose(const NVTETensor input, const NVTETensor act_inp
 *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
 *
 *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
 *                                     gated activation operation.
 *                                     Shape [N, H * 2].
 *  \param[in,out] output              Result of the cast.
@@ -307,7 +307,7 @@ void nvte_dqgeglu_cast_transpose(const NVTETensor input, const NVTETensor act_in
 *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
 *
 *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
 *                                     gated activation operation.
 *                                     Shape [N, H * 2].
 *  \param[in,out] output              Result of the cast.