Fix cpp warnings (#1639)

* fix cpp warning Signed-off-by: Xin Yao <xiny@nvidia.com> * more fix Signed-off-by: Xin Yao <xiny@nvidia.com> --------- Signed-off-by: Xin Yao <xiny@nvidia.com>

Fix cpp warnings (#1639)
* fix cpp warning Signed-off-by: Xin Yao <xiny@nvidia.com> * more fix Signed-off-by: Xin Yao <xiny@nvidia.com> --------- Signed-off-by: Xin Yao <xiny@nvidia.com>
a3ba4dff · Xin Yao · GitHub · ba605f18 · a3ba4dff · a3ba4dff
Unverified Commit a3ba4dff authored Apr 07, 2025 by Xin Yao Committed by GitHub Apr 07, 2025
8 changed files
--- a/transformer_engine/common/util/cast_gated_kernels.cuh
+++ b/transformer_engine/common/util/cast_gated_kernels.cuh
@@ -96,8 +96,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
  constexpr size_t in_mem = in_act_mem + in_gate_mem;

  constexpr size_t out_act_mem = buff_size_aligned_out;
-  constexpr size_t out_gate_mem = buff_size_aligned_out;
-  constexpr size_t out_mem = out_act_mem + out_gate_mem;

  // const size_t in_transaction_size = grad_mem + in_mem;
  constexpr size_t in_transaction_size = buff_elems * sizeof(IType);
@@ -108,7 +106,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
  IType *in_gate_sh = reinterpret_cast<IType *>(dshmem + grad_mem + in_act_mem);
  OType *out_act_sh = reinterpret_cast<OType *>(dshmem + grad_mem + in_mem);
  OType *out_gate_sh = reinterpret_cast<OType *>(dshmem + grad_mem + in_mem + out_act_mem);
-  // uint64_t *mbar = reinterpret_cast<uint64_t *>(dshmem + grad_mem + in_mem + out_mem);

  const uint64_t *TMAP_grad_in = reinterpret_cast<const uint64_t *>(&tensor_map_grad);
  const uint64_t *TMAP_in_act = reinterpret_cast<const uint64_t *>(&tensor_map_input_act);
@@ -289,7 +286,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
 #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
  constexpr bool USE_ROWWISE_SCALING = SCALE_DIM_X > 1;
  constexpr bool USE_COLWISE_SCALING = SCALE_DIM_Y > 1;
-  constexpr bool COMPUTE_IN_ROWWISE_SECTION = !USE_COLWISE_SCALING;

  constexpr size_t SCALES_ROWWISE_PER_CHUNK_Y = CHUNK_DIM_Y;                //  128
  constexpr size_t SCALES_ROWWISE_PER_CHUNK_X = CHUNK_DIM_X / SCALE_DIM_X;  //    4 = 128 / 32
@@ -826,8 +822,6 @@ void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *out
  size_t scale_stride_rowwise = USE_ROWWISE_SCALING ? output->scale_inv.shape[1] : 1;
  size_t scale_stride_colwise = USE_COLWISE_SCALING ? output->columnwise_scale_inv.shape[1] : 1;

-  float *const amax_ptr = reinterpret_cast<float *>(output->amax.dptr);
-
  e8m0_t *const scales_rowwise_ptr =
      USE_ROWWISE_SCALING ? reinterpret_cast<e8m0_t *>(output->scale_inv.dptr) : nullptr;
  e8m0_t *const scales_colwise_ptr =

--- a/transformer_engine/common/util/cast_kernels.cuh
+++ b/transformer_engine/common/util/cast_kernels.cuh
@@ -142,7 +142,6 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
      OType out_colwise_sh[MXFP8_BUFFERS_NUM][MXFP8_SHMEM_DIM_Y][MXFP8_SHMEM_DIM_X];

  constexpr int shmem_buff_size = sizeof(in_sh) / MXFP8_BUFFERS_NUM;
-  constexpr int transaction_size = shmem_buff_size * (IS_DACT ? 2 : 1);

  const bool is_master_thread = (threadIdx.x == 0);

@@ -513,7 +512,6 @@ __global__ void __launch_bounds__(FP8_THREADS_PER_CHUNK)
  __shared__ alignas(128) OType out_sh[FP8_BUFFERS_NUM][FP8_SHMEM_DIM_Y][FP8_SHMEM_DIM_X];

  constexpr int shmem_buff_size = sizeof(in_sh) / FP8_BUFFERS_NUM;
-  constexpr int transaction_size = shmem_buff_size * (IS_DACT ? 2 : 1);

  const bool is_master_thread = (threadIdx.x == 0);

@@ -927,7 +925,6 @@ void mxfp8_quantize(const Tensor &input, const Tensor *act_input,
  bool use_colwise_scaling = output->has_columnwise_data();
  checkCuDriverContext(stream);
  NVTE_CHECK(input.has_data(), "Cannot quantize tensor without rowwise data.");
-  const auto &input_shape = input.data.shape;
  NVTE_CHECK(is_fp8_dtype(output->dtype()), "Output must have FP8 type.");

  if (use_rowwise_scaling) {

--- a/transformer_engine/common/util/dequantize_kernels.cuh
+++ b/transformer_engine/common/util/dequantize_kernels.cuh
@@ -56,7 +56,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
                            const size_t scales_stride) {
 #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
  constexpr bool USE_ROWWISE_SCALING = SCALE_DIM_X > 1;
-  constexpr bool USE_COLWISE_SCALING = SCALE_DIM_Y > 1;

  constexpr size_t SCALES_ROWWISE_PER_CHUNK_Y = CHUNK_DIM_Y;                //  128
  constexpr size_t SCALES_ROWWISE_PER_CHUNK_X = CHUNK_DIM_X / SCALE_DIM_X;  //    4 = 128 / 32
@@ -66,7 +65,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)

  constexpr size_t THREADS_PER_SCALE_X_ROWWISE =
      DIVUP(SCALE_DIM_X, ELEMS_PER_THREAD);  // 2 = 32 / 16
-  constexpr size_t SUBWARP_WIDTH = THREADS_PER_SCALE_X_ROWWISE;  //   2

  const int chunk_offset_Y = blockIdx.y * CHUNK_DIM_Y;
  const int chunk_offset_X = blockIdx.x * CHUNK_DIM_X;

--- a/transformer_engine/pytorch/csrc/extensions/comm_gemm_overlap.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/comm_gemm_overlap.cpp
@@ -157,15 +157,15 @@ void CommOverlap::copy_into_buffer(py::handle input, py::handle quantizer, bool

  char *ubuf_ptr = reinterpret_cast<char *>(_ubuf.dptr());
  if (local_chunk) {
-    if (input_tensor.numel() * _tp_size > (int64_t)_ubuf.numel())
+    if (input_tensor.numel() * _tp_size > _ubuf.numel())
      NVTE_ERROR("input is larger than the local communication buffer!");
-    if (input_tensor.element_size() != (int64_t)_ubuf.element_size())
+    if (input_tensor.element_size() != _ubuf.element_size())
      NVTE_ERROR("input data type does not match communication buffer!");
    ubuf_ptr += (_ubuf.numel() / _tp_size) * _tp_id * _ubuf.element_size();
  } else {
-    if (input_tensor.numel() > (int64_t)_ubuf.numel())
+    if (input_tensor.numel() > _ubuf.numel())
      NVTE_ERROR("input is larger than the global communication buffer!");
-    if (input_tensor.element_size() != (int64_t)_ubuf.element_size())
+    if (input_tensor.element_size() != _ubuf.element_size())
      NVTE_ERROR("input data type does not match communication buffer!");
  }

@@ -189,7 +189,7 @@ py::object CommOverlap::get_buffer(py::handle quantizer, bool local_chunk,
  std::vector<int64_t> torch_shape;
  if (shape.has_value()) {
    torch_shape = shape.value();
-    auto requested = product(torch_shape);
+    size_t requested = product(torch_shape);
    auto expected = local_chunk ? _ubuf.numel() / _tp_size : _ubuf.numel();
    NVTE_CHECK(requested == expected, "Number of elements in the requested shape (", requested,
               ") does not match allocated buffer size (", expected, ")!");
@@ -253,18 +253,18 @@ void CommOverlapP2P::copy_into_buffer(py::handle input, py::handle quantizer, bo
  at::cuda::CUDAStream stream_main = at::cuda::getCurrentCUDAStream();
  if (local_chunk) {
    // Copy input to the target ubuf chunk by rank offset
-    if (input_tensor.numel() * _tp_size > (int64_t)_ubuf.numel())
+    if (input_tensor.numel() * _tp_size > _ubuf.numel())
      NVTE_ERROR("input is larger than the local communication buffer!");
-    if (input_tensor.element_size() != (int64_t)_ubuf.element_size())
+    if (input_tensor.element_size() != _ubuf.element_size())
      NVTE_ERROR("input data type does not match communication buffer!");
    NVTE_CHECK_CUDA(cudaMemcpyAsync(_ubufs[_tp_id].dptr(), input_ptr,
                                    input_tensor.numel() * input_tensor.element_size(),
                                    cudaMemcpyDeviceToDevice, (cudaStream_t)stream_main));

  } else {
-    if (input_tensor.numel() > (int64_t)_ubuf.numel())
+    if (input_tensor.numel() > _ubuf.numel())
      NVTE_ERROR("input is larger than the global communication buffer!");
-    if (input_tensor.element_size() != (int64_t)_ubuf.element_size())
+    if (input_tensor.element_size() != _ubuf.element_size())
      NVTE_ERROR("input data type does not match communication buffer!");
    NVTE_CHECK_CUDA(cudaMemcpyAsync(_ubuf.dptr(), input_ptr,
                                    input_tensor.numel() * input_tensor.element_size(),
@@ -280,7 +280,7 @@ py::object CommOverlapP2P::get_buffer(py::handle quantizer, bool local_chunk,
  std::vector<int64_t> torch_shape;
  if (shape.has_value()) {
    torch_shape = shape.value();
-    auto requested = product(torch_shape);
+    size_t requested = product(torch_shape);
    auto expected = local_chunk ? _ubufs[_tp_id].numel() : _ubuf.numel();
    NVTE_CHECK(requested == expected, "Number of elements in the requested shape (", requested,
               ") does not match allocated buffer size (", expected, ")!");

--- a/transformer_engine/pytorch/csrc/extensions/gemm.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/gemm.cpp
@@ -318,12 +318,11 @@ std::optional<std::vector<at::Tensor>> te_general_grouped_gemm(

  std::vector<size_t> single_output_begins;
  std::vector<size_t> single_output_ends;
-  int slicing_dim;
  if (single_output && D == std::nullopt) {
    NVTE_ERROR("not implemented, D should be allocated for single output case.");
  }

-  void* output_data_ptr;
+  void* output_data_ptr = nullptr;
  if (single_output) {
    output_data_ptr = (*D)[0].data_ptr();
  }

--- a/transformer_engine/pytorch/csrc/extensions/padding.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/padding.cpp
@@ -17,7 +17,7 @@ void fused_multi_row_padding(at::Tensor input, at::Tensor output,
  NVTE_CHECK(input.dim() == 2, "Dimension of input must equal 2.");
  NVTE_CHECK(output.dim() == 2, "Dimension of output must equal  2.");

-  const int num_tensors = input_row_list.size();
+  const auto num_tensors = input_row_list.size();
  // Extract properties from PyTorch tensors
  std::vector<void*> input_dptr_list, output_dptr_list;
  std::vector<std::vector<size_t>> input_shape_list, output_shape_list;

--- a/transformer_engine/pytorch/csrc/extensions/quantizer.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/quantizer.cpp
@@ -323,7 +323,8 @@ std::pair<TensorWrapper, py::object> Float8BlockQuantizer::create_tensor(
                 "Expected 1 or 2. Got ",
                 block_scaling_dim);
    }
-    scale_inv_rowwise = at::empty({sinv0, sinv1}, scale_opts);
+    scale_inv_rowwise =
+        at::empty({static_cast<int64_t>(sinv0), static_cast<int64_t>(sinv1)}, scale_opts);
    tensor.set_rowwise_data(data_rowwise.data_ptr(), this->dtype, shape);
    tensor.set_rowwise_scale_inv(scale_inv_rowwise.data_ptr(), DType::kFloat32,
                                 std::vector<size_t>{sinv0, sinv1});
@@ -359,7 +360,8 @@ std::pair<TensorWrapper, py::object> Float8BlockQuantizer::create_tensor(
                 block_scaling_dim);
    }
    data_colwise = at::empty(torch_columnwise_shape, opts);
-    scale_inv_colwise = at::empty({sinv0, sinv1}, scale_opts);
+    scale_inv_colwise =
+        at::empty({static_cast<int64_t>(sinv0), static_cast<int64_t>(sinv1)}, scale_opts);

    tensor.set_columnwise_data(data_colwise.data_ptr(), this->dtype, columnwise_shape);
    tensor.set_columnwise_scale_inv(scale_inv_colwise.data_ptr(), DType::kFloat32,

--- a/transformer_engine/pytorch/csrc/extensions/transpose.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/transpose.cpp
@@ -21,7 +21,7 @@ std::vector<py::object> fused_multi_quantize(std::vector<py::handle> input_list,
  auto none = py::none();

  // create TE tensors from input
-  for (int i = 0; i < input_list.size(); i++) {
+  for (size_t i = 0; i < input_list.size(); i++) {
    auto input_tensor = makeTransformerEngineTensor(input_list[i], none);
    const NVTEShape input_shape = input_tensor.shape();