[C][PyTorch] Move multi tensors kernels from PyTorch extensions to core (#1744)

* Move multi tensors kernels from PyTorch extensions to core Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Add int16 type to core (for storing fp32 param remainders) Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fix core build Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * same fix to scale Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fix perf, memory, vars Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Re-add device guard for multi-device Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fix junk output dtype for non-per tensor Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fixes for test and upgrade mcore version Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fix core tests Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> --------- Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[C][PyTorch] Move multi tensors kernels from PyTorch extensions to core (#1744)
* Move multi tensors kernels from PyTorch extensions to core Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Add int16 type to core (for storing fp32 param remainders) Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fix core build Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * same fix to scale Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fix perf, memory, vars Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Re-add device guard for multi-device Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fix junk output dtype for non-per tensor Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fixes for test and upgrade mcore version Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Fix core tests Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> --------- Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
c972f5a7 · Kirthi Shankar Sivamani · GitHub · e17fab14 · c972f5a7 · c972f5a7
Unverified Commit c972f5a7 authored May 05, 2025 by Kirthi Shankar Sivamani Committed by GitHub May 05, 2025
3 changed files
--- a/transformer_engine/pytorch/csrc/extensions/multi_tensor/scale.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/multi_tensor/scale.cpp
+/*************************************************************************
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+#include "extensions.h"
+void multi_tensor_scale_cuda(int chunk_size, at::Tensor noop_flag,
+                             std::vector<std::vector<at::Tensor>> tensor_lists, float scale) {
+  using namespace transformer_engine;
+  using namespace transformer_engine::pytorch;
+  auto noop_flag_cu = makeTransformerEngineTensor(noop_flag);
+  auto [_, __, tensor_lists_ptr, num_lists, num_tensors] =
+      makeTransformerEngineTensorList(tensor_lists);
+  int device_id = tensor_lists[0][0].device().index();
+  nvte_multi_tensor_scale_cuda(chunk_size, noop_flag_cu.data(), tensor_lists_ptr.data(), num_lists,
+                               num_tensors, scale, device_id, at::cuda::getCurrentCUDAStream());
+}
--- a/transformer_engine/pytorch/csrc/extensions/multi_tensor/sgd.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/multi_tensor/sgd.cpp
+/*************************************************************************
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+#include "extensions.h"
+void multi_tensor_sgd_cuda(int chunk_size, at::Tensor noop_flag,
+                           std::vector<std::vector<at::Tensor>> tensor_lists, float wd,
+                           float momentum, float dampening, float lr, bool nesterov, bool first_run,
+                           bool wd_after_momentum, float scale) {
+  using namespace transformer_engine;
+  using namespace transformer_engine::pytorch;
+  auto noop_flag_cu = makeTransformerEngineTensor(noop_flag);
+  auto [_, __, tensor_lists_ptr, num_lists, num_tensors] =
+      makeTransformerEngineTensorList(tensor_lists);
+  int device_id = tensor_lists[0][0].device().index();
+  nvte_multi_tensor_sgd_cuda(chunk_size, noop_flag_cu.data(), tensor_lists_ptr.data(), num_lists,
+                             num_tensors, wd, momentum, dampening, lr, nesterov, first_run,
+                             wd_after_momentum, scale, device_id, at::cuda::getCurrentCUDAStream());
+}
--- a/transformer_engine/pytorch/csrc/type_shim.h
+++ b/transformer_engine/pytorch/csrc/type_shim.h
-/*************************************************************************
- * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- *
- * See LICENSE for license information.
- ************************************************************************/
-#pragma once
-#include <ATen/ATen.h>
-// Forward/backward compatiblity hack around
-// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
-// pending more future-proof guidance from upstream.
-// struct TypeShim
-// {
-//   const at::Type& payload;
-//   TypeShim(const at::Type& type) : payload(type) {}
-//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
-//   operator const at::Type&(){ return payload; };
-//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
-//   //operator at::ScalarType(){ return payload.; };
-// };
-#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)               \
-  switch (TYPE) {                                                     \
-    case at::ScalarType::Float: {                                     \
-      using scalar_t_##LEVEL = float;                                 \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::Half: {                                      \
-      using scalar_t_##LEVEL = at::Half;                              \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    default:                                                          \
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-  }
-#define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, LEVEL, NAME, ...)        \
-  switch (TYPE) {                                                     \
-    case at::ScalarType::Float: {                                     \
-      using scalar_t_##LEVEL = float;                                 \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::Half: {                                      \
-      using scalar_t_##LEVEL = at::Half;                              \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::BFloat16: {                                  \
-      using scalar_t_##LEVEL = at::BFloat16;                          \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    default:                                                          \
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-  }
-#define DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, LEVEL, NAME, ...)          \
-  switch (TYPE) {                                                     \
-    case at::ScalarType::Float: {                                     \
-      using scalar_t_##LEVEL = float;                                 \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::Half: {                                      \
-      using scalar_t_##LEVEL = at::Half;                              \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::Byte: {                                      \
-      using scalar_t_##LEVEL = uint8_t;                               \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    default:                                                          \
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-  }
-#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)        \
-  switch (TYPE) {                                                     \
-    case at::ScalarType::Double: {                                    \
-      using scalar_t_##LEVEL = double;                                \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::Float: {                                     \
-      using scalar_t_##LEVEL = float;                                 \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::Half: {                                      \
-      using scalar_t_##LEVEL = at::Half;                              \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    default:                                                          \
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-  }
-#define DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(TYPE, LEVEL, NAME, ...) \
-  switch (TYPE) {                                                     \
-    case at::ScalarType::Double: {                                    \
-      using scalar_t_##LEVEL = double;                                \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::Float: {                                     \
-      using scalar_t_##LEVEL = float;                                 \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::Half: {                                      \
-      using scalar_t_##LEVEL = at::Half;                              \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::BFloat16: {                                  \
-      using scalar_t_##LEVEL = at::BFloat16;                          \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    default:                                                          \
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-  }
-#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)             \
-  switch (TYPE) {                                                     \
-    case at::ScalarType::Double: {                                    \
-      using scalar_t_##LEVEL = double;                                \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::Float: {                                     \
-      using scalar_t_##LEVEL = float;                                 \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    default:                                                          \
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-  }
-#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)                     \
-  switch (TYPE) {                                                     \
-    case at::ScalarType::Half: {                                      \
-      using scalar_t = at::Half;                                      \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    case at::ScalarType::BFloat16: {                                  \
-      using scalar_t = at::BFloat16;                                  \
-      __VA_ARGS__;                                                    \
-      break;                                                          \
-    }                                                                 \
-    default:                                                          \
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-  }
-#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
-  switch (TYPEIN) {                                                            \
-    case at::ScalarType::Float: {                                              \
-      using scalar_t_in = float;                                               \
-      switch (TYPEOUT) {                                                       \
-        case at::ScalarType::Float: {                                          \
-          using scalar_t_out = float;                                          \
-          __VA_ARGS__;                                                         \
-          break;                                                               \
-        }                                                                      \
-        case at::ScalarType::Half: {                                           \
-          using scalar_t_out = at::Half;                                       \
-          __VA_ARGS__;                                                         \
-          break;                                                               \
-        }                                                                      \
-        case at::ScalarType::BFloat16: {                                       \
-          using scalar_t_out = at::BFloat16;                                   \
-          __VA_ARGS__;                                                         \
-          break;                                                               \
-        }                                                                      \
-        default:                                                               \
-          AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'");   \
-      }                                                                        \
-      break;                                                                   \
-    }                                                                          \
-    case at::ScalarType::Half: {                                               \
-      using scalar_t_in = at::Half;                                            \
-      using scalar_t_out = at::Half;                                           \
-      __VA_ARGS__;                                                             \
-      break;                                                                   \
-    }                                                                          \
-    case at::ScalarType::BFloat16: {                                           \
-      using scalar_t_in = at::BFloat16;                                        \
-      using scalar_t_out = at::BFloat16;                                       \
-      __VA_ARGS__;                                                             \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");        \
-  }
-#define DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
-  switch (TYPEIN) {                                                                   \
-    case at::ScalarType::Double: {                                                    \
-      using scalar_t_in = double;                                                     \
-      switch (TYPEOUT) {                                                              \
-        case at::ScalarType::Double: {                                                \
-          using scalar_t_out = double;                                                \
-          __VA_ARGS__;                                                                \
-          break;                                                                      \
-        }                                                                             \
-        case at::ScalarType::Float: {                                                 \
-          using scalar_t_out = float;                                                 \
-          __VA_ARGS__;                                                                \
-          break;                                                                      \
-        }                                                                             \
-        case at::ScalarType::Half: {                                                  \
-          using scalar_t_out = at::Half;                                              \
-          __VA_ARGS__;                                                                \
-          break;                                                                      \
-        }                                                                             \
-        case at::ScalarType::BFloat16: {                                              \
-          using scalar_t_out = at::BFloat16;                                          \
-          __VA_ARGS__;                                                                \
-          break;                                                                      \
-        }                                                                             \
-        default:                                                                      \
-          AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'");          \
-      }                                                                               \
-      break;                                                                          \
-    }                                                                                 \
-    case at::ScalarType::Float: {                                                     \
-      using scalar_t_in = float;                                                      \
-      switch (TYPEOUT) {                                                              \
-        case at::ScalarType::Float: {                                                 \
-          using scalar_t_out = float;                                                 \
-          __VA_ARGS__;                                                                \
-          break;                                                                      \
-        }                                                                             \
-        case at::ScalarType::Half: {                                                  \
-          using scalar_t_out = at::Half;                                              \
-          __VA_ARGS__;                                                                \
-          break;                                                                      \
-        }                                                                             \
-        case at::ScalarType::BFloat16: {                                              \
-          using scalar_t_out = at::BFloat16;                                          \
-          __VA_ARGS__;                                                                \
-          break;                                                                      \
-        }                                                                             \
-        default:                                                                      \
-          AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'");          \
-      }                                                                               \
-      break;                                                                          \
-    }                                                                                 \
-    case at::ScalarType::Half: {                                                      \
-      using scalar_t_in = at::Half;                                                   \
-      using scalar_t_out = at::Half;                                                  \
-      __VA_ARGS__;                                                                    \
-      break;                                                                          \
-    }                                                                                 \
-    case at::ScalarType::BFloat16: {                                                  \
-      using scalar_t_in = at::BFloat16;                                               \
-      using scalar_t_out = at::BFloat16;                                              \
-      __VA_ARGS__;                                                                    \
-      break;                                                                          \
-    }                                                                                 \
-    default:                                                                          \
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");               \
-  }
-template <typename T>
-__device__ __forceinline__ T
-reduce_block_into_lanes(T *x, T val, int lanes = 1,
-                        bool share_result = false) {  // lanes is intended to be <= 32.
-  int tid = threadIdx.x + threadIdx.y * blockDim.x;
-  int blockSize = blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.
-  if (blockSize >= 64) {
-    x[tid] = val;
-    __syncthreads();
-  }
-#pragma unroll
-  for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
-    if (tid < i) x[tid] = x[tid] + x[tid + i];
-    __syncthreads();
-  }
-  T final;
-  if (tid < 32) {
-    if (blockSize >= 64)
-      final = x[tid] + x[tid + 32];
-    else
-      final = val;
-      // __SYNCWARP();
-#pragma unroll
-    for (int i = 16; i >= lanes; i >>= 1) final = final + __shfl_down_sync(0xffffffff, final, i);
-  }
-  if (share_result) {
-    if (tid < lanes) x[tid] = final;  // EpilogueOp
-    // Make sure the smem result is visible to all warps.
-  }
-  __syncthreads();
-  // Avoid potential write before read race when reduce_block_into_lanes is called back to back
-  return final;
-}
-template <typename T>
-__device__ __forceinline__ T
-reduce_block_into_lanes_max_op(T *x, T val, int lanes = 1,
-                               bool share_result = false) {  // lanes is intended to be <= 32.
-  int tid = threadIdx.x + threadIdx.y * blockDim.x;
-  int blockSize = blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.
-  if (blockSize >= 64) {
-    x[tid] = val;
-    __syncthreads();
-  }
-#pragma unroll
-  for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
-    if (tid < i) x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid + i]));
-    __syncthreads();
-  }
-  T final;
-  if (tid < 32) {
-    if (blockSize >= 64)
-      final = fmaxf(fabsf(x[tid]), fabsf(x[tid + 32]));
-    else
-      final = val;
-      // __SYNCWARP();
-#pragma unroll
-    for (int i = 16; i >= lanes; i >>= 1)
-      final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
-  }
-  if (share_result) {
-    if (tid < lanes) x[tid] = final;  // EpilogueOp
-    // Make sure the smem result is visible to all warps.
-    __syncthreads();
-  }
-  return final;
-}