[ROCm] add support for ROCm/HIP device (#6086)

* [ROCm] add support for ROCm/HIP - CMakeLists.txt ROCm updates, also replace glob with explicit file list - initial warpSize interop changes - helpers/hipify.sh script added - .gitignore to ignore generated hip source files * more rocm updates - disable compiler warnings - move PercentileDevice __device__ template function into header - bug fixes for __host__ __define__ and __HIP__ preprocessor symbols * more bug fixes * warp 32 vs 64 updates * lint fixes * missing device_index variable * accidental inclusion of hip headers * copyright notice compliance * Update CMakeLists.txt Co-authored-by: James Lamb <jaylamb20@gmail.com> * fix lint issue * clean up * Update CMakeLists.txt Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * Update CMakeLists.txt Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * clean up CMakeLists.txt use WARPSIZE * use WARPSIZE * fix share buffer size --------- Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru> Co-authored-by: Yu Shi <yushi2@microsoft.com>

[ROCm] add support for ROCm/HIP device (#6086)
* [ROCm] add support for ROCm/HIP - CMakeLists.txt ROCm updates, also replace glob with explicit file list - initial warpSize interop changes - helpers/hipify.sh script added - .gitignore to ignore generated hip source files * more rocm updates - disable compiler warnings - move PercentileDevice __device__ template function into header - bug fixes for __host__ __define__ and __HIP__ preprocessor symbols * more bug fixes * warp 32 vs 64 updates * lint fixes * missing device_index variable * accidental inclusion of hip headers * copyright notice compliance * Update CMakeLists.txt Co-authored-by: James Lamb <jaylamb20@gmail.com> * fix lint issue * clean up * Update CMakeLists.txt Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * Update CMakeLists.txt Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * clean up CMakeLists.txt use WARPSIZE * use WARPSIZE * fix share buffer size --------- Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru> Co-authored-by: Yu Shi <yushi2@microsoft.com>
a0fde1b0 · Jeff Daily · GitHub · 4c376953 · a0fde1b0 · a0fde1b0
Unverified Commit a0fde1b0 authored Jul 23, 2025 by Jeff Daily Committed by GitHub Jul 24, 2025
16 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -463,3 +463,6 @@ dask-worker-space/
 *.pub
 *.rdp
 *_rsa
+
+# hipify-perl -inplace leaves behind *.prehip files
+*.prehip
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,7 @@ option(USE_GPU "Enable GPU-accelerated training" OFF)
 option(USE_SWIG "Enable SWIG to generate Java API" OFF)
 option(USE_TIMETAG "Set to ON to output time costs" OFF)
 option(USE_CUDA "Enable CUDA-accelerated training " OFF)
+option(USE_ROCM "Enable ROCm-accelerated training " OFF)
 option(USE_DEBUG "Set to ON for Debug mode" OFF)
 option(USE_SANITIZER "Use sanitizer flags" OFF)
 set(
@@ -160,6 +161,11 @@ if(USE_CUDA)
    set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
 endif()

+if(USE_ROCM)
+    enable_language(HIP)
+    set(USE_OPENMP ON CACHE BOOL "ROCm requires OpenMP" FORCE)
+endif()
+
 if(USE_OPENMP)
    if(APPLE)
        find_package(OpenMP)
@@ -261,6 +267,34 @@ if(USE_CUDA)
    endif()
 endif()

+if(USE_ROCM)
+    find_package(HIP)
+    include_directories(${HIP_INCLUDE_DIRS})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${OpenMP_CXX_FLAGS} -fPIC -Wall")
+
+    # avoid warning: unused variable 'mask' due to __shfl_down_sync work-around
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-unused-variable")
+    # avoid warning: 'hipHostAlloc' is deprecated: use hipHostMalloc instead
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-deprecated-declarations")
+    # avoid many warnings about missing overrides
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-inconsistent-missing-override")
+    # avoid warning: shift count >= width of type in feature_histogram.hpp
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-shift-count-overflow")
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DISABLED_WARNINGS}")
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${DISABLED_WARNINGS}")
+
+    if(USE_DEBUG)
+      set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -g -O0")
+    else()
+      set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -O3")
+    endif()
+    message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")
+
+    add_definitions(-DUSE_ROCM)
+endif()
+
 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles("
 #include <xmmintrin.h>

--- a/include/LightGBM/cuda/cuda_algorithms.hpp
+++ b/include/LightGBM/cuda/cuda_algorithms.hpp
 /*!
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
@@ -14,6 +15,7 @@

 #include <LightGBM/bin.h>
 #include <LightGBM/cuda/cuda_utils.hu>
+#include <LightGBM/cuda/cuda_rocm_interop.h>
 #include <LightGBM/utils/log.h>

 #include <algorithm>
@@ -174,7 +176,7 @@ __device__ __forceinline__ void GlobalMemoryPrefixSum(T* array, const size_t len
  for (size_t index = start; index < end; ++index) {
    thread_sum += array[index];
  }
-  __shared__ T shared_mem[32];
+  __shared__ T shared_mem[WARPSIZE];
  const T thread_base = ShufflePrefixSumExclusive<T>(thread_sum, shared_mem);
  if (start < end) {
    array[start] += thread_base;
@@ -483,7 +485,7 @@ __device__ void ShuffleSortedPrefixSumDevice(const VAL_T* in_values,
                                const INDEX_T* sorted_indices,
                                REDUCE_VAL_T* out_values,
                                const INDEX_T num_data) {
-  __shared__ REDUCE_VAL_T shared_buffer[32];
+  __shared__ REDUCE_VAL_T shared_buffer[WARPSIZE];
  const INDEX_T num_data_per_thread = (num_data + static_cast<INDEX_T>(blockDim.x) - 1) / static_cast<INDEX_T>(blockDim.x);
  const INDEX_T start = num_data_per_thread * static_cast<INDEX_T>(threadIdx.x);
  const INDEX_T end = min(start + num_data_per_thread, num_data);
@@ -572,8 +574,48 @@ __device__ VAL_T PercentileDevice(const VAL_T* values,
                                  INDEX_T* indices,
                                  REDUCE_WEIGHT_T* weights_prefix_sum,
                                  const double alpha,
-                                  const INDEX_T len);
-
+                                  const INDEX_T len) {
+  if (len <= 1) {
+    return values[0];
+  }
+  if (!USE_WEIGHT) {
+    BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 2, 10>(values, indices, len);
+    const double float_pos = (1.0f - alpha) * len;
+    const INDEX_T pos = static_cast<INDEX_T>(float_pos);
+    if (pos < 1) {
+      return values[indices[0]];
+    } else if (pos >= len) {
+      return values[indices[len - 1]];
+    } else {
+      const double bias = float_pos - pos;
+      const VAL_T v1 = values[indices[pos - 1]];
+      const VAL_T v2 = values[indices[pos]];
+      return static_cast<VAL_T>(v1 - (v1 - v2) * bias);
+    }
+  } else {
+    BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 4, 9>(values, indices, len);
+    ShuffleSortedPrefixSumDevice<WEIGHT_T, REDUCE_WEIGHT_T, INDEX_T>(weights, indices, weights_prefix_sum, len);
+    const REDUCE_WEIGHT_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha);
+    __shared__ INDEX_T pos;
+    if (threadIdx.x == 0) {
+      pos = len;
+    }
+    __syncthreads();
+    for (INDEX_T index = static_cast<INDEX_T>(threadIdx.x); index < len; index += static_cast<INDEX_T>(blockDim.x)) {
+      if (weights_prefix_sum[index] > threshold && (index == 0 || weights_prefix_sum[index - 1] <= threshold)) {
+        pos = index;
+      }
+    }
+    __syncthreads();
+    pos = min(pos, len - 1);
+    if (pos == 0 || pos == len - 1) {
+      return values[pos];
+    }
+    const VAL_T v1 = values[indices[pos - 1]];
+    const VAL_T v2 = values[indices[pos]];
+    return static_cast<VAL_T>(v1 - (v1 - v2) * (threshold - weights_prefix_sum[pos - 1]) / (weights_prefix_sum[pos] - weights_prefix_sum[pos - 1]));
+  }
+}

 }  // namespace LightGBM


--- a/include/LightGBM/cuda/cuda_rocm_interop.h
+++ b/include/LightGBM/cuda/cuda_rocm_interop.h
+/*!
+ * Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ */
+#ifdef USE_CUDA
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
+// ROCm doesn't have __shfl_down_sync, only __shfl_down without mask.
+// Since mask is full 0xffffffff, we can use __shfl_down instead.
+#define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset)
+#define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset)
+// ROCm warpSize is constexpr and is either 32 or 64 depending on gfx arch.
+#define WARPSIZE warpSize
+// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
+#define atomicAdd_block atomicAdd
+#else
+// CUDA warpSize is not a constexpr, but always 32
+#define WARPSIZE 32
+#endif
+
+#endif
--- a/include/LightGBM/cuda/cuda_split_info.hpp
+++ b/include/LightGBM/cuda/cuda_split_info.hpp
@@ -2,6 +2,7 @@
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifdef USE_CUDA
@@ -40,24 +41,24 @@ class CUDASplitInfo {
  uint32_t* cat_threshold = nullptr;
  int* cat_threshold_real = nullptr;

-  __device__ CUDASplitInfo() {
+  __host__ __device__ CUDASplitInfo() {
    num_cat_threshold = 0;
    cat_threshold = nullptr;
    cat_threshold_real = nullptr;
  }

-  __device__ ~CUDASplitInfo() {
+  __host__ __device__ ~CUDASplitInfo() {
    if (num_cat_threshold > 0) {
      if (cat_threshold != nullptr) {
-        cudaFree(cat_threshold);
+        CUDASUCCESS_OR_FATAL(cudaFree(cat_threshold));
      }
      if (cat_threshold_real != nullptr) {
-        cudaFree(cat_threshold_real);
+        CUDASUCCESS_OR_FATAL(cudaFree(cat_threshold_real));
      }
    }
  }

-  __device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) {
+  __host__ __device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) {
    is_valid = other.is_valid;
    leaf_index = other.leaf_index;
    gain = other.gain;

--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
 /*!
 * Copyright (c) 2020 IBM Corporation, Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */
 #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
 #define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
@@ -45,7 +46,7 @@ struct CHAllocator {
    n = SIZE_ALIGNED(n);
    #ifdef USE_CUDA
      if (LGBM_config_::current_device == lgbm_device_cuda) {
-        cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
+        cudaError_t ret = cudaHostAlloc(reinterpret_cast<void**>(&ptr), n*sizeof(T), cudaHostAllocPortable);
        if (ret != cudaSuccess) {
          Log::Warning("Defaulting to malloc in CHAllocator!!!");
          ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));

--- a/src/cuda/cuda_algorithms.cu
+++ b/src/cuda/cuda_algorithms.cu
 /*!
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_algorithms.hpp>
+#include <LightGBM/cuda/cuda_rocm_interop.h>

 namespace LightGBM {

 template <typename T>
 __global__ void ShufflePrefixSumGlobalKernel(T* values, size_t len, T* block_prefix_sum_buffer) {
-  __shared__ T shared_mem_buffer[32];
+  __shared__ T shared_mem_buffer[WARPSIZE];
  const size_t index = static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x);
  T value = 0;
  if (index < len) {
@@ -26,7 +28,7 @@ __global__ void ShufflePrefixSumGlobalKernel(T* values, size_t len, T* block_pre

 template <typename T>
 __global__ void ShufflePrefixSumGlobalReduceBlockKernel(T* block_prefix_sum_buffer, int num_blocks) {
-  __shared__ T shared_mem_buffer[32];
+  __shared__ T shared_mem_buffer[WARPSIZE];
  const int num_blocks_per_thread = (num_blocks + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 2) / (GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1);
  int thread_block_start = threadIdx.x == 0 ? 0 : (threadIdx.x - 1) * num_blocks_per_thread;
  int thread_block_end = threadIdx.x == 0 ? 0 : min(thread_block_start + num_blocks_per_thread, num_blocks);
@@ -96,7 +98,7 @@ void BitonicArgSortItemsGlobal(

 template <typename T>
 __global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks) {
-  __shared__ T shared_buffer[32];
+  __shared__ T shared_buffer[WARPSIZE];
  T thread_sum = 0;
  for (data_size_t block_index = static_cast<data_size_t>(threadIdx.x); block_index < num_blocks; block_index += static_cast<data_size_t>(blockDim.x)) {
    thread_sum += block_buffer[block_index];
@@ -109,7 +111,7 @@ __global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks) {

 template <typename VAL_T, typename REDUCE_T>
 __global__ void ShuffleReduceSumGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) {
-  __shared__ REDUCE_T shared_buffer[32];
+  __shared__ REDUCE_T shared_buffer[WARPSIZE];
  const data_size_t data_index = static_cast<data_size_t>(blockIdx.x * blockDim.x + threadIdx.x);
  const REDUCE_T value = (data_index < num_value ? static_cast<REDUCE_T>(values[data_index]) : 0.0f);
  const REDUCE_T reduce_value = ShuffleReduceSum<REDUCE_T>(value, shared_buffer, blockDim.x);
@@ -131,7 +133,7 @@ template void ShuffleReduceSumGlobal<double, double>(const double* values, size_

 template <typename VAL_T, typename REDUCE_T>
 __global__ void ShuffleReduceMinGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) {
-  __shared__ REDUCE_T shared_buffer[32];
+  __shared__ REDUCE_T shared_buffer[WARPSIZE];
  const data_size_t data_index = static_cast<data_size_t>(blockIdx.x * blockDim.x + threadIdx.x);
  const REDUCE_T value = (data_index < num_value ? static_cast<REDUCE_T>(values[data_index]) : 0.0f);
  const REDUCE_T reduce_value = ShuffleReduceMin<REDUCE_T>(value, shared_buffer, blockDim.x);
@@ -142,7 +144,7 @@ __global__ void ShuffleReduceMinGlobalKernel(const VAL_T* values, const data_siz

 template <typename T>
 __global__ void ShuffleBlockReduceMin(T* block_buffer, const data_size_t num_blocks) {
-  __shared__ T shared_buffer[32];
+  __shared__ T shared_buffer[WARPSIZE];
  T thread_min = 0;
  for (data_size_t block_index = static_cast<data_size_t>(threadIdx.x); block_index < num_blocks; block_index += static_cast<data_size_t>(blockDim.x)) {
    const T value = block_buffer[block_index];
@@ -168,7 +170,7 @@ template void ShuffleReduceMinGlobal<label_t, double>(const label_t* values, siz

 template <typename VAL_T, typename REDUCE_T>
 __global__ void ShuffleReduceDotProdGlobalKernel(const VAL_T* values1, const VAL_T* values2, const data_size_t num_value, REDUCE_T* block_buffer) {
-  __shared__ REDUCE_T shared_buffer[32];
+  __shared__ REDUCE_T shared_buffer[WARPSIZE];
  const data_size_t data_index = static_cast<data_size_t>(blockIdx.x * blockDim.x + threadIdx.x);
  const REDUCE_T value1 = (data_index < num_value ? static_cast<REDUCE_T>(values1[data_index]) : 0.0f);
  const REDUCE_T value2 = (data_index < num_value ? static_cast<REDUCE_T>(values2[data_index]) : 0.0f);
@@ -191,7 +193,7 @@ template void ShuffleReduceDotProdGlobal<label_t, double>(const label_t* values1
 template <typename INDEX_T, typename VAL_T, typename REDUCE_T>
 __global__ void GlobalInclusiveArgPrefixSumKernel(
  const INDEX_T* sorted_indices, const VAL_T* in_values, REDUCE_T* out_values, REDUCE_T* block_buffer, data_size_t num_data) {
-  __shared__ REDUCE_T shared_buffer[32];
+  __shared__ REDUCE_T shared_buffer[WARPSIZE];
  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
  REDUCE_T value = static_cast<REDUCE_T>(data_index < num_data ? in_values[sorted_indices[data_index]] : 0);
  __syncthreads();
@@ -206,7 +208,7 @@ __global__ void GlobalInclusiveArgPrefixSumKernel(

 template <typename T>
 __global__ void GlobalInclusivePrefixSumReduceBlockKernel(T* block_buffer, data_size_t num_blocks) {
-  __shared__ T shared_buffer[32];
+  __shared__ T shared_buffer[WARPSIZE];
  T thread_sum = 0;
  const data_size_t num_blocks_per_thread = (num_blocks + static_cast<data_size_t>(blockDim.x)) / static_cast<data_size_t>(blockDim.x);
  const data_size_t thread_start_block_index = static_cast<data_size_t>(threadIdx.x) * num_blocks_per_thread;
@@ -441,72 +443,6 @@ void BitonicArgSortGlobal<data_size_t, int, true>(const data_size_t* values, int
  BitonicArgSortGlobalHelper<data_size_t, int, true>(values, indices, len);
 }

-template <typename VAL_T, typename INDEX_T, typename WEIGHT_T, typename REDUCE_WEIGHT_T, bool ASCENDING, bool USE_WEIGHT>
-__device__ VAL_T PercentileDevice(const VAL_T* values,
-                                       const WEIGHT_T* weights,
-                                       INDEX_T* indices,
-                                       REDUCE_WEIGHT_T* weights_prefix_sum,
-                                       const double alpha,
-                                       const INDEX_T len) {
-  if (len <= 1) {
-    return values[0];
-  }
-  if (!USE_WEIGHT) {
-    BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 2, 10>(values, indices, len);
-    const double float_pos = (1.0f - alpha) * len;
-    const INDEX_T pos = static_cast<INDEX_T>(float_pos);
-    if (pos < 1) {
-      return values[indices[0]];
-    } else if (pos >= len) {
-      return values[indices[len - 1]];
-    } else {
-      const double bias = float_pos - pos;
-      const VAL_T v1 = values[indices[pos - 1]];
-      const VAL_T v2 = values[indices[pos]];
-      return static_cast<VAL_T>(v1 - (v1 - v2) * bias);
-    }
-  } else {
-    BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 4, 9>(values, indices, len);
-    ShuffleSortedPrefixSumDevice<WEIGHT_T, REDUCE_WEIGHT_T, INDEX_T>(weights, indices, weights_prefix_sum, len);
-    const REDUCE_WEIGHT_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha);
-    __shared__ INDEX_T pos;
-    if (threadIdx.x == 0) {
-      pos = len;
-    }
-    __syncthreads();
-    for (INDEX_T index = static_cast<INDEX_T>(threadIdx.x); index < len; index += static_cast<INDEX_T>(blockDim.x)) {
-      if (weights_prefix_sum[index] > threshold && (index == 0 || weights_prefix_sum[index - 1] <= threshold)) {
-        pos = index;
-      }
-    }
-    __syncthreads();
-    pos = min(pos, len - 1);
-    if (pos == 0 || pos == len - 1) {
-      return values[pos];
-    }
-    const VAL_T v1 = values[indices[pos - 1]];
-    const VAL_T v2 = values[indices[pos]];
-    return static_cast<VAL_T>(v1 - (v1 - v2) * (threshold - weights_prefix_sum[pos - 1]) / (weights_prefix_sum[pos] - weights_prefix_sum[pos - 1]));
-  }
-}
-
-template __device__ double PercentileDevice<double, data_size_t, label_t, double, false, true>(
-                                  const double* values,
-                                  const label_t* weights,
-                                  data_size_t* indices,
-                                  double* weights_prefix_sum,
-                                  const double alpha,
-                                  const data_size_t len);
-
-template __device__ double PercentileDevice<double, data_size_t, label_t, double, false, false>(
-                                  const double* values,
-                                  const label_t* weights,
-                                  data_size_t* indices,
-                                  double* weights_prefix_sum,
-                                  const double alpha,
-                                  const data_size_t len);
-
-
 }  // namespace LightGBM

 #endif  // USE_CUDA
--- a/src/metric/cuda/cuda_pointwise_metric.cu
+++ b/src/metric/cuda/cuda_pointwise_metric.cu
@@ -2,11 +2,13 @@
 * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_algorithms.hpp>
+#include <LightGBM/cuda/cuda_rocm_interop.h>

 #include "cuda_binary_metric.hpp"
 #include "cuda_pointwise_metric.hpp"
@@ -17,7 +19,7 @@ namespace LightGBM {
 template <typename CUDA_METRIC, bool USE_WEIGHTS>
 __global__ void EvalKernel(const data_size_t num_data, const label_t* labels, const label_t* weights,
                           const double* scores, double* reduce_block_buffer, const double param) {
-  __shared__ double shared_mem_buffer[32];
+  __shared__ double shared_mem_buffer[WARPSIZE];
  const data_size_t index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
  double point_metric = 0.0;
  if (index < num_data) {

--- a/src/objective/cuda/cuda_binary_objective.cu
+++ b/src/objective/cuda/cuda_binary_objective.cu
@@ -2,20 +2,23 @@
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifdef USE_CUDA

-#include <algorithm>
-
 #include "cuda_binary_objective.hpp"

+#include <LightGBM/cuda/cuda_rocm_interop.h>
+
+#include <algorithm>
+
 namespace LightGBM {

 template <bool USE_WEIGHT>
 __global__ void BoostFromScoreKernel_1_BinaryLogloss(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_sum_labels,
                                                     double* out_cuda_sum_weights, const label_t* cuda_weights) {
-  __shared__ double shared_buffer[32];
+  __shared__ double shared_buffer[WARPSIZE];
  const uint32_t mask = 0xffffffff;
  const uint32_t warpLane = threadIdx.x % warpSize;
  const uint32_t warpID = threadIdx.x / warpSize;

--- a/src/objective/cuda/cuda_rank_objective.cu
+++ b/src/objective/cuda/cuda_rank_objective.cu
@@ -2,6 +2,7 @@
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifdef USE_CUDA
@@ -307,6 +308,9 @@ __global__ void GetGradientsKernel_LambdarankNDCG_Sorted(
 void CUDALambdarankNDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const {
  const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK;
  const data_size_t num_rank_label = static_cast<int>(label_gain_.size());
+  const int device_index = GetCUDADevice(__FILE__, __LINE__);
+  cudaDeviceProp device_prop;
+  CUDASUCCESS_OR_FATAL(cudaGetDeviceProperties(&device_prop, device_index));

  #define GetGradientsKernel_LambdarankNDCG_ARGS \
    score, cuda_labels_, num_data_, \
@@ -321,7 +325,7 @@ void CUDALambdarankNDCG::LaunchGetGradientsKernel(const double* score, score_t*
    gradients, hessians

  if (max_items_in_query_aligned_ <= 1024) {
-    if (num_rank_label <= 32) {
+    if (num_rank_label <= 32 && device_prop.warpSize == 32) {
      GetGradientsKernel_LambdarankNDCG<false, 32><<<num_blocks, max_items_in_query_aligned_>>>(GetGradientsKernel_LambdarankNDCG_ARGS);
    } else if (num_rank_label <= 64) {
      GetGradientsKernel_LambdarankNDCG<false, 64><<<num_blocks, max_items_in_query_aligned_>>>(GetGradientsKernel_LambdarankNDCG_ARGS);
@@ -337,7 +341,7 @@ void CUDALambdarankNDCG::LaunchGetGradientsKernel(const double* score, score_t*
      GetGradientsKernel_LambdarankNDCG<false, 2048><<<num_blocks, max_items_in_query_aligned_>>>(GetGradientsKernel_LambdarankNDCG_ARGS);
    }
  } else if (max_items_in_query_aligned_ <= 2048) {
-    if (num_rank_label <= 32) {
+    if (num_rank_label <= 32 && device_prop.warpSize == 32) {
      GetGradientsKernel_LambdarankNDCG<true, 32><<<num_blocks, 1024>>>(GetGradientsKernel_LambdarankNDCG_ARGS);
    } else if (num_rank_label <= 64) {
      GetGradientsKernel_LambdarankNDCG<true, 64><<<num_blocks, 1024>>>(GetGradientsKernel_LambdarankNDCG_ARGS);
@@ -354,7 +358,7 @@ void CUDALambdarankNDCG::LaunchGetGradientsKernel(const double* score, score_t*
    }
  } else {
    BitonicArgSortItemsGlobal(score, num_queries_, cuda_query_boundaries_, cuda_item_indices_buffer_.RawData());
-    if (num_rank_label <= 32) {
+    if (num_rank_label <= 32 && device_prop.warpSize == 32) {
      GetGradientsKernel_LambdarankNDCG_Sorted<32><<<num_blocks, 1024>>>(GetGradientsKernel_LambdarankNDCG_Sorted_ARGS);
    } else if (num_rank_label <= 64) {
      GetGradientsKernel_LambdarankNDCG_Sorted<64><<<num_blocks, 1024>>>(GetGradientsKernel_LambdarankNDCG_Sorted_ARGS);
@@ -405,7 +409,7 @@ __global__ void GetGradientsKernel_RankXENDCG_SharedMemory(
    const data_size_t block_reduce_size = query_item_count >= 1024 ? 1024 : query_item_count;
    __shared__ double shared_rho[SHARED_MEMORY_SIZE];
    // assert that warpSize == 32
-    __shared__ double shared_buffer[32];
+    __shared__ double shared_buffer[1024 / WARPSIZE];
    __shared__ double shared_params[SHARED_MEMORY_SIZE];
    __shared__ score_t shared_lambdas[SHARED_MEMORY_SIZE];
    __shared__ double reduce_result;
@@ -523,7 +527,7 @@ __global__ void GetGradientsKernel_RankXENDCG_GlobalMemory(
    double* cuda_params_buffer_pointer = cuda_params_buffer + item_index_start;
    const data_size_t block_reduce_size = query_item_count > 1024 ? 1024 : query_item_count;
    // assert that warpSize == 32, so we use buffer size 1024 / 32 = 32
-    __shared__ double shared_buffer[32];
+    __shared__ double shared_buffer[1024 / WARPSIZE];
    __shared__ double reduce_result;
    if (query_item_count <= 1) {
      for (data_size_t i = 0; i <= query_item_count; ++i) {

--- a/src/treelearner/cuda/cuda_best_split_finder.cu
+++ b/src/treelearner/cuda/cuda_best_split_finder.cu
@@ -2,14 +2,17 @@
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifdef USE_CUDA

-#include <algorithm>
+#include "cuda_best_split_finder.hpp"

 #include <LightGBM/cuda/cuda_algorithms.hpp>
-#include "cuda_best_split_finder.hpp"
+#include <LightGBM/cuda/cuda_rocm_interop.h>
+
+#include <algorithm>

 namespace LightGBM {

@@ -161,9 +164,9 @@ __device__ void FindBestSplitsForLeafKernelInner(
    }
  }
  __shared__ uint32_t best_thread_index;
-  __shared__ double shared_double_buffer[32];
-  __shared__ bool shared_bool_buffer[32];
-  __shared__ uint32_t shared_int_buffer[32];
+  __shared__ double shared_double_buffer[WARPSIZE];
+  __shared__ bool shared_bool_buffer[WARPSIZE];
+  __shared__ uint32_t shared_int_buffer[WARPSIZE];
  const unsigned int threadIdx_x = threadIdx.x;
  const bool skip_sum = REVERSE ?
    (task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast<int>(task->default_bin)) :
@@ -361,9 +364,9 @@ __device__ void FindBestSplitsDiscretizedForLeafKernelInner(
    }
  }
  __shared__ uint32_t best_thread_index;
-  __shared__ double shared_double_buffer[32];
-  __shared__ bool shared_bool_buffer[32];
-  __shared__ uint32_t shared_int_buffer[64];
+  __shared__ double shared_double_buffer[WARPSIZE];
+  __shared__ bool shared_bool_buffer[WARPSIZE];
+  __shared__ uint32_t shared_int_buffer[2 * WARPSIZE];  // need 2 * WARPSIZE since the actual ACC_HIST_TYPE could be long int
  const unsigned int threadIdx_x = threadIdx.x;
  const bool skip_sum = REVERSE ?
    (task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast<int>(task->default_bin)) :
@@ -515,9 +518,9 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner(
  const double parent_output,
  // output parameters
  CUDASplitInfo* cuda_best_split_info) {
-  __shared__ double shared_gain_buffer[32];
-  __shared__ bool shared_found_buffer[32];
-  __shared__ uint32_t shared_thread_index_buffer[32];
+  __shared__ double shared_gain_buffer[WARPSIZE];
+  __shared__ bool shared_found_buffer[WARPSIZE];
+  __shared__ uint32_t shared_thread_index_buffer[WARPSIZE];
  __shared__ uint32_t best_thread_index;
  const double cnt_factor = num_data / sum_hessians;
  const double min_gain_shift = parent_gain + min_gain_to_split;
@@ -605,8 +608,8 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner(
  } else {
    __shared__ double shared_value_buffer[NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER];
    __shared__ int16_t shared_index_buffer[NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER];
-    __shared__ uint16_t shared_mem_buffer_uint16[32];
-    __shared__ double shared_mem_buffer_double[32];
+    __shared__ uint16_t shared_mem_buffer_uint16[WARPSIZE];
+    __shared__ double shared_mem_buffer_double[WARPSIZE];
    __shared__ int used_bin;
    l2 += cat_l2;
    uint16_t is_valid_bin = 0;
@@ -1086,9 +1089,9 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory(
    }
  }
  __shared__ uint32_t best_thread_index;
-  __shared__ double shared_double_buffer[32];
-  __shared__ bool shared_found_buffer[32];
-  __shared__ uint32_t shared_thread_index_buffer[32];
+  __shared__ double shared_double_buffer[WARPSIZE];
+  __shared__ bool shared_found_buffer[WARPSIZE];
+  __shared__ uint32_t shared_thread_index_buffer[WARPSIZE];
  const unsigned int threadIdx_x = threadIdx.x;
  const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset;
  if (!REVERSE) {
@@ -1301,9 +1304,9 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory(
  data_size_t* hist_index_buffer_ptr,
  // output parameters
  CUDASplitInfo* cuda_best_split_info) {
-  __shared__ double shared_gain_buffer[32];
-  __shared__ bool shared_found_buffer[32];
-  __shared__ uint32_t shared_thread_index_buffer[32];
+  __shared__ double shared_gain_buffer[WARPSIZE];
+  __shared__ bool shared_found_buffer[WARPSIZE];
+  __shared__ uint32_t shared_thread_index_buffer[WARPSIZE];
  __shared__ uint32_t best_thread_index;
  const double cnt_factor = num_data / sum_hessians;
  const double min_gain_shift = parent_gain + min_gain_to_split;
@@ -1390,7 +1393,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory(
        sum_right_hessian, lambda_l1, l2, right_output);
    }
  } else {
-    __shared__ uint16_t shared_mem_buffer_uint16[32];
+    __shared__ uint16_t shared_mem_buffer_uint16[WARPSIZE];
    __shared__ int used_bin;
    l2 += cat_l2;
    uint16_t is_valid_bin = 0;
@@ -1927,9 +1930,9 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i
  const int num_blocks_per_leaf,
  const bool larger_only,
  const int num_leaves) {
-  __shared__ double shared_gain_buffer[32];
-  __shared__ bool shared_found_buffer[32];
-  __shared__ uint32_t shared_thread_index_buffer[32];
+  __shared__ double shared_gain_buffer[WARPSIZE];
+  __shared__ bool shared_found_buffer[WARPSIZE];
+  __shared__ uint32_t shared_thread_index_buffer[WARPSIZE];
  const uint32_t threadIdx_x = threadIdx.x;
  const uint32_t blockIdx_x = blockIdx.x;

@@ -2113,8 +2116,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(
 __global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves,
  CUDASplitInfo* cuda_leaf_best_split_info,
  int* cuda_best_split_info_buffer) {
-  __shared__ double gain_shared_buffer[32];
-  __shared__ int leaf_index_shared_buffer[32];
+  __shared__ double gain_shared_buffer[WARPSIZE];
+  __shared__ int leaf_index_shared_buffer[WARPSIZE];
  double thread_best_gain = kMinScore;
  int thread_best_leaf_index = -1;
  const int threadIdx_x = static_cast<int>(threadIdx.x);

--- a/src/treelearner/cuda/cuda_data_partition.cu
+++ b/src/treelearner/cuda/cuda_data_partition.cu
@@ -2,6 +2,7 @@
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifdef USE_CUDA
@@ -9,6 +10,7 @@
 #include "cuda_data_partition.hpp"

 #include <LightGBM/cuda/cuda_algorithms.hpp>
+#include <LightGBM/cuda/cuda_rocm_interop.h>
 #include <LightGBM/tree.h>

 #include <algorithm>
@@ -290,7 +292,7 @@ __global__ void GenDataToLeftBitVectorKernel(
  uint16_t* block_to_left_offset,
  data_size_t* block_to_left_offset_buffer,
  data_size_t* block_to_right_offset_buffer) {
-  __shared__ uint16_t shared_mem_buffer[32];
+  __shared__ uint16_t shared_mem_buffer[WARPSIZE];
  uint16_t thread_to_left_offset_cnt = 0;
  const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x;
  if (local_data_index < num_data_in_leaf) {
@@ -585,7 +587,7 @@ __global__ void GenDataToLeftBitVectorKernel_Categorical(
  const uint8_t split_default_to_left,
  uint16_t* block_to_left_offset,
  data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) {
-  __shared__ uint16_t shared_mem_buffer[32];
+  __shared__ uint16_t shared_mem_buffer[WARPSIZE];
  uint16_t thread_to_left_offset_cnt = 0;
  const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x;
  if (local_data_index < num_data_in_leaf) {
@@ -683,7 +685,7 @@ __global__ void AggregateBlockOffsetKernel0(
  data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start,
  data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices,
  const data_size_t num_blocks) {
-  __shared__ uint32_t shared_mem_buffer[32];
+  __shared__ uint32_t shared_mem_buffer[WARPSIZE];
  __shared__ uint32_t to_left_total_count;
  const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index];
  const unsigned int blockDim_x = blockDim.x;
@@ -747,7 +749,7 @@ __global__ void AggregateBlockOffsetKernel1(
  data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start,
  data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices,
  const data_size_t num_blocks) {
-  __shared__ uint32_t shared_mem_buffer[32];
+  __shared__ uint32_t shared_mem_buffer[WARPSIZE];
  __shared__ uint32_t to_left_total_count;
  const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index];
  const unsigned int threadIdx_x = threadIdx.x;
@@ -1078,7 +1080,7 @@ __global__ void RenewDiscretizedTreeLeavesKernel(
  double* leaf_grad_stat_buffer,
  double* leaf_hess_stat_buffer,
  double* leaf_values) {
-  __shared__ double shared_mem_buffer[32];
+  __shared__ double shared_mem_buffer[WARPSIZE];
  const int leaf_index = static_cast<int>(blockIdx.x);
  const data_size_t* data_indices_in_leaf = data_indices + leaf_data_start[leaf_index];
  const data_size_t num_data_in_leaf = leaf_num_data[leaf_index];

--- a/src/treelearner/cuda/cuda_gradient_discretizer.cu
+++ b/src/treelearner/cuda/cuda_gradient_discretizer.cu
@@ -22,7 +22,7 @@ __global__ void ReduceMinMaxKernel(
  score_t* grad_max_block_buffer,
  score_t* hess_min_block_buffer,
  score_t* hess_max_block_buffer) {
-  __shared__ score_t shared_mem_buffer[32];
+  __shared__ score_t shared_mem_buffer[WARPSIZE];
  const data_size_t index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
  score_t grad_max_val = kMinScore;
  score_t grad_min_val = kMaxScore;
@@ -56,7 +56,7 @@ __global__ void ReduceBlockMinMaxKernel(
  score_t* grad_max_block_buffer,
  score_t* hess_min_block_buffer,
  score_t* hess_max_block_buffer) {
-  __shared__ score_t shared_mem_buffer[32];
+  __shared__ score_t shared_mem_buffer[WARPSIZE];
  score_t grad_max_val = kMinScore;
  score_t grad_min_val = kMaxScore;
  score_t hess_max_val = kMinScore;

--- a/src/treelearner/cuda/cuda_histogram_constructor.cu
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cu
@@ -2,6 +2,7 @@
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifdef USE_CUDA
@@ -9,6 +10,7 @@
 #include "cuda_histogram_constructor.hpp"

 #include <LightGBM/cuda/cuda_algorithms.hpp>
+#include <LightGBM/cuda/cuda_rocm_interop.h>

 #include <algorithm>

@@ -742,7 +744,7 @@ __global__ void FixHistogramKernel(
  const int* cuda_need_fix_histogram_features,
  const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned,
  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits) {
-  __shared__ hist_t shared_mem_buffer[32];
+  __shared__ hist_t shared_mem_buffer[WARPSIZE];
  const unsigned int blockIdx_x = blockIdx.x;
  const int feature_index = cuda_need_fix_histogram_features[blockIdx_x];
  const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x];
@@ -833,7 +835,7 @@ __global__ void FixHistogramDiscretizedKernel(
  const int* cuda_need_fix_histogram_features,
  const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned,
  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits) {
-  __shared__ int64_t shared_mem_buffer[32];
+  __shared__ int64_t shared_mem_buffer[WARPSIZE];
  const unsigned int blockIdx_x = blockIdx.x;
  const int feature_index = cuda_need_fix_histogram_features[blockIdx_x];
  const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x];

--- a/src/treelearner/cuda/cuda_leaf_splits.cu
+++ b/src/treelearner/cuda/cuda_leaf_splits.cu
@@ -2,6 +2,7 @@
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */


@@ -9,6 +10,7 @@

 #include "cuda_leaf_splits.hpp"
 #include <LightGBM/cuda/cuda_algorithms.hpp>
+#include <LightGBM/cuda/cuda_rocm_interop.h>

 namespace LightGBM {

@@ -16,7 +18,7 @@ template <bool USE_INDICES>
 __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians,
  const data_size_t num_data, const data_size_t* cuda_bagging_data_indices,
  double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) {
-  __shared__ double shared_mem_buffer[32];
+  __shared__ double shared_mem_buffer[WARPSIZE];
  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
  double gradient = 0.0f;
  double hessian = 0.0f;
@@ -43,7 +45,7 @@ __global__ void CUDAInitValuesKernel2(
  const data_size_t* cuda_data_indices_in_leaf,
  hist_t* cuda_hist_in_leaf,
  CUDALeafSplitsStruct* cuda_struct) {
-  __shared__ double shared_mem_buffer[32];
+  __shared__ double shared_mem_buffer[WARPSIZE];
  double thread_sum_of_gradients = 0.0f;
  double thread_sum_of_hessians = 0.0f;
  for (int block_index = static_cast<int>(threadIdx.x); block_index < num_blocks_to_reduce; block_index += static_cast<int>(blockDim.x)) {
@@ -88,7 +90,7 @@ __global__ void CUDAInitValuesKernel3(const int16_t* cuda_gradients_and_hessians
  const score_t* grad_scale_pointer, const score_t* hess_scale_pointer) {
  const score_t grad_scale = *grad_scale_pointer;
  const score_t hess_scale = *hess_scale_pointer;
-  __shared__ int64_t shared_mem_buffer[32];
+  __shared__ int64_t shared_mem_buffer[WARPSIZE];
  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
  int64_t int_gradient = 0;
  int64_t int_hessian = 0;
@@ -119,7 +121,7 @@ __global__ void CUDAInitValuesKernel4(
  const data_size_t* cuda_data_indices_in_leaf,
  hist_t* cuda_hist_in_leaf,
  CUDALeafSplitsStruct* cuda_struct) {
-  __shared__ double shared_mem_buffer[32];
+  __shared__ double shared_mem_buffer[WARPSIZE];
  double thread_sum_of_gradients = 0.0f;
  double thread_sum_of_hessians = 0.0f;
  int64_t thread_sum_of_gradients_hessians = 0;

--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
@@ -2,14 +2,16 @@
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */

 #ifdef USE_CUDA

-#include <LightGBM/cuda/cuda_algorithms.hpp>
-
 #include "cuda_single_gpu_tree_learner.hpp"

+#include <LightGBM/cuda/cuda_algorithms.hpp>
+#include <LightGBM/cuda/cuda_rocm_interop.h>
+
 #include <algorithm>

 namespace LightGBM {
@@ -167,7 +169,7 @@ void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel(

 template <typename T, bool IS_INNER>
 __global__ void CalcBitsetLenKernel(const CUDASplitInfo* best_split_info, size_t* out_len_buffer) {
-  __shared__ size_t shared_mem_buffer[32];
+  __shared__ size_t shared_mem_buffer[WARPSIZE];
  const T* vals = nullptr;
  if (IS_INNER) {
    vals = reinterpret_cast<const T*>(best_split_info->cat_threshold);
@@ -178,7 +180,7 @@ __global__ void CalcBitsetLenKernel(const CUDASplitInfo* best_split_info, size_t
  size_t len = 0;
  if (i < best_split_info->num_cat_threshold) {
    const T val = vals[i];
-    len = (val / 32) + 1;
+    len = (val / WARPSIZE) + 1;
  }
  const size_t block_max_len = ShuffleReduceMax<size_t>(len, shared_mem_buffer, blockDim.x);
  if (threadIdx.x == 0) {
@@ -187,7 +189,7 @@ __global__ void CalcBitsetLenKernel(const CUDASplitInfo* best_split_info, size_t
 }

 __global__ void ReduceBlockMaxLen(size_t* out_len_buffer, const int num_blocks) {
-  __shared__ size_t shared_mem_buffer[32];
+  __shared__ size_t shared_mem_buffer[WARPSIZE];
  size_t max_len = 0;
  for (int i = static_cast<int>(threadIdx.x); i < num_blocks; i += static_cast<int>(blockDim.x)) {
    max_len = max(out_len_buffer[i], max_len);
@@ -210,7 +212,7 @@ __global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info,
  if (i < best_split_info->num_cat_threshold) {
    const T val = vals[i];
    // can use add instead of or here, because each bit will only be added once
-    atomicAdd_system(out + (val / 32), (0x1 << (val % 32)));
+    atomicAdd_system(out + (val / WARPSIZE), (0x1 << (val % WARPSIZE)));
  }
 }