Commit 61ec4f1a authored by Jeff Daily's avatar Jeff Daily
Browse files

[ROCm] re-add support for ROCm builds

Previously #6086 added ROCm support but after numerous rebases it lost
critical changes. This PR restores the ROCm build.

There are many source file changes but most were automated using the
following:

```bash
for f in `grep -rl '#ifdef USE_CUDA'`
do
    sed -i 's@#ifdef USE_CUDA@#if defined(USE_CUDA) || defined(USE_ROCM)@g' $f
done

for f in `grep -rl '#endif  // USE_CUDA'`
do
    sed -i 's@#endif  // USE_CUDA@#endif  // USE_CUDA || USE_ROCM@g' $f
done
```
parent 336a77df
......@@ -36,7 +36,7 @@ endif()
project(lightgbm LANGUAGES C CXX)
if(USE_CUDA)
if(USE_CUDA OR USE_ROCM)
set(CMAKE_CXX_STANDARD 17)
elseif(BUILD_CPP_TEST)
set(CMAKE_CXX_STANDARD 14)
......@@ -480,10 +480,21 @@ set(
src/cuda/cuda_algorithms.cu
)
if(USE_CUDA)
if(USE_CUDA OR USE_ROCM)
list(APPEND LGBM_SOURCES ${LGBM_CUDA_SOURCES})
endif()
if(USE_ROCM)
set(CU_FILES "")
foreach(file IN LISTS LGBM_CUDA_SOURCES)
string(REGEX MATCH "\\.cu$" is_cu_file "${file}")
if(is_cu_file)
list(APPEND CU_FILES "${file}")
endif()
endforeach()
set_source_files_properties(${CU_FILES} PROPERTIES LANGUAGE HIP)
endif()
add_library(lightgbm_objs OBJECT ${LGBM_SOURCES})
if(BUILD_CLI)
......@@ -632,6 +643,10 @@ if(USE_CUDA)
endif()
endif()
if(USE_ROCM)
target_link_libraries(lightgbm_objs PUBLIC hip::host)
endif()
if(WIN32)
if(MINGW OR CYGWIN)
target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi)
......
......@@ -600,13 +600,13 @@ class MultiValBin {
virtual MultiValBin* Clone() = 0;
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
virtual const void* GetRowWiseData(uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const = 0;
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
};
inline uint32_t BinMapper::ValueToBin(double value) const {
......
......@@ -7,10 +7,12 @@
#ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
#define LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#if defined(USE_CUDA)
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#include <stdio.h>
#include <LightGBM/bin.h>
......@@ -619,5 +621,5 @@ __device__ VAL_T PercentileDevice(const VAL_T* values,
} // namespace LightGBM
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
#endif // LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
......@@ -3,7 +3,7 @@
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifndef LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
#define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
......@@ -139,4 +139,4 @@ class CUDAColumnData {
#endif // LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
......@@ -3,7 +3,7 @@
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
#define LIGHTGBM_CUDA_CUDA_METADATA_HPP_
......@@ -55,4 +55,4 @@ class CUDAMetadata {
#endif // LIGHTGBM_CUDA_CUDA_METADATA_HPP_
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
......@@ -7,7 +7,7 @@
#ifndef LIGHTGBM_CUDA_CUDA_METRIC_HPP_
#define LIGHTGBM_CUDA_CUDA_METRIC_HPP_
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/metric.h>
......@@ -39,6 +39,6 @@ class CUDAMetricInterface: public HOST_METRIC {
} // namespace LightGBM
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
#endif // LIGHTGBM_CUDA_CUDA_METRIC_HPP_
......@@ -7,7 +7,7 @@
#ifndef LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_
#define LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/objective_function.h>
......@@ -81,6 +81,6 @@ class CUDAObjectiveInterface: public HOST_OBJECTIVE {
} // namespace LightGBM
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
#endif // LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_
......@@ -5,10 +5,12 @@
#ifndef LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
#define LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#if defined(USE_CUDA)
#include <cuda.h>
#include <cuda_runtime.h>
#endif
namespace LightGBM {
......@@ -69,6 +71,6 @@ class CUDARandom {
} // namespace LightGBM
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
#endif // LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
/*!
* Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
// ROCm doesn't have __shfl_down_sync, only __shfl_down without mask.
......@@ -12,9 +12,38 @@
#define WARPSIZE warpSize
// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
#define atomicAdd_block atomicAdd
#else
// hipify
#include <hip/hip_runtime.h>
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaFree hipFree
#define cudaFreeHost hipFreeHost
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorName hipGetErrorName
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostAlloc hipHostAlloc
#define cudaHostAllocPortable hipHostAllocPortable
#define cudaMalloc hipMalloc
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemoryTypeHost hipMemoryTypeHost
#define cudaMemset hipMemset
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaSetDevice hipSetDevice
#define cudaStreamCreate hipStreamCreate
#define cudaStreamDestroy hipStreamDestroy
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#else // __HIP_PLATFORM_AMD__ || __HIP__
// CUDA warpSize is not a constexpr, but always 32
#define WARPSIZE 32
#endif
#endif
#endif // USE_CUDA || USE_ROCM
......@@ -3,7 +3,7 @@
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifndef LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
#define LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
......@@ -177,4 +177,4 @@ class CUDARowData {
} // namespace LightGBM
#endif // LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
......@@ -5,7 +5,7 @@
* Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifndef LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
#define LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
......@@ -105,4 +105,4 @@ class CUDASplitInfo {
#endif // LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
......@@ -3,7 +3,7 @@
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifndef LIGHTGBM_CUDA_CUDA_TREE_HPP_
#define LIGHTGBM_CUDA_CUDA_TREE_HPP_
......@@ -170,4 +170,4 @@ class CUDATree : public Tree {
#endif // LIGHTGBM_CUDA_CUDA_TREE_HPP_
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
......@@ -6,10 +6,14 @@
#ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_
#define LIGHTGBM_CUDA_CUDA_UTILS_H_
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#if defined(USE_CUDA)
#include <cuda.h>
#include <cuda_runtime.h>
#else
#include <LightGBM/cuda/cuda_rocm_interop.h>
#endif
#include <stdio.h>
#include <LightGBM/utils/log.h>
......@@ -207,6 +211,6 @@ static __device__ T SafeLog(T x) {
} // namespace LightGBM
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
#endif // LIGHTGBM_CUDA_CUDA_UTILS_H_
......@@ -8,10 +8,13 @@
#include <LightGBM/utils/common.h>
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
#if defined(USE_CUDA)
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#endif // USE_CUDA
#include <LightGBM/cuda/cuda_utils.hu>
#endif // USE_CUDA || USE_ROCM
#include <stdio.h>
enum LGBM_Device {
......@@ -44,7 +47,7 @@ struct CHAllocator {
T* ptr;
if (n == 0) return NULL;
n = SIZE_ALIGNED(n);
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
if (LGBM_config_::current_device == lgbm_device_cuda) {
cudaError_t ret = cudaHostAlloc(reinterpret_cast<void**>(&ptr), n*sizeof(T), cudaHostAllocPortable);
if (ret != cudaSuccess) {
......@@ -63,17 +66,17 @@ struct CHAllocator {
void deallocate(T* p, std::size_t n) {
(void)n; // UNUSED
if (p == NULL) return;
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
if (LGBM_config_::current_device == lgbm_device_cuda) {
cudaPointerAttributes attributes;
cudaPointerGetAttributes(&attributes, p);
#if CUDA_VERSION >= 10000
CUDASUCCESS_OR_FATAL(cudaPointerGetAttributes(&attributes, p));
#if CUDA_VERSION >= 10000 || defined(USE_ROCM)
if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
cudaFreeHost(p);
CUDASUCCESS_OR_FATAL(cudaFreeHost(p));
}
#else
if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
cudaFreeHost(p);
CUDASUCCESS_OR_FATAL(cudaFreeHost(p));
}
#endif
} else {
......
......@@ -318,13 +318,13 @@ class Metadata {
/*! \brief Disable copy */
Metadata(const Metadata&) = delete;
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
CUDAMetadata* cuda_metadata() const { return cuda_metadata_.get(); }
void CreateCUDAMetadata(const int gpu_device_id);
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
private:
/*! \brief Load wights from file */
......@@ -391,9 +391,9 @@ class Metadata {
bool position_load_from_file_;
bool query_load_from_file_;
bool init_score_load_from_file_;
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
std::unique_ptr<CUDAMetadata> cuda_metadata_;
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
};
......@@ -997,13 +997,13 @@ class Dataset {
return feature_groups_[feature_group_index]->feature_min_bin(sub_feature_index);
}
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
const CUDAColumnData* cuda_column_data() const {
return cuda_column_data_.get();
}
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
private:
void SerializeHeader(BinaryWriter* serializer);
......@@ -1062,9 +1062,9 @@ class Dataset {
/*! \brief mutex for threading safe call */
std::mutex mutex_;
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
std::unique_ptr<CUDAColumnData> cuda_column_data_;
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
std::string parser_config_str_;
};
......
......@@ -108,7 +108,7 @@ class ObjectiveFunction {
*/
virtual bool IsCUDAObjective() const { return false; }
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
/*!
* \brief Convert output for CUDA version
*/
......@@ -118,7 +118,7 @@ class ObjectiveFunction {
virtual bool NeedConvertOutputCUDA () const { return false; }
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
};
} // namespace LightGBM
......
......@@ -38,9 +38,9 @@ class SampleStrategy {
std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>>& bag_data_indices() { return bag_data_indices_; }
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
CUDAVector<data_size_t>& cuda_bag_data_indices() { return cuda_bag_data_indices_; }
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
void UpdateObjectiveFunction(const ObjectiveFunction* objective_function) {
objective_function_ = objective_function;
......@@ -76,10 +76,10 @@ class SampleStrategy {
/*! \brief whether need to resize the gradient vectors */
bool need_resize_gradients_;
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
/*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda */
CUDAVector<data_size_t> cuda_bag_data_indices_;
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
};
} // namespace LightGBM
......
......@@ -219,7 +219,7 @@ class MultiValBinWrapper {
}
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
const void* GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
......@@ -235,7 +235,7 @@ class MultiValBinWrapper {
return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type);
}
}
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
private:
bool is_use_subcol_ = false;
......@@ -280,9 +280,9 @@ struct TrainingShareStates {
const std::vector<uint32_t>& feature_hist_offsets() const { return feature_hist_offsets_; }
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
const std::vector<uint32_t>& column_hist_offsets() const { return column_hist_offsets_; }
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
bool IsSparseRowwise() {
return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse());
......@@ -332,7 +332,7 @@ struct TrainingShareStates {
}
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
const void* GetRowWiseData(uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
......@@ -347,13 +347,13 @@ struct TrainingShareStates {
return nullptr;
}
}
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
private:
std::vector<uint32_t> feature_hist_offsets_;
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
std::vector<uint32_t> column_hist_offsets_;
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
int num_hist_total_bin_ = 0;
std::unique_ptr<MultiValBinWrapper> multi_val_bin_wrapper_;
std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_buf_;
......
......@@ -321,9 +321,9 @@ class Tree {
inline bool is_linear() const { return is_linear_; }
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
inline bool is_cuda_tree() const { return is_cuda_tree_; }
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
inline void SetIsLinear(bool is_linear) {
is_linear_ = is_linear;
......@@ -534,10 +534,10 @@ class Tree {
std::vector<std::vector<int>> leaf_features_;
/* \brief features used in leaf linear models; indexing is relative to used_features_ */
std::vector<std::vector<int>> leaf_features_inner_;
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
/*! \brief Marks whether this tree is a CUDATree */
bool is_cuda_tree_;
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
};
inline void Tree::Split(int leaf, int feature, int real_feature,
......
......@@ -105,33 +105,33 @@ class BaggingSampleStrategy : public SampleStrategy {
Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner
if (!is_use_subset_) {
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
if (config_->device_type == std::string("cuda")) {
CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
} else {
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
}
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
} else {
// get subset
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
bag_data_cnt_, false);
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
if (config_->device_type == std::string("cuda")) {
CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
bag_data_cnt_);
} else {
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
bag_data_cnt_);
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
}
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
}
}
}
......@@ -161,11 +161,11 @@ class BaggingSampleStrategy : public SampleStrategy {
bag_data_cnt_ = static_cast<data_size_t>(config_->bagging_fraction * num_data_);
}
bag_data_indices_.resize(num_data_);
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
if (config_->device_type == std::string("cuda")) {
cuda_bag_data_indices_.Resize(num_data_);
}
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
if (!config_->bagging_by_query) {
bagging_runner_.ReSize(num_data_);
} else {
......@@ -206,9 +206,9 @@ class BaggingSampleStrategy : public SampleStrategy {
} else {
bag_data_cnt_ = num_data_;
bag_data_indices_.clear();
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
cuda_bag_data_indices_.Clear();
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
bagging_runner_.ReSize(0);
is_use_subset_ = false;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment