/*! * Copyright (c) 2021 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ #define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ #ifdef USE_CUDA_EXP #include #include #include #include #include #include "cuda_leaf_splits.hpp" #define NUM_DATA_PER_THREAD (400) #define NUM_THRADS_PER_BLOCK (504) #define NUM_FEATURE_PER_THREAD_GROUP (28) #define SUBTRACT_BLOCK_SIZE (1024) #define FIX_HISTOGRAM_SHARED_MEM_SIZE (1024) #define FIX_HISTOGRAM_BLOCK_SIZE (512) #define USED_HISTOGRAM_BUFFER_NUM (8) namespace LightGBM { class CUDAHistogramConstructor { public: CUDAHistogramConstructor( const Dataset* train_data, const int num_leaves, const int num_threads, const std::vector& feature_hist_offsets, const int min_data_in_leaf, const double min_sum_hessian_in_leaf, const int gpu_device_id, const bool gpu_use_dp); ~CUDAHistogramConstructor(); void Init(const Dataset* train_data, TrainingShareStates* share_state); void ConstructHistogramForLeaf( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); void ResetTrainingData(const Dataset* train_data, TrainingShareStates* share_states); void ResetConfig(const Config* config); void BeforeTrain(const score_t* gradients, const score_t* hessians); const hist_t* cuda_hist() const { return cuda_hist_; } hist_t* cuda_hist_pointer() { return cuda_hist_; } private: void InitFeatureMetaInfo(const Dataset* train_data, const std::vector& feature_hist_offsets); void CalcConstructHistogramKernelDim( int* grid_dim_x, int* grid_dim_y, int* block_dim_x, int* block_dim_y, const data_size_t num_data_in_smaller_leaf); template void LaunchConstructHistogramKernelInner( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf); template void LaunchConstructHistogramKernelInner0( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf); template void LaunchConstructHistogramKernelInner1( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf); template void LaunchConstructHistogramKernelInner2( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf); void LaunchConstructHistogramKernel( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf); void LaunchSubtractHistogramKernel( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits); // Host memory /*! \brief size of training data */ data_size_t num_data_; /*! \brief number of features in training data */ int num_features_; /*! \brief maximum number of leaves */ int num_leaves_; /*! \brief number of threads */ int num_threads_; /*! \brief total number of bins in histogram */ int num_total_bin_; /*! \brief number of bins per feature */ std::vector feature_num_bins_; /*! \brief offsets in histogram of all features */ std::vector feature_hist_offsets_; /*! \brief most frequent bins in each feature */ std::vector feature_most_freq_bins_; /*! \brief minimum number of data allowed per leaf */ int min_data_in_leaf_; /*! \brief minimum sum value of hessians allowed per leaf */ double min_sum_hessian_in_leaf_; /*! \brief cuda stream for histogram construction */ cudaStream_t cuda_stream_; /*! \brief indices of feature whose histograms need to be fixed */ std::vector need_fix_histogram_features_; /*! \brief aligned number of bins of the features whose histograms need to be fixed */ std::vector need_fix_histogram_features_num_bin_aligend_; /*! \brief minimum number of blocks allowed in the y dimension */ const int min_grid_dim_y_ = 160; // CUDA memory, held by this object /*! \brief CUDA row wise data */ std::unique_ptr cuda_row_data_; /*! \brief number of bins per feature */ uint32_t* cuda_feature_num_bins_; /*! \brief offsets in histogram of all features */ uint32_t* cuda_feature_hist_offsets_; /*! \brief most frequent bins in each feature */ uint32_t* cuda_feature_most_freq_bins_; /*! \brief CUDA histograms */ hist_t* cuda_hist_; /*! \brief CUDA histograms buffer for each block */ float* cuda_hist_buffer_; /*! \brief indices of feature whose histograms need to be fixed */ int* cuda_need_fix_histogram_features_; /*! \brief aligned number of bins of the features whose histograms need to be fixed */ uint32_t* cuda_need_fix_histogram_features_num_bin_aligned_; // CUDA memory, held by other object /*! \brief gradients on CUDA */ const score_t* cuda_gradients_; /*! \brief hessians on CUDA */ const score_t* cuda_hessians_; /*! \brief GPU device index */ const int gpu_device_id_; /*! \brief use double precision histogram per block */ const bool gpu_use_dp_; }; } // namespace LightGBM #endif // USE_CUDA_EXP #endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_