cuda_histogram_constructor.hpp 7.15 KB
Newer Older
1
2
3
4
5
6
7
8
/*!
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
 */
#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
#define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_

9
#ifdef USE_CUDA
10
11

#include <LightGBM/cuda/cuda_row_data.hpp>
12
#include <LightGBM/cuda/cuda_utils.hu>
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#include <LightGBM/feature_group.h>
#include <LightGBM/tree.h>

#include <memory>
#include <vector>

#include "cuda_leaf_splits.hpp"

#define NUM_DATA_PER_THREAD (400)
#define NUM_THRADS_PER_BLOCK (504)
#define NUM_FEATURE_PER_THREAD_GROUP (28)
#define SUBTRACT_BLOCK_SIZE (1024)
#define FIX_HISTOGRAM_SHARED_MEM_SIZE (1024)
#define FIX_HISTOGRAM_BLOCK_SIZE (512)
#define USED_HISTOGRAM_BUFFER_NUM (8)

namespace LightGBM {

class CUDAHistogramConstructor {
 public:
  CUDAHistogramConstructor(
    const Dataset* train_data,
    const int num_leaves,
    const int num_threads,
    const std::vector<uint32_t>& feature_hist_offsets,
    const int min_data_in_leaf,
    const double min_sum_hessian_in_leaf,
    const int gpu_device_id,
41
42
43
    const bool gpu_use_dp,
    const bool use_discretized_grad,
    const int grad_discretized_bins);
44
45
46
47
48
49
50
51
52
53
54

  ~CUDAHistogramConstructor();

  void Init(const Dataset* train_data, TrainingShareStates* share_state);

  void ConstructHistogramForLeaf(
    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
    const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
    const data_size_t num_data_in_smaller_leaf,
    const data_size_t num_data_in_larger_leaf,
    const double sum_hessians_in_smaller_leaf,
55
56
57
58
59
60
61
62
63
64
    const double sum_hessians_in_larger_leaf,
    const uint8_t num_bits_in_histogram_bins);

  void SubtractHistogramForLeaf(
    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
    const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
    const bool use_discretized_grad,
    const uint8_t parent_num_bits_in_histogram_bins,
    const uint8_t smaller_num_bits_in_histogram_bins,
    const uint8_t larger_num_bits_in_histogram_bins);
65
66
67
68
69
70
71

  void ResetTrainingData(const Dataset* train_data, TrainingShareStates* share_states);

  void ResetConfig(const Config* config);

  void BeforeTrain(const score_t* gradients, const score_t* hessians);

72
  const hist_t* cuda_hist() const { return cuda_hist_.RawData(); }
73

74
  hist_t* cuda_hist_pointer() { return cuda_hist_.RawData(); }
75
76
77
78
79
80
81
82
83
84
85
86
87
88

 private:
  void InitFeatureMetaInfo(const Dataset* train_data, const std::vector<uint32_t>& feature_hist_offsets);

  void CalcConstructHistogramKernelDim(
    int* grid_dim_x,
    int* grid_dim_y,
    int* block_dim_x,
    int* block_dim_y,
    const data_size_t num_data_in_smaller_leaf);

  template <typename HIST_TYPE, size_t SHARED_HIST_SIZE>
  void LaunchConstructHistogramKernelInner(
    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
89
90
    const data_size_t num_data_in_smaller_leaf,
    const uint8_t num_bits_in_histogram_bins);
91
92
93
94

  template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE>
  void LaunchConstructHistogramKernelInner0(
    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
95
96
    const data_size_t num_data_in_smaller_leaf,
    const uint8_t num_bits_in_histogram_bins);
97
98
99
100

  template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE>
  void LaunchConstructHistogramKernelInner1(
    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
101
102
    const data_size_t num_data_in_smaller_leaf,
    const uint8_t num_bits_in_histogram_bins);
103
104
105
106

  template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE, bool USE_GLOBAL_MEM_BUFFER>
  void LaunchConstructHistogramKernelInner2(
    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
107
108
    const data_size_t num_data_in_smaller_leaf,
    const uint8_t num_bits_in_histogram_bins);
109
110
111

  void LaunchConstructHistogramKernel(
    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
112
113
    const data_size_t num_data_in_smaller_leaf,
    const uint8_t num_bits_in_histogram_bins);
114
115
116

  void LaunchSubtractHistogramKernel(
    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
117
118
119
120
121
    const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
    const bool use_discretized_grad,
    const uint8_t parent_num_bits_in_histogram_bins,
    const uint8_t smaller_num_bits_in_histogram_bins,
    const uint8_t larger_num_bits_in_histogram_bins);
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

  // Host memory

  /*! \brief size of training data */
  data_size_t num_data_;
  /*! \brief number of features in training data */
  int num_features_;
  /*! \brief maximum number of leaves */
  int num_leaves_;
  /*! \brief number of threads */
  int num_threads_;
  /*! \brief total number of bins in histogram */
  int num_total_bin_;
  /*! \brief number of bins per feature */
  std::vector<uint32_t> feature_num_bins_;
  /*! \brief offsets in histogram of all features */
  std::vector<uint32_t> feature_hist_offsets_;
  /*! \brief most frequent bins in each feature */
  std::vector<uint32_t> feature_most_freq_bins_;
  /*! \brief minimum number of data allowed per leaf */
  int min_data_in_leaf_;
  /*! \brief minimum sum value of hessians allowed per leaf */
  double min_sum_hessian_in_leaf_;
  /*! \brief cuda stream for histogram construction */
  cudaStream_t cuda_stream_;
  /*! \brief indices of feature whose histograms need to be fixed */
  std::vector<int> need_fix_histogram_features_;
  /*! \brief aligned number of bins of the features whose histograms need to be fixed */
  std::vector<uint32_t> need_fix_histogram_features_num_bin_aligend_;
  /*! \brief minimum number of blocks allowed in the y dimension */
  const int min_grid_dim_y_ = 160;


  // CUDA memory, held by this object

  /*! \brief CUDA row wise data */
  std::unique_ptr<CUDARowData> cuda_row_data_;
  /*! \brief number of bins per feature */
160
  CUDAVector<uint32_t> cuda_feature_num_bins_;
161
  /*! \brief offsets in histogram of all features */
162
  CUDAVector<uint32_t> cuda_feature_hist_offsets_;
163
  /*! \brief most frequent bins in each feature */
164
  CUDAVector<uint32_t> cuda_feature_most_freq_bins_;
165
  /*! \brief CUDA histograms */
166
  CUDAVector<hist_t> cuda_hist_;
167
  /*! \brief CUDA histograms buffer for each block */
168
  CUDAVector<float> cuda_hist_buffer_;
169
  /*! \brief indices of feature whose histograms need to be fixed */
170
  CUDAVector<int> cuda_need_fix_histogram_features_;
171
  /*! \brief aligned number of bins of the features whose histograms need to be fixed */
172
173
174
  CUDAVector<uint32_t> cuda_need_fix_histogram_features_num_bin_aligned_;
  /*! \brief histogram buffer used in histogram subtraction with different number of bits for histogram bins */
  CUDAVector<hist_t> hist_buffer_for_num_bit_change_;
175
176
177
178
179
180
181
182
183
184
185
186

  // CUDA memory, held by other object

  /*! \brief gradients on CUDA */
  const score_t* cuda_gradients_;
  /*! \brief hessians on CUDA */
  const score_t* cuda_hessians_;

  /*! \brief GPU device index */
  const int gpu_device_id_;
  /*! \brief use double precision histogram per block */
  const bool gpu_use_dp_;
187
188
189
190
  /*! \brief whether to use quantized gradients */
  const bool use_quantized_grad_;
  /*! \brief the number of bins to quantized gradients */
  const int num_grad_quant_bins_;
191
192
193
194
};

}  // namespace LightGBM

195
#endif  // USE_CUDA
196
#endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_