"tests/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "60710c701a8c14b5fb49a5eb729274c179789ece"
gpu_tree_learner.h 11.1 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5
6
7
8
9
#ifndef LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_
#define LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_

#include <LightGBM/dataset.h>
#include <LightGBM/feature_group.h>
10
11
12
#include <LightGBM/tree.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/random.h>
13

14
#include <string>
15
#include <cmath>
16
#include <cstdio>
17
#include <memory>
18
19
20
21
22
23
24
25
#include <random>
#include <vector>

#include "data_partition.hpp"
#include "feature_histogram.hpp"
#include "leaf_splits.hpp"
#include "serial_tree_learner.h"
#include "split_info.hpp"
26
27
28
29
30
31
32
33
34
35
36

#ifdef USE_GPU

#define BOOST_COMPUTE_THREAD_SAFE
#define BOOST_COMPUTE_HAVE_THREAD_LOCAL
// Use Boost.Compute on-disk kernel cache
#define BOOST_COMPUTE_USE_OFFLINE_CACHE
#include <boost/compute/core.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/align/aligned_allocator.hpp>

37
using namespace json11;
38
39
40
41
42
43
44

namespace LightGBM {

/*!
* \brief GPU-based parallel learning algorithm.
*/
class GPUTreeLearner: public SerialTreeLearner {
Nikita Titov's avatar
Nikita Titov committed
45
 public:
Guolin Ke's avatar
Guolin Ke committed
46
  explicit GPUTreeLearner(const Config* tree_config);
47
48
49
  ~GPUTreeLearner();
  void Init(const Dataset* train_data, bool is_constant_hessian) override;
  void ResetTrainingData(const Dataset* train_data) override;
50
  Tree* Train(const score_t* gradients, const score_t *hessians,
Guolin Ke's avatar
Guolin Ke committed
51
              bool is_constant_hessian, const Json& forced_split_json) override;
52
53
54
55
56
57
58
59
60
61
62
63
64
65

  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
    SerialTreeLearner::SetBaggingData(used_indices, num_data);
    // determine if we are using bagging before we construct the data partition
    // thus we can start data movement to GPU earlier
    if (used_indices != nullptr) {
      if (num_data != num_data_) {
        use_bagging_ = true;
        return;
      }
    }
    use_bagging_ = false;
  }

Nikita Titov's avatar
Nikita Titov committed
66
 protected:
67
68
  void BeforeTrain() override;
  bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
Guolin Ke's avatar
Guolin Ke committed
69
  void FindBestSplits() override;
70
71
  void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
  void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
72

Nikita Titov's avatar
Nikita Titov committed
73
 private:
74
75
  /*! \brief 4-byte feature tuple used by GPU kernels */
  struct Feature4 {
76
      uint8_t s[4];
77
  };
78

79
  typedef float gpu_hist_t;
80
81
82
83
84
85
86

  /*!
  * \brief Find the best number of workgroups processing one feature for maximizing efficiency
  * \param leaf_num_data The number of data examples on the current leaf being processed
  * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
  */
  int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
87

88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
  /*!
  * \brief Initialize GPU device, context and command queues
  *        Also compiles the OpenCL kernel
  * \param platform_id OpenCL platform ID
  * \param device_id OpenCL device ID
  */
  void InitGPU(int platform_id, int device_id);

  /*!
  * \brief Allocate memory for GPU computation
  */
  void AllocateGPUMemory();

  /*!
  * \brief Compile OpenCL GPU source code to kernel binaries
  */
  void BuildGPUKernels();
105

106
107
  /*!
   * \brief Returns OpenCL kernel build log when compiled with option opts
108
   * \param opts OpenCL build options
109
110
111
   * \return OpenCL build log
  */
  std::string GetBuildLog(const std::string &opts);
112
113
114
115
116
117

  /*!
  * \brief Setup GPU kernel arguments, preparing for launching
  */
  void SetupKernelArguments();

118
  /*!
119
120
121
122
123
124
   * \brief Compute GPU feature histogram for the current leaf.
   *        Indices, gradients and hessians have been copied to the device.
   * \param leaf_num_data Number of data on current leaf
   * \param use_all_features Set to true to not use feature masks, with a faster kernel
  */
  void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
125

126
127
128
129
130
  /*!
   * \brief Wait for GPU kernel execution and read histogram
   * \param histograms Destination of histogram results from GPU.
  */
  template <typename HistType>
131
  void WaitAndGetHistograms(hist_t* histograms);
132
133

  /*!
134
   * \brief Construct GPU histogram asynchronously.
135
136
137
138
139
140
141
   *        Interface is similar to Dataset::ConstructHistograms().
   * \param is_feature_used A predicate vector for enabling each feature
   * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
   *                     Set to nullptr to skip copy to GPU.
   * \param num_data Number of data examples to be included in histogram
   * \param gradients Array of gradients for all examples.
   * \param hessians Array of hessians for all examples.
142
   * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr,
143
   *                     Set gradients to nullptr to skip copy to GPU.
144
   * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr,
145
146
147
148
149
150
   *                     Set hessians to nullptr to skip copy to GPU.
   * \return true if GPU kernel is launched, false if GPU is not used
  */
  bool ConstructGPUHistogramsAsync(
    const std::vector<int8_t>& is_feature_used,
    const data_size_t* data_indices, data_size_t num_data,
151
152
    const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians);
153
154
155


  /*! brief Log2 of max number of workgroups per feature*/
156
  const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10
157
158
159
160
161
162
163
164
165
166
167
168
169
170
  /*! brief Max total number of workgroups with preallocated workspace.
   *        If we use more than this number of workgroups, we have to reallocate subhistograms */
  int preallocd_max_num_wg_ = 1024;

  /*! \brief True if bagging is used */
  bool use_bagging_;

  /*! \brief GPU device object */
  boost::compute::device dev_;
  /*! \brief GPU context object */
  boost::compute::context ctx_;
  /*! \brief GPU command queue object */
  boost::compute::command_queue queue_;
  /*! \brief GPU kernel for 256 bins */
171
  const char *kernel256_src_ = {
172
  #include "ocl/histogram256.cl"
173
  };
174
  /*! \brief GPU kernel for 64 bins */
175
  const char *kernel64_src_ = {
176
  #include "ocl/histogram64.cl"
177
  };
178
  /*! \brief GPU kernel for 16 bins */
179
  const char *kernel16_src_ = {
180
  #include "ocl/histogram16.cl"
181
  };
182
183
184
185
186
  /*! \brief Currently used kernel source */
  std::string kernel_source_;
  /*! \brief Currently used kernel name */
  std::string kernel_name_;

187
  /*! \brief an array of histogram kernels with different number
188
189
     of workgroups per feature */
  std::vector<boost::compute::kernel> histogram_kernels_;
190
  /*! \brief an array of histogram kernels with different number
191
192
     of workgroups per feature, with all features enabled to avoid branches */
  std::vector<boost::compute::kernel> histogram_allfeats_kernels_;
193
  /*! \brief an array of histogram kernels with different number
194
195
196
197
198
199
200
201
202
203
204
205
206
207
     of workgroups per feature, and processing the whole dataset */
  std::vector<boost::compute::kernel> histogram_fulldata_kernels_;
  /*! \brief total number of feature-groups */
  int num_feature_groups_;
  /*! \brief total number of dense feature-groups, which will be processed on GPU */
  int num_dense_feature_groups_;
  /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
   *  With bin size > 16, there are 4 features per DWORD.
   *  With bin size <=16, there are 8 features per DWORD.
   * */
  int dword_features_;
  /*! \brief total number of dense feature-group tuples on GPU.
   * Each feature tuple is 4-byte (4 features if each feature takes a byte) */
  int num_dense_feature4_;
208
  /*! \brief Max number of bins of training data, used to determine
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
   * which GPU kernel to use */
  int max_num_bin_;
  /*! \brief Used GPU kernel bin size (64, 256) */
  int device_bin_size_;
  /*! \brief Size of histogram bin entry, depending if single or double precision is used */
  size_t hist_bin_entry_sz_;
  /*! \brief Indices of all dense feature-groups */
  std::vector<int> dense_feature_group_map_;
  /*! \brief Indices of all sparse feature-groups */
  std::vector<int> sparse_feature_group_map_;
  /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */
  std::vector<int> device_bin_mults_;
  /*! \brief GPU memory object holding the training data */
  std::unique_ptr<boost::compute::vector<Feature4>> device_features_;
  /*! \brief GPU memory object holding the ordered gradient */
  boost::compute::buffer device_gradients_;
  /*! \brief Pinned memory object for ordered gradient */
  boost::compute::buffer pinned_gradients_;
  /*! \brief Pointer to pinned memory of ordered gradient */
  void * ptr_pinned_gradients_ = nullptr;
  /*! \brief GPU memory object holding the ordered hessian */
  boost::compute::buffer device_hessians_;
  /*! \brief Pinned memory object for ordered hessian */
  boost::compute::buffer pinned_hessians_;
  /*! \brief Pointer to pinned memory of ordered hessian */
  void * ptr_pinned_hessians_ = nullptr;
  /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
  std::vector<char, boost::alignment::aligned_allocator<char, 4096>> feature_masks_;
  /*! \brief GPU memory object holding the feature masks */
  boost::compute::buffer device_feature_masks_;
  /*! \brief Pinned memory object for feature masks */
  boost::compute::buffer pinned_feature_masks_;
  /*! \brief Pointer to pinned memory of feature masks */
  void * ptr_pinned_feature_masks_ = nullptr;
  /*! \brief GPU memory object holding indices of the leaf being processed */
  std::unique_ptr<boost::compute::vector<data_size_t>> device_data_indices_;
  /*! \brief GPU memory object holding counters for workgroup coordination */
  std::unique_ptr<boost::compute::vector<int>> sync_counters_;
  /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
  std::unique_ptr<boost::compute::vector<char>> device_subhistograms_;
  /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
  boost::compute::buffer device_histogram_outputs_;
  /*! \brief Host memory pointer for histogram outputs */
  void * host_histogram_outputs_;
  /*! \brief OpenCL waitlist object for waiting for data transfer before kernel execution */
  boost::compute::wait_list kernel_wait_obj_;
  /*! \brief OpenCL waitlist object for reading output histograms after kernel execution */
  boost::compute::wait_list histograms_wait_obj_;
  /*! \brief Asynchronous waiting object for copying indices */
  boost::compute::future<void> indices_future_;
  /*! \brief Asynchronous waiting object for copying gradients */
  boost::compute::event gradients_future_;
  /*! \brief Asynchronous waiting object for copying hessians */
  boost::compute::event hessians_future_;
};

}  // namespace LightGBM
#else

// When GPU support is not compiled in, quit with an error message

namespace LightGBM {
271

272
class GPUTreeLearner: public SerialTreeLearner {
Nikita Titov's avatar
Nikita Titov committed
273
 public:
Guolin Ke's avatar
Guolin Ke committed
274
  #pragma warning(disable : 4702)
Guolin Ke's avatar
Guolin Ke committed
275
  explicit GPUTreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
276
277
    Log::Fatal("GPU Tree Learner was not enabled in this build.\n"
               "Please recompile with CMake option -DUSE_GPU=1");
278
279
280
  }
};

281
}  // namespace LightGBM
282
283
284
285

#endif   // USE_GPU

#endif   // LightGBM_TREELEARNER_GPU_TREE_LEARNER_H_