gpu_tree_learner.cpp 52.3 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5
#ifdef USE_GPU
6

7
8
9
#include "gpu_tree_learner.h"

#include <LightGBM/bin.h>
10
11
#include <LightGBM/network.h>
#include <LightGBM/utils/array_args.h>
12
13
14

#include <algorithm>

15
16
#include "../io/dense_bin.hpp"
#include "../io/dense_nbits_bin.hpp"
17
18
19
20
21

#define GPU_DEBUG 0

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
22
23
GPUTreeLearner::GPUTreeLearner(const Config* config)
  :SerialTreeLearner(config) {
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
  use_bagging_ = false;
  Log::Info("This is the GPU trainer!!");
}

GPUTreeLearner::~GPUTreeLearner() {
  if (ptr_pinned_gradients_) {
    queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_);
  }
  if (ptr_pinned_hessians_) {
    queue_.enqueue_unmap_buffer(pinned_hessians_, ptr_pinned_hessians_);
  }
  if (ptr_pinned_feature_masks_) {
    queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_);
  }
}

void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
  // initialize SerialTreeLearner
  SerialTreeLearner::Init(train_data, is_constant_hessian);
  // some additional variables needed for GPU trainer
  num_feature_groups_ = train_data_->num_feature_groups();
  // Initialize GPU buffers and kernels
Guolin Ke's avatar
Guolin Ke committed
46
  InitGPU(config_->gpu_platform_id, config_->gpu_device_id);
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
}

// some functions used for debugging the GPU histogram construction
#if GPU_DEBUG > 0

void PrintHistograms(HistogramBinEntry* h, size_t size) {
  size_t total = 0;
  for (size_t i = 0; i < size; ++i) {
    printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt);
    total += h[i].cnt;
    if ((i & 3) == 3)
        printf("\n");
  }
  printf("\nTotal examples: %lu\n", total);
}

63
union Float_t {
64
65
66
67
68
69
    int64_t i;
    double f;
    static int64_t ulp_diff(Float_t a, Float_t b) {
      return abs(a.i - b.i);
    }
};
70

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
  size_t i;
  Float_t a, b;
  for (i = 0; i < size; ++i) {
    a.f = h1[i].sum_gradients;
    b.f = h2[i].sum_gradients;
    int32_t ulps = Float_t::ulp_diff(a, b);
    if (fabs(h1[i].cnt           - h2[i].cnt != 0)) {
      printf("%d != %d\n", h1[i].cnt, h2[i].cnt);
      goto err;
    }
    if (ulps > 0) {
      // printf("grad %g != %g (%d ULPs)\n", h1[i].sum_gradients, h2[i].sum_gradients, ulps);
      // goto err;
    }
    a.f = h1[i].sum_hessians;
    b.f = h2[i].sum_hessians;
    ulps = Float_t::ulp_diff(a, b);
    if (ulps > 0) {
      // printf("hessian %g != %g (%d ULPs)\n", h1[i].sum_hessians, h2[i].sum_hessians, ulps);
      // goto err;
    }
  }
  return;
err:
  Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i);
  std::cin.get();
  PrintHistograms(h1, size);
  printf("\n");
  PrintHistograms(h2, size);
  std::cin.get();
}
#endif

int GPUTreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
  // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
  // also guarantee that there are at least 2K examples per workgroup
  double x = 256.0 / num_dense_feature4_;
110
  int exp_workgroups_per_feature = static_cast<int>(ceil(log2(x)));
111
112
113
114
  double t = leaf_num_data / 1024.0;
  #if GPU_DEBUG >= 4
  printf("Computing histogram for %d examples and (%d * %d) feature groups\n", leaf_num_data, dword_features_, num_dense_feature4_);
  printf("We can have at most %d workgroups per feature4 for efficiency reasons.\n"
115
         "Best workgroup size per feature for full utilization is %d\n", static_cast<int>(ceil(t)), (1 << exp_workgroups_per_feature));
116
  #endif
117
  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast<int>(ceil(log(static_cast<double>(t))/log(2.0))));
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
  if (exp_workgroups_per_feature < 0)
      exp_workgroups_per_feature = 0;
  if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature)
      exp_workgroups_per_feature = kMaxLogWorkgroupsPerFeature;
  // return 0;
  return exp_workgroups_per_feature;
}

void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) {
  // we have already copied ordered gradients, ordered hessians and indices to GPU
  // decide the best number of workgroups working on one feature4 tuple
  // set work group size based on feature size
  // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
  int num_workgroups = (1 << exp_workgroups_per_feature) * num_dense_feature4_;
  if (num_workgroups > preallocd_max_num_wg_) {
    preallocd_max_num_wg_ = num_workgroups;
135
    Log::Info("Increasing preallocd_max_num_wg_ to %d for launching more workgroups", preallocd_max_num_wg_);
136
137
138
139
140
141
142
143
144
145
146
    device_subhistograms_.reset(new boost::compute::vector<char>(
                              preallocd_max_num_wg_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, ctx_));
    // we need to refresh the kernel arguments after reallocating
    for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
      // The only argument that needs to be changed later is num_data_
      histogram_kernels_[i].set_arg(7, *device_subhistograms_);
      histogram_allfeats_kernels_[i].set_arg(7, *device_subhistograms_);
      histogram_fulldata_kernels_[i].set_arg(7, *device_subhistograms_);
    }
  }
  #if GPU_DEBUG >= 4
147
  printf("Setting exp_workgroups_per_feature to %d, using %u work groups\n", exp_workgroups_per_feature, num_workgroups);
148
149
  printf("Constructing histogram with %d examples\n", leaf_num_data);
  #endif
150

151
152
153
154
155
156
  // the GPU kernel will process all features in one call, and each
  // 2^exp_workgroups_per_feature (compile time constant) workgroup will
  // process one feature4 tuple

  if (use_all_features) {
    histogram_allfeats_kernels_[exp_workgroups_per_feature].set_arg(4, leaf_num_data);
157
  } else {
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    histogram_kernels_[exp_workgroups_per_feature].set_arg(4, leaf_num_data);
  }
  // for the root node, indices are not copied
  if (leaf_num_data != num_data_) {
    indices_future_.wait();
  }
  // for constant hessian, hessians are not copied except for the root node
  if (!is_constant_hessian_) {
    hessians_future_.wait();
  }
  gradients_future_.wait();
  // there will be 2^exp_workgroups_per_feature = num_workgroups / num_dense_feature4 sub-histogram per feature4
  // and we will launch num_feature workgroups for this kernel
  // will launch threads for all features
  // the queue should be asynchrounous, and we will can WaitAndGetHistograms() before we start processing dense feature groups
  if (leaf_num_data == num_data_) {
    kernel_wait_obj_ = boost::compute::wait_list(queue_.enqueue_1d_range_kernel(histogram_fulldata_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
175
  } else {
176
177
178
    if (use_all_features) {
      kernel_wait_obj_ = boost::compute::wait_list(
                         queue_.enqueue_1d_range_kernel(histogram_allfeats_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
179
    } else {
180
181
182
183
184
185
186
      kernel_wait_obj_ = boost::compute::wait_list(
                         queue_.enqueue_1d_range_kernel(histogram_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
    }
  }
  // copy the results asynchronously. Size depends on if double precision is used
  size_t output_size = num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
  boost::compute::event histogram_wait_event;
187
  host_histogram_outputs_ = (void*)queue_.enqueue_map_buffer_async(device_histogram_outputs_, boost::compute::command_queue::map_read,
188
189
190
191
192
193
                                                                   0, output_size, histogram_wait_event, kernel_wait_obj_);
  // we will wait for this object in WaitAndGetHistograms
  histograms_wait_obj_ = boost::compute::wait_list(histogram_wait_event);
}

template <typename HistType>
194
void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
195
  HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
196
197
198
  // when the output is ready, the computation is done
  histograms_wait_obj_.wait();
  #pragma omp parallel for schedule(static)
199
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
200
201
202
203
204
    if (!feature_masks_[i]) {
      continue;
    }
    int dense_group_index = dense_feature_group_map_[i];
    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
205
    int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
206
207
208
209
    if (device_bin_mults_[i] == 1) {
      for (int j = 0; j < bin_size; ++j) {
        old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
        old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians;
210
        old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
211
      }
212
    } else {
213
214
215
216
217
218
219
220
221
222
223
224
225
      // values of this feature has been redistributed to multiple bins; need a reduction here
      int ind = 0;
      for (int j = 0; j < bin_size; ++j) {
        double sum_g = 0.0, sum_h = 0.0;
        size_t cnt = 0;
        for (int k = 0; k < device_bin_mults_[i]; ++k) {
          sum_g += hist_outputs[i * device_bin_size_+ ind].sum_gradients;
          sum_h += hist_outputs[i * device_bin_size_+ ind].sum_hessians;
          cnt += hist_outputs[i * device_bin_size_ + ind].cnt;
          ind++;
        }
        old_histogram_array[j].sum_gradients = sum_g;
        old_histogram_array[j].sum_hessians = sum_h;
226
        old_histogram_array[j].cnt = (data_size_t)cnt;
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
      }
    }
  }
  queue_.enqueue_unmap_buffer(device_histogram_outputs_, host_histogram_outputs_);
}

void GPUTreeLearner::AllocateGPUMemory() {
  num_dense_feature_groups_ = 0;
  for (int i = 0; i < num_feature_groups_; ++i) {
    if (ordered_bins_[i] == nullptr) {
      num_dense_feature_groups_++;
    }
  }
  // how many feature-group tuples we have
  num_dense_feature4_ = (num_dense_feature_groups_ + (dword_features_ - 1)) / dword_features_;
  // leave some safe margin for prefetching
  // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
  int allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
  // clear sparse/dense maps
  dense_feature_group_map_.clear();
  device_bin_mults_.clear();
  sparse_feature_group_map_.clear();
  // do nothing if no features can be processed on GPU
  if (!num_dense_feature_groups_) {
Lingyi Hu's avatar
Lingyi Hu committed
251
    Log::Warning("GPU acceleration is disabled because no non-trivial dense features can be found");
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
    return;
  }
  // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers)
  device_features_.reset();
  device_features_ = std::unique_ptr<boost::compute::vector<Feature4>>(new boost::compute::vector<Feature4>(num_dense_feature4_ * num_data_, ctx_));
  // unpin old buffer if necessary before destructing them
  if (ptr_pinned_gradients_) {
    queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_);
  }
  if (ptr_pinned_hessians_) {
    queue_.enqueue_unmap_buffer(pinned_hessians_, ptr_pinned_hessians_);
  }
  if (ptr_pinned_feature_masks_) {
    queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_);
  }
267
  // make ordered_gradients and hessians larger (including extra room for prefetching), and pin them
268
269
  ordered_gradients_.reserve(allocated_num_data_);
  ordered_hessians_.reserve(allocated_num_data_);
270
271
272
  pinned_gradients_ = boost::compute::buffer();  // deallocate
  pinned_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
                                             boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
273
                                             ordered_gradients_.data());
274
  ptr_pinned_gradients_ = queue_.enqueue_map_buffer(pinned_gradients_, boost::compute::command_queue::map_write_invalidate_region,
275
                                                    0, allocated_num_data_ * sizeof(score_t));
276
277
278
  pinned_hessians_ = boost::compute::buffer();  // deallocate
  pinned_hessians_  = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
                                             boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
279
                                             ordered_hessians_.data());
280
  ptr_pinned_hessians_ = queue_.enqueue_map_buffer(pinned_hessians_, boost::compute::command_queue::map_write_invalidate_region,
281
                                                   0, allocated_num_data_ * sizeof(score_t));
282
283
  // allocate space for gradients and hessians on device
  // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed
284
285
  device_gradients_ = boost::compute::buffer();  // deallocate
  device_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
286
                      boost::compute::memory_object::read_only, nullptr);
287
288
  device_hessians_ = boost::compute::buffer();  // deallocate
  device_hessians_  = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
289
290
291
                      boost::compute::memory_object::read_only, nullptr);
  // allocate feature mask, for disabling some feature-groups' histogram calculation
  feature_masks_.resize(num_dense_feature4_ * dword_features_);
292
293
  device_feature_masks_ = boost::compute::buffer();  // deallocate
  device_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
294
                          boost::compute::memory_object::read_only, nullptr);
295
296
  pinned_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
                                             boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
297
298
299
300
301
302
303
304
305
                                             feature_masks_.data());
  ptr_pinned_feature_masks_ = queue_.enqueue_map_buffer(pinned_feature_masks_, boost::compute::command_queue::map_write_invalidate_region,
                                                        0, num_dense_feature4_ * dword_features_);
  memset(ptr_pinned_feature_masks_, 0, num_dense_feature4_ * dword_features_);
  // copy indices to the device
  device_data_indices_.reset();
  device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_));
  boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_);
  // histogram bin entry size depends on the precision (single/double)
Guolin Ke's avatar
Guolin Ke committed
306
  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry);
307
308
309
310
311
312
313
314
315
316
317
318
319
320
  Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_);
  // create output buffer, each feature has a histogram with device_bin_size_ bins,
  // each work group generates a sub-histogram of dword_features_ features.
  if (!device_subhistograms_) {
    // only initialize once here, as this will not need to change when ResetTrainingData() is called
    device_subhistograms_ = std::unique_ptr<boost::compute::vector<char>>(new boost::compute::vector<char>(
                              preallocd_max_num_wg_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, ctx_));
  }
  // create atomic counters for inter-group coordination
  sync_counters_.reset();
  sync_counters_ = std::unique_ptr<boost::compute::vector<int>>(new boost::compute::vector<int>(
                    num_dense_feature4_, ctx_));
  boost::compute::fill(sync_counters_->begin(), sync_counters_->end(), 0, queue_);
  // The output buffer is allocated to host directly, to overlap compute and data transfer
321
  device_histogram_outputs_ = boost::compute::buffer();  // deallocate
322
  device_histogram_outputs_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_,
323
324
                           boost::compute::memory_object::write_only | boost::compute::memory_object::alloc_host_ptr, nullptr);
  // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
325
326
327
  int k = 0, copied_feature4 = 0;
  std::vector<int> dense_dword_ind(dword_features_);
  for (int i = 0; i < num_feature_groups_; ++i) {
328
329
    // looking for dword_features_ non-sparse feature-groups
    if (ordered_bins_[i] == nullptr) {
330
      dense_dword_ind[k] = i;
331
      // decide if we need to redistribute the bin
332
      double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i));
333
      // multiplier must be a power of 2
334
      device_bin_mults_.push_back(static_cast<int>(round(pow(2, floor(log2(t))))));
335
336
337
338
339
      // device_bin_mults_.push_back(1);
      #if GPU_DEBUG >= 1
      printf("feature-group %d using multiplier %d\n", i, device_bin_mults_.back());
      #endif
      k++;
340
    } else {
341
342
      sparse_feature_group_map_.push_back(i);
    }
343
    // found
344
345
346
    if (k == dword_features_) {
      k = 0;
      for (int j = 0; j < dword_features_; ++j) {
347
        dense_feature_group_map_.push_back(dense_dword_ind[j]);
348
349
350
351
352
353
354
      }
      copied_feature4++;
    }
  }
  // for data transfer time
  auto start_time = std::chrono::steady_clock::now();
  // Now generate new data structure feature4, and copy data to the device
355
  int nthreads = std::min(omp_get_max_threads(), static_cast<int>(dense_feature_group_map_.size()) / dword_features_);
356
357
358
359
360
361
  nthreads = std::max(nthreads, 1);
  std::vector<Feature4*> host4_vecs(nthreads);
  std::vector<boost::compute::buffer> host4_bufs(nthreads);
  std::vector<Feature4*> host4_ptrs(nthreads);
  // preallocate arrays for all threads, and pin them
  for (int i = 0; i < nthreads; ++i) {
362
    host4_vecs[i] = reinterpret_cast<Feature4*>(boost::alignment::aligned_alloc(4096, num_data_ * sizeof(Feature4)));
363
364
    host4_bufs[i] = boost::compute::buffer(ctx_, num_data_ * sizeof(Feature4),
                    boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
365
                    host4_vecs[i]);
366
367
    host4_ptrs[i] = reinterpret_cast<Feature4*>(queue_.enqueue_map_buffer(host4_bufs[i], boost::compute::command_queue::map_write_invalidate_region,
                    0, num_data_ * sizeof(Feature4)));
368
369
370
  }
  // building Feature4 bundles; each thread handles dword_features_ features
  #pragma omp parallel for schedule(static)
371
  for (int i = 0; i < static_cast<int>(dense_feature_group_map_.size() / dword_features_); ++i) {
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
    int tid = omp_get_thread_num();
    Feature4* host4 = host4_ptrs[tid];
    auto dense_ind = dense_feature_group_map_.begin() + i * dword_features_;
    auto dev_bin_mult = device_bin_mults_.begin() + i * dword_features_;
    #if GPU_DEBUG >= 1
    printf("Copying feature group ");
    for (int l = 0; l < dword_features_; ++l) {
      printf("%d ", dense_ind[l]);
    }
    printf("to devices\n");
    #endif
    if (dword_features_ == 8) {
      // one feature datapoint is 4 bits
      BinIterator* bin_iters[8];
      for (int s_idx = 0; s_idx < 8; ++s_idx) {
        bin_iters[s_idx] = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
        if (dynamic_cast<Dense4bitsBinIterator*>(bin_iters[s_idx]) == 0) {
389
          Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not", dense_ind[s_idx]);
390
391
392
393
394
395
396
397
398
399
400
401
402
        }
      }
      // this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
      Dense4bitsBinIterator iters[8] = {
        *static_cast<Dense4bitsBinIterator*>(bin_iters[0]),
        *static_cast<Dense4bitsBinIterator*>(bin_iters[1]),
        *static_cast<Dense4bitsBinIterator*>(bin_iters[2]),
        *static_cast<Dense4bitsBinIterator*>(bin_iters[3]),
        *static_cast<Dense4bitsBinIterator*>(bin_iters[4]),
        *static_cast<Dense4bitsBinIterator*>(bin_iters[5]),
        *static_cast<Dense4bitsBinIterator*>(bin_iters[6]),
        *static_cast<Dense4bitsBinIterator*>(bin_iters[7])};
      for (int j = 0; j < num_data_; ++j) {
403
        host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
404
                      |((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
405
        host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1)))
406
                      |((iters[3].RawGet(j) * dev_bin_mult[3] + ((j+3) & (dev_bin_mult[3] - 1))) << 4));
407
        host4[j].s[2] = (uint8_t)((iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1)))
408
                      |((iters[5].RawGet(j) * dev_bin_mult[5] + ((j+5) & (dev_bin_mult[5] - 1))) << 4));
409
        host4[j].s[3] = (uint8_t)((iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1)))
410
                      |((iters[7].RawGet(j) * dev_bin_mult[7] + ((j+7) & (dev_bin_mult[7] - 1))) << 4));
411
      }
412
    } else if (dword_features_ == 4) {
413
414
415
416
417
418
419
420
      // one feature datapoint is one byte
      for (int s_idx = 0; s_idx < 4; ++s_idx) {
        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
        // this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
        if (dynamic_cast<DenseBinIterator<uint8_t>*>(bin_iter) != 0) {
          // Dense bin
          DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
          for (int j = 0; j < num_data_; ++j) {
421
            host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
422
          }
423
        } else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
424
425
426
          // Dense 4-bit bin
          Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
          for (int j = 0; j < num_data_; ++j) {
427
            host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
428
          }
429
        } else {
430
          Log::Fatal("Bug in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
431
432
        }
      }
433
    } else {
434
      Log::Fatal("Bug in GPU tree builder: dword_features_ can only be 4 or 8");
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
    }
    queue_.enqueue_write_buffer(device_features_->get_buffer(),
                        i * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
    #if GPU_DEBUG >= 1
    printf("first example of feature-group tuple is: %d %d %d %d\n", host4[0].s0, host4[0].s1, host4[0].s2, host4[0].s3);
    printf("Feature-groups copied to device with multipliers ");
    for (int l = 0; l < dword_features_; ++l) {
      printf("%d ", dev_bin_mult[l]);
    }
    printf("\n");
    #endif
  }
  // working on the remaining (less than dword_features_) feature groups
  if (k != 0) {
    Feature4* host4 = host4_ptrs[0];
    if (dword_features_ == 8) {
      memset(host4, 0, num_data_ * sizeof(Feature4));
    }
    #if GPU_DEBUG >= 1
    printf("%d features left\n", k);
    #endif
456
    for (int i = 0; i < k; ++i) {
457
      if (dword_features_ == 8) {
458
        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
459
460
461
462
        if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
          Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
          #pragma omp parallel for schedule(static)
          for (int j = 0; j < num_data_; ++j) {
463
            host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
464
465
466
                                + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
                               << ((i & 1) << 2));
          }
467
        } else {
468
          Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not", dense_dword_ind[i]);
469
        }
470
      } else if (dword_features_ == 4) {
471
        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
472
473
474
475
        if (dynamic_cast<DenseBinIterator<uint8_t>*>(bin_iter) != 0) {
          DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
          #pragma omp parallel for schedule(static)
          for (int j = 0; j < num_data_; ++j) {
476
            host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
477
                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
478
          }
479
        } else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
480
481
482
          Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
          #pragma omp parallel for schedule(static)
          for (int j = 0; j < num_data_; ++j) {
483
            host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
484
                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
485
          }
486
        } else {
487
          Log::Fatal("BUG in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
488
        }
489
      } else {
490
        Log::Fatal("Bug in GPU tree builder: dword_features_ can only be 4 or 8");
491
492
493
494
495
496
      }
    }
    // fill the leftover features
    if (dword_features_ == 8) {
      #pragma omp parallel for schedule(static)
      for (int j = 0; j < num_data_; ++j) {
497
        for (int i = k; i < dword_features_; ++i) {
498
          // fill this empty feature with some "random" value
499
          host4[j].s[i >> 1] |= (uint8_t)((j & 0xf) << ((i & 1) << 2));
500
501
        }
      }
502
    } else if (dword_features_ == 4) {
503
504
      #pragma omp parallel for schedule(static)
      for (int j = 0; j < num_data_; ++j) {
505
        for (int i = k; i < dword_features_; ++i) {
506
          // fill this empty feature with some "random" value
507
          host4[j].s[i] = (uint8_t)j;
508
509
510
511
512
513
514
515
516
        }
      }
    }
    // copying the last 1 to (dword_features - 1) feature-groups in the last tuple
    queue_.enqueue_write_buffer(device_features_->get_buffer(),
                        (num_dense_feature4_ - 1) * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
    #if GPU_DEBUG >= 1
    printf("Last features copied to device\n");
    #endif
517
518
    for (int i = 0; i < k; ++i) {
      dense_feature_group_map_.push_back(dense_dword_ind[i]);
519
520
521
522
523
524
525
526
527
528
    }
  }
  // deallocate pinned space for feature copying
  for (int i = 0; i < nthreads; ++i) {
      queue_.enqueue_unmap_buffer(host4_bufs[i], host4_ptrs[i]);
      host4_bufs[i] = boost::compute::buffer();
      boost::alignment::aligned_free(host4_vecs[i]);
  }
  // data transfer time
  std::chrono::duration<double, std::milli> end_time = std::chrono::steady_clock::now() - start_time;
529
530
  Log::Info("%d dense feature groups (%.2f MB) transferred to GPU in %f secs. %d sparse feature groups",
            dense_feature_group_map_.size(), ((dense_feature_group_map_.size() + (dword_features_ - 1)) / dword_features_) * num_data_ * sizeof(Feature4) / (1024.0 * 1024.0),
531
532
533
            end_time * 1e-3, sparse_feature_group_map_.size());
  #if GPU_DEBUG >= 1
  printf("Dense feature group list (size %lu): ", dense_feature_group_map_.size());
534
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
535
536
537
538
    printf("%d ", dense_feature_group_map_[i]);
  }
  printf("\n");
  printf("Sparse feature group list (size %lu): ", sparse_feature_group_map_.size());
539
  for (int i = 0; i < num_feature_groups_ - num_dense_feature_groups_; ++i) {
540
541
542
543
544
545
    printf("%d ", sparse_feature_group_map_[i]);
  }
  printf("\n");
  #endif
}

546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
std::string GPUTreeLearner::GetBuildLog(const std::string &opts) {
  boost::compute::program program = boost::compute::program::create_with_source(kernel_source_, ctx_);
  try {
    program.build(opts);
  }
  catch (boost::compute::opencl_error &e) {
    auto error_code = e.error_code();
    std::string log("No log available.\n");
    // for other types of failure, build log might not be available; program.build_log() can crash
    if (error_code == CL_INVALID_PROGRAM || error_code == CL_BUILD_PROGRAM_FAILURE) {
      try {
        log = program.build_log();
      }
      catch(...) {
        // Something bad happened. Just return "No log available."
      }
    }
    return log;
  }
  // build is okay, log may contain warnings
  return program.build_log();
}

569
570
571
572
573
574
575
576
577
578
579
580
void GPUTreeLearner::BuildGPUKernels() {
  Log::Info("Compiling OpenCL Kernel with %d bins...", device_bin_size_);
  // destroy any old kernels
  histogram_kernels_.clear();
  histogram_allfeats_kernels_.clear();
  histogram_fulldata_kernels_.clear();
  // create OpenCL kernels for different number of workgroups per feature
  histogram_kernels_.resize(kMaxLogWorkgroupsPerFeature+1);
  histogram_allfeats_kernels_.resize(kMaxLogWorkgroupsPerFeature+1);
  histogram_fulldata_kernels_.resize(kMaxLogWorkgroupsPerFeature+1);
  // currently we don't use constant memory
  int use_constants = 0;
581
  OMP_INIT_EX();
582
583
  #pragma omp parallel for schedule(guided)
  for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
584
    OMP_LOOP_EX_BEGIN();
585
586
    boost::compute::program program;
    std::ostringstream opts;
587
    // compile the GPU kernel depending if double precision is used, constant hessian is used, etc.
588
    opts << " -D POWER_FEATURE_WORKGROUPS=" << i
Guolin Ke's avatar
Guolin Ke committed
589
         << " -D USE_CONSTANT_BUF=" << use_constants << " -D USE_DP_FLOAT=" << int(config_->gpu_use_dp)
590
         << " -D CONST_HESSIAN=" << int(is_constant_hessian_)
591
         << " -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math";
592
593
594
595
596
597
598
599
    #if GPU_DEBUG >= 1
    std::cout << "Building GPU kernels with options: " << opts.str() << std::endl;
    #endif
    // kernel with indices in an array
    try {
      program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
    }
    catch (boost::compute::opencl_error &e) {
600
601
602
603
604
      #pragma omp critical
      {
        std::cerr << "Build Options:" << opts.str() << std::endl;
        std::cerr << "Build Log:" << std::endl << GetBuildLog(opts.str()) << std::endl;
        Log::Fatal("Cannot build GPU program: %s", e.what());
605
606
607
      }
    }
    histogram_kernels_[i] = program.create_kernel(kernel_name_);
608

609
610
611
612
613
614
    // kernel with all features enabled, with elimited branches
    opts << " -D ENABLE_ALL_FEATURES=1";
    try {
      program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
    }
    catch (boost::compute::opencl_error &e) {
615
616
617
618
619
      #pragma omp critical
      {
        std::cerr << "Build Options:" << opts.str() << std::endl;
        std::cerr << "Build Log:" << std::endl << GetBuildLog(opts.str()) << std::endl;
        Log::Fatal("Cannot build GPU program: %s", e.what());
620
621
622
623
624
625
626
627
628
629
      }
    }
    histogram_allfeats_kernels_[i] = program.create_kernel(kernel_name_);

    // kernel with all data indices (for root node, and assumes that root node always uses all features)
    opts << " -D IGNORE_INDICES=1";
    try {
      program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
    }
    catch (boost::compute::opencl_error &e) {
630
631
632
633
634
      #pragma omp critical
      {
        std::cerr << "Build Options:" << opts.str() << std::endl;
        std::cerr << "Build Log:" << std::endl << GetBuildLog(opts.str()) << std::endl;
        Log::Fatal("Cannot build GPU program: %s", e.what());
635
636
637
      }
    }
    histogram_fulldata_kernels_[i] = program.create_kernel(kernel_name_);
638
    OMP_LOOP_EX_END();
639
  }
640
  OMP_THROW_EX();
641
642
643
644
645
646
647
648
649
650
651
  Log::Info("GPU programs have been built");
}

void GPUTreeLearner::SetupKernelArguments() {
  // do nothing if no features can be processed on GPU
  if (!num_dense_feature_groups_) {
    return;
  }
  for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
    // The only argument that needs to be changed later is num_data_
    if (is_constant_hessian_) {
652
      // hessian is passed as a parameter, but it is not available now.
653
654
655
656
657
658
659
660
661
662
      // hessian will be set in BeforeTrain()
      histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                         *device_data_indices_, num_data_, device_gradients_, 0.0f,
                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
      histogram_allfeats_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                         *device_data_indices_, num_data_, device_gradients_, 0.0f,
                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
      histogram_fulldata_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                          *device_data_indices_, num_data_, device_gradients_, 0.0f,
                                          *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
663
    } else {
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
      histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                         *device_data_indices_, num_data_, device_gradients_, device_hessians_,
                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
      histogram_allfeats_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                         *device_data_indices_, num_data_, device_gradients_, device_hessians_,
                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
      histogram_fulldata_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                          *device_data_indices_, num_data_, device_gradients_, device_hessians_,
                                          *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
    }
  }
}

void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
  // Get the max bin size, used for selecting best GPU kernel
  max_num_bin_ = 0;
  #if GPU_DEBUG >= 1
  printf("bin size: ");
  #endif
  for (int i = 0; i < num_feature_groups_; ++i) {
    #if GPU_DEBUG >= 1
    printf("%d, ", train_data_->FeatureGroupNumBin(i));
    #endif
    max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
  }
  #if GPU_DEBUG >= 1
  printf("\n");
  #endif
  // initialize GPU
  dev_ = boost::compute::system::default_device();
  if (platform_id >= 0 && device_id >= 0) {
    const std::vector<boost::compute::platform> platforms = boost::compute::system::platforms();
696
    if (static_cast<int>(platforms.size()) > platform_id) {
697
      const std::vector<boost::compute::device> platform_devices = platforms[platform_id].devices();
698
      if (static_cast<int>(platform_devices.size()) > device_id) {
699
700
        Log::Info("Using requested OpenCL platform %d device %d", platform_id, device_id);
        dev_ = platform_devices[device_id];
701
702
703
      }
    }
  }
704
705
706
707
708
709
  // determine which kernel to use based on the max number of bins
  if (max_num_bin_ <= 16) {
    kernel_source_ = kernel16_src_;
    kernel_name_ = "histogram16";
    device_bin_size_ = 16;
    dword_features_ = 8;
710
  } else if (max_num_bin_ <= 64) {
711
712
713
714
    kernel_source_ = kernel64_src_;
    kernel_name_ = "histogram64";
    device_bin_size_ = 64;
    dword_features_ = 4;
715
  } else if (max_num_bin_ <= 256) {
716
717
718
719
    kernel_source_ = kernel256_src_;
    kernel_name_ = "histogram256";
    device_bin_size_ = 256;
    dword_features_ = 4;
720
  } else {
721
722
    Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
  }
723
  if (max_num_bin_ == 65) {
724
725
    Log::Warning("Setting max_bin to 63 is sugguested for best performance");
  }
726
  if (max_num_bin_ == 17) {
727
728
729
730
731
732
733
734
735
736
737
    Log::Warning("Setting max_bin to 15 is sugguested for best performance");
  }
  ctx_ = boost::compute::context(dev_);
  queue_ = boost::compute::command_queue(ctx_, dev_);
  Log::Info("Using GPU Device: %s, Vendor: %s", dev_.name().c_str(), dev_.vendor().c_str());
  BuildGPUKernels();
  AllocateGPUMemory();
  // setup GPU kernel arguments after we allocating all the buffers
  SetupKernelArguments();
}

738
739
Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians,
                            bool is_constant_hessian, Json& forced_split_json) {
740
741
742
743
744
745
746
747
  // check if we need to recompile the GPU kernel (is_constant_hessian changed)
  // this should rarely occur
  if (is_constant_hessian != is_constant_hessian_) {
    Log::Info("Recompiling GPU kernel because hessian is %sa constant now", is_constant_hessian ? "" : "not ");
    is_constant_hessian_ = is_constant_hessian;
    BuildGPUKernels();
    SetupKernelArguments();
  }
748
  return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian, forced_split_json);
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
}

void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) {
  SerialTreeLearner::ResetTrainingData(train_data);
  num_feature_groups_ = train_data_->num_feature_groups();
  // GPU memory has to been reallocated because data may have been changed
  AllocateGPUMemory();
  // setup GPU kernel arguments after we allocating all the buffers
  SetupKernelArguments();
}

void GPUTreeLearner::BeforeTrain() {
  #if GPU_DEBUG >= 2
  printf("Copying intial full gradients and hessians to device\n");
  #endif
  // Copy initial full hessians and gradients to GPU.
  // We start copying as early as possible, instead of at ConstructHistogram().
  if (!use_bagging_ && num_dense_feature_groups_) {
    if (!is_constant_hessian_) {
768
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data_ * sizeof(score_t), hessians_);
769
    } else {
770
      // setup hessian parameters only
771
      score_t const_hessian = hessians_[0];
772
773
774
775
776
777
778
      for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
        // hessian is passed as a parameter
        histogram_kernels_[i].set_arg(6, const_hessian);
        histogram_allfeats_kernels_[i].set_arg(6, const_hessian);
        histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
      }
    }
779
    gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data_ * sizeof(score_t), gradients_);
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
  }

  SerialTreeLearner::BeforeTrain();

  // use bagging
  if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
    // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram()
    // copy used gradients and hessians to ordered buffer
    const data_size_t* indices = data_partition_->indices();
    data_size_t cnt = data_partition_->leaf_count(0);
    #if GPU_DEBUG > 0
    printf("Using bagging, examples count = %d\n", cnt);
    #endif
    // transfer the indices to GPU
    indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
    if (!is_constant_hessian_) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < cnt; ++i) {
        ordered_hessians_[i] = hessians_[indices[i]];
      }
      // transfer hessian to GPU
801
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, cnt * sizeof(score_t), ordered_hessians_.data());
802
    } else {
803
      // setup hessian parameters only
804
      score_t const_hessian = hessians_[indices[0]];
805
806
807
808
809
810
811
812
813
814
815
816
      for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
        // hessian is passed as a parameter
        histogram_kernels_[i].set_arg(6, const_hessian);
        histogram_allfeats_kernels_[i].set_arg(6, const_hessian);
        histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
      }
    }
    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < cnt; ++i) {
      ordered_gradients_[i] = gradients_[indices[i]];
    }
    // transfer gradients to GPU
817
    gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, cnt * sizeof(score_t), ordered_gradients_.data());
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
  }
}

bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
  int smaller_leaf;
  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
  // only have root
  if (right_leaf < 0) {
    smaller_leaf = -1;
  } else if (num_data_in_left_child < num_data_in_right_child) {
    smaller_leaf = left_leaf;
  } else {
    smaller_leaf = right_leaf;
  }

  // Copy indices, gradients and hessians as early as possible
  if (smaller_leaf >= 0 && num_dense_feature_groups_) {
    // only need to initialize for smaller leaf
    // Get leaf boundary
    const data_size_t* indices = data_partition_->indices();
    data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
    data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);

    // copy indices to the GPU:
    #if GPU_DEBUG >= 2
    Log::Info("Copying indices, gradients and hessians to GPU...");
845
    printf("Indices size %d being copied (left = %d, right = %d)\n", end - begin, num_data_in_left_child, num_data_in_right_child);
846
847
848
849
850
851
852
853
854
    #endif
    indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);

    if (!is_constant_hessian_) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = begin; i < end; ++i) {
        ordered_hessians_[i - begin] = hessians_[indices[i]];
      }
      // copy ordered hessians to the GPU:
855
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_);
856
857
858
859
860
861
862
    }

    #pragma omp parallel for schedule(static)
    for (data_size_t i = begin; i < end; ++i) {
      ordered_gradients_[i - begin] = gradients_[indices[i]];
    }
    // copy ordered gradients to the GPU:
863
    gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, (end - begin) * sizeof(score_t), ptr_pinned_gradients_);
864
865

    #if GPU_DEBUG >= 2
866
    Log::Info("Gradients/hessians/indices copied to device with size %d", end - begin);
867
868
869
870
871
872
873
874
    #endif
  }
  return SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf);
}

bool GPUTreeLearner::ConstructGPUHistogramsAsync(
  const std::vector<int8_t>& is_feature_used,
  const data_size_t* data_indices, data_size_t num_data,
875
876
  const score_t* gradients, const score_t* hessians,
  score_t* ordered_gradients, score_t* ordered_hessians) {
877
878
879
880
881
882
883
  if (num_data <= 0) {
    return false;
  }
  // do nothing if no features can be processed on GPU
  if (!num_dense_feature_groups_) {
    return false;
  }
884

885
886
887
888
889
890
891
892
893
894
895
  // copy data indices if it is not null
  if (data_indices != nullptr && num_data != num_data_) {
    indices_future_ = boost::compute::copy_async(data_indices, data_indices + num_data, device_data_indices_->begin(), queue_);
  }
  // generate and copy ordered_gradients if gradients is not null
  if (gradients != nullptr) {
    if (num_data != num_data_) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_gradients[i] = gradients[data_indices[i]];
      }
896
      gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data * sizeof(score_t), ptr_pinned_gradients_);
897
    } else {
898
      gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data * sizeof(score_t), gradients);
899
900
901
902
903
904
905
906
907
    }
  }
  // generate and copy ordered_hessians if hessians is not null
  if (hessians != nullptr && !is_constant_hessian_) {
    if (num_data != num_data_) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_hessians[i] = hessians[data_indices[i]];
      }
908
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data * sizeof(score_t), ptr_pinned_hessians_);
909
    } else {
910
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data * sizeof(score_t), hessians);
911
912
913
914
    }
  }
  // converted indices in is_feature_used to feature-group indices
  std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
915
  #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
916
  for (int i = 0; i < num_features_; ++i) {
917
    if (is_feature_used[i]) {
918
919
920
921
922
      is_feature_group_used[train_data_->Feature2Group(i)] = 1;
    }
  }
  // construct the feature masks for dense feature-groups
  int used_dense_feature_groups = 0;
923
  #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
924
925
926
927
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
    if (is_feature_group_used[dense_feature_group_map_[i]]) {
      feature_masks_[i] = 1;
      ++used_dense_feature_groups;
928
    } else {
929
930
931
932
933
934
935
936
937
      feature_masks_[i] = 0;
    }
  }
  bool use_all_features = used_dense_feature_groups == num_dense_feature_groups_;
  // if no feature group is used, just return and do not use GPU
  if (used_dense_feature_groups == 0) {
    return false;
  }
#if GPU_DEBUG >= 1
938
  printf("Feature masks:\n");
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
  for (unsigned int i = 0; i < feature_masks_.size(); ++i) {
    printf("%d ", feature_masks_[i]);
  }
  printf("\n");
  printf("%d feature groups, %d used, %d\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features);
#endif
  // if not all feature groups are used, we need to transfer the feature mask to GPU
  // otherwise, we will use a specialized GPU kernel with all feature groups enabled
  if (!use_all_features) {
    queue_.enqueue_write_buffer(device_feature_masks_, 0, num_dense_feature4_ * dword_features_, ptr_pinned_feature_masks_);
  }
  // All data have been prepared, now run the GPU kernel
  GPUHistogram(num_data, use_all_features);
  return true;
}

void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
  std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
  std::vector<int8_t> is_dense_feature_used(num_features_, 0);
  #pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used_[feature_index]) continue;
    if (!is_feature_used[feature_index]) continue;
    if (ordered_bins_[train_data_->Feature2Group(feature_index)]) {
      is_sparse_feature_used[feature_index] = 1;
964
    } else {
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
      is_dense_feature_used[feature_index] = 1;
    }
  }
  // construct smaller leaf
  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
  // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
  bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
    nullptr, nullptr,
    nullptr, nullptr);
  // then construct sparse features on CPU
  // We set data_indices to null to avoid rebuilding ordered gradients/hessians
  train_data_->ConstructHistograms(is_sparse_feature_used,
    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
    smaller_leaf_splits_->LeafIndex(),
    ordered_bins_, gradients_, hessians_,
    ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
    ptr_smaller_leaf_hist_data);
  // wait for GPU to finish, only if GPU is actually used
  if (is_gpu_used) {
Guolin Ke's avatar
Guolin Ke committed
985
    if (config_->gpu_use_dp) {
986
      // use double precision
987
      WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data);
988
    } else {
989
      // use single precision
990
      WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data);
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
    }
  }

  // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
  // #define GPU_DEBUG_COMPARE
  #ifdef GPU_DEBUG_COMPARE
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
    if (!feature_masks_[i])
      continue;
    int dense_feature_group_index = dense_feature_group_map_[i];
    size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
    HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
    HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
    HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size];
    data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
    printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
    std::copy(current_histogram, current_histogram + size, gpu_histogram);
    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry));
    train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
      num_data != num_data_ ? smaller_leaf_splits_->data_indices() : nullptr,
      num_data,
      num_data != num_data_ ? ordered_gradients_.data() : gradients_,
      num_data != num_data_ ? ordered_hessians_.data() : hessians_,
1014
      current_histogram);
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
    std::copy(gpu_histogram, gpu_histogram + size, current_histogram);
    delete [] gpu_histogram;
  }
  #endif

  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
    // construct larger leaf
    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
    is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
      gradients_, hessians_,
      ordered_gradients_.data(), ordered_hessians_.data());
    // then construct sparse features on CPU
    // We set data_indices to null to avoid rebuilding ordered gradients/hessians
    train_data_->ConstructHistograms(is_sparse_feature_used,
      nullptr, larger_leaf_splits_->num_data_in_leaf(),
      larger_leaf_splits_->LeafIndex(),
      ordered_bins_, gradients_, hessians_,
      ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
      ptr_larger_leaf_hist_data);
    // wait for GPU to finish, only if GPU is actually used
    if (is_gpu_used) {
Guolin Ke's avatar
Guolin Ke committed
1038
      if (config_->gpu_use_dp) {
1039
        // use double precision
1040
        WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data);
1041
      } else {
1042
        // use single precision
1043
        WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data);
1044
1045
1046
1047
1048
      }
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
1049
1050
void GPUTreeLearner::FindBestSplits() {
  SerialTreeLearner::FindBestSplits();
1051
1052
1053
1054
1055
1056
1057
1058
1059

#if GPU_DEBUG >= 3
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used_[feature_index]) continue;
    if (parent_leaf_histogram_array_ != nullptr
        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
      continue;
    }
1060
    size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1;
1061
    printf("Feature %d smaller leaf:\n", feature_index);
1062
1063
    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
1064
    printf("Feature %d larger leaf:\n", feature_index);
1065
1066
1067
1068
1069
1070
1071
1072
    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
  }
#endif
}

void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
#if GPU_DEBUG >= 2
James Lamb's avatar
James Lamb committed
1073
  printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
#endif
  SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
  if (Network::num_machines() == 1) {
    // do some sanity check for the GPU algorithm
    if (best_split_info.left_count < best_split_info.right_count) {
      if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) ||
          (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) {
        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
      }
    } else {
Guolin Ke's avatar
Guolin Ke committed
1084
1085
1086
1087
      double smaller_min = smaller_leaf_splits_->min_constraint();
      double smaller_max = smaller_leaf_splits_->max_constraint();
      double larger_min = larger_leaf_splits_->min_constraint();
      double larger_max = larger_leaf_splits_->max_constraint();
1088
1089
      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
Guolin Ke's avatar
Guolin Ke committed
1090
1091
      smaller_leaf_splits_->SetValueConstraint(smaller_min, smaller_max);
      larger_leaf_splits_->SetValueConstraint(larger_min, larger_max);
1092
1093
1094
1095
1096
1097
1098
1099
1100
      if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
          (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
      }
    }
  }
}

}   // namespace LightGBM
1101
#endif  // USE_GPU