gpu_tree_learner.cpp 53.8 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5
#ifdef USE_GPU
6

7
8
9
#include "gpu_tree_learner.h"

#include <LightGBM/bin.h>
10
11
#include <LightGBM/network.h>
#include <LightGBM/utils/array_args.h>
12

13
#include <algorithm>
14
15
16
17
18
#include <cstdio>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
19

20
#include "../io/dense_bin.hpp"
21
22
23
24
25

#define GPU_DEBUG 0

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
26
27
GPUTreeLearner::GPUTreeLearner(const Config* config)
  :SerialTreeLearner(config) {
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
  use_bagging_ = false;
  Log::Info("This is the GPU trainer!!");
}

GPUTreeLearner::~GPUTreeLearner() {
  if (ptr_pinned_gradients_) {
    queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_);
  }
  if (ptr_pinned_hessians_) {
    queue_.enqueue_unmap_buffer(pinned_hessians_, ptr_pinned_hessians_);
  }
  if (ptr_pinned_feature_masks_) {
    queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_);
  }
}

void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
  // initialize SerialTreeLearner
  SerialTreeLearner::Init(train_data, is_constant_hessian);
  // some additional variables needed for GPU trainer
  num_feature_groups_ = train_data_->num_feature_groups();
  // Initialize GPU buffers and kernels
Guolin Ke's avatar
Guolin Ke committed
50
  InitGPU(config_->gpu_platform_id, config_->gpu_device_id);
51
52
53
54
55
}

// some functions used for debugging the GPU histogram construction
#if GPU_DEBUG > 0

56
57
void PrintHistograms(hist_t* h, size_t size) {
  double total_hess = 0;
58
  for (size_t i = 0; i < size; ++i) {
59
    printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
60
    if ((i & 3) == 3)
61
        printf("\n");
62
    total_hess += GET_HESS(h, i);
63
  }
64
  printf("\nSum hessians: %9.3g\n", total_hess);
65
66
}

67
union Float_t {
68
69
70
71
72
73
    int64_t i;
    double f;
    static int64_t ulp_diff(Float_t a, Float_t b) {
      return abs(a.i - b.i);
    }
};
74

75

76
void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
77
78
79
  size_t i;
  Float_t a, b;
  for (i = 0; i < size; ++i) {
80
81
    a.f = GET_GRAD(h1, i);
    b.f = GET_GRAD(h2, i);
82
83
    int32_t ulps = Float_t::ulp_diff(a, b);
    if (ulps > 0) {
84
      // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
85
86
      // goto err;
    }
87
88
    a.f = GET_HESS(h1, i);
    b.f = GET_HESS(h2, i);
89
    ulps = Float_t::ulp_diff(a, b);
90
91
92
    if (std::fabs(a.f - b.f) >= 1e-20) {
      printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps);
      goto err;
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
    }
  }
  return;
err:
  Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i);
  std::cin.get();
  PrintHistograms(h1, size);
  printf("\n");
  PrintHistograms(h2, size);
  std::cin.get();
}
#endif

int GPUTreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
  // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
  // also guarantee that there are at least 2K examples per workgroup
  double x = 256.0 / num_dense_feature4_;
110
  int exp_workgroups_per_feature = static_cast<int>(ceil(log2(x)));
111
112
113
114
  double t = leaf_num_data / 1024.0;
  #if GPU_DEBUG >= 4
  printf("Computing histogram for %d examples and (%d * %d) feature groups\n", leaf_num_data, dword_features_, num_dense_feature4_);
  printf("We can have at most %d workgroups per feature4 for efficiency reasons.\n"
115
         "Best workgroup size per feature for full utilization is %d\n", static_cast<int>(ceil(t)), (1 << exp_workgroups_per_feature));
116
  #endif
117
  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast<int>(ceil(log(static_cast<double>(t))/log(2.0))));
118
119
120
121
122
123
124
125
126
  if (exp_workgroups_per_feature < 0)
      exp_workgroups_per_feature = 0;
  if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature)
      exp_workgroups_per_feature = kMaxLogWorkgroupsPerFeature;
  // return 0;
  return exp_workgroups_per_feature;
}

void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) {
Andrew Ziem's avatar
Andrew Ziem committed
127
  // we have already copied ordered gradients, ordered Hessians and indices to GPU
128
129
130
131
132
133
134
  // decide the best number of workgroups working on one feature4 tuple
  // set work group size based on feature size
  // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
  int num_workgroups = (1 << exp_workgroups_per_feature) * num_dense_feature4_;
  if (num_workgroups > preallocd_max_num_wg_) {
    preallocd_max_num_wg_ = num_workgroups;
135
    Log::Info("Increasing preallocd_max_num_wg_ to %d for launching more workgroups", preallocd_max_num_wg_);
136
137
138
139
140
141
142
143
144
145
146
    device_subhistograms_.reset(new boost::compute::vector<char>(
                              preallocd_max_num_wg_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, ctx_));
    // we need to refresh the kernel arguments after reallocating
    for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
      // The only argument that needs to be changed later is num_data_
      histogram_kernels_[i].set_arg(7, *device_subhistograms_);
      histogram_allfeats_kernels_[i].set_arg(7, *device_subhistograms_);
      histogram_fulldata_kernels_[i].set_arg(7, *device_subhistograms_);
    }
  }
  #if GPU_DEBUG >= 4
147
  printf("Setting exp_workgroups_per_feature to %d, using %u work groups\n", exp_workgroups_per_feature, num_workgroups);
148
149
  printf("Constructing histogram with %d examples\n", leaf_num_data);
  #endif
150

151
152
153
154
155
156
  // the GPU kernel will process all features in one call, and each
  // 2^exp_workgroups_per_feature (compile time constant) workgroup will
  // process one feature4 tuple

  if (use_all_features) {
    histogram_allfeats_kernels_[exp_workgroups_per_feature].set_arg(4, leaf_num_data);
157
  } else {
158
159
160
161
162
163
164
    histogram_kernels_[exp_workgroups_per_feature].set_arg(4, leaf_num_data);
  }
  // for the root node, indices are not copied
  if (leaf_num_data != num_data_) {
    indices_future_.wait();
  }
  // for constant hessian, hessians are not copied except for the root node
165
  if (!share_state_->is_constant_hessian) {
166
167
168
169
170
171
    hessians_future_.wait();
  }
  gradients_future_.wait();
  // there will be 2^exp_workgroups_per_feature = num_workgroups / num_dense_feature4 sub-histogram per feature4
  // and we will launch num_feature workgroups for this kernel
  // will launch threads for all features
Andrew Ziem's avatar
Andrew Ziem committed
172
  // the queue should be asynchronous, and we will can WaitAndGetHistograms() before we start processing dense feature groups
173
  if (leaf_num_data == num_data_) {
174
175
    kernel_wait_obj_ = boost::compute::wait_list(
      queue_.enqueue_1d_range_kernel(histogram_fulldata_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
176
  } else {
177
178
    if (use_all_features) {
      kernel_wait_obj_ = boost::compute::wait_list(
179
        queue_.enqueue_1d_range_kernel(histogram_allfeats_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
180
    } else {
181
      kernel_wait_obj_ = boost::compute::wait_list(
182
        queue_.enqueue_1d_range_kernel(histogram_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
183
184
185
186
187
    }
  }
  // copy the results asynchronously. Size depends on if double precision is used
  size_t output_size = num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
  boost::compute::event histogram_wait_event;
188
189
  host_histogram_outputs_ = reinterpret_cast<void*>(queue_.enqueue_map_buffer_async(
    device_histogram_outputs_, boost::compute::command_queue::map_read, 0, output_size, histogram_wait_event, kernel_wait_obj_));
190
191
192
193
194
  // we will wait for this object in WaitAndGetHistograms
  histograms_wait_obj_ = boost::compute::wait_list(histogram_wait_event);
}

template <typename HistType>
195
void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
196
  HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
197
198
  // when the output is ready, the computation is done
  histograms_wait_obj_.wait();
199
  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
200
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
201
202
203
204
    if (!feature_masks_[i]) {
      continue;
    }
    int dense_group_index = dense_feature_group_map_[i];
205
    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
206
    int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
207
208
    if (device_bin_mults_[i] == 1) {
      for (int j = 0; j < bin_size; ++j) {
209
210
        GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
        GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j);
211
      }
212
    } else {
213
214
215
216
217
      // values of this feature has been redistributed to multiple bins; need a reduction here
      int ind = 0;
      for (int j = 0; j < bin_size; ++j) {
        double sum_g = 0.0, sum_h = 0.0;
        for (int k = 0; k < device_bin_mults_[i]; ++k) {
218
219
          sum_g += GET_GRAD(hist_outputs, i * device_bin_size_+ ind);
          sum_h += GET_HESS(hist_outputs, i * device_bin_size_+ ind);
220
221
          ind++;
        }
222
223
        GET_GRAD(old_histogram_array, j) = sum_g;
        GET_HESS(old_histogram_array, j) = sum_h;
224
225
226
227
228
229
230
231
232
      }
    }
  }
  queue_.enqueue_unmap_buffer(device_histogram_outputs_, host_histogram_outputs_);
}

void GPUTreeLearner::AllocateGPUMemory() {
  num_dense_feature_groups_ = 0;
  for (int i = 0; i < num_feature_groups_; ++i) {
233
    if (!train_data_->IsMultiGroup(i)) {
234
235
236
237
238
239
240
241
242
243
244
245
246
247
      num_dense_feature_groups_++;
    }
  }
  // how many feature-group tuples we have
  num_dense_feature4_ = (num_dense_feature_groups_ + (dword_features_ - 1)) / dword_features_;
  // leave some safe margin for prefetching
  // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
  int allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
  // clear sparse/dense maps
  dense_feature_group_map_.clear();
  device_bin_mults_.clear();
  sparse_feature_group_map_.clear();
  // do nothing if no features can be processed on GPU
  if (!num_dense_feature_groups_) {
Lingyi Hu's avatar
Lingyi Hu committed
248
    Log::Warning("GPU acceleration is disabled because no non-trivial dense features can be found");
249
250
251
252
    return;
  }
  // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers)
  device_features_.reset();
253
  device_features_ = std::unique_ptr<boost::compute::vector<Feature4>>(new boost::compute::vector<Feature4>(static_cast<uint64_t>(num_dense_feature4_) * num_data_, ctx_));
254
255
256
257
258
259
260
261
262
263
  // unpin old buffer if necessary before destructing them
  if (ptr_pinned_gradients_) {
    queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_);
  }
  if (ptr_pinned_hessians_) {
    queue_.enqueue_unmap_buffer(pinned_hessians_, ptr_pinned_hessians_);
  }
  if (ptr_pinned_feature_masks_) {
    queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_);
  }
Andrew Ziem's avatar
Andrew Ziem committed
264
  // make ordered_gradients and Hessians larger (including extra room for prefetching), and pin them
265
266
  ordered_gradients_.reserve(allocated_num_data_);
  ordered_hessians_.reserve(allocated_num_data_);
267
268
269
  pinned_gradients_ = boost::compute::buffer();  // deallocate
  pinned_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
                                             boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
270
                                             ordered_gradients_.data());
271
  ptr_pinned_gradients_ = queue_.enqueue_map_buffer(pinned_gradients_, boost::compute::command_queue::map_write_invalidate_region,
272
                                                    0, allocated_num_data_ * sizeof(score_t));
273
  pinned_hessians_ = boost::compute::buffer();  // deallocate
274
  pinned_hessians_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
275
                                             boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
276
                                             ordered_hessians_.data());
277
  ptr_pinned_hessians_ = queue_.enqueue_map_buffer(pinned_hessians_, boost::compute::command_queue::map_write_invalidate_region,
278
                                                   0, allocated_num_data_ * sizeof(score_t));
Andrew Ziem's avatar
Andrew Ziem committed
279
280
  // allocate space for gradients and Hessians on device
  // we will copy gradients and Hessians in after ordered_gradients_ and ordered_hessians_ are constructed
281
282
  device_gradients_ = boost::compute::buffer();  // deallocate
  device_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
283
                      boost::compute::memory_object::read_only, nullptr);
284
  device_hessians_ = boost::compute::buffer();  // deallocate
285
  device_hessians_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
286
287
288
                      boost::compute::memory_object::read_only, nullptr);
  // allocate feature mask, for disabling some feature-groups' histogram calculation
  feature_masks_.resize(num_dense_feature4_ * dword_features_);
289
290
  device_feature_masks_ = boost::compute::buffer();  // deallocate
  device_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
291
                          boost::compute::memory_object::read_only, nullptr);
292
293
  pinned_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
                                             boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
294
295
296
297
298
299
300
301
302
                                             feature_masks_.data());
  ptr_pinned_feature_masks_ = queue_.enqueue_map_buffer(pinned_feature_masks_, boost::compute::command_queue::map_write_invalidate_region,
                                                        0, num_dense_feature4_ * dword_features_);
  memset(ptr_pinned_feature_masks_, 0, num_dense_feature4_ * dword_features_);
  // copy indices to the device
  device_data_indices_.reset();
  device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_));
  boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_);
  // histogram bin entry size depends on the precision (single/double)
303
  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) * 2 : sizeof(gpu_hist_t) * 2;
304
305
306
307
308
309
310
311
312
313
314
315
316
317
  Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_);
  // create output buffer, each feature has a histogram with device_bin_size_ bins,
  // each work group generates a sub-histogram of dword_features_ features.
  if (!device_subhistograms_) {
    // only initialize once here, as this will not need to change when ResetTrainingData() is called
    device_subhistograms_ = std::unique_ptr<boost::compute::vector<char>>(new boost::compute::vector<char>(
                              preallocd_max_num_wg_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, ctx_));
  }
  // create atomic counters for inter-group coordination
  sync_counters_.reset();
  sync_counters_ = std::unique_ptr<boost::compute::vector<int>>(new boost::compute::vector<int>(
                    num_dense_feature4_, ctx_));
  boost::compute::fill(sync_counters_->begin(), sync_counters_->end(), 0, queue_);
  // The output buffer is allocated to host directly, to overlap compute and data transfer
318
  device_histogram_outputs_ = boost::compute::buffer();  // deallocate
319
  device_histogram_outputs_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_,
320
321
                           boost::compute::memory_object::write_only | boost::compute::memory_object::alloc_host_ptr, nullptr);
  // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
322
323
324
  int k = 0, copied_feature4 = 0;
  std::vector<int> dense_dword_ind(dword_features_);
  for (int i = 0; i < num_feature_groups_; ++i) {
325
    // looking for dword_features_ non-sparse feature-groups
326
    if (!train_data_->IsMultiGroup(i)) {
327
      dense_dword_ind[k] = i;
328
      // decide if we need to redistribute the bin
329
      double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i));
330
      // multiplier must be a power of 2
331
      device_bin_mults_.push_back(static_cast<int>(round(pow(2, floor(log2(t))))));
332
333
334
335
336
      // device_bin_mults_.push_back(1);
      #if GPU_DEBUG >= 1
      printf("feature-group %d using multiplier %d\n", i, device_bin_mults_.back());
      #endif
      k++;
337
    } else {
338
339
      sparse_feature_group_map_.push_back(i);
    }
340
    // found
341
342
343
    if (k == dword_features_) {
      k = 0;
      for (int j = 0; j < dword_features_; ++j) {
344
        dense_feature_group_map_.push_back(dense_dword_ind[j]);
345
346
347
348
349
350
351
      }
      copied_feature4++;
    }
  }
  // for data transfer time
  auto start_time = std::chrono::steady_clock::now();
  // Now generate new data structure feature4, and copy data to the device
352
  int nthreads = std::min(OMP_NUM_THREADS(), static_cast<int>(dense_feature_group_map_.size()) / dword_features_);
353
354
355
356
357
358
  nthreads = std::max(nthreads, 1);
  std::vector<Feature4*> host4_vecs(nthreads);
  std::vector<boost::compute::buffer> host4_bufs(nthreads);
  std::vector<Feature4*> host4_ptrs(nthreads);
  // preallocate arrays for all threads, and pin them
  for (int i = 0; i < nthreads; ++i) {
359
    host4_vecs[i] = reinterpret_cast<Feature4*>(boost::alignment::aligned_alloc(4096, num_data_ * sizeof(Feature4)));
360
361
    host4_bufs[i] = boost::compute::buffer(ctx_, num_data_ * sizeof(Feature4),
                    boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
362
                    host4_vecs[i]);
363
364
    host4_ptrs[i] = reinterpret_cast<Feature4*>(queue_.enqueue_map_buffer(host4_bufs[i], boost::compute::command_queue::map_write_invalidate_region,
                    0, num_data_ * sizeof(Feature4)));
365
366
  }
  // building Feature4 bundles; each thread handles dword_features_ features
367
  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
368
  for (int i = 0; i < static_cast<int>(dense_feature_group_map_.size() / dword_features_); ++i) {
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
    int tid = omp_get_thread_num();
    Feature4* host4 = host4_ptrs[tid];
    auto dense_ind = dense_feature_group_map_.begin() + i * dword_features_;
    auto dev_bin_mult = device_bin_mults_.begin() + i * dword_features_;
    #if GPU_DEBUG >= 1
    printf("Copying feature group ");
    for (int l = 0; l < dword_features_; ++l) {
      printf("%d ", dense_ind[l]);
    }
    printf("to devices\n");
    #endif
    if (dword_features_ == 8) {
      // one feature datapoint is 4 bits
      BinIterator* bin_iters[8];
      for (int s_idx = 0; s_idx < 8; ++s_idx) {
        bin_iters[s_idx] = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
385
        if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[s_idx]) == 0) {
386
          Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not", dense_ind[s_idx]);
387
388
389
        }
      }
      // this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
390
391
392
393
394
395
396
397
398
      DenseBinIterator<uint8_t, true> iters[8] = {
        *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[0]),
        *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[1]),
        *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[2]),
        *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[3]),
        *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[4]),
        *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[5]),
        *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[6]),
        *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[7])};
399
      for (int j = 0; j < num_data_; ++j) {
400
        host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
401
                      |((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
402
        host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1)))
403
                      |((iters[3].RawGet(j) * dev_bin_mult[3] + ((j+3) & (dev_bin_mult[3] - 1))) << 4));
404
        host4[j].s[2] = (uint8_t)((iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1)))
405
                      |((iters[5].RawGet(j) * dev_bin_mult[5] + ((j+5) & (dev_bin_mult[5] - 1))) << 4));
406
        host4[j].s[3] = (uint8_t)((iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1)))
407
                      |((iters[7].RawGet(j) * dev_bin_mult[7] + ((j+7) & (dev_bin_mult[7] - 1))) << 4));
408
      }
409
    } else if (dword_features_ == 4) {
410
411
412
413
      // one feature datapoint is one byte
      for (int s_idx = 0; s_idx < 4; ++s_idx) {
        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
        // this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
414
        if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
415
          // Dense bin
416
          DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
417
          for (int j = 0; j < num_data_; ++j) {
418
            host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
419
          }
420
        } else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
421
          // Dense 4-bit bin
422
          DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
423
          for (int j = 0; j < num_data_; ++j) {
424
            host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
425
          }
426
        } else {
427
          Log::Fatal("Bug in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
428
429
        }
      }
430
    } else {
431
      Log::Fatal("Bug in GPU tree builder: dword_features_ can only be 4 or 8");
432
    }
Vladimir's avatar
Vladimir committed
433
    #pragma omp critical
434
    queue_.enqueue_write_buffer(device_features_->get_buffer(),
435
                        static_cast<uint64_t>(i) * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
436
    #if GPU_DEBUG >= 1
437
    printf("first example of feature-group tuple is: %d %d %d %d\n", host4[0].s[0], host4[0].s[1], host4[0].s[2], host4[0].s[3]);
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
    printf("Feature-groups copied to device with multipliers ");
    for (int l = 0; l < dword_features_; ++l) {
      printf("%d ", dev_bin_mult[l]);
    }
    printf("\n");
    #endif
  }
  // working on the remaining (less than dword_features_) feature groups
  if (k != 0) {
    Feature4* host4 = host4_ptrs[0];
    if (dword_features_ == 8) {
      memset(host4, 0, num_data_ * sizeof(Feature4));
    }
    #if GPU_DEBUG >= 1
    printf("%d features left\n", k);
    #endif
454
    for (int i = 0; i < k; ++i) {
455
      if (dword_features_ == 8) {
456
        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
457
458
        if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
          DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
459
          #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
460
          for (int j = 0; j < num_data_; ++j) {
461
            host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
462
463
464
                                + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
                               << ((i & 1) << 2));
          }
465
        } else {
466
          Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not", dense_dword_ind[i]);
467
        }
468
      } else if (dword_features_ == 4) {
469
        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
470
471
        if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
          DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
472
          #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
473
          for (int j = 0; j < num_data_; ++j) {
474
            host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
475
                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
476
          }
477
478
        } else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
          DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
479
          #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
480
          for (int j = 0; j < num_data_; ++j) {
481
            host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
482
                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
483
          }
484
        } else {
485
          Log::Fatal("BUG in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
486
        }
487
      } else {
488
        Log::Fatal("Bug in GPU tree builder: dword_features_ can only be 4 or 8");
489
490
491
492
      }
    }
    // fill the leftover features
    if (dword_features_ == 8) {
493
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
494
      for (int j = 0; j < num_data_; ++j) {
495
        for (int i = k; i < dword_features_; ++i) {
496
          // fill this empty feature with some "random" value
497
          host4[j].s[i >> 1] |= (uint8_t)((j & 0xf) << ((i & 1) << 2));
498
499
        }
      }
500
    } else if (dword_features_ == 4) {
501
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
502
      for (int j = 0; j < num_data_; ++j) {
503
        for (int i = k; i < dword_features_; ++i) {
504
          // fill this empty feature with some "random" value
505
          host4[j].s[i] = (uint8_t)j;
506
507
508
509
510
        }
      }
    }
    // copying the last 1 to (dword_features - 1) feature-groups in the last tuple
    queue_.enqueue_write_buffer(device_features_->get_buffer(),
511
                        (num_dense_feature4_ - 1) * static_cast<uint64_t>(num_data_) * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
512
513
514
    #if GPU_DEBUG >= 1
    printf("Last features copied to device\n");
    #endif
515
516
    for (int i = 0; i < k; ++i) {
      dense_feature_group_map_.push_back(dense_dword_ind[i]);
517
518
519
520
521
522
523
524
525
526
    }
  }
  // deallocate pinned space for feature copying
  for (int i = 0; i < nthreads; ++i) {
      queue_.enqueue_unmap_buffer(host4_bufs[i], host4_ptrs[i]);
      host4_bufs[i] = boost::compute::buffer();
      boost::alignment::aligned_free(host4_vecs[i]);
  }
  // data transfer time
  std::chrono::duration<double, std::milli> end_time = std::chrono::steady_clock::now() - start_time;
527
528
  Log::Info("%d dense feature groups (%.2f MB) transferred to GPU in %f secs. %d sparse feature groups",
            dense_feature_group_map_.size(), ((dense_feature_group_map_.size() + (dword_features_ - 1)) / dword_features_) * num_data_ * sizeof(Feature4) / (1024.0 * 1024.0),
529
530
531
            end_time * 1e-3, sparse_feature_group_map_.size());
  #if GPU_DEBUG >= 1
  printf("Dense feature group list (size %lu): ", dense_feature_group_map_.size());
532
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
533
534
535
536
    printf("%d ", dense_feature_group_map_[i]);
  }
  printf("\n");
  printf("Sparse feature group list (size %lu): ", sparse_feature_group_map_.size());
537
  for (int i = 0; i < num_feature_groups_ - num_dense_feature_groups_; ++i) {
538
539
540
541
542
543
    printf("%d ", sparse_feature_group_map_[i]);
  }
  printf("\n");
  #endif
}

544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
std::string GPUTreeLearner::GetBuildLog(const std::string &opts) {
  boost::compute::program program = boost::compute::program::create_with_source(kernel_source_, ctx_);
  try {
    program.build(opts);
  }
  catch (boost::compute::opencl_error &e) {
    auto error_code = e.error_code();
    std::string log("No log available.\n");
    // for other types of failure, build log might not be available; program.build_log() can crash
    if (error_code == CL_INVALID_PROGRAM || error_code == CL_BUILD_PROGRAM_FAILURE) {
      try {
        log = program.build_log();
      }
      catch(...) {
        // Something bad happened. Just return "No log available."
      }
    }
    return log;
  }
  // build is okay, log may contain warnings
  return program.build_log();
}

567
568
569
570
571
572
573
574
575
576
577
578
void GPUTreeLearner::BuildGPUKernels() {
  Log::Info("Compiling OpenCL Kernel with %d bins...", device_bin_size_);
  // destroy any old kernels
  histogram_kernels_.clear();
  histogram_allfeats_kernels_.clear();
  histogram_fulldata_kernels_.clear();
  // create OpenCL kernels for different number of workgroups per feature
  histogram_kernels_.resize(kMaxLogWorkgroupsPerFeature+1);
  histogram_allfeats_kernels_.resize(kMaxLogWorkgroupsPerFeature+1);
  histogram_fulldata_kernels_.resize(kMaxLogWorkgroupsPerFeature+1);
  // currently we don't use constant memory
  int use_constants = 0;
579
  OMP_INIT_EX();
580
  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
581
  for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
582
    OMP_LOOP_EX_BEGIN();
583
584
    boost::compute::program program;
    std::ostringstream opts;
585
    // compile the GPU kernel depending if double precision is used, constant hessian is used, etc.
586
    opts << " -D POWER_FEATURE_WORKGROUPS=" << i
Guolin Ke's avatar
Guolin Ke committed
587
         << " -D USE_CONSTANT_BUF=" << use_constants << " -D USE_DP_FLOAT=" << int(config_->gpu_use_dp)
588
         << " -D CONST_HESSIAN=" << int(share_state_->is_constant_hessian)
589
         << " -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math";
590
591
592
593
594
595
596
597
    #if GPU_DEBUG >= 1
    std::cout << "Building GPU kernels with options: " << opts.str() << std::endl;
    #endif
    // kernel with indices in an array
    try {
      program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
    }
    catch (boost::compute::opencl_error &e) {
598
599
600
601
602
      #pragma omp critical
      {
        std::cerr << "Build Options:" << opts.str() << std::endl;
        std::cerr << "Build Log:" << std::endl << GetBuildLog(opts.str()) << std::endl;
        Log::Fatal("Cannot build GPU program: %s", e.what());
603
604
605
      }
    }
    histogram_kernels_[i] = program.create_kernel(kernel_name_);
606

Andrew Ziem's avatar
Andrew Ziem committed
607
    // kernel with all features enabled, with eliminated branches
608
609
610
611
612
    opts << " -D ENABLE_ALL_FEATURES=1";
    try {
      program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
    }
    catch (boost::compute::opencl_error &e) {
613
614
615
616
617
      #pragma omp critical
      {
        std::cerr << "Build Options:" << opts.str() << std::endl;
        std::cerr << "Build Log:" << std::endl << GetBuildLog(opts.str()) << std::endl;
        Log::Fatal("Cannot build GPU program: %s", e.what());
618
619
620
621
622
623
624
625
626
627
      }
    }
    histogram_allfeats_kernels_[i] = program.create_kernel(kernel_name_);

    // kernel with all data indices (for root node, and assumes that root node always uses all features)
    opts << " -D IGNORE_INDICES=1";
    try {
      program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
    }
    catch (boost::compute::opencl_error &e) {
628
629
630
631
632
      #pragma omp critical
      {
        std::cerr << "Build Options:" << opts.str() << std::endl;
        std::cerr << "Build Log:" << std::endl << GetBuildLog(opts.str()) << std::endl;
        Log::Fatal("Cannot build GPU program: %s", e.what());
633
634
635
      }
    }
    histogram_fulldata_kernels_[i] = program.create_kernel(kernel_name_);
636
    OMP_LOOP_EX_END();
637
  }
638
  OMP_THROW_EX();
639
640
641
642
643
644
645
646
647
648
  Log::Info("GPU programs have been built");
}

void GPUTreeLearner::SetupKernelArguments() {
  // do nothing if no features can be processed on GPU
  if (!num_dense_feature_groups_) {
    return;
  }
  for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
    // The only argument that needs to be changed later is num_data_
649
    if (share_state_->is_constant_hessian) {
650
      // hessian is passed as a parameter, but it is not available now.
651
652
653
654
655
656
657
658
659
660
      // hessian will be set in BeforeTrain()
      histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                         *device_data_indices_, num_data_, device_gradients_, 0.0f,
                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
      histogram_allfeats_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                         *device_data_indices_, num_data_, device_gradients_, 0.0f,
                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
      histogram_fulldata_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                          *device_data_indices_, num_data_, device_gradients_, 0.0f,
                                          *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
661
    } else {
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
      histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                         *device_data_indices_, num_data_, device_gradients_, device_hessians_,
                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
      histogram_allfeats_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                         *device_data_indices_, num_data_, device_gradients_, device_hessians_,
                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
      histogram_fulldata_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
                                          *device_data_indices_, num_data_, device_gradients_, device_hessians_,
                                          *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
    }
  }
}

void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
  // Get the max bin size, used for selecting best GPU kernel
  max_num_bin_ = 0;
  #if GPU_DEBUG >= 1
  printf("bin size: ");
  #endif
  for (int i = 0; i < num_feature_groups_; ++i) {
682
683
684
    if (train_data_->IsMultiGroup(i)) {
      continue;
    }
685
686
687
688
689
690
691
692
693
694
695
696
    #if GPU_DEBUG >= 1
    printf("%d, ", train_data_->FeatureGroupNumBin(i));
    #endif
    max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
  }
  #if GPU_DEBUG >= 1
  printf("\n");
  #endif
  // initialize GPU
  dev_ = boost::compute::system::default_device();
  if (platform_id >= 0 && device_id >= 0) {
    const std::vector<boost::compute::platform> platforms = boost::compute::system::platforms();
697
    if (static_cast<int>(platforms.size()) > platform_id) {
698
      const std::vector<boost::compute::device> platform_devices = platforms[platform_id].devices();
699
      if (static_cast<int>(platform_devices.size()) > device_id) {
700
701
        Log::Info("Using requested OpenCL platform %d device %d", platform_id, device_id);
        dev_ = platform_devices[device_id];
702
703
704
      }
    }
  }
705
706
  // determine which kernel to use based on the max number of bins
  if (max_num_bin_ <= 16) {
Guolin Ke's avatar
Guolin Ke committed
707
708
    // the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
    kernel_source_ = kernel16_src_ + 9;
709
710
711
    kernel_name_ = "histogram16";
    device_bin_size_ = 16;
    dword_features_ = 8;
712
  } else if (max_num_bin_ <= 64) {
Guolin Ke's avatar
Guolin Ke committed
713
714
    // the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
    kernel_source_ = kernel64_src_ + 9;
715
716
717
    kernel_name_ = "histogram64";
    device_bin_size_ = 64;
    dword_features_ = 4;
718
  } else if (max_num_bin_ <= 256) {
Guolin Ke's avatar
Guolin Ke committed
719
720
    // the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
    kernel_source_ = kernel256_src_ + 9;
721
722
723
    kernel_name_ = "histogram256";
    device_bin_size_ = 256;
    dword_features_ = 4;
724
  } else {
725
726
    Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
  }
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747

  // ignore the feature groups that contain categorical features when producing warnings about max_bin.
  // these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
  int max_num_bin_no_categorical = 0;
  int cur_feature_group = 0;
  bool categorical_feature_found = false;
  for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
    const int feature_group = train_data_->Feature2Group(inner_feature_index);
    const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
    if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
      categorical_feature_found = true;
    }
    if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
      if (!categorical_feature_found) {
        max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
      }
      categorical_feature_found = false;
      cur_feature_group = feature_group;
    }
  }
  if (max_num_bin_no_categorical == 65) {
James Lamb's avatar
James Lamb committed
748
    Log::Warning("Setting max_bin to 63 is suggested for best performance");
749
  }
750
  if (max_num_bin_no_categorical == 17) {
James Lamb's avatar
James Lamb committed
751
    Log::Warning("Setting max_bin to 15 is suggested for best performance");
752
753
754
755
756
757
758
759
760
761
  }
  ctx_ = boost::compute::context(dev_);
  queue_ = boost::compute::command_queue(ctx_, dev_);
  Log::Info("Using GPU Device: %s, Vendor: %s", dev_.name().c_str(), dev_.vendor().c_str());
  BuildGPUKernels();
  AllocateGPUMemory();
  // setup GPU kernel arguments after we allocating all the buffers
  SetupKernelArguments();
}

762
763
Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) {
  return SerialTreeLearner::Train(gradients, hessians, is_first_tree);
764
765
}

766
767
void GPUTreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
  SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
768
769
770
771
772
773
774
  num_feature_groups_ = train_data_->num_feature_groups();
  // GPU memory has to been reallocated because data may have been changed
  AllocateGPUMemory();
  // setup GPU kernel arguments after we allocating all the buffers
  SetupKernelArguments();
}

775
void GPUTreeLearner::ResetIsConstantHessian(bool is_constant_hessian) {
Nikita Titov's avatar
Nikita Titov committed
776
  if (is_constant_hessian != share_state_->is_constant_hessian) {
777
    SerialTreeLearner::ResetIsConstantHessian(is_constant_hessian);
Nikita Titov's avatar
Nikita Titov committed
778
779
    BuildGPUKernels();
    SetupKernelArguments();
780
781
782
  }
}

783
784
void GPUTreeLearner::BeforeTrain() {
  #if GPU_DEBUG >= 2
785
  printf("Copying initial full gradients and hessians to device\n");
786
787
788
789
  #endif
  // Copy initial full hessians and gradients to GPU.
  // We start copying as early as possible, instead of at ConstructHistogram().
  if (!use_bagging_ && num_dense_feature_groups_) {
790
    if (!share_state_->is_constant_hessian) {
791
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data_ * sizeof(score_t), hessians_);
792
    } else {
793
      // setup hessian parameters only
794
      score_t const_hessian = hessians_[0];
795
796
797
798
799
800
801
      for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
        // hessian is passed as a parameter
        histogram_kernels_[i].set_arg(6, const_hessian);
        histogram_allfeats_kernels_[i].set_arg(6, const_hessian);
        histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
      }
    }
802
    gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data_ * sizeof(score_t), gradients_);
803
804
805
806
807
808
  }

  SerialTreeLearner::BeforeTrain();

  // use bagging
  if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
Andrew Ziem's avatar
Andrew Ziem committed
809
810
    // On GPU, we start copying indices, gradients and Hessians now, instead at ConstructHistogram()
    // copy used gradients and Hessians to ordered buffer
811
812
813
814
815
816
817
    const data_size_t* indices = data_partition_->indices();
    data_size_t cnt = data_partition_->leaf_count(0);
    #if GPU_DEBUG > 0
    printf("Using bagging, examples count = %d\n", cnt);
    #endif
    // transfer the indices to GPU
    indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
818
    if (!share_state_->is_constant_hessian) {
819
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
820
821
822
823
      for (data_size_t i = 0; i < cnt; ++i) {
        ordered_hessians_[i] = hessians_[indices[i]];
      }
      // transfer hessian to GPU
824
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, cnt * sizeof(score_t), ordered_hessians_.data());
825
    } else {
826
      // setup hessian parameters only
827
      score_t const_hessian = hessians_[indices[0]];
828
829
830
831
832
833
834
      for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
        // hessian is passed as a parameter
        histogram_kernels_[i].set_arg(6, const_hessian);
        histogram_allfeats_kernels_[i].set_arg(6, const_hessian);
        histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
      }
    }
835
    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
836
837
838
839
    for (data_size_t i = 0; i < cnt; ++i) {
      ordered_gradients_[i] = gradients_[indices[i]];
    }
    // transfer gradients to GPU
840
    gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, cnt * sizeof(score_t), ordered_gradients_.data());
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
  }
}

bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
  int smaller_leaf;
  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
  // only have root
  if (right_leaf < 0) {
    smaller_leaf = -1;
  } else if (num_data_in_left_child < num_data_in_right_child) {
    smaller_leaf = left_leaf;
  } else {
    smaller_leaf = right_leaf;
  }

Andrew Ziem's avatar
Andrew Ziem committed
857
  // Copy indices, gradients and Hessians as early as possible
858
859
860
861
862
863
864
865
866
  if (smaller_leaf >= 0 && num_dense_feature_groups_) {
    // only need to initialize for smaller leaf
    // Get leaf boundary
    const data_size_t* indices = data_partition_->indices();
    data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
    data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);

    // copy indices to the GPU:
    #if GPU_DEBUG >= 2
Andrew Ziem's avatar
Andrew Ziem committed
867
    Log::Info("Copying indices, gradients and Hessians to GPU...");
868
    printf("Indices size %d being copied (left = %d, right = %d)\n", end - begin, num_data_in_left_child, num_data_in_right_child);
869
870
871
    #endif
    indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);

872
    if (!share_state_->is_constant_hessian) {
873
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
874
875
876
      for (data_size_t i = begin; i < end; ++i) {
        ordered_hessians_[i - begin] = hessians_[indices[i]];
      }
Andrew Ziem's avatar
Andrew Ziem committed
877
      // copy ordered Hessians to the GPU:
878
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_);
879
880
    }

881
    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
882
883
884
885
    for (data_size_t i = begin; i < end; ++i) {
      ordered_gradients_[i - begin] = gradients_[indices[i]];
    }
    // copy ordered gradients to the GPU:
886
    gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, (end - begin) * sizeof(score_t), ptr_pinned_gradients_);
887
888

    #if GPU_DEBUG >= 2
Andrew Ziem's avatar
Andrew Ziem committed
889
    Log::Info("Gradients/Hessians/indices copied to device with size %d", end - begin);
890
891
892
893
894
895
896
897
    #endif
  }
  return SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf);
}

bool GPUTreeLearner::ConstructGPUHistogramsAsync(
  const std::vector<int8_t>& is_feature_used,
  const data_size_t* data_indices, data_size_t num_data,
898
899
  const score_t* gradients, const score_t* hessians,
  score_t* ordered_gradients, score_t* ordered_hessians) {
900
901
902
903
904
905
906
  if (num_data <= 0) {
    return false;
  }
  // do nothing if no features can be processed on GPU
  if (!num_dense_feature_groups_) {
    return false;
  }
907

908
909
910
911
912
913
914
  // copy data indices if it is not null
  if (data_indices != nullptr && num_data != num_data_) {
    indices_future_ = boost::compute::copy_async(data_indices, data_indices + num_data, device_data_indices_->begin(), queue_);
  }
  // generate and copy ordered_gradients if gradients is not null
  if (gradients != nullptr) {
    if (num_data != num_data_) {
915
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
916
917
918
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_gradients[i] = gradients[data_indices[i]];
      }
919
      gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data * sizeof(score_t), ptr_pinned_gradients_);
920
    } else {
921
      gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data * sizeof(score_t), gradients);
922
923
    }
  }
Andrew Ziem's avatar
Andrew Ziem committed
924
  // generate and copy ordered_hessians if Hessians is not null
925
  if (hessians != nullptr && !share_state_->is_constant_hessian) {
926
    if (num_data != num_data_) {
927
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
928
929
930
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_hessians[i] = hessians[data_indices[i]];
      }
931
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data * sizeof(score_t), ptr_pinned_hessians_);
932
    } else {
933
      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data * sizeof(score_t), hessians);
934
935
936
937
    }
  }
  // converted indices in is_feature_used to feature-group indices
  std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
938
  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_features_ >= 2048)
939
  for (int i = 0; i < num_features_; ++i) {
940
    if (is_feature_used[i]) {
941
942
943
944
945
      is_feature_group_used[train_data_->Feature2Group(i)] = 1;
    }
  }
  // construct the feature masks for dense feature-groups
  int used_dense_feature_groups = 0;
946
  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
947
948
949
950
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
    if (is_feature_group_used[dense_feature_group_map_[i]]) {
      feature_masks_[i] = 1;
      ++used_dense_feature_groups;
951
    } else {
952
953
954
955
956
957
958
959
960
      feature_masks_[i] = 0;
    }
  }
  bool use_all_features = used_dense_feature_groups == num_dense_feature_groups_;
  // if no feature group is used, just return and do not use GPU
  if (used_dense_feature_groups == 0) {
    return false;
  }
#if GPU_DEBUG >= 1
961
  printf("Feature masks:\n");
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
  for (unsigned int i = 0; i < feature_masks_.size(); ++i) {
    printf("%d ", feature_masks_[i]);
  }
  printf("\n");
  printf("%d feature groups, %d used, %d\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features);
#endif
  // if not all feature groups are used, we need to transfer the feature mask to GPU
  // otherwise, we will use a specialized GPU kernel with all feature groups enabled
  if (!use_all_features) {
    queue_.enqueue_write_buffer(device_feature_masks_, 0, num_dense_feature4_ * dword_features_, ptr_pinned_feature_masks_);
  }
  // All data have been prepared, now run the GPU kernel
  GPUHistogram(num_data, use_all_features);
  return true;
}

void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
  std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
  std::vector<int8_t> is_dense_feature_used(num_features_, 0);
981
  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
982
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
983
    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
984
    if (!is_feature_used[feature_index]) continue;
985
    if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
986
      is_sparse_feature_used[feature_index] = 1;
987
    } else {
988
989
990
991
      is_dense_feature_used[feature_index] = 1;
    }
  }
  // construct smaller leaf
992
  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
Andrew Ziem's avatar
Andrew Ziem committed
993
  // ConstructGPUHistogramsAsync will return true if there are available feature groups dispatched to GPU
994
995
996
997
998
  bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
    nullptr, nullptr,
    nullptr, nullptr);
  // then construct sparse features on CPU
999
  train_data_->ConstructHistograms<false, 0>(is_sparse_feature_used,
1000
1001
    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
    gradients_, hessians_,
1002
1003
    ordered_gradients_.data(), ordered_hessians_.data(),
    share_state_.get(),
1004
1005
1006
    ptr_smaller_leaf_hist_data);
  // wait for GPU to finish, only if GPU is actually used
  if (is_gpu_used) {
Guolin Ke's avatar
Guolin Ke committed
1007
    if (config_->gpu_use_dp) {
1008
      // use double precision
1009
      WaitAndGetHistograms<hist_t>(ptr_smaller_leaf_hist_data);
1010
    } else {
1011
      // use single precision
1012
      WaitAndGetHistograms<gpu_hist_t>(ptr_smaller_leaf_hist_data);
1013
1014
1015
    }
  }

Andrew Ziem's avatar
Andrew Ziem committed
1016
  // Compare GPU histogram with CPU histogram, useful for debugging GPU code problem
1017
1018
1019
1020
1021
1022
1023
  // #define GPU_DEBUG_COMPARE
  #ifdef GPU_DEBUG_COMPARE
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
    if (!feature_masks_[i])
      continue;
    int dense_feature_group_index = dense_feature_group_map_[i];
    size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
1024
    hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
1025
1026
    hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
    hist_t* gpu_histogram = new hist_t[size * 2];
1027
1028
    data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
    printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
1029
1030
    std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
    std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
1031
1032
1033
1034
    if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) {
      continue;
    }
    if (num_data != num_data_) {
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
      train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
        smaller_leaf_splits_->data_indices(),
        0,
        num_data,
        ordered_gradients_.data(),
        ordered_hessians_.data(),
        current_histogram);
    } else {
      train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
        0,
        num_data,
        gradients_,
        hessians_,
        current_histogram);
    }
1050
    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
1051
    std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
1052
1053
1054
1055
1056
1057
    delete [] gpu_histogram;
  }
  #endif

  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
    // construct larger leaf
1058
    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
1059
1060
1061
1062
1063
    is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
      gradients_, hessians_,
      ordered_gradients_.data(), ordered_hessians_.data());
    // then construct sparse features on CPU
1064
    train_data_->ConstructHistograms<false, 0>(is_sparse_feature_used,
1065
1066
      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
      gradients_, hessians_,
1067
1068
      ordered_gradients_.data(), ordered_hessians_.data(),
      share_state_.get(),
1069
1070
1071
      ptr_larger_leaf_hist_data);
    // wait for GPU to finish, only if GPU is actually used
    if (is_gpu_used) {
Guolin Ke's avatar
Guolin Ke committed
1072
      if (config_->gpu_use_dp) {
1073
        // use double precision
1074
        WaitAndGetHistograms<hist_t>(ptr_larger_leaf_hist_data);
1075
      } else {
1076
        // use single precision
1077
        WaitAndGetHistograms<gpu_hist_t>(ptr_larger_leaf_hist_data);
1078
1079
1080
1081
1082
      }
    }
  }
}

1083
1084
void GPUTreeLearner::FindBestSplits(const Tree* tree) {
  SerialTreeLearner::FindBestSplits(tree);
1085
1086
1087

#if GPU_DEBUG >= 3
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
1088
    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
1089
1090
1091
1092
1093
    if (parent_leaf_histogram_array_ != nullptr
        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
      continue;
    }
1094
    size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1;
1095
    printf("Feature %d smaller leaf:\n", feature_index);
1096
    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
1097
1098
1099
    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) {
      continue;
    }
1100
    printf("Feature %d larger leaf:\n", feature_index);
1101
    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
1102
1103
1104
1105
1106
1107
1108
  }
#endif
}

void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
#if GPU_DEBUG >= 2
James Lamb's avatar
James Lamb committed
1109
  printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
#endif
  SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
  if (Network::num_machines() == 1) {
    // do some sanity check for the GPU algorithm
    if (best_split_info.left_count < best_split_info.right_count) {
      if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) ||
          (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) {
        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
      }
    } else {
Belinda Trotta's avatar
Belinda Trotta committed
1120
1121
      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, best_split_info.right_output);
      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, best_split_info.left_output);
1122
1123
1124
1125
1126
1127
1128
1129
1130
      if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
          (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
      }
    }
  }
}

}   // namespace LightGBM
1131
#endif  // USE_GPU