cuda_histogram_constructor.cpp 7.93 KB
Newer Older
1
2
3
4
5
6
/*!
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
 */

7
#if defined(USE_CUDA) || defined(USE_ROCM)
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

#include "cuda_histogram_constructor.hpp"

#include <algorithm>

namespace LightGBM {

CUDAHistogramConstructor::CUDAHistogramConstructor(
  const Dataset* train_data,
  const int num_leaves,
  const int num_threads,
  const std::vector<uint32_t>& feature_hist_offsets,
  const int min_data_in_leaf,
  const double min_sum_hessian_in_leaf,
  const int gpu_device_id,
23
24
25
  const bool gpu_use_dp,
  const bool use_quantized_grad,
  const int num_grad_quant_bins):
26
27
28
29
30
31
32
  num_data_(train_data->num_data()),
  num_features_(train_data->num_features()),
  num_leaves_(num_leaves),
  num_threads_(num_threads),
  min_data_in_leaf_(min_data_in_leaf),
  min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf),
  gpu_device_id_(gpu_device_id),
33
34
35
  gpu_use_dp_(gpu_use_dp),
  use_quantized_grad_(use_quantized_grad),
  num_grad_quant_bins_(num_grad_quant_bins) {
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  InitFeatureMetaInfo(train_data, feature_hist_offsets);
  cuda_row_data_.reset(nullptr);
}

CUDAHistogramConstructor::~CUDAHistogramConstructor() {
  gpuAssert(cudaStreamDestroy(cuda_stream_), __FILE__, __LINE__);
}

void CUDAHistogramConstructor::InitFeatureMetaInfo(const Dataset* train_data, const std::vector<uint32_t>& feature_hist_offsets) {
  need_fix_histogram_features_.clear();
  need_fix_histogram_features_num_bin_aligend_.clear();
  feature_num_bins_.clear();
  feature_most_freq_bins_.clear();
  for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) {
    const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index);
    const uint32_t most_freq_bin = bin_mapper->GetMostFreqBin();
    if (most_freq_bin != 0) {
      need_fix_histogram_features_.emplace_back(feature_index);
      uint32_t num_bin_ref = static_cast<uint32_t>(bin_mapper->num_bin()) - 1;
      uint32_t num_bin_aligned = 1;
      while (num_bin_ref > 0) {
        num_bin_aligned <<= 1;
        num_bin_ref >>= 1;
      }
      need_fix_histogram_features_num_bin_aligend_.emplace_back(num_bin_aligned);
    }
    feature_num_bins_.emplace_back(static_cast<uint32_t>(bin_mapper->num_bin()));
    feature_most_freq_bins_.emplace_back(most_freq_bin);
  }
  feature_hist_offsets_.clear();
  for (size_t i = 0; i < feature_hist_offsets.size(); ++i) {
    feature_hist_offsets_.emplace_back(feature_hist_offsets[i]);
  }
  if (feature_hist_offsets.empty()) {
    num_total_bin_ = 0;
  } else {
    num_total_bin_ = static_cast<int>(feature_hist_offsets.back());
  }
}

void CUDAHistogramConstructor::BeforeTrain(const score_t* gradients, const score_t* hessians) {
  cuda_gradients_ = gradients;
  cuda_hessians_ = hessians;
79
  cuda_hist_.SetValue(0);
80
81
82
}

void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStates* share_state) {
83
84
  cuda_hist_.Resize(static_cast<size_t>(num_total_bin_ * 2 * num_leaves_));
  cuda_hist_.SetValue(0);
85

86
87
88
  cuda_feature_num_bins_.InitFromHostVector(feature_num_bins_);
  cuda_feature_hist_offsets_.InitFromHostVector(feature_hist_offsets_);
  cuda_feature_most_freq_bins_.InitFromHostVector(feature_most_freq_bins_);
89
90
91
92
93
94

  cuda_row_data_.reset(new CUDARowData(train_data, share_state, gpu_device_id_, gpu_use_dp_));
  cuda_row_data_->Init(train_data, share_state);

  CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_));

95
96
  cuda_need_fix_histogram_features_.InitFromHostVector(need_fix_histogram_features_);
  cuda_need_fix_histogram_features_num_bin_aligned_.InitFromHostVector(need_fix_histogram_features_num_bin_aligend_);
97
98
99
100

  if (cuda_row_data_->NumLargeBinPartition() > 0) {
    int grid_dim_x = 0, grid_dim_y = 0, block_dim_x = 0, block_dim_y = 0;
    CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_);
101
102
103
104
105
106
107
108
109
110
111
112
    const size_t buffer_size = static_cast<size_t>(grid_dim_y) * static_cast<size_t>(num_total_bin_);
    if (!use_quantized_grad_) {
      if (gpu_use_dp_) {
        // need to double the size of histogram buffer in global memory when using double precision in histogram construction
        cuda_hist_buffer_.Resize(buffer_size * 4);
      } else {
        cuda_hist_buffer_.Resize(buffer_size * 2);
      }
    } else {
      // use only half the size of histogram buffer in global memory when quantized training since each gradient and hessian takes only 2 bytes
      cuda_hist_buffer_.Resize(buffer_size);
    }
113
  }
114
  hist_buffer_for_num_bit_change_.Resize(num_total_bin_ * 2);
115
116
117
118
}

void CUDAHistogramConstructor::ConstructHistogramForLeaf(
  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
119
  const CUDALeafSplitsStruct* /*cuda_larger_leaf_splits*/,
120
121
122
  const data_size_t num_data_in_smaller_leaf,
  const data_size_t num_data_in_larger_leaf,
  const double sum_hessians_in_smaller_leaf,
123
124
  const double sum_hessians_in_larger_leaf,
  const uint8_t num_bits_in_histogram_bins) {
125
126
127
128
  if ((num_data_in_smaller_leaf <= min_data_in_leaf_ || sum_hessians_in_smaller_leaf <= min_sum_hessian_in_leaf_) &&
    (num_data_in_larger_leaf <= min_data_in_leaf_ || sum_hessians_in_larger_leaf <= min_sum_hessian_in_leaf_)) {
    return;
  }
129
  LaunchConstructHistogramKernel(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
130
  SynchronizeCUDADevice(__FILE__, __LINE__);
131
132
133
134
135
136
137
138
139
}

void CUDAHistogramConstructor::SubtractHistogramForLeaf(
  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
  const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
  const bool use_quantized_grad,
  const uint8_t parent_num_bits_in_histogram_bins,
  const uint8_t smaller_num_bits_in_histogram_bins,
  const uint8_t larger_num_bits_in_histogram_bins) {
140
  global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel");
141
142
  LaunchSubtractHistogramKernel(cuda_smaller_leaf_splits, cuda_larger_leaf_splits, use_quantized_grad,
                                parent_num_bits_in_histogram_bins, smaller_num_bits_in_histogram_bins, larger_num_bits_in_histogram_bins);
143
144
145
146
147
148
149
150
151
152
  global_timer.Stop("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel");
}

void CUDAHistogramConstructor::CalcConstructHistogramKernelDim(
  int* grid_dim_x,
  int* grid_dim_y,
  int* block_dim_x,
  int* block_dim_y,
  const data_size_t num_data_in_smaller_leaf) {
  *block_dim_x = cuda_row_data_->max_num_column_per_partition();
153
  *block_dim_y = NUM_THREADS_PER_BLOCK / cuda_row_data_->max_num_column_per_partition();
154
155
156
157
158
159
160
161
162
163
  *grid_dim_x = cuda_row_data_->num_feature_partitions();
  *grid_dim_y = std::max(min_grid_dim_y_,
    ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y));
}

void CUDAHistogramConstructor::ResetTrainingData(const Dataset* train_data, TrainingShareStates* share_states) {
  num_data_ = train_data->num_data();
  num_features_ = train_data->num_features();
  InitFeatureMetaInfo(train_data, share_states->feature_hist_offsets());

164
165
166
167
168
  cuda_hist_.Resize(static_cast<size_t>(num_total_bin_ * 2 * num_leaves_));
  cuda_hist_.SetValue(0);
  cuda_feature_num_bins_.InitFromHostVector(feature_num_bins_);
  cuda_feature_hist_offsets_.InitFromHostVector(feature_hist_offsets_);
  cuda_feature_most_freq_bins_.InitFromHostVector(feature_most_freq_bins_);
169
170
171
172

  cuda_row_data_.reset(new CUDARowData(train_data, share_states, gpu_device_id_, gpu_use_dp_));
  cuda_row_data_->Init(train_data, share_states);

173
174
  cuda_need_fix_histogram_features_.InitFromHostVector(need_fix_histogram_features_);
  cuda_need_fix_histogram_features_num_bin_aligned_.InitFromHostVector(need_fix_histogram_features_num_bin_aligend_);
175
176
177
178
179
180
181
}

void CUDAHistogramConstructor::ResetConfig(const Config* config) {
  num_threads_ = OMP_NUM_THREADS();
  num_leaves_ = config->num_leaves;
  min_data_in_leaf_ = config->min_data_in_leaf;
  min_sum_hessian_in_leaf_ = config->min_sum_hessian_in_leaf;
182
183
  cuda_hist_.Resize(static_cast<size_t>(num_total_bin_ * 2 * num_leaves_));
  cuda_hist_.SetValue(0);
184
185
186
187
}

}  // namespace LightGBM

188
#endif  // USE_CUDA || USE_ROCM