"src/vscode:/vscode.git/clone" did not exist on "6129208006c91f9ac94891690aca6f0c49d9b08e"
cuda_tree_learner.h 11 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/*!
 * Copyright (c) 2020 IBM Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
#ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
#define LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_

#include <LightGBM/utils/random.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/dataset.h>
#include <LightGBM/feature_group.h>
#include <LightGBM/tree.h>

#include <string>
#include <cmath>
#include <cstdio>
#include <memory>
#include <random>
#include <vector>
#ifdef USE_CUDA
#include <cuda_runtime.h>
#endif

#include "feature_histogram.hpp"
#include "serial_tree_learner.h"
#include "data_partition.hpp"
#include "split_info.hpp"
#include "leaf_splits.hpp"

#ifdef USE_CUDA
#include <LightGBM/cuda/vector_cudahost.h>
#include "cuda_kernel_launcher.h"


using json11::Json;

namespace LightGBM {

/*!
* \brief CUDA-based parallel learning algorithm.
*/
class CUDATreeLearner: public SerialTreeLearner {
 public:
    explicit CUDATreeLearner(const Config* tree_config);
    ~CUDATreeLearner();
    void Init(const Dataset* train_data, bool is_constant_hessian) override;
    void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
    Tree* Train(const score_t* gradients, const score_t *hessians);
    void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
      SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
      if (subset == nullptr && used_indices != nullptr) {
        if (num_data != num_data_) {
          use_bagging_ = true;
          return;
        }
      }
      use_bagging_ = false;
    }

 protected:
    void BeforeTrain() override;
    bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
    void FindBestSplits(const Tree* tree) override;
    void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
    void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;

 private:
    typedef float gpu_hist_t;

    /*!
     * \brief Find the best number of workgroups processing one feature for maximizing efficiency
     * \param leaf_num_data The number of data examples on the current leaf being processed
     * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
     */
    int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);

    /*!
     * \brief Initialize GPU device
     * \param num_gpu: number of maximum gpus
     */
    void InitGPU(int num_gpu);

    /*!
     * \brief Allocate memory for GPU computation // alloc only
     */
    void CountDenseFeatureGroups();  // compute num_dense_feature_group
    void prevAllocateGPUMemory();  // compute CPU-side param calculation & Pin HostMemory
    void AllocateGPUMemory();

    /*!
     * \ ResetGPUMemory
     */
    void ResetGPUMemory();

    /*!
     * \ copy dense feature from CPU to GPU
     */
    void copyDenseFeature();

    /*! 
     * \brief Compute GPU feature histogram for the current leaf.
     *        Indices, gradients and hessians have been copied to the device.
     * \param leaf_num_data Number of data on current leaf
     * \param use_all_features Set to true to not use feature masks, with a faster kernel
     */
    void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);

    void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
                int leaf_num_data, bool use_all_features,
                int num_workgroups, int exp_workgroups_per_feature) {
      ThreadData* td = &thread_data[device_id];
      td->device_id             = device_id;
      td->histogram_size        = histogram_size;
      td->leaf_num_data         = leaf_num_data;
      td->num_data              = num_data_;
      td->use_all_features      = use_all_features;
      td->is_constant_hessian   = share_state_->is_constant_hessian;
      td->num_workgroups        = num_workgroups;
      td->stream                = stream_[device_id];
      td->device_features       = device_features_[device_id];
      td->device_feature_masks  = reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
      td->device_data_indices   = device_data_indices_[device_id];
      td->device_gradients      = device_gradients_[device_id];
      td->device_hessians       = device_hessians_[device_id];
      td->hessians_const        = hessians_[0];
      td->device_subhistograms  = device_subhistograms_[device_id];
      td->sync_counters         = sync_counters_[device_id];
      td->device_histogram_outputs   = device_histogram_outputs_[device_id];
      td->exp_workgroups_per_feature = exp_workgroups_per_feature;

      td->kernel_start           = &(kernel_start_[device_id]);
      td->kernel_wait_obj        = &(kernel_wait_obj_[device_id]);
      td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]);

      size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
      size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
      td->output_size           = output_size;
      td->host_histogram_output = reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset;
      td->histograms_wait_obj   = &(histograms_wait_obj_[device_id]);
    }

    /*!
     * \brief Wait for GPU kernel execution and read histogram
     * \param histograms Destination of histogram results from GPU.
     */
    template <typename HistType>
    void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array);

    /*!
     * \brief Construct GPU histogram asynchronously. 
     *        Interface is similar to Dataset::ConstructHistograms().
     * \param is_feature_used A predicate vector for enabling each feature
     * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
     *                     Set to nullptr to skip copy to GPU.
     * \param num_data Number of data examples to be included in histogram
     * \return true if GPU kernel is launched, false if GPU is not used
    */
    bool ConstructGPUHistogramsAsync(
      const std::vector<int8_t>& is_feature_used,
      const data_size_t* data_indices, data_size_t num_data);

    /*! brief Log2 of max number of workgroups per feature*/
    const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10
    /*! brief Max total number of workgroups with preallocated workspace.
     *        If we use more than this number of workgroups, we have to reallocate subhistograms */
    std::vector<int> preallocd_max_num_wg_;

    /*! \brief True if bagging is used */
    bool use_bagging_;

    /*! \brief GPU command queue object */
    std::vector<cudaStream_t> stream_;

    /*! \brief total number of feature-groups */
    int num_feature_groups_;
    /*! \brief total number of dense feature-groups, which will be processed on GPU */
    int num_dense_feature_groups_;
    std::vector<int> num_gpu_feature_groups_;
    std::vector<int> offset_gpu_feature_groups_;
    /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
     *  With bin size > 16, there are 4 features per DWORD.
     *  With bin size <=16, there are 8 features per DWORD.
     */
    int dword_features_;
    /*! \brief Max number of bins of training data, used to determine 
     * which GPU kernel to use */
    int max_num_bin_;
    /*! \brief Used GPU kernel bin size (64, 256) */
    int histogram_size_;
    int device_bin_size_;
    /*! \brief Size of histogram bin entry, depending if single or double precision is used */
    size_t hist_bin_entry_sz_;
    /*! \brief Indices of all dense feature-groups */
    std::vector<int> dense_feature_group_map_;
    /*! \brief Indices of all sparse feature-groups */
    std::vector<int> sparse_feature_group_map_;
    /*! \brief GPU memory object holding the training data */
    std::vector<uint8_t*> device_features_;
    /*! \brief GPU memory object holding the ordered gradient */
    std::vector<score_t*> device_gradients_;
    /*! \brief Pointer to pinned memory of ordered gradient */
    void * ptr_pinned_gradients_ = nullptr;
    /*! \brief GPU memory object holding the ordered hessian */
    std::vector<score_t*> device_hessians_;
    /*! \brief Pointer to pinned memory of ordered hessian */
    void * ptr_pinned_hessians_ = nullptr;
    /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
    std::vector<char> feature_masks_;
    /*! \brief GPU memory object holding the feature masks */
    std::vector<char*> device_feature_masks_;
    /*! \brief Pointer to pinned memory of feature masks */
    char* ptr_pinned_feature_masks_ = nullptr;
    /*! \brief GPU memory object holding indices of the leaf being processed */
    std::vector<data_size_t*> device_data_indices_;
    /*! \brief GPU memory object holding counters for workgroup coordination */
    std::vector<int*> sync_counters_;
    /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
    std::vector<char*> device_subhistograms_;
    /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
    std::vector<void*> device_histogram_outputs_;
    /*! \brief Host memory pointer for histogram outputs */
    void *host_histogram_outputs_;
    /*! CUDA waitlist object for waiting for data transfer before kernel execution */
    std::vector<cudaEvent_t> kernel_wait_obj_;
    /*! CUDA waitlist object for reading output histograms after kernel execution */
    std::vector<cudaEvent_t> histograms_wait_obj_;
    /*! CUDA Asynchronous waiting object for copying indices */
    std::vector<cudaEvent_t> indices_future_;
    /*! Asynchronous waiting object for copying gradients */
    std::vector<cudaEvent_t> gradients_future_;
    /*! Asynchronous waiting object for copying hessians */
    std::vector<cudaEvent_t> hessians_future_;
    /*! Asynchronous waiting object for copying dense features */
    std::vector<cudaEvent_t> features_future_;

    // host-side buffer for converting feature data into featre4 data
    int nthreads_;  // number of Feature4* vector on host4_vecs_
    std::vector<cudaEvent_t> kernel_start_;
    std::vector<float> kernel_time_;  // measure histogram kernel time
    std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
    int num_gpu_;
    int allocated_num_data_;  // allocated data instances
    pthread_t **cpu_threads_;  // pthread, 1 cpu thread / gpu
};

}  // namespace LightGBM
#else  // USE_CUDA

// When GPU support is not compiled in, quit with an error message

namespace LightGBM {

class CUDATreeLearner: public SerialTreeLearner {
 public:
    #pragma warning(disable : 4702)
    explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
                 "Please recompile with CMake option -DUSE_CUDA=1");
    }
};

}  // namespace LightGBM

#endif  // USE_CUDA
#endif  // LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_