multi_val_sparse_bin.hpp 18.2 KB
Newer Older
1
2
/*!
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
Nikita Titov's avatar
Nikita Titov committed
3
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
4
 */
5
6
#ifndef LIGHTGBM_SRC_IO_MULTI_VAL_SPARSE_BIN_HPP_
#define LIGHTGBM_SRC_IO_MULTI_VAL_SPARSE_BIN_HPP_
7

8
9
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>
10
#include <LightGBM/utils/threading.h>
11

Nikita Titov's avatar
Nikita Titov committed
12
#include <algorithm>
13
14
15
16
17
18
#include <cstdint>
#include <cstring>
#include <vector>

namespace LightGBM {

19
template <typename INDEX_T, typename VAL_T>
20
class MultiValSparseBin : public MultiValBin {
21
 public:
22
23
24
25
26
  explicit MultiValSparseBin(data_size_t num_data, int num_bin,
                             double estimate_element_per_row)
      : num_data_(num_data),
        num_bin_(num_bin),
        estimate_element_per_row_(estimate_element_per_row) {
27
    row_ptr_.resize(num_data_ + 1, 0);
28
    INDEX_T estimate_num_data = static_cast<INDEX_T>(estimate_element_per_row_ * 1.1 * num_data_);
29
    int num_threads = OMP_NUM_THREADS();
30
31
    if (num_threads > 1) {
      t_data_.resize(num_threads - 1);
32
33
34
      for (size_t i = 0; i < t_data_.size(); ++i) {
        t_data_[i].resize(estimate_num_data / num_threads);
      }
35
    }
36
37
    t_size_.resize(num_threads, 0);
    data_.resize(estimate_num_data / num_threads);
38
39
  }

40
  ~MultiValSparseBin() {}
41

42
  data_size_t num_data() const override { return num_data_; }
43

44
  int num_bin() const override { return num_bin_; }
45

46
47
48
49
  double num_element_per_row() const override {
    return estimate_element_per_row_;
  }

50
51
  const std::vector<uint32_t>& offsets() const override { return offsets_; }

52
53
54
  void PushOneRow(int tid, data_size_t idx,
                  const std::vector<uint32_t>& values) override {
    const int pre_alloc_size = 50;
55
    row_ptr_[idx + 1] = static_cast<INDEX_T>(values.size());
56
    if (tid == 0) {
57
58
      if (t_size_[tid] + row_ptr_[idx + 1] >
          static_cast<INDEX_T>(data_.size())) {
59
60
        data_.resize(t_size_[tid] + row_ptr_[idx + 1] * pre_alloc_size);
      }
61
      for (auto val : values) {
62
        data_[t_size_[tid]++] = static_cast<VAL_T>(val);
63
64
      }
    } else {
65
66
      if (t_size_[tid] + row_ptr_[idx + 1] >
          static_cast<INDEX_T>(t_data_[tid - 1].size())) {
67
68
69
        t_data_[tid - 1].resize(t_size_[tid] +
                                row_ptr_[idx + 1] * pre_alloc_size);
      }
70
      for (auto val : values) {
71
        t_data_[tid - 1][t_size_[tid]++] = static_cast<VAL_T>(val);
72
73
74
75
      }
    }
  }

76
  void MergeData(const INDEX_T* sizes) {
77
    Common::FunctionTimer fun_time("MultiValSparseBin::MergeData", global_timer);
78
    for (data_size_t i = 0; i < num_data_; ++i) {
79
80
81
      row_ptr_[i + 1] += row_ptr_[i];
    }
    if (t_data_.size() > 0) {
82
      std::vector<INDEX_T> offsets(1 + t_data_.size());
83
      offsets[0] = sizes[0];
84
      for (size_t tid = 0; tid < t_data_.size() - 1; ++tid) {
85
        offsets[tid + 1] = offsets[tid] + sizes[tid + 1];
86
      }
87
      data_.resize(row_ptr_[num_data_]);
88
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
89
      for (int tid = 0; tid < static_cast<int>(t_data_.size()); ++tid) {
90
        std::copy_n(t_data_[tid].data(), sizes[tid + 1],
91
                    data_.data() + offsets[tid]);
92
      }
93
94
    } else {
      data_.resize(row_ptr_[num_data_]);
95
    }
96
97
98
99
100
  }

  void FinishLoad() override {
    MergeData(t_size_.data());
    t_size_.clear();
101
102
103
104
    row_ptr_.shrink_to_fit();
    data_.shrink_to_fit();
    t_data_.clear();
    t_data_.shrink_to_fit();
105
106
107
    // update estimate_element_per_row_ by all data
    estimate_element_per_row_ =
        static_cast<double>(row_ptr_[num_data_]) / num_data_;
108
109
  }

110
  bool IsSparse() override { return true; }
111

Guolin Ke's avatar
Guolin Ke committed
112
  template <bool USE_INDICES, bool USE_PREFETCH, bool ORDERED>
113
114
115
116
  void ConstructHistogramInner(const data_size_t* data_indices,
                               data_size_t start, data_size_t end,
                               const score_t* gradients,
                               const score_t* hessians, hist_t* out) const {
117
    data_size_t i = start;
Guolin Ke's avatar
Guolin Ke committed
118
119
    hist_t* grad = out;
    hist_t* hess = out + 1;
120
    const VAL_T* data_ptr = data_.data();
Guolin Ke's avatar
Guolin Ke committed
121
    if (USE_PREFETCH) {
122
123
124
125
      const data_size_t pf_offset = 32 / sizeof(VAL_T);
      const data_size_t pf_end = end - pf_offset;

      for (; i < pf_end; ++i) {
Guolin Ke's avatar
Guolin Ke committed
126
        const auto idx = USE_INDICES ? data_indices[i] : i;
127
        const auto pf_idx =
Guolin Ke's avatar
Guolin Ke committed
128
129
130
            USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
        if (!ORDERED) {
          PREFETCH_T0(gradients + pf_idx);
131
132
133
          PREFETCH_T0(hessians + pf_idx);
        }
        PREFETCH_T0(row_ptr_.data() + pf_idx);
134
        PREFETCH_T0(data_ptr + row_ptr_[pf_idx]);
135
136
        const auto j_start = RowPtr(idx);
        const auto j_end = RowPtr(idx + 1);
137
138
        const score_t gradient = ORDERED ? gradients[i] : gradients[idx];
        const score_t hessian = ORDERED ? hessians[i] : hessians[idx];
139
        for (auto j = j_start; j < j_end; ++j) {
140
141
142
          const auto ti = static_cast<uint32_t>(data_ptr[j]) << 1;
          grad[ti] += gradient;
          hess[ti] += hessian;
143
144
145
146
        }
      }
    }
    for (; i < end; ++i) {
Guolin Ke's avatar
Guolin Ke committed
147
      const auto idx = USE_INDICES ? data_indices[i] : i;
148
149
      const auto j_start = RowPtr(idx);
      const auto j_end = RowPtr(idx + 1);
150
151
      const score_t gradient = ORDERED ? gradients[i] : gradients[idx];
      const score_t hessian = ORDERED ? hessians[i] : hessians[idx];
152
      for (auto j = j_start; j < j_end; ++j) {
153
154
155
        const auto ti = static_cast<uint32_t>(data_ptr[j]) << 1;
        grad[ti] += gradient;
        hess[ti] += hessian;
156
157
158
159
      }
    }
  }

160
161
162
  void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
                          data_size_t end, const score_t* gradients,
                          const score_t* hessians, hist_t* out) const override {
Guolin Ke's avatar
Guolin Ke committed
163
    ConstructHistogramInner<true, true, false>(data_indices, start, end,
Nikita Titov's avatar
Nikita Titov committed
164
                                               gradients, hessians, out);
165
166
167
  }

  void ConstructHistogram(data_size_t start, data_size_t end,
168
169
                          const score_t* gradients, const score_t* hessians,
                          hist_t* out) const override {
Guolin Ke's avatar
Guolin Ke committed
170
171
    ConstructHistogramInner<false, false, false>(
        nullptr, start, end, gradients, hessians, out);
172
173
  }

Guolin Ke's avatar
Guolin Ke committed
174
175
176
177
178
179
  void ConstructHistogramOrdered(const data_size_t* data_indices,
                                 data_size_t start, data_size_t end,
                                 const score_t* gradients,
                                 const score_t* hessians,
                                 hist_t* out) const override {
    ConstructHistogramInner<true, true, true>(data_indices, start, end,
Nikita Titov's avatar
Nikita Titov committed
180
                                              gradients, hessians, out);
181
182
  }

183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
  template <bool USE_INDICES, bool USE_PREFETCH, bool ORDERED, typename PACKED_HIST_T, int HIST_BITS>
  void ConstructHistogramIntInner(const data_size_t* data_indices,
                               data_size_t start, data_size_t end,
                               const score_t* gradients_and_hessians, hist_t* out) const {
    data_size_t i = start;
    PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
    const int16_t* gradients_and_hessians_ptr = reinterpret_cast<const int16_t*>(gradients_and_hessians);
    const VAL_T* data_ptr = data_.data();
    const INDEX_T* row_ptr_base = row_ptr_.data();
    if (USE_PREFETCH) {
      const data_size_t pf_offset = 32 / sizeof(VAL_T);
      const data_size_t pf_end = end - pf_offset;

      for (; i < pf_end; ++i) {
        const auto idx = USE_INDICES ? data_indices[i] : i;
        const auto pf_idx =
            USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
        if (!ORDERED) {
          PREFETCH_T0(gradients_and_hessians_ptr + pf_idx);
        }
        PREFETCH_T0(row_ptr_base + pf_idx);
        PREFETCH_T0(data_ptr + row_ptr_[pf_idx]);
        const auto j_start = RowPtr(idx);
        const auto j_end = RowPtr(idx + 1);
        const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx];
        const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
          ((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
          static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
        for (auto j = j_start; j < j_end; ++j) {
          const auto ti = static_cast<uint32_t>(data_ptr[j]);
          out_ptr[ti] += gradient_packed;
        }
      }
    }
    for (; i < end; ++i) {
      const auto idx = USE_INDICES ? data_indices[i] : i;
      const auto j_start = RowPtr(idx);
      const auto j_end = RowPtr(idx + 1);
      const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx];
      const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
          ((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
          static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
      for (auto j = j_start; j < j_end; ++j) {
        const auto ti = static_cast<uint32_t>(data_ptr[j]);
        out_ptr[ti] += gradient_packed;
      }
    }
  }

  void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
                          data_size_t end, const score_t* gradients,
                          const score_t* /*hessians*/, hist_t* out) const override {
    ConstructHistogramIntInner<true, true, false, int64_t, 32>(data_indices, start, end,
                                                               gradients, out);
  }

  void ConstructHistogramInt32(data_size_t start, data_size_t end,
                          const score_t* gradients, const score_t* /*hessians*/,
                          hist_t* out) const override {
    ConstructHistogramIntInner<false, false, false, int64_t, 32>(
        nullptr, start, end, gradients, out);
  }

  void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
                                 data_size_t start, data_size_t end,
                                 const score_t* gradients,
                                 const score_t* /*hessians*/,
                                 hist_t* out) const override {
    ConstructHistogramIntInner<true, true, true, int64_t, 32>(data_indices, start, end,
                                                              gradients, out);
  }

  void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
                          data_size_t end, const score_t* gradients,
                          const score_t* /*hessians*/, hist_t* out) const override {
    ConstructHistogramIntInner<true, true, false, int32_t, 16>(data_indices, start, end,
                                                               gradients, out);
  }

  void ConstructHistogramInt16(data_size_t start, data_size_t end,
                          const score_t* gradients, const score_t* /*hessians*/,
                          hist_t* out) const override {
    ConstructHistogramIntInner<false, false, false, int32_t, 16>(
        nullptr, start, end, gradients, out);
  }

  void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
                                 data_size_t start, data_size_t end,
                                 const score_t* gradients,
                                 const score_t* /*hessians*/,
                                 hist_t* out) const override {
    ConstructHistogramIntInner<true, true, true, int32_t, 16>(data_indices, start, end,
                                                              gradients, out);
  }

  void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
                          data_size_t end, const score_t* gradients,
                          const score_t* /*hessians*/, hist_t* out) const override {
    ConstructHistogramIntInner<true, true, false, int16_t, 8>(data_indices, start, end,
                                                               gradients, out);
  }

  void ConstructHistogramInt8(data_size_t start, data_size_t end,
                          const score_t* gradients, const score_t* /*hessians*/,
                          hist_t* out) const override {
    ConstructHistogramIntInner<false, false, false, int16_t, 8>(
        nullptr, start, end, gradients, out);
  }

  void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
                                 data_size_t start, data_size_t end,
                                 const score_t* gradients,
                                 const score_t* /*hessians*/,
                                 hist_t* out) const override {
    ConstructHistogramIntInner<true, true, true, int16_t, 8>(data_indices, start, end,
                                                              gradients, out);
  }

301
  MultiValBin* CreateLike(data_size_t num_data, int num_bin, int,
302
303
                          double estimate_element_per_row,
                          const std::vector<uint32_t>& /*offsets*/) const override {
304
305
    return new MultiValSparseBin<INDEX_T, VAL_T>(num_data, num_bin,
                                                 estimate_element_per_row);
306
307
  }

308
  void ReSize(data_size_t num_data, int num_bin, int,
309
              double estimate_element_per_row, const std::vector<uint32_t>& /*offsets*/) override {
310
    num_data_ = num_data;
311
312
    num_bin_ = num_bin;
    estimate_element_per_row_ = estimate_element_per_row;
313
    INDEX_T estimate_num_data =
314
        static_cast<INDEX_T>(estimate_element_per_row_ * 1.1 * num_data_);
315
    size_t npart = 1 + t_data_.size();
316
317
    INDEX_T avg_num_data = static_cast<INDEX_T>(estimate_num_data / npart);
    if (static_cast<INDEX_T>(data_.size()) < avg_num_data) {
318
319
320
      data_.resize(avg_num_data, 0);
    }
    for (size_t i = 0; i < t_data_.size(); ++i) {
321
      if (static_cast<INDEX_T>(t_data_[i].size()) < avg_num_data) {
322
323
        t_data_[i].resize(avg_num_data, 0);
      }
324
    }
325
326
327
    if (num_data_ + 1 > static_cast<data_size_t>(row_ptr_.size())) {
      row_ptr_.resize(num_data_ + 1);
    }
328
329
  }

330
331
332
333
334
335
  template <bool SUBROW, bool SUBCOL>
  void CopyInner(const MultiValBin* full_bin, const data_size_t* used_indices,
                 data_size_t num_used_indices,
                 const std::vector<uint32_t>& lower,
                 const std::vector<uint32_t>& upper,
                 const std::vector<uint32_t>& delta) {
336
    const auto other =
337
        reinterpret_cast<const MultiValSparseBin<INDEX_T, VAL_T>*>(full_bin);
338
    if (SUBROW) {
Nikita Titov's avatar
Nikita Titov committed
339
      CHECK_EQ(num_data_, num_used_indices);
340
    }
Guolin Ke's avatar
Guolin Ke committed
341
342
343
344
    int n_block = 1;
    data_size_t block_size = num_data_;
    Threading::BlockInfo<data_size_t>(static_cast<int>(t_data_.size() + 1),
                                      num_data_, 1024, &n_block, &block_size);
345
    std::vector<INDEX_T> sizes(t_data_.size() + 1, 0);
346
    const int pre_alloc_size = 50;
347
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
348
349
350
351
    for (int tid = 0; tid < n_block; ++tid) {
      data_size_t start = tid * block_size;
      data_size_t end = std::min(num_data_, start + block_size);
      auto& buf = (tid == 0) ? data_ : t_data_[tid - 1];
352
      INDEX_T size = 0;
353
      for (data_size_t i = start; i < end; ++i) {
354
355
356
357
        const auto j_start =
            SUBROW ? other->RowPtr(used_indices[i]) : other->RowPtr(i);
        const auto j_end =
            SUBROW ? other->RowPtr(used_indices[i] + 1) : other->RowPtr(i + 1);
358
        if (size + (j_end - j_start) > static_cast<INDEX_T>(buf.size())) {
359
360
361
          buf.resize(size + (j_end - j_start) * pre_alloc_size);
        }
        int k = 0;
362
        const auto pre_size = size;
363
        for (auto j = j_start; j < j_end; ++j) {
364
365
366
367
368
369
370
371
372
373
          const auto val = other->data_[j];
          if (SUBCOL) {
            while (val >= upper[k]) {
              ++k;
            }
            if (val >= lower[k]) {
              buf[size++] = static_cast<VAL_T>(val - delta[k]);
            }
          } else {
            buf[size++] = val;
374
375
376
377
378
379
380
          }
        }
        row_ptr_[i + 1] = size - pre_size;
      }
      sizes[tid] = size;
    }
    MergeData(sizes.data());
381
382
  }

383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
  void CopySubrow(const MultiValBin* full_bin, const data_size_t* used_indices,
                  data_size_t num_used_indices) override {
    CopyInner<true, false>(full_bin, used_indices, num_used_indices,
                           std::vector<uint32_t>(), std::vector<uint32_t>(),
                           std::vector<uint32_t>());
  }

  void CopySubcol(const MultiValBin* full_bin, const std::vector<int>&,
                  const std::vector<uint32_t>& lower,
                  const std::vector<uint32_t>& upper,
                  const std::vector<uint32_t>& delta) override {
    CopyInner<false, true>(full_bin, nullptr, num_data_, lower, upper, delta);
  }

  void CopySubrowAndSubcol(const MultiValBin* full_bin,
                           const data_size_t* used_indices,
                           data_size_t num_used_indices,
                           const std::vector<int>&,
                           const std::vector<uint32_t>& lower,
                           const std::vector<uint32_t>& upper,
                           const std::vector<uint32_t>& delta) override {
    CopyInner<true, true>(full_bin, used_indices, num_used_indices, lower,
                          upper, delta);
  }

408
  inline INDEX_T RowPtr(data_size_t idx) const { return row_ptr_[idx]; }
409

410
  MultiValSparseBin<INDEX_T, VAL_T>* Clone() override;
411

412

413
  #ifdef USE_CUDA
414
415
416
417
418
  const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
    const void** out_data_ptr,
    uint8_t* data_ptr_bit_type) const override;
419
  #endif  // USE_CUDA
420

421
 private:
422
423
  data_size_t num_data_;
  int num_bin_;
424
  double estimate_element_per_row_;
425
  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
426
  std::vector<INDEX_T, Common::AlignmentAllocator<INDEX_T, 32>>
427
428
429
      row_ptr_;
  std::vector<std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>>>
      t_data_;
430
  std::vector<INDEX_T> t_size_;
431
  std::vector<uint32_t> offsets_;
432

433
  MultiValSparseBin(const MultiValSparseBin<INDEX_T, VAL_T>& other)
434
435
436
437
438
      : num_data_(other.num_data_),
        num_bin_(other.num_bin_),
        estimate_element_per_row_(other.estimate_element_per_row_),
        data_(other.data_),
        row_ptr_(other.row_ptr_) {}
439
440
};

441
442
443
template <typename INDEX_T, typename VAL_T>
MultiValSparseBin<INDEX_T, VAL_T>* MultiValSparseBin<INDEX_T, VAL_T>::Clone() {
  return new MultiValSparseBin<INDEX_T, VAL_T>(*this);
444
445
446
}

}  // namespace LightGBM
447

448
#endif  // LIGHTGBM_SRC_IO_MULTI_VAL_SPARSE_BIN_HPP_