"src/io/dense_nbits_bin.hpp" did not exist on "3489607fbc466044fff96aa80738a005c7ead215"
rank_objective.hpp 16.9 KB
Newer Older
1
/*!
2
3
4
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_
#define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_

9
10
11
#include <LightGBM/metric.h>
#include <LightGBM/objective_function.h>

12
13
#include <algorithm>
#include <cmath>
Guolin Ke's avatar
Guolin Ke committed
14
15
#include <cstdio>
#include <cstring>
16
17
#include <limits>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
18
19
20
#include <vector>

namespace LightGBM {
21
22
23
24
25
26
27

/*!
 * \brief Objective function for Ranking
 */
class RankingObjective : public ObjectiveFunction {
 public:
  explicit RankingObjective(const Config& config)
28
29
30
31
      : seed_(config.objective_seed) {
    learning_rate_ = config.learning_rate;
    position_bias_regularization_ = config.lambdarank_position_bias_regularization;
  }
32
33
34
35
36
37
38
39
40
41
42

  explicit RankingObjective(const std::vector<std::string>&) : seed_(0) {}

  ~RankingObjective() {}

  void Init(const Metadata& metadata, data_size_t num_data) override {
    num_data_ = num_data;
    // get label
    label_ = metadata.label();
    // get weights
    weights_ = metadata.weights();
43
44
45
46
47
48
    // get positions
    positions_ = metadata.positions();
    // get position ids
    position_ids_ = metadata.position_ids();
    // get number of different position ids
    num_position_ids_ = static_cast<data_size_t>(metadata.num_position_ids());
49
50
51
52
53
54
    // get boundries
    query_boundaries_ = metadata.query_boundaries();
    if (query_boundaries_ == nullptr) {
      Log::Fatal("Ranking tasks require query information");
    }
    num_queries_ = metadata.num_queries();
55
56
    // initialize position bias vectors
    pos_biases_.resize(num_position_ids_, 0.0);
57
58
59
60
61
62
63
64
  }

  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
#pragma omp parallel for schedule(guided)
    for (data_size_t i = 0; i < num_queries_; ++i) {
      const data_size_t start = query_boundaries_[i];
      const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i];
65
66
67
68
69
70
71
      std::vector<double> score_adjusted;
      if (num_position_ids_ > 0) {
        for (data_size_t j = 0; j < cnt; ++j) {
          score_adjusted.push_back(score[start + j] + pos_biases_[positions_[start + j]]);
        }
      }
      GetGradientsForOneQuery(i, cnt, label_ + start, num_position_ids_ > 0 ? score_adjusted.data() : score + start,
72
73
74
75
76
77
78
79
80
81
                              gradients + start, hessians + start);
      if (weights_ != nullptr) {
        for (data_size_t j = 0; j < cnt; ++j) {
          gradients[start + j] =
              static_cast<score_t>(gradients[start + j] * weights_[start + j]);
          hessians[start + j] =
              static_cast<score_t>(hessians[start + j] * weights_[start + j]);
        }
      }
    }
82
83
84
    if (num_position_ids_ > 0) {
      UpdatePositionBiasFactors(gradients, hessians);
    }
85
86
87
88
89
90
91
  }

  virtual void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt,
                                       const label_t* label,
                                       const double* score, score_t* lambdas,
                                       score_t* hessians) const = 0;

92
93
  virtual void UpdatePositionBiasFactors(const score_t* /*lambdas*/, const score_t* /*hessians*/) const {}

94
  const char* GetName() const override = 0;
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

  std::string ToString() const override {
    std::stringstream str_buf;
    str_buf << GetName();
    return str_buf.str();
  }

  bool NeedAccuratePrediction() const override { return false; }

 protected:
  int seed_;
  data_size_t num_queries_;
  /*! \brief Number of data */
  data_size_t num_data_;
  /*! \brief Pointer of label */
  const label_t* label_;
  /*! \brief Pointer of weights */
  const label_t* weights_;
113
114
115
116
117
118
  /*! \brief Pointer of positions */
  const data_size_t* positions_;
  /*! \brief Pointer of position IDs */
  const std::string* position_ids_;
  /*! \brief Pointer of label */
  data_size_t num_position_ids_;
Andrew Ziem's avatar
Andrew Ziem committed
119
  /*! \brief Query boundaries */
120
  const data_size_t* query_boundaries_;
121
122
123
124
125
126
  /*! \brief Position bias factors */
  mutable std::vector<label_t> pos_biases_;
  /*! \brief Learning rate to update position bias factors */
  double learning_rate_;
  /*! \brief Position bias regularization */
  double position_bias_regularization_;
127
};
128

Guolin Ke's avatar
Guolin Ke committed
129
/*!
Andrew Ziem's avatar
Andrew Ziem committed
130
 * \brief Objective function for LambdaRank with NDCG
131
132
 */
class LambdarankNDCG : public RankingObjective {
Nikita Titov's avatar
Nikita Titov committed
133
 public:
134
135
136
137
138
  explicit LambdarankNDCG(const Config& config)
      : RankingObjective(config),
        sigmoid_(config.sigmoid),
        norm_(config.lambdarank_norm),
        truncation_level_(config.lambdarank_truncation_level) {
Guolin Ke's avatar
Guolin Ke committed
139
    label_gain_ = config.label_gain;
Guolin Ke's avatar
Guolin Ke committed
140
    // initialize DCG calculator
Guolin Ke's avatar
Guolin Ke committed
141
142
    DCGCalculator::DefaultLabelGain(&label_gain_);
    DCGCalculator::Init(label_gain_);
Guolin Ke's avatar
Guolin Ke committed
143
144
    sigmoid_table_.clear();
    inverse_max_dcgs_.clear();
Guolin Ke's avatar
Guolin Ke committed
145
    if (sigmoid_ <= 0.0) {
146
      Log::Fatal("Sigmoid param %f should be greater than zero", sigmoid_);
Guolin Ke's avatar
Guolin Ke committed
147
148
    }
  }
149

150
151
152
153
  explicit LambdarankNDCG(const std::vector<std::string>& strs)
      : RankingObjective(strs) {}

  ~LambdarankNDCG() {}
154

Guolin Ke's avatar
Guolin Ke committed
155
  void Init(const Metadata& metadata, data_size_t num_data) override {
156
    RankingObjective::Init(metadata, num_data);
157
    DCGCalculator::CheckMetadata(metadata, num_queries_);
158
    DCGCalculator::CheckLabel(label_, num_data_);
Guolin Ke's avatar
Guolin Ke committed
159
    inverse_max_dcgs_.resize(num_queries_);
Guolin Ke's avatar
Guolin Ke committed
160
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
161
    for (data_size_t i = 0; i < num_queries_; ++i) {
162
163
164
      inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(
          truncation_level_, label_ + query_boundaries_[i],
          query_boundaries_[i + 1] - query_boundaries_[i]);
Guolin Ke's avatar
Guolin Ke committed
165
166
167
168
169

      if (inverse_max_dcgs_[i] > 0.0) {
        inverse_max_dcgs_[i] = 1.0f / inverse_max_dcgs_[i];
      }
    }
Andrew Ziem's avatar
Andrew Ziem committed
170
    // construct Sigmoid table to speed up Sigmoid transform
Guolin Ke's avatar
Guolin Ke committed
171
172
173
    ConstructSigmoidTable();
  }

174
175
176
177
  inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt,
                                      const label_t* label, const double* score,
                                      score_t* lambdas,
                                      score_t* hessians) const override {
Guolin Ke's avatar
Guolin Ke committed
178
    // get max DCG on current query
179
    const double inverse_max_dcg = inverse_max_dcgs_[query_id];
Guolin Ke's avatar
Guolin Ke committed
180
181
182
183
184
185
    // initialize with zero
    for (data_size_t i = 0; i < cnt; ++i) {
      lambdas[i] = 0.0f;
      hessians[i] = 0.0f;
    }
    // get sorted indices for scores
186
    std::vector<data_size_t> sorted_idx(cnt);
Guolin Ke's avatar
Guolin Ke committed
187
    for (data_size_t i = 0; i < cnt; ++i) {
188
      sorted_idx[i] = i;
Guolin Ke's avatar
Guolin Ke committed
189
    }
190
191
192
    std::stable_sort(
        sorted_idx.begin(), sorted_idx.end(),
        [score](data_size_t a, data_size_t b) { return score[a] > score[b]; });
Guolin Ke's avatar
Guolin Ke committed
193
    // get best and worst score
194
195
196
197
198
199
200
    const double best_score = score[sorted_idx[0]];
    data_size_t worst_idx = cnt - 1;
    if (worst_idx > 0 && score[sorted_idx[worst_idx]] == kMinScore) {
      worst_idx -= 1;
    }
    const double worst_score = score[sorted_idx[worst_idx]];
    double sum_lambdas = 0.0;
201
202
203
204
205
206
207
208
209
210
211
212
213
214
    // start accmulate lambdas by pairs that contain at least one document above truncation level
    for (data_size_t i = 0; i < cnt - 1 && i < truncation_level_; ++i) {
      if (score[sorted_idx[i]] == kMinScore) { continue; }
      for (data_size_t j = i + 1; j < cnt; ++j) {
        if (score[sorted_idx[j]] == kMinScore) { continue; }
        // skip pairs with the same labels
        if (label[sorted_idx[i]] == label[sorted_idx[j]]) { continue; }
        data_size_t high_rank, low_rank;
        if (label[sorted_idx[i]] > label[sorted_idx[j]]) {
          high_rank = i;
          low_rank = j;
        } else {
          high_rank = j;
          low_rank = i;
215
        }
216
217
218
219
220
221
        const data_size_t high = sorted_idx[high_rank];
        const int high_label = static_cast<int>(label[high]);
        const double high_score = score[high];
        const double high_label_gain = label_gain_[high_label];
        const double high_discount = DCGCalculator::GetDiscount(high_rank);
        const data_size_t low = sorted_idx[low_rank];
Guolin Ke's avatar
Guolin Ke committed
222
        const int low_label = static_cast<int>(label[low]);
223
        const double low_score = score[low];
224
225
        const double low_label_gain = label_gain_[low_label];
        const double low_discount = DCGCalculator::GetDiscount(low_rank);
Guolin Ke's avatar
Guolin Ke committed
226

227
        const double delta_score = high_score - low_score;
Guolin Ke's avatar
Guolin Ke committed
228
229

        // get dcg gap
230
        const double dcg_gap = high_label_gain - low_label_gain;
Guolin Ke's avatar
Guolin Ke committed
231
        // get discount of this pair
232
        const double paired_discount = fabs(high_discount - low_discount);
Guolin Ke's avatar
Guolin Ke committed
233
        // get delta NDCG
234
        double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg;
Guolin Ke's avatar
Guolin Ke committed
235
        // regular the delta_pair_NDCG by score distance
236
        if (norm_ && best_score != worst_score) {
237
238
          delta_pair_NDCG /= (0.01f + fabs(delta_score));
        }
Guolin Ke's avatar
Guolin Ke committed
239
        // calculate lambda for this pair
240
        double p_lambda = GetSigmoid(delta_score);
241
        double p_hessian = p_lambda * (1.0f - p_lambda);
Guolin Ke's avatar
Guolin Ke committed
242
        // update
243
244
        p_lambda *= -sigmoid_ * delta_pair_NDCG;
        p_hessian *= sigmoid_ * sigmoid_ * delta_pair_NDCG;
245
246
        lambdas[low] -= static_cast<score_t>(p_lambda);
        hessians[low] += static_cast<score_t>(p_hessian);
247
248
        lambdas[high] += static_cast<score_t>(p_lambda);
        hessians[high] += static_cast<score_t>(p_hessian);
249
250
        // lambda is negative, so use minus to accumulate
        sum_lambdas -= 2 * p_lambda;
Guolin Ke's avatar
Guolin Ke committed
251
252
      }
    }
253
254
255
256
257
258
259
    if (norm_ && sum_lambdas > 0) {
      double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas;
      for (data_size_t i = 0; i < cnt; ++i) {
        lambdas[i] = static_cast<score_t>(lambdas[i] * norm_factor);
        hessians[i] = static_cast<score_t>(hessians[i] * norm_factor);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
260
261
  }

262
  inline double GetSigmoid(double score) const {
Guolin Ke's avatar
Guolin Ke committed
263
264
265
266
    if (score <= min_sigmoid_input_) {
      // too small, use lower bound
      return sigmoid_table_[0];
    } else if (score >= max_sigmoid_input_) {
267
      // too large, use upper bound
Guolin Ke's avatar
Guolin Ke committed
268
269
      return sigmoid_table_[_sigmoid_bins - 1];
    } else {
270
271
      return sigmoid_table_[static_cast<size_t>((score - min_sigmoid_input_) *
                                                sigmoid_table_idx_factor_)];
Guolin Ke's avatar
Guolin Ke committed
272
273
274
275
276
277
278
    }
  }

  void ConstructSigmoidTable() {
    // get boundary
    min_sigmoid_input_ = min_sigmoid_input_ / sigmoid_ / 2;
    max_sigmoid_input_ = -min_sigmoid_input_;
Guolin Ke's avatar
Guolin Ke committed
279
    sigmoid_table_.resize(_sigmoid_bins);
Guolin Ke's avatar
Guolin Ke committed
280
281
    // get score to bin factor
    sigmoid_table_idx_factor_ =
282
        _sigmoid_bins / (max_sigmoid_input_ - min_sigmoid_input_);
Guolin Ke's avatar
Guolin Ke committed
283
284
    // cache
    for (size_t i = 0; i < _sigmoid_bins; ++i) {
285
      const double score = i / sigmoid_table_idx_factor_ + min_sigmoid_input_;
286
      sigmoid_table_[i] = 1.0f / (1.0f + std::exp(score * sigmoid_));
Guolin Ke's avatar
Guolin Ke committed
287
288
289
    }
  }

290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
  void UpdatePositionBiasFactors(const score_t* lambdas, const score_t* hessians) const override {
    /// get number of threads
    int num_threads = 1;
    #pragma omp parallel
    #pragma omp master
    {
      num_threads = omp_get_num_threads();
    }
    // create per-thread buffers for first and second derivatives of utility w.r.t. position bias factors
    std::vector<double> bias_first_derivatives(num_position_ids_ * num_threads, 0.0);
    std::vector<double> bias_second_derivatives(num_position_ids_ * num_threads, 0.0);
    std::vector<int> instance_counts(num_position_ids_ * num_threads, 0);
    #pragma omp parallel for schedule(guided)
    for (data_size_t i = 0; i < num_data_; i++) {
      // get thread ID
      const int tid = omp_get_thread_num();
      size_t offset = static_cast<size_t>(positions_[i] + tid * num_position_ids_);
      // accumulate first derivatives of utility w.r.t. position bias factors, for each position
      bias_first_derivatives[offset] -= lambdas[i];
      // accumulate second derivatives of utility w.r.t. position bias factors, for each position
      bias_second_derivatives[offset] -= hessians[i];
      instance_counts[offset]++;
    }
    #pragma omp parallel for schedule(guided)
    for (data_size_t i = 0; i < num_position_ids_; i++) {
      double bias_first_derivative = 0.0;
      double bias_second_derivative = 0.0;
      int instance_count = 0;
      // aggregate derivatives from per-thread buffers
      for (int tid = 0; tid < num_threads; tid++) {
        size_t offset = static_cast<size_t>(i + tid * num_position_ids_);
        bias_first_derivative += bias_first_derivatives[offset];
        bias_second_derivative += bias_second_derivatives[offset];
        instance_count += instance_counts[offset];
      }
      // L2 regularization on position bias factors
      bias_first_derivative -= pos_biases_[i] * position_bias_regularization_ * instance_count;
      bias_second_derivative -= position_bias_regularization_ * instance_count;
      // do Newton-Raphson step to update position bias factors
      pos_biases_[i] += learning_rate_ * bias_first_derivative / (std::abs(bias_second_derivative) + 0.001);
    }
    LogDebugPositionBiasFactors();
  }

334
  const char* GetName() const override { return "lambdarank"; }
335

336
 protected:
337
338
339
340
341
342
343
344
345
346
347
348
349
350
  void LogDebugPositionBiasFactors() const {
    std::stringstream message_stream;
    message_stream << std::setw(15) << "position"
      << std::setw(15) << "bias_factor"
      << std::endl;
    Log::Debug(message_stream.str().c_str());
    message_stream.str("");
    for (int i = 0; i < num_position_ids_; ++i) {
      message_stream << std::setw(15) << position_ids_[i]
        << std::setw(15) << pos_biases_[i];
      Log::Debug(message_stream.str().c_str());
      message_stream.str("");
    }
  }
Andrew Ziem's avatar
Andrew Ziem committed
351
  /*! \brief Sigmoid param */
352
  double sigmoid_;
353
354
  /*! \brief Normalize the lambdas or not */
  bool norm_;
355
  /*! \brief Truncation position for max DCG */
356
357
358
  int truncation_level_;
  /*! \brief Cache inverse max DCG, speed up calculation */
  std::vector<double> inverse_max_dcgs_;
Guolin Ke's avatar
Guolin Ke committed
359
  /*! \brief Cache result for sigmoid transform to speed up */
360
  std::vector<double> sigmoid_table_;
361
  /*! \brief Gains for labels */
362
  std::vector<double> label_gain_;
Guolin Ke's avatar
Guolin Ke committed
363
364
365
  /*! \brief Number of bins in simoid table */
  size_t _sigmoid_bins = 1024 * 1024;
  /*! \brief Minimal input of sigmoid table */
366
  double min_sigmoid_input_ = -50;
Andrew Ziem's avatar
Andrew Ziem committed
367
  /*! \brief Maximal input of Sigmoid table */
368
  double max_sigmoid_input_ = 50;
Andrew Ziem's avatar
Andrew Ziem committed
369
  /*! \brief Factor that covert score to bin in Sigmoid table */
370
  double sigmoid_table_idx_factor_;
Guolin Ke's avatar
Guolin Ke committed
371
372
};

373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
/*!
 * \brief Implementation of the learning-to-rank objective function, XE_NDCG
 * [arxiv.org/abs/1911.09798].
 */
class RankXENDCG : public RankingObjective {
 public:
  explicit RankXENDCG(const Config& config) : RankingObjective(config) {}

  explicit RankXENDCG(const std::vector<std::string>& strs)
      : RankingObjective(strs) {}

  ~RankXENDCG() {}

  void Init(const Metadata& metadata, data_size_t num_data) override {
    RankingObjective::Init(metadata, num_data);
    for (data_size_t i = 0; i < num_queries_; ++i) {
      rands_.emplace_back(seed_ + i);
    }
  }

  inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt,
                                      const label_t* label, const double* score,
                                      score_t* lambdas,
                                      score_t* hessians) const override {
397
398
399
400
401
402
403
404
405
    // Skip groups with too few items.
    if (cnt <= 1) {
      for (data_size_t i = 0; i < cnt; ++i) {
        lambdas[i] = 0.0f;
        hessians[i] = 0.0f;
      }
      return;
    }

406
407
408
409
    // Turn scores into a probability distribution using Softmax.
    std::vector<double> rho(cnt, 0.0);
    Common::Softmax(score, rho.data(), cnt);

410
411
412
413
414
    // An auxiliary buffer of parameters used to form the ground-truth
    // distribution and compute the loss.
    std::vector<double> params(cnt);

    double inv_denominator = 0;
415
    for (data_size_t i = 0; i < cnt; ++i) {
416
417
      params[i] = Phi(label[i], rands_[query_id].NextFloat());
      inv_denominator += params[i];
418
419
    }
    // sum_labels will always be positive number
420
421
    inv_denominator = 1. / std::max<double>(kEpsilon, inv_denominator);

422
423
    // Approximate gradients and inverse Hessian.
    // First order terms.
424
    double sum_l1 = 0.0;
425
    for (data_size_t i = 0; i < cnt; ++i) {
426
427
428
429
430
      double term = -params[i] * inv_denominator + rho[i];
      lambdas[i] = static_cast<score_t>(term);
      // Params will now store terms needed to compute second-order terms.
      params[i] = term / (1. - rho[i]);
      sum_l1 += params[i];
431
    }
432
433
434
435
436
437
438
439
440
441
442
443
    // Second order terms.
    double sum_l2 = 0.0;
    for (data_size_t i = 0; i < cnt; ++i) {
      double term = rho[i] * (sum_l1 - params[i]);
      lambdas[i] += static_cast<score_t>(term);
      // Params will now store terms needed to compute third-order terms.
      params[i] = term / (1. - rho[i]);
      sum_l2 += params[i];
    }
    for (data_size_t i = 0; i < cnt; ++i) {
      lambdas[i] += static_cast<score_t>(rho[i] * (sum_l2 - params[i]));
      hessians[i] = static_cast<score_t>(rho[i] * (1.0 - rho[i]));
444
445
446
447
448
449
450
451
452
    }
  }

  double Phi(const label_t l, double g) const {
    return Common::Pow(2, static_cast<int>(l)) - g;
  }

  const char* GetName() const override { return "rank_xendcg"; }

453
 protected:
454
455
456
  mutable std::vector<Random> rands_;
};

Guolin Ke's avatar
Guolin Ke committed
457
}  // namespace LightGBM
458
#endif  // LightGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_