rank_objective.hpp 17.4 KB
Newer Older
1
/*!
2
3
4
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
Guolin Ke's avatar
Guolin Ke committed
6
7
8
#ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_
#define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_

9
10
11
#include <LightGBM/metric.h>
#include <LightGBM/objective_function.h>

12
13
#include <algorithm>
#include <cmath>
Guolin Ke's avatar
Guolin Ke committed
14
15
#include <cstdio>
#include <cstring>
16
17
#include <limits>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
18
19
20
#include <vector>

namespace LightGBM {
21
22
23
24
25
26
27

/*!
 * \brief Objective function for Ranking
 */
class RankingObjective : public ObjectiveFunction {
 public:
  explicit RankingObjective(const Config& config)
28
29
30
31
      : seed_(config.objective_seed) {
    learning_rate_ = config.learning_rate;
    position_bias_regularization_ = config.lambdarank_position_bias_regularization;
  }
32
33
34
35
36
37
38
39
40
41
42

  explicit RankingObjective(const std::vector<std::string>&) : seed_(0) {}

  ~RankingObjective() {}

  void Init(const Metadata& metadata, data_size_t num_data) override {
    num_data_ = num_data;
    // get label
    label_ = metadata.label();
    // get weights
    weights_ = metadata.weights();
43
44
45
46
47
48
    // get positions
    positions_ = metadata.positions();
    // get position ids
    position_ids_ = metadata.position_ids();
    // get number of different position ids
    num_position_ids_ = static_cast<data_size_t>(metadata.num_position_ids());
49
50
51
52
53
54
    // get boundries
    query_boundaries_ = metadata.query_boundaries();
    if (query_boundaries_ == nullptr) {
      Log::Fatal("Ranking tasks require query information");
    }
    num_queries_ = metadata.num_queries();
55
56
    // initialize position bias vectors
    pos_biases_.resize(num_position_ids_, 0.0);
57
58
  }

59
60
61
  void GetGradients(const double* score, const data_size_t num_sampled_queries, const data_size_t* sampled_query_indices,
                    score_t* gradients, score_t* hessians) const override {
    const data_size_t num_queries = (sampled_query_indices == nullptr ? num_queries_ : num_sampled_queries);
62
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
63
64
65
66
    for (data_size_t i = 0; i < num_queries; ++i) {
      const data_size_t query_index = (sampled_query_indices == nullptr ? i : sampled_query_indices[i]);
      const data_size_t start = query_boundaries_[query_index];
      const data_size_t cnt = query_boundaries_[query_index + 1] - query_boundaries_[query_index];
67
68
69
70
71
72
      std::vector<double> score_adjusted;
      if (num_position_ids_ > 0) {
        for (data_size_t j = 0; j < cnt; ++j) {
          score_adjusted.push_back(score[start + j] + pos_biases_[positions_[start + j]]);
        }
      }
73
      GetGradientsForOneQuery(query_index, cnt, label_ + start, num_position_ids_ > 0 ? score_adjusted.data() : score + start,
74
75
76
77
78
79
80
81
82
83
                              gradients + start, hessians + start);
      if (weights_ != nullptr) {
        for (data_size_t j = 0; j < cnt; ++j) {
          gradients[start + j] =
              static_cast<score_t>(gradients[start + j] * weights_[start + j]);
          hessians[start + j] =
              static_cast<score_t>(hessians[start + j] * weights_[start + j]);
        }
      }
    }
84
85
86
    if (num_position_ids_ > 0) {
      UpdatePositionBiasFactors(gradients, hessians);
    }
87
88
  }

89
90
91
92
  void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
    GetGradients(score, num_queries_, nullptr, gradients, hessians);
  }

93
94
95
96
97
  virtual void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt,
                                       const label_t* label,
                                       const double* score, score_t* lambdas,
                                       score_t* hessians) const = 0;

98
99
  virtual void UpdatePositionBiasFactors(const score_t* /*lambdas*/, const score_t* /*hessians*/) const {}

100
  const char* GetName() const override = 0;
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

  std::string ToString() const override {
    std::stringstream str_buf;
    str_buf << GetName();
    return str_buf.str();
  }

  bool NeedAccuratePrediction() const override { return false; }

 protected:
  int seed_;
  data_size_t num_queries_;
  /*! \brief Number of data */
  data_size_t num_data_;
  /*! \brief Pointer of label */
  const label_t* label_;
  /*! \brief Pointer of weights */
  const label_t* weights_;
119
120
121
122
123
124
  /*! \brief Pointer of positions */
  const data_size_t* positions_;
  /*! \brief Pointer of position IDs */
  const std::string* position_ids_;
  /*! \brief Pointer of label */
  data_size_t num_position_ids_;
Andrew Ziem's avatar
Andrew Ziem committed
125
  /*! \brief Query boundaries */
126
  const data_size_t* query_boundaries_;
127
128
129
130
131
132
  /*! \brief Position bias factors */
  mutable std::vector<label_t> pos_biases_;
  /*! \brief Learning rate to update position bias factors */
  double learning_rate_;
  /*! \brief Position bias regularization */
  double position_bias_regularization_;
133
};
134

Guolin Ke's avatar
Guolin Ke committed
135
/*!
Andrew Ziem's avatar
Andrew Ziem committed
136
 * \brief Objective function for LambdaRank with NDCG
137
138
 */
class LambdarankNDCG : public RankingObjective {
Nikita Titov's avatar
Nikita Titov committed
139
 public:
140
141
142
143
144
  explicit LambdarankNDCG(const Config& config)
      : RankingObjective(config),
        sigmoid_(config.sigmoid),
        norm_(config.lambdarank_norm),
        truncation_level_(config.lambdarank_truncation_level) {
Guolin Ke's avatar
Guolin Ke committed
145
    label_gain_ = config.label_gain;
Guolin Ke's avatar
Guolin Ke committed
146
    // initialize DCG calculator
Guolin Ke's avatar
Guolin Ke committed
147
148
    DCGCalculator::DefaultLabelGain(&label_gain_);
    DCGCalculator::Init(label_gain_);
Guolin Ke's avatar
Guolin Ke committed
149
150
    sigmoid_table_.clear();
    inverse_max_dcgs_.clear();
Guolin Ke's avatar
Guolin Ke committed
151
    if (sigmoid_ <= 0.0) {
152
      Log::Fatal("Sigmoid param %f should be greater than zero", sigmoid_);
Guolin Ke's avatar
Guolin Ke committed
153
154
    }
  }
155

156
157
158
159
  explicit LambdarankNDCG(const std::vector<std::string>& strs)
      : RankingObjective(strs) {}

  ~LambdarankNDCG() {}
160

Guolin Ke's avatar
Guolin Ke committed
161
  void Init(const Metadata& metadata, data_size_t num_data) override {
162
    RankingObjective::Init(metadata, num_data);
163
    DCGCalculator::CheckMetadata(metadata, num_queries_);
164
    DCGCalculator::CheckLabel(label_, num_data_);
Guolin Ke's avatar
Guolin Ke committed
165
    inverse_max_dcgs_.resize(num_queries_);
166
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
Guolin Ke's avatar
Guolin Ke committed
167
    for (data_size_t i = 0; i < num_queries_; ++i) {
168
169
170
      inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(
          truncation_level_, label_ + query_boundaries_[i],
          query_boundaries_[i + 1] - query_boundaries_[i]);
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
175

      if (inverse_max_dcgs_[i] > 0.0) {
        inverse_max_dcgs_[i] = 1.0f / inverse_max_dcgs_[i];
      }
    }
Andrew Ziem's avatar
Andrew Ziem committed
176
    // construct Sigmoid table to speed up Sigmoid transform
Guolin Ke's avatar
Guolin Ke committed
177
178
179
    ConstructSigmoidTable();
  }

180
181
182
183
  inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt,
                                      const label_t* label, const double* score,
                                      score_t* lambdas,
                                      score_t* hessians) const override {
Guolin Ke's avatar
Guolin Ke committed
184
    // get max DCG on current query
185
    const double inverse_max_dcg = inverse_max_dcgs_[query_id];
Guolin Ke's avatar
Guolin Ke committed
186
187
188
189
190
191
    // initialize with zero
    for (data_size_t i = 0; i < cnt; ++i) {
      lambdas[i] = 0.0f;
      hessians[i] = 0.0f;
    }
    // get sorted indices for scores
192
    std::vector<data_size_t> sorted_idx(cnt);
Guolin Ke's avatar
Guolin Ke committed
193
    for (data_size_t i = 0; i < cnt; ++i) {
194
      sorted_idx[i] = i;
Guolin Ke's avatar
Guolin Ke committed
195
    }
196
197
198
    std::stable_sort(
        sorted_idx.begin(), sorted_idx.end(),
        [score](data_size_t a, data_size_t b) { return score[a] > score[b]; });
Guolin Ke's avatar
Guolin Ke committed
199
    // get best and worst score
200
201
202
203
204
205
206
    const double best_score = score[sorted_idx[0]];
    data_size_t worst_idx = cnt - 1;
    if (worst_idx > 0 && score[sorted_idx[worst_idx]] == kMinScore) {
      worst_idx -= 1;
    }
    const double worst_score = score[sorted_idx[worst_idx]];
    double sum_lambdas = 0.0;
207
208
209
210
211
212
213
214
215
216
217
218
219
220
    // start accmulate lambdas by pairs that contain at least one document above truncation level
    for (data_size_t i = 0; i < cnt - 1 && i < truncation_level_; ++i) {
      if (score[sorted_idx[i]] == kMinScore) { continue; }
      for (data_size_t j = i + 1; j < cnt; ++j) {
        if (score[sorted_idx[j]] == kMinScore) { continue; }
        // skip pairs with the same labels
        if (label[sorted_idx[i]] == label[sorted_idx[j]]) { continue; }
        data_size_t high_rank, low_rank;
        if (label[sorted_idx[i]] > label[sorted_idx[j]]) {
          high_rank = i;
          low_rank = j;
        } else {
          high_rank = j;
          low_rank = i;
221
        }
222
223
224
225
226
227
        const data_size_t high = sorted_idx[high_rank];
        const int high_label = static_cast<int>(label[high]);
        const double high_score = score[high];
        const double high_label_gain = label_gain_[high_label];
        const double high_discount = DCGCalculator::GetDiscount(high_rank);
        const data_size_t low = sorted_idx[low_rank];
Guolin Ke's avatar
Guolin Ke committed
228
        const int low_label = static_cast<int>(label[low]);
229
        const double low_score = score[low];
230
231
        const double low_label_gain = label_gain_[low_label];
        const double low_discount = DCGCalculator::GetDiscount(low_rank);
Guolin Ke's avatar
Guolin Ke committed
232

233
        const double delta_score = high_score - low_score;
Guolin Ke's avatar
Guolin Ke committed
234
235

        // get dcg gap
236
        const double dcg_gap = high_label_gain - low_label_gain;
Guolin Ke's avatar
Guolin Ke committed
237
        // get discount of this pair
238
        const double paired_discount = fabs(high_discount - low_discount);
Guolin Ke's avatar
Guolin Ke committed
239
        // get delta NDCG
240
        double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg;
Guolin Ke's avatar
Guolin Ke committed
241
        // regular the delta_pair_NDCG by score distance
242
        if (norm_ && best_score != worst_score) {
243
244
          delta_pair_NDCG /= (0.01f + fabs(delta_score));
        }
Guolin Ke's avatar
Guolin Ke committed
245
        // calculate lambda for this pair
246
        double p_lambda = GetSigmoid(delta_score);
247
        double p_hessian = p_lambda * (1.0f - p_lambda);
Guolin Ke's avatar
Guolin Ke committed
248
        // update
249
250
        p_lambda *= -sigmoid_ * delta_pair_NDCG;
        p_hessian *= sigmoid_ * sigmoid_ * delta_pair_NDCG;
251
252
        lambdas[low] -= static_cast<score_t>(p_lambda);
        hessians[low] += static_cast<score_t>(p_hessian);
253
254
        lambdas[high] += static_cast<score_t>(p_lambda);
        hessians[high] += static_cast<score_t>(p_hessian);
255
256
        // lambda is negative, so use minus to accumulate
        sum_lambdas -= 2 * p_lambda;
Guolin Ke's avatar
Guolin Ke committed
257
258
      }
    }
259
260
261
262
263
264
265
    if (norm_ && sum_lambdas > 0) {
      double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas;
      for (data_size_t i = 0; i < cnt; ++i) {
        lambdas[i] = static_cast<score_t>(lambdas[i] * norm_factor);
        hessians[i] = static_cast<score_t>(hessians[i] * norm_factor);
      }
    }
Guolin Ke's avatar
Guolin Ke committed
266
267
  }

268
  inline double GetSigmoid(double score) const {
Guolin Ke's avatar
Guolin Ke committed
269
270
271
272
    if (score <= min_sigmoid_input_) {
      // too small, use lower bound
      return sigmoid_table_[0];
    } else if (score >= max_sigmoid_input_) {
273
      // too large, use upper bound
Guolin Ke's avatar
Guolin Ke committed
274
275
      return sigmoid_table_[_sigmoid_bins - 1];
    } else {
276
277
      return sigmoid_table_[static_cast<size_t>((score - min_sigmoid_input_) *
                                                sigmoid_table_idx_factor_)];
Guolin Ke's avatar
Guolin Ke committed
278
279
280
281
282
283
284
    }
  }

  void ConstructSigmoidTable() {
    // get boundary
    min_sigmoid_input_ = min_sigmoid_input_ / sigmoid_ / 2;
    max_sigmoid_input_ = -min_sigmoid_input_;
Guolin Ke's avatar
Guolin Ke committed
285
    sigmoid_table_.resize(_sigmoid_bins);
Guolin Ke's avatar
Guolin Ke committed
286
287
    // get score to bin factor
    sigmoid_table_idx_factor_ =
288
        _sigmoid_bins / (max_sigmoid_input_ - min_sigmoid_input_);
Guolin Ke's avatar
Guolin Ke committed
289
290
    // cache
    for (size_t i = 0; i < _sigmoid_bins; ++i) {
291
      const double score = i / sigmoid_table_idx_factor_ + min_sigmoid_input_;
292
      sigmoid_table_[i] = 1.0f / (1.0f + std::exp(score * sigmoid_));
Guolin Ke's avatar
Guolin Ke committed
293
294
295
    }
  }

296
297
  void UpdatePositionBiasFactors(const score_t* lambdas, const score_t* hessians) const override {
    /// get number of threads
298
    int num_threads = OMP_NUM_THREADS();
299
300
301
302
    // create per-thread buffers for first and second derivatives of utility w.r.t. position bias factors
    std::vector<double> bias_first_derivatives(num_position_ids_ * num_threads, 0.0);
    std::vector<double> bias_second_derivatives(num_position_ids_ * num_threads, 0.0);
    std::vector<int> instance_counts(num_position_ids_ * num_threads, 0);
303
    #pragma omp parallel for schedule(guided) num_threads(num_threads)
304
305
306
307
308
309
310
311
312
313
    for (data_size_t i = 0; i < num_data_; i++) {
      // get thread ID
      const int tid = omp_get_thread_num();
      size_t offset = static_cast<size_t>(positions_[i] + tid * num_position_ids_);
      // accumulate first derivatives of utility w.r.t. position bias factors, for each position
      bias_first_derivatives[offset] -= lambdas[i];
      // accumulate second derivatives of utility w.r.t. position bias factors, for each position
      bias_second_derivatives[offset] -= hessians[i];
      instance_counts[offset]++;
    }
314
    #pragma omp parallel for schedule(guided) num_threads(num_threads)
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
    for (data_size_t i = 0; i < num_position_ids_; i++) {
      double bias_first_derivative = 0.0;
      double bias_second_derivative = 0.0;
      int instance_count = 0;
      // aggregate derivatives from per-thread buffers
      for (int tid = 0; tid < num_threads; tid++) {
        size_t offset = static_cast<size_t>(i + tid * num_position_ids_);
        bias_first_derivative += bias_first_derivatives[offset];
        bias_second_derivative += bias_second_derivatives[offset];
        instance_count += instance_counts[offset];
      }
      // L2 regularization on position bias factors
      bias_first_derivative -= pos_biases_[i] * position_bias_regularization_ * instance_count;
      bias_second_derivative -= position_bias_regularization_ * instance_count;
      // do Newton-Raphson step to update position bias factors
      pos_biases_[i] += learning_rate_ * bias_first_derivative / (std::abs(bias_second_derivative) + 0.001);
    }
    LogDebugPositionBiasFactors();
  }

335
  const char* GetName() const override { return "lambdarank"; }
336

337
 protected:
338
339
340
341
342
343
344
345
346
347
348
349
350
351
  void LogDebugPositionBiasFactors() const {
    std::stringstream message_stream;
    message_stream << std::setw(15) << "position"
      << std::setw(15) << "bias_factor"
      << std::endl;
    Log::Debug(message_stream.str().c_str());
    message_stream.str("");
    for (int i = 0; i < num_position_ids_; ++i) {
      message_stream << std::setw(15) << position_ids_[i]
        << std::setw(15) << pos_biases_[i];
      Log::Debug(message_stream.str().c_str());
      message_stream.str("");
    }
  }
Andrew Ziem's avatar
Andrew Ziem committed
352
  /*! \brief Sigmoid param */
353
  double sigmoid_;
354
355
  /*! \brief Normalize the lambdas or not */
  bool norm_;
356
  /*! \brief Truncation position for max DCG */
357
358
359
  int truncation_level_;
  /*! \brief Cache inverse max DCG, speed up calculation */
  std::vector<double> inverse_max_dcgs_;
Guolin Ke's avatar
Guolin Ke committed
360
  /*! \brief Cache result for sigmoid transform to speed up */
361
  std::vector<double> sigmoid_table_;
362
  /*! \brief Gains for labels */
363
  std::vector<double> label_gain_;
Guolin Ke's avatar
Guolin Ke committed
364
365
366
  /*! \brief Number of bins in simoid table */
  size_t _sigmoid_bins = 1024 * 1024;
  /*! \brief Minimal input of sigmoid table */
367
  double min_sigmoid_input_ = -50;
Andrew Ziem's avatar
Andrew Ziem committed
368
  /*! \brief Maximal input of Sigmoid table */
369
  double max_sigmoid_input_ = 50;
Andrew Ziem's avatar
Andrew Ziem committed
370
  /*! \brief Factor that covert score to bin in Sigmoid table */
371
  double sigmoid_table_idx_factor_;
Guolin Ke's avatar
Guolin Ke committed
372
373
};

374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/*!
 * \brief Implementation of the learning-to-rank objective function, XE_NDCG
 * [arxiv.org/abs/1911.09798].
 */
class RankXENDCG : public RankingObjective {
 public:
  explicit RankXENDCG(const Config& config) : RankingObjective(config) {}

  explicit RankXENDCG(const std::vector<std::string>& strs)
      : RankingObjective(strs) {}

  ~RankXENDCG() {}

  void Init(const Metadata& metadata, data_size_t num_data) override {
    RankingObjective::Init(metadata, num_data);
    for (data_size_t i = 0; i < num_queries_; ++i) {
      rands_.emplace_back(seed_ + i);
    }
  }

  inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt,
                                      const label_t* label, const double* score,
                                      score_t* lambdas,
                                      score_t* hessians) const override {
398
399
400
401
402
403
404
405
406
    // Skip groups with too few items.
    if (cnt <= 1) {
      for (data_size_t i = 0; i < cnt; ++i) {
        lambdas[i] = 0.0f;
        hessians[i] = 0.0f;
      }
      return;
    }

407
408
409
410
    // Turn scores into a probability distribution using Softmax.
    std::vector<double> rho(cnt, 0.0);
    Common::Softmax(score, rho.data(), cnt);

411
412
413
414
415
    // An auxiliary buffer of parameters used to form the ground-truth
    // distribution and compute the loss.
    std::vector<double> params(cnt);

    double inv_denominator = 0;
416
    for (data_size_t i = 0; i < cnt; ++i) {
417
418
      params[i] = Phi(label[i], rands_[query_id].NextFloat());
      inv_denominator += params[i];
419
420
    }
    // sum_labels will always be positive number
421
422
    inv_denominator = 1. / std::max<double>(kEpsilon, inv_denominator);

423
424
    // Approximate gradients and inverse Hessian.
    // First order terms.
425
    double sum_l1 = 0.0;
426
    for (data_size_t i = 0; i < cnt; ++i) {
427
428
429
430
431
      double term = -params[i] * inv_denominator + rho[i];
      lambdas[i] = static_cast<score_t>(term);
      // Params will now store terms needed to compute second-order terms.
      params[i] = term / (1. - rho[i]);
      sum_l1 += params[i];
432
    }
433
434
435
436
437
438
439
440
441
442
443
444
    // Second order terms.
    double sum_l2 = 0.0;
    for (data_size_t i = 0; i < cnt; ++i) {
      double term = rho[i] * (sum_l1 - params[i]);
      lambdas[i] += static_cast<score_t>(term);
      // Params will now store terms needed to compute third-order terms.
      params[i] = term / (1. - rho[i]);
      sum_l2 += params[i];
    }
    for (data_size_t i = 0; i < cnt; ++i) {
      lambdas[i] += static_cast<score_t>(rho[i] * (sum_l2 - params[i]));
      hessians[i] = static_cast<score_t>(rho[i] * (1.0 - rho[i]));
445
446
447
448
449
450
451
452
453
    }
  }

  double Phi(const label_t l, double g) const {
    return Common::Pow(2, static_cast<int>(l)) - g;
  }

  const char* GetName() const override { return "rank_xendcg"; }

454
 protected:
455
456
457
  mutable std::vector<Random> rands_;
};

Guolin Ke's avatar
Guolin Ke committed
458
}  // namespace LightGBM
459
#endif  // LightGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_