dcg_calculator.cpp 5.54 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5

6
#include <algorithm>
Guolin Ke's avatar
Guolin Ke committed
7
8
9
#include <cmath>
#include <vector>

10
11
12
#include <LightGBM/metric.h>
#include <LightGBM/utils/log.h>

Guolin Ke's avatar
Guolin Ke committed
13
14
15
namespace LightGBM {

/*! \brief Declaration for some static members */
16
17
std::vector<double> DCGCalculator::label_gain_;
std::vector<double> DCGCalculator::discount_;
Guolin Ke's avatar
Guolin Ke committed
18
19
const data_size_t DCGCalculator::kMaxPosition = 10000;

Guolin Ke's avatar
Guolin Ke committed
20
21

void DCGCalculator::DefaultEvalAt(std::vector<int>* eval_at) {
Guolin Ke's avatar
Guolin Ke committed
22
23
  auto& ref_eval_at = *eval_at;
  if (ref_eval_at.empty()) {
Guolin Ke's avatar
Guolin Ke committed
24
    for (int i = 1; i <= 5; ++i) {
Guolin Ke's avatar
Guolin Ke committed
25
      ref_eval_at.push_back(i);
Guolin Ke's avatar
Guolin Ke committed
26
27
28
    }
  } else {
    for (size_t i = 0; i < eval_at->size(); ++i) {
29
      CHECK_GT(ref_eval_at[i], 0);
Guolin Ke's avatar
Guolin Ke committed
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
    }
  }
}

void DCGCalculator::DefaultLabelGain(std::vector<double>* label_gain) {
  if (!label_gain->empty()) { return; }
  // label_gain = 2^i - 1, may overflow, so we use 31 here
  const int max_label = 31;
  label_gain->push_back(0.0f);
  for (int i = 1; i < max_label; ++i) {
    label_gain->push_back(static_cast<double>((1 << i) - 1));
  }
}

void DCGCalculator::Init(const std::vector<double>& input_label_gain) {
Guolin Ke's avatar
Guolin Ke committed
45
  label_gain_.resize(input_label_gain.size());
46
  for (size_t i = 0; i < input_label_gain.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
47
    label_gain_[i] = static_cast<double>(input_label_gain[i]);
48
  }
Guolin Ke's avatar
Guolin Ke committed
49
  discount_.resize(kMaxPosition);
Guolin Ke's avatar
Guolin Ke committed
50
  for (data_size_t i = 0; i < kMaxPosition; ++i) {
51
    discount_[i] = 1.0 / std::log2(2.0 + i);
Guolin Ke's avatar
Guolin Ke committed
52
53
54
  }
}

55
double DCGCalculator::CalMaxDCGAtK(data_size_t k, const label_t* label, data_size_t num_data) {
56
  double ret = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
57
58
59
60
61
  // counts for all labels
  std::vector<data_size_t> label_cnt(label_gain_.size(), 0);
  for (data_size_t i = 0; i < num_data; ++i) {
    ++label_cnt[static_cast<int>(label[i])];
  }
62
  int top_label = static_cast<int>(label_gain_.size()) - 1;
Guolin Ke's avatar
Guolin Ke committed
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

  if (k > num_data) { k = num_data; }
  //  start from top label, and accumulate DCG
  for (data_size_t j = 0; j < k; ++j) {
    while (top_label > 0 && label_cnt[top_label] <= 0) {
      top_label -= 1;
    }
    if (top_label < 0) {
      break;
    }
    ret += discount_[j] * label_gain_[top_label];
    label_cnt[top_label] -= 1;
  }
  return ret;
}

void DCGCalculator::CalMaxDCG(const std::vector<data_size_t>& ks,
80
                              const label_t* label,
Guolin Ke's avatar
Guolin Ke committed
81
                              data_size_t num_data,
82
                              std::vector<double>* out) {
Guolin Ke's avatar
Guolin Ke committed
83
84
85
86
87
  std::vector<data_size_t> label_cnt(label_gain_.size(), 0);
  // counts for all labels
  for (data_size_t i = 0; i < num_data; ++i) {
    ++label_cnt[static_cast<int>(label[i])];
  }
88
  double cur_result = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
89
  data_size_t cur_left = 0;
90
  int top_label = static_cast<int>(label_gain_.size()) - 1;
Guolin Ke's avatar
Guolin Ke committed
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
  // calculate k Max DCG by one pass
  for (size_t i = 0; i < ks.size(); ++i) {
    data_size_t cur_k = ks[i];
    if (cur_k > num_data) { cur_k = num_data; }
    for (data_size_t j = cur_left; j < cur_k; ++j) {
      while (top_label > 0 && label_cnt[top_label] <= 0) {
        top_label -= 1;
      }
      if (top_label < 0) {
        break;
      }
      cur_result += discount_[j] * label_gain_[top_label];
      label_cnt[top_label] -= 1;
    }
    (*out)[i] = cur_result;
    cur_left = cur_k;
  }
}


111
double DCGCalculator::CalDCGAtK(data_size_t k, const label_t* label,
112
                                const double* score, data_size_t num_data) {
Guolin Ke's avatar
Guolin Ke committed
113
  // get sorted indices by score
114
  std::vector<data_size_t> sorted_idx(num_data);
Guolin Ke's avatar
Guolin Ke committed
115
  for (data_size_t i = 0; i < num_data; ++i) {
116
    sorted_idx[i] = i;
Guolin Ke's avatar
Guolin Ke committed
117
  }
118
119
  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
                   [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
Guolin Ke's avatar
Guolin Ke committed
120
121

  if (k > num_data) { k = num_data; }
122
  double dcg = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
123
124
125
126
127
128
129
130
  // calculate dcg
  for (data_size_t i = 0; i < k; ++i) {
    data_size_t idx = sorted_idx[i];
    dcg += label_gain_[static_cast<int>(label[idx])] * discount_[i];
  }
  return dcg;
}

131
void DCGCalculator::CalDCG(const std::vector<data_size_t>& ks, const label_t* label,
132
                           const double * score, data_size_t num_data, std::vector<double>* out) {
Guolin Ke's avatar
Guolin Ke committed
133
  // get sorted indices by score
134
  std::vector<data_size_t> sorted_idx(num_data);
Guolin Ke's avatar
Guolin Ke committed
135
  for (data_size_t i = 0; i < num_data; ++i) {
136
    sorted_idx[i] = i;
Guolin Ke's avatar
Guolin Ke committed
137
  }
138
139
  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
                   [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
Guolin Ke's avatar
Guolin Ke committed
140

141
  double cur_result = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
142
143
144
145
146
147
148
149
150
151
152
153
154
155
  data_size_t cur_left = 0;
  // calculate multi dcg by one pass
  for (size_t i = 0; i < ks.size(); ++i) {
    data_size_t cur_k = ks[i];
    if (cur_k > num_data) { cur_k = num_data; }
    for (data_size_t j = cur_left; j < cur_k; ++j) {
      data_size_t idx = sorted_idx[j];
      cur_result += label_gain_[static_cast<int>(label[idx])] * discount_[j];
    }
    (*out)[i] = cur_result;
    cur_left = cur_k;
  }
}

156
void DCGCalculator::CheckLabel(const label_t* label, data_size_t num_data) {
157
  for (data_size_t i = 0; i < num_data; ++i) {
158
    label_t delta = std::fabs(label[i] - static_cast<int>(label[i]));
159
    if (delta > kEpsilon) {
160
161
      Log::Fatal("label should be int type (met %f) for ranking task,\n"
                 "for the gain of label, please set the label_gain parameter", label[i]);
162
    }
163
164
165
166
167
168
169

    if (label[i] < 0) {
      Log::Fatal("Label should be non-negative (met %f) for ranking task", label[i]);
    }

    if (static_cast<size_t>(label[i]) >= label_gain_.size()) {
      Log::Fatal("Label %zu is not less than the number of label mappings (%zu)", static_cast<size_t>(label[i]), label_gain_.size());
170
171
172
173
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
174
}  // namespace LightGBM