dcg_calculator.cpp 5.53 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#include <LightGBM/metric.h>
#include <LightGBM/utils/log.h>

8
#include <algorithm>
Guolin Ke's avatar
Guolin Ke committed
9
10
11
12
13
14
#include <cmath>
#include <vector>

namespace LightGBM {

/*! \brief Declaration for some static members */
15
16
std::vector<double> DCGCalculator::label_gain_;
std::vector<double> DCGCalculator::discount_;
Guolin Ke's avatar
Guolin Ke committed
17
18
const data_size_t DCGCalculator::kMaxPosition = 10000;

Guolin Ke's avatar
Guolin Ke committed
19
20

void DCGCalculator::DefaultEvalAt(std::vector<int>* eval_at) {
Guolin Ke's avatar
Guolin Ke committed
21
22
  auto& ref_eval_at = *eval_at;
  if (ref_eval_at.empty()) {
Guolin Ke's avatar
Guolin Ke committed
23
    for (int i = 1; i <= 5; ++i) {
Guolin Ke's avatar
Guolin Ke committed
24
      ref_eval_at.push_back(i);
Guolin Ke's avatar
Guolin Ke committed
25
26
27
    }
  } else {
    for (size_t i = 0; i < eval_at->size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
28
      CHECK(ref_eval_at[i] > 0);
Guolin Ke's avatar
Guolin Ke committed
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
    }
  }
}

void DCGCalculator::DefaultLabelGain(std::vector<double>* label_gain) {
  if (!label_gain->empty()) { return; }
  // label_gain = 2^i - 1, may overflow, so we use 31 here
  const int max_label = 31;
  label_gain->push_back(0.0f);
  for (int i = 1; i < max_label; ++i) {
    label_gain->push_back(static_cast<double>((1 << i) - 1));
  }
}

void DCGCalculator::Init(const std::vector<double>& input_label_gain) {
Guolin Ke's avatar
Guolin Ke committed
44
  label_gain_.resize(input_label_gain.size());
45
  for (size_t i = 0; i < input_label_gain.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
46
    label_gain_[i] = static_cast<double>(input_label_gain[i]);
47
  }
Guolin Ke's avatar
Guolin Ke committed
48
  discount_.resize(kMaxPosition);
Guolin Ke's avatar
Guolin Ke committed
49
  for (data_size_t i = 0; i < kMaxPosition; ++i) {
50
    discount_[i] = 1.0 / std::log2(2.0 + i);
Guolin Ke's avatar
Guolin Ke committed
51
52
53
  }
}

54
double DCGCalculator::CalMaxDCGAtK(data_size_t k, const label_t* label, data_size_t num_data) {
55
  double ret = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
56
57
58
59
60
  // counts for all labels
  std::vector<data_size_t> label_cnt(label_gain_.size(), 0);
  for (data_size_t i = 0; i < num_data; ++i) {
    ++label_cnt[static_cast<int>(label[i])];
  }
61
  int top_label = static_cast<int>(label_gain_.size()) - 1;
Guolin Ke's avatar
Guolin Ke committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

  if (k > num_data) { k = num_data; }
  //  start from top label, and accumulate DCG
  for (data_size_t j = 0; j < k; ++j) {
    while (top_label > 0 && label_cnt[top_label] <= 0) {
      top_label -= 1;
    }
    if (top_label < 0) {
      break;
    }
    ret += discount_[j] * label_gain_[top_label];
    label_cnt[top_label] -= 1;
  }
  return ret;
}

void DCGCalculator::CalMaxDCG(const std::vector<data_size_t>& ks,
79
                              const label_t* label,
Guolin Ke's avatar
Guolin Ke committed
80
                              data_size_t num_data,
81
                              std::vector<double>* out) {
Guolin Ke's avatar
Guolin Ke committed
82
83
84
85
86
  std::vector<data_size_t> label_cnt(label_gain_.size(), 0);
  // counts for all labels
  for (data_size_t i = 0; i < num_data; ++i) {
    ++label_cnt[static_cast<int>(label[i])];
  }
87
  double cur_result = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
88
  data_size_t cur_left = 0;
89
  int top_label = static_cast<int>(label_gain_.size()) - 1;
Guolin Ke's avatar
Guolin Ke committed
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
  // calculate k Max DCG by one pass
  for (size_t i = 0; i < ks.size(); ++i) {
    data_size_t cur_k = ks[i];
    if (cur_k > num_data) { cur_k = num_data; }
    for (data_size_t j = cur_left; j < cur_k; ++j) {
      while (top_label > 0 && label_cnt[top_label] <= 0) {
        top_label -= 1;
      }
      if (top_label < 0) {
        break;
      }
      cur_result += discount_[j] * label_gain_[top_label];
      label_cnt[top_label] -= 1;
    }
    (*out)[i] = cur_result;
    cur_left = cur_k;
  }
}


110
double DCGCalculator::CalDCGAtK(data_size_t k, const label_t* label,
111
                                const double* score, data_size_t num_data) {
Guolin Ke's avatar
Guolin Ke committed
112
  // get sorted indices by score
113
  std::vector<data_size_t> sorted_idx(num_data);
Guolin Ke's avatar
Guolin Ke committed
114
  for (data_size_t i = 0; i < num_data; ++i) {
115
    sorted_idx[i] = i;
Guolin Ke's avatar
Guolin Ke committed
116
  }
117
118
  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
                   [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
Guolin Ke's avatar
Guolin Ke committed
119
120

  if (k > num_data) { k = num_data; }
121
  double dcg = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
122
123
124
125
126
127
128
129
  // calculate dcg
  for (data_size_t i = 0; i < k; ++i) {
    data_size_t idx = sorted_idx[i];
    dcg += label_gain_[static_cast<int>(label[idx])] * discount_[i];
  }
  return dcg;
}

130
void DCGCalculator::CalDCG(const std::vector<data_size_t>& ks, const label_t* label,
131
                           const double * score, data_size_t num_data, std::vector<double>* out) {
Guolin Ke's avatar
Guolin Ke committed
132
  // get sorted indices by score
133
  std::vector<data_size_t> sorted_idx(num_data);
Guolin Ke's avatar
Guolin Ke committed
134
  for (data_size_t i = 0; i < num_data; ++i) {
135
    sorted_idx[i] = i;
Guolin Ke's avatar
Guolin Ke committed
136
  }
137
138
  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
                   [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
Guolin Ke's avatar
Guolin Ke committed
139

140
  double cur_result = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
  data_size_t cur_left = 0;
  // calculate multi dcg by one pass
  for (size_t i = 0; i < ks.size(); ++i) {
    data_size_t cur_k = ks[i];
    if (cur_k > num_data) { cur_k = num_data; }
    for (data_size_t j = cur_left; j < cur_k; ++j) {
      data_size_t idx = sorted_idx[j];
      cur_result += label_gain_[static_cast<int>(label[idx])] * discount_[j];
    }
    (*out)[i] = cur_result;
    cur_left = cur_k;
  }
}

155
void DCGCalculator::CheckLabel(const label_t* label, data_size_t num_data) {
156
  for (data_size_t i = 0; i < num_data; ++i) {
157
    label_t delta = std::fabs(label[i] - static_cast<int>(label[i]));
158
    if (delta > kEpsilon) {
159
160
      Log::Fatal("label should be int type (met %f) for ranking task,\n"
                 "for the gain of label, please set the label_gain parameter", label[i]);
161
    }
162
163
164
165
166
167
168

    if (label[i] < 0) {
      Log::Fatal("Label should be non-negative (met %f) for ranking task", label[i]);
    }

    if (static_cast<size_t>(label[i]) >= label_gain_.size()) {
      Log::Fatal("Label %zu is not less than the number of label mappings (%zu)", static_cast<size_t>(label[i]), label_gain_.size());
169
170
171
172
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
173
}  // namespace LightGBM