xentropy_objective.hpp 11.8 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5
6
#ifndef LIGHTGBM_SRC_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_
#define LIGHTGBM_SRC_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_
7

8
9
10
11
#include <LightGBM/meta.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/utils/common.h>

12
13
#include <string>
#include <algorithm>
14
#include <cmath>
15
16
#include <cstring>
#include <vector>
17
18

/*
Andrew Ziem's avatar
Andrew Ziem committed
19
 * Implements gradients and Hessians for the following point losses.
20
21
22
 * Target y is anything in interval [0, 1].
 *
 * (1) CrossEntropy; "xentropy";
Tony-Y's avatar
Tony-Y committed
23
 *
24
25
26
27
28
29
30
 * loss(y, p, w) = { -(1-y)*log(1-p)-y*log(p) }*w,
 * with probability p = 1/(1+exp(-f)), where f is being boosted
 *
 * ConvertToOutput: f -> p
 *
 * (2) CrossEntropyLambda; "xentlambda"
 *
Tony-Y's avatar
Tony-Y committed
31
 * loss(y, p, w) = -(1-y)*log(1-p)-y*log(p),
32
33
34
35
36
37
38
39
40
41
42
43
44
 * with p = 1-exp(-lambda*w), lambda = log(1+exp(f)), f being boosted, and w > 0
 *
 * ConvertToOutput: f -> lambda
 *
 * (1) and (2) are the same if w=1; but outputs still differ.
 *
 */

namespace LightGBM {
/*!
* \brief Objective function for cross-entropy (with optional linear weights)
*/
class CrossEntropy: public ObjectiveFunction {
Nikita Titov's avatar
Nikita Titov committed
45
 public:
Guolin Ke's avatar
Guolin Ke committed
46
47
  explicit CrossEntropy(const Config& config)
      : deterministic_(config.deterministic) {}
48

Guolin Ke's avatar
Guolin Ke committed
49
50
  explicit CrossEntropy(const std::vector<std::string>&)
      : deterministic_(false) {
51
52
53
54
55
56
57
58
59
60
  }

  ~CrossEntropy() {}

  void Init(const Metadata& metadata, data_size_t num_data) override {
    num_data_ = num_data;
    label_ = metadata.label();
    weights_ = metadata.weights();

    CHECK_NOTNULL(label_);
61
    Common::CheckElementsIntervalClosed<label_t>(label_, 0.0f, 1.0f, num_data_, GetName());
62
63
64
    Log::Info("[%s:%s]: (objective) labels passed interval [0, 1] check",  GetName(), __func__);

    if (weights_ != nullptr) {
65
      label_t minw;
66
      double sumw;
67
      Common::ObtainMinMaxSum(weights_, num_data_, &minw, static_cast<label_t*>(nullptr), &sumw);
68
      if (minw < 0.0f) {
69
        Log::Fatal("[%s]: at least one weight is negative", GetName());
70
71
      }
      if (sumw == 0.0f) {
72
        Log::Fatal("[%s]: sum of weights is zero", GetName());
73
74
75
76
77
      }
    }
  }

  void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
    // z = expit(score) = 1 / (1 + exp(-score))
    // gradient = z - label = expit(score) - label
    // Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/
    //     if score < 0:
    //         exp_tmp = exp(score)
    //         return ((1 - label) * exp_tmp - label) / (1 + exp_tmp)
    //     else:
    //         exp_tmp = exp(-score)
    //         return ((1 - label) - label * exp_tmp) / (1 + exp_tmp)
    // Note that optimal speed would be achieved, at the cost of precision, by
    //     return expit(score) - y_true
    // i.e. no "if else" and an own inline implementation of expit.
    // The case distinction score < 0 in the stable implementation does not
    // provide significant better precision apart from protecting overflow of exp(..).
    // The branch (if else), however, can incur runtime costs of up to 30%.
    // Instead, we help branch prediction by almost always ending in the first if clause
    // and making the second branch (else) a bit simpler. This has the exact same
    // precision but is faster than the stable implementation.
    // As branching criteria, we use the same cutoff as in log1pexp, see link above.
    // Note that the maximal value to get gradient = -1 with label = 1 is -37.439198610162731
    // (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365.
99
    if (weights_ == nullptr) {
Andrew Ziem's avatar
Andrew Ziem committed
100
      // compute pointwise gradients and Hessians with implied unit weights
101
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
102
      for (data_size_t i = 0; i < num_data_; ++i) {
103
104
105
106
107
108
109
110
111
        if (score[i] > -37.0) {
          const double exp_tmp = std::exp(-score[i]);
          gradients[i] = static_cast<score_t>(((1.0f - label_[i]) - label_[i] * exp_tmp) / (1.0f + exp_tmp));
          hessians[i] = static_cast<score_t>(exp_tmp / ((1 + exp_tmp) * (1 + exp_tmp)));
        } else {
          const double exp_tmp = std::exp(score[i]);
          gradients[i] = static_cast<score_t>(exp_tmp - label_[i]);
          hessians[i] = static_cast<score_t>(exp_tmp);
        }
112
113
      }
    } else {
Andrew Ziem's avatar
Andrew Ziem committed
114
      // compute pointwise gradients and Hessians with given weights
115
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
116
      for (data_size_t i = 0; i < num_data_; ++i) {
117
118
119
120
121
122
123
124
125
        if (score[i] > -37.0) {
          const double exp_tmp = std::exp(-score[i]);
          gradients[i] = static_cast<score_t>(((1.0f - label_[i]) - label_[i] * exp_tmp) / (1.0f + exp_tmp) * weights_[i]);
          hessians[i] = static_cast<score_t>(exp_tmp / ((1 + exp_tmp) * (1 + exp_tmp)) * weights_[i]);
        } else {
          const double exp_tmp = std::exp(score[i]);
          gradients[i] = static_cast<score_t>((exp_tmp - label_[i]) * weights_[i]);
          hessians[i] = static_cast<score_t>(exp_tmp * weights_[i]);
        }
126
127
128
129
130
      }
    }
  }

  const char* GetName() const override {
Guolin Ke's avatar
Guolin Ke committed
131
    return "cross_entropy";
132
133
134
135
136
137
138
139
140
141
142
143
144
  }

  // convert score to a probability
  void ConvertOutput(const double* input, double* output) const override {
    output[0] = 1.0f / (1.0f + std::exp(-input[0]));
  }

  std::string ToString() const override {
    std::stringstream str_buf;
    str_buf << GetName();
    return str_buf.str();
  }

145
  // implement custom average to boost from (if enabled among options)
146
  double BoostFromScore(int) const override {
147
148
    double suml = 0.0f;
    double sumw = 0.0f;
149
    if (weights_ != nullptr) {
150
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
Guolin Ke's avatar
Guolin Ke committed
151

152
      for (data_size_t i = 0; i < num_data_; ++i) {
153
        suml += static_cast<double>(label_[i]) * weights_[i];
154
155
156
157
        sumw += weights_[i];
      }
    } else {
      sumw = static_cast<double>(num_data_);
158
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
Guolin Ke's avatar
Guolin Ke committed
159

160
161
162
163
164
      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += label_[i];
      }
    }
    double pavg = suml / sumw;
165
166
    pavg = std::min(pavg, 1.0 - kEpsilon);
    pavg = std::max<double>(pavg, kEpsilon);
167
    double initscore = std::log(pavg / (1.0f - pavg));
168
    Log::Info("[%s:%s]: pavg = %f -> initscore = %f",  GetName(), __func__, pavg, initscore);
169
    return initscore;
170
171
  }

Nikita Titov's avatar
Nikita Titov committed
172
 private:
173
174
175
  /*! \brief Number of data points */
  data_size_t num_data_;
  /*! \brief Pointer for label */
176
  const label_t* label_;
177
  /*! \brief Weights for data */
178
  const label_t* weights_;
Guolin Ke's avatar
Guolin Ke committed
179
  const bool deterministic_;
180
181
182
183
184
185
};

/*!
* \brief Objective function for alternative parameterization of cross-entropy (see top of file for explanation)
*/
class CrossEntropyLambda: public ObjectiveFunction {
Nikita Titov's avatar
Nikita Titov committed
186
 public:
Guolin Ke's avatar
Guolin Ke committed
187
188
  explicit CrossEntropyLambda(const Config& config)
      : deterministic_(config.deterministic) {
189
190
191
    min_weight_ = max_weight_ = 0.0f;
  }

Guolin Ke's avatar
Guolin Ke committed
192
193
  explicit CrossEntropyLambda(const std::vector<std::string>&)
      : deterministic_(false) {}
194
195
196
197
198
199
200
201
202

  ~CrossEntropyLambda() {}

  void Init(const Metadata& metadata, data_size_t num_data) override {
    num_data_ = num_data;
    label_ = metadata.label();
    weights_ = metadata.weights();

    CHECK_NOTNULL(label_);
203
    Common::CheckElementsIntervalClosed<label_t>(label_, 0.0f, 1.0f, num_data_, GetName());
204
205
206
    Log::Info("[%s:%s]: (objective) labels passed interval [0, 1] check",  GetName(), __func__);

    if (weights_ != nullptr) {
207
      Common::ObtainMinMaxSum(weights_, num_data_, &min_weight_, &max_weight_, static_cast<label_t*>(nullptr));
208
      if (min_weight_ <= 0.0f) {
209
        Log::Fatal("[%s]: at least one weight is non-positive", GetName());
210
211
212
213
      }

      // Issue an info statement about this ratio
      double weight_ratio = max_weight_ / min_weight_;
Tony-Y's avatar
Tony-Y committed
214
      Log::Info("[%s:%s]: min, max weights = %f, %f; ratio = %f",
215
216
217
218
                GetName(), __func__,
                min_weight_, max_weight_,
                weight_ratio);
    } else {
Tony-Y's avatar
Tony-Y committed
219
      // all weights are implied to be unity; no need to do anything
220
221
222
223
224
    }
  }

  void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
    if (weights_ == nullptr) {
Andrew Ziem's avatar
Andrew Ziem committed
225
      // compute pointwise gradients and Hessians with implied unit weights; exactly equivalent to CrossEntropy with unit weights
226
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
227
228
229
230
231
232
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double z = 1.0f / (1.0f + std::exp(-score[i]));
        gradients[i] = static_cast<score_t>(z - label_[i]);
        hessians[i] = static_cast<score_t>(z * (1.0f - z));
      }
    } else {
Andrew Ziem's avatar
Andrew Ziem committed
233
      // compute pointwise gradients and Hessians with given weights
234
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
235
236
237
238
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double w = weights_[i];
        const double y = label_[i];
        const double epf = std::exp(score[i]);
239
        const double hhat = std::log1p(epf);
240
        const double z = 1.0f - std::exp(-w*hhat);
241
        const double enf = 1.0f / epf;  // = std::exp(-score[i]);
242
243
244
245
246
247
248
249
250
251
252
253
        gradients[i] = static_cast<score_t>((1.0f - y / z) * w / (1.0f + enf));
        const double c = 1.0f / (1.0f - z);
        double d = 1.0f + epf;
        const double a = w * epf / (d * d);
        d = c - 1.0f;
        const double b = (c / (d * d) ) * (1.0f + w * epf - c);
        hessians[i] = static_cast<score_t>(a * (1.0f + y * b));
      }
    }
  }

  const char* GetName() const override {
Guolin Ke's avatar
Guolin Ke committed
254
    return "cross_entropy_lambda";
255
256
257
258
259
260
261
262
263
264
265
266
  }

  //
  // ATTENTION: the function output is the "normalized exponential parameter" lambda > 0, not the probability
  //
  // If this code would read: output[0] = 1.0f / (1.0f + std::exp(-input[0]));
  // The output would still not be the probability unless the weights are unity.
  //
  // Let z = 1 / (1 + exp(-f)), then prob(z) = 1-(1-z)^w, where w is the weight for the specific point.
  //

  void ConvertOutput(const double* input, double* output) const override {
267
    output[0] = std::log1p(std::exp(input[0]));
268
269
270
271
272
273
274
275
  }

  std::string ToString() const override {
    std::stringstream str_buf;
    str_buf << GetName();
    return str_buf.str();
  }

276
  double BoostFromScore(int) const override {
277
    double suml = 0.0f;
278
    double sumw = 0.0f;
Laurae's avatar
Laurae committed
279
    if (weights_ != nullptr) {
280
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
Guolin Ke's avatar
Guolin Ke committed
281

282
      for (data_size_t i = 0; i < num_data_; ++i) {
283
        suml += static_cast<double>(label_[i]) * weights_[i];
284
285
        sumw += weights_[i];
      }
286
287
    } else {
      sumw = static_cast<double>(num_data_);
288
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
Guolin Ke's avatar
Guolin Ke committed
289

290
291
292
      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += label_[i];
      }
293
    }
294
    double havg = suml / sumw;
295
    double initscore = std::log(std::expm1(havg));
296
    Log::Info("[%s:%s]: havg = %f -> initscore = %f",  GetName(), __func__, havg, initscore);
297
    return initscore;
298
299
  }

300
 private:
301
302
303
  /*! \brief Number of data points */
  data_size_t num_data_;
  /*! \brief Pointer for label */
304
  const label_t* label_;
305
  /*! \brief Weights for data */
306
  const label_t* weights_;
307
  /*! \brief Minimum weight found during init */
308
  label_t min_weight_;
309
  /*! \brief Maximum weight found during init */
310
  label_t max_weight_;
Guolin Ke's avatar
Guolin Ke committed
311
  const bool deterministic_;
312
313
314
315
};

}  // end namespace LightGBM

316
#endif  // LIGHTGBM_SRC_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_