"vscode:/vscode.git/clone" did not exist on "d517ba12f2e7862ac533908304dddbd770655d2b"
Commit f8597c93 authored by olofer's avatar olofer Committed by Guolin Ke
Browse files

cross entropy metrics and objective (#685)

* Created objectives and metrics xentropy and xentropy1

* Some coment and code cleanup.

* Added Kullback-Leibler version of metric. Changed some warning messages.

* Fixed sign error in KL-divergence calc.

* Removed __PRETTY_FUNCTION__.

* Fixed better name for alternative xentropy parameterization.
Documented details on the objectives / metrics in code comments.

* Common code for label interval checks. Cleanups.

* Use common utility for various weight property checks.
parent c79d897c
......@@ -579,6 +579,31 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred) {
return ParallelSort(_First, _Last, _Pred, IteratorValType(_First));
}
// Check that all y[] are in interval [ymin, ymax] (end points included); throws error if not
inline void check_elements_interval_closed(const float *y, float ymin, float ymax, int ny, const char *callername) {
for (int i = 0; i < ny; ++i) {
if (y[i] < ymin || y[i] > ymax) {
Log::Fatal("[%s]: does not tolerate element [#%i = %f] outside [%f, %f]", callername, i, y[i], ymin, ymax);
}
}
}
// One-pass scan over array w with nw elements: find min, max and sum of elements;
// this is useful for checking weight requirements.
inline void obtain_min_max_sum(const float *w, int nw, float *mi, float *ma, double *su) {
float minw = w[0];
float maxw = w[0];
double sumw = static_cast<double>(w[0]);
for (int i = 1; i < nw; ++i) {
sumw += w[i];
if (w[i] < minw) minw = w[i];
if (w[i] > maxw) maxw = w[i];
}
if (mi != nullptr) *mi = minw;
if (ma != nullptr) *ma = maxw;
if (su != nullptr) *su = sumw;
}
} // namespace Common
} // namespace LightGBM
......
......@@ -4,6 +4,7 @@
#include "rank_metric.hpp"
#include "map_metric.hpp"
#include "multiclass_metric.hpp"
#include "xentropy_metric.hpp"
namespace LightGBM {
......@@ -34,6 +35,12 @@ Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config
return new MultiSoftmaxLoglossMetric(config);
} else if (type == std::string("multi_error")) {
return new MultiErrorMetric(config);
} else if (type == std::string("xentropy") || type == std::string("cross_entropy")) {
return new CrossEntropyMetric(config);
} else if (type == std::string("xentlambda")) {
return new CrossEntropyLambdaMetric(config);
} else if (type == std::string("kldiv") || type == std::string("kullback_leibler")) {
return new KullbackLeiblerDivergence(config);
}
return nullptr;
}
......
#ifndef LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_
#define LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/metric.h>
#include <algorithm>
#include <vector>
#include <sstream>
/*
* Implements three related metrics:
*
* (1) standard cross-entropy that can be used for continuous labels in [0, 1]
* (2) "intensity-weighted" cross-entropy, also for continuous labels in [0, 1]
* (3) Kullback-Leibler divergence, also for continuous labels in [0, 1]
*
* (3) adds an offset term to (1); the entropy of the label
*
* See xentropy_objective.hpp for further details.
*
*/
namespace LightGBM {
// label should be in interval [0, 1];
// prob should be in interval (0, 1); prob is clipped if needed
inline static double XentLoss(float label, double prob) {
const double log_arg_epsilon = 1.0e-12;
double a = label;
if (prob > log_arg_epsilon) {
a *= std::log(prob);
} else {
a *= std::log(log_arg_epsilon);
}
double b = 1.0f - label;
if (1.0f - prob > log_arg_epsilon) {
b *= std::log(1.0f - prob);
} else {
b *= std::log(log_arg_epsilon);
}
return - (a + b);
}
// hhat >(=) 0 assumed; and weight > 0 required; but not checked here
inline static double XentLambdaLoss(float label, float weight, double hhat) {
return XentLoss(label, 1.0f - std::exp(-weight * hhat));
}
// Computes the (negative) entropy for label p; p should be in interval [0, 1];
// This is used to presum the KL-divergence offset term (to be _added_ to the cross-entropy loss).
// NOTE: x*log(x) = 0 for x=0,1; so only add when in (0, 1); avoid log(0)*0
inline static double YentLoss(double p) {
double hp = 0.0;
if (p > 0) hp += p * std::log(p);
double q = 1.0f - p;
if (q > 0) hp += q * std::log(q);
return hp;
}
//
// CrossEntropyMetric : "xentropy" : (optional) weights are used linearly
//
class CrossEntropyMetric : public Metric {
public:
explicit CrossEntropyMetric(const MetricConfig&) {}
virtual ~CrossEntropyMetric() {}
void Init(const Metadata& metadata, data_size_t num_data) override {
name_.emplace_back("xentropy");
num_data_ = num_data;
label_ = metadata.label();
weights_ = metadata.weights();
CHECK_NOTNULL(label_);
// ensure that labels are in interval [0, 1], interval ends included
Common::check_elements_interval_closed(label_, 0.0f, 1.0f, num_data_, GetName()[0].c_str());
Log::Info("[%s:%s]: (metric) labels passed interval [0, 1] check", GetName()[0].c_str(), __func__);
// check that weights are non-negative and sum is positive
if (weights_ == nullptr) {
sum_weights_ = static_cast<double>(num_data_);
} else {
float minw;
Common::obtain_min_max_sum(weights_, num_data_, &minw, nullptr, &sum_weights_);
if (minw < 0.0f) {
Log::Fatal("[%s:%s]: (metric) weights not allowed to be negative", GetName()[0].c_str(), __func__);
}
}
// check weight sum (may fail to be zero)
if (sum_weights_ <= 0.0f) {
Log::Fatal("[%s:%s]: sum-of-weights = %f is non-positive", __func__, GetName()[0].c_str(), sum_weights_);
}
Log::Info("[%s:%s]: sum-of-weights = %f", GetName()[0].c_str(), __func__, sum_weights_);
}
std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const override {
double sum_loss = 0.0f;
if (objective == nullptr) {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]); // NOTE: does not work unless score is a probability
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]) * weights_[i]; // NOTE: does not work unless score is a probability
}
}
} else {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0;
objective->ConvertOutput(&score[i], &p);
sum_loss += XentLoss(label_[i], p);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0;
objective->ConvertOutput(&score[i], &p);
sum_loss += XentLoss(label_[i], p) * weights_[i];
}
}
}
double loss = sum_loss / sum_weights_;
return std::vector<double>(1, loss);
}
const std::vector<std::string>& GetName() const override {
return name_;
}
double factor_to_bigger_better() const override {
return -1.0f; // negative means smaller loss is better, positive means larger loss is better
}
private:
/*! \brief Number of data points */
data_size_t num_data_;
/*! \brief Pointer to label */
const float* label_;
/*! \brief Pointer to weights */
const float* weights_;
/*! \brief Sum of weights */
double sum_weights_;
/*! \brief Name of this metric */
std::vector<std::string> name_;
};
//
// CrossEntropyLambdaMetric : "xentlambda" : (optional) weights have a different meaning than for "xentropy"
// ATTENTION: Supposed to be used when the objective also is "xentlambda"
//
class CrossEntropyLambdaMetric : public Metric {
public:
explicit CrossEntropyLambdaMetric(const MetricConfig&) {}
virtual ~CrossEntropyLambdaMetric() {}
void Init(const Metadata& metadata, data_size_t num_data) override {
name_.emplace_back("xentlambda");
num_data_ = num_data;
label_ = metadata.label();
weights_ = metadata.weights();
CHECK_NOTNULL(label_);
Common::check_elements_interval_closed(label_, 0.0f, 1.0f, num_data_, GetName()[0].c_str());
Log::Info("[%s:%s]: (metric) labels passed interval [0, 1] check", GetName()[0].c_str(), __func__);
// check all weights are strictly positive; throw error if not
if (weights_ != nullptr) {
float minw;
Common::obtain_min_max_sum(weights_, num_data_, &minw, nullptr, nullptr);
if (minw <= 0.0f) {
Log::Fatal("[%s:%s]: (metric) all weights must be positive", GetName()[0].c_str(), __func__);
}
}
}
std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const override {
double sum_loss = 0.0f;
if (objective == nullptr) {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = std::log(1.0f + std::exp(score[i])); // auto-convert
sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = std::log(1.0f + std::exp(score[i])); // auto-convert
sum_loss += XentLambdaLoss(label_[i], weights_[i], hhat);
}
}
} else {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = 0;
objective->ConvertOutput(&score[i], &hhat); // NOTE: this only works if objective = "xentlambda"
sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double hhat = 0;
objective->ConvertOutput(&score[i], &hhat); // NOTE: this only works if objective = "xentlambda"
sum_loss += XentLambdaLoss(label_[i], weights_[i], hhat);
}
}
}
return std::vector<double>(1, sum_loss / static_cast<double>(num_data_));
}
const std::vector<std::string>& GetName() const override {
return name_;
}
double factor_to_bigger_better() const override {
return -1.0f;
}
private:
/*! \brief Number of data points */
data_size_t num_data_;
/*! \brief Pointer to label */
const float* label_;
/*! \brief Pointer to weights */
const float* weights_;
/*! \brief Name of this metric */
std::vector<std::string> name_;
};
//
// KullbackLeiblerDivergence : "kldiv" : (optional) weights are used linearly
//
class KullbackLeiblerDivergence : public Metric {
public:
explicit KullbackLeiblerDivergence(const MetricConfig&) {}
virtual ~KullbackLeiblerDivergence() {}
void Init(const Metadata& metadata, data_size_t num_data) override {
name_.emplace_back("kldiv");
num_data_ = num_data;
label_ = metadata.label();
weights_ = metadata.weights();
CHECK_NOTNULL(label_);
Common::check_elements_interval_closed(label_, 0.0f, 1.0f, num_data_, GetName()[0].c_str());
Log::Info("[%s:%s]: (metric) labels passed interval [0, 1] check", GetName()[0].c_str(), __func__);
if (weights_ == nullptr) {
sum_weights_ = static_cast<double>(num_data_);
} else {
float minw;
Common::obtain_min_max_sum(weights_, num_data_, &minw, nullptr, &sum_weights_);
if (minw < 0.0f) {
Log::Fatal("[%s:%s]: (metric) at least one weight is negative", GetName()[0].c_str(), __func__);
}
}
// check weight sum
if (sum_weights_ <= 0.0f) {
Log::Fatal("[%s:%s]: sum-of-weights = %f is non-positive", GetName()[0].c_str(), __func__, sum_weights_);
}
Log::Info("[%s:%s]: sum-of-weights = %f", GetName()[0].c_str(), __func__, sum_weights_);
// evaluate offset term
presum_label_entropy_ = 0.0f;
if (weights_ == nullptr) {
// #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
presum_label_entropy_ += YentLoss(label_[i]);
}
} else {
// #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
presum_label_entropy_ += YentLoss(label_[i]) * weights_[i];
}
}
presum_label_entropy_ /= sum_weights_;
// communicate the value of the offset term to be added
Log::Info("%s offset term = %f", GetName()[0].c_str(), presum_label_entropy_);
}
std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const override {
double sum_loss = 0.0f;
if (objective == nullptr) {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]); // NOTE: does not work unless score is a probability
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
sum_loss += XentLoss(label_[i], score[i]) * weights_[i]; // NOTE: does not work unless score is a probability
}
}
} else {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0;
objective->ConvertOutput(&score[i], &p);
sum_loss += XentLoss(label_[i], p);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
double p = 0;
objective->ConvertOutput(&score[i], &p);
sum_loss += XentLoss(label_[i], p) * weights_[i];
}
}
}
double loss = presum_label_entropy_ + sum_loss / sum_weights_;
return std::vector<double>(1, loss);
}
const std::vector<std::string>& GetName() const override {
return name_;
}
double factor_to_bigger_better() const override {
return -1.0f;
}
private:
/*! \brief Number of data points */
data_size_t num_data_;
/*! \brief Pointer to label */
const float* label_;
/*! \brief Pointer to weights */
const float* weights_;
/*! \brief Sum of weights */
double sum_weights_;
/*! \brief Offset term to cross-entropy; precomputed during init */
double presum_label_entropy_;
/*! \brief Name of this metric */
std::vector<std::string> name_;
};
} // end namespace LightGBM
#endif // end #ifndef LIGHTGBM_METRIC_XENTROPY_METRIC_HPP_
......@@ -3,6 +3,7 @@
#include "binary_objective.hpp"
#include "rank_objective.hpp"
#include "multiclass_objective.hpp"
#include "xentropy_objective.hpp"
namespace LightGBM {
......@@ -26,6 +27,10 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
return new MulticlassSoftmax(config);
} else if (type == std::string("multiclassova")) {
return new MulticlassOVA(config);
} else if (type == std::string("xentropy") || type == std::string("cross_entropy")) {
return new CrossEntropy(config);
} else if (type == std::string("xentlambda") || type == std::string("cross_entropy_lambda")) {
return new CrossEntropyLambda(config);
}
return nullptr;
}
......@@ -51,6 +56,10 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
return new MulticlassSoftmax(strs);
} else if (type == std::string("multiclassova")) {
return new MulticlassOVA(strs);
} else if (type == std::string("xentropy") || type == std::string("cross_entropy")) {
return new CrossEntropy(strs);
} else if (type == std::string("xentlambda") || type == std::string("cross_entropy_lambda")) {
return new CrossEntropyLambda(strs);
}
return nullptr;
}
......
#ifndef LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_
#define LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_
#include <LightGBM/utils/common.h>
#include <LightGBM/objective_function.h>
#include <cstring>
#include <cmath>
/*
* Implements gradients and hessians for the following point losses.
* Target y is anything in interval [0, 1].
*
* (1) CrossEntropy; "xentropy";
*
* loss(y, p, w) = { -(1-y)*log(1-p)-y*log(p) }*w,
* with probability p = 1/(1+exp(-f)), where f is being boosted
*
* ConvertToOutput: f -> p
*
* (2) CrossEntropyLambda; "xentlambda"
*
* loss(y, p, w) = -(1-y)*log(1-p)-y*log(p),
* with p = 1-exp(-lambda*w), lambda = log(1+exp(f)), f being boosted, and w > 0
*
* ConvertToOutput: f -> lambda
*
* (1) and (2) are the same if w=1; but outputs still differ.
*
*/
namespace LightGBM {
/*!
* \brief Objective function for cross-entropy (with optional linear weights)
*/
class CrossEntropy: public ObjectiveFunction {
public:
explicit CrossEntropy(const ObjectiveConfig&) {
}
explicit CrossEntropy(const std::vector<std::string>&) {
}
~CrossEntropy() {}
void Init(const Metadata& metadata, data_size_t num_data) override {
num_data_ = num_data;
label_ = metadata.label();
weights_ = metadata.weights();
CHECK_NOTNULL(label_);
Common::check_elements_interval_closed(label_, 0.0f, 1.0f, num_data_, GetName());
Log::Info("[%s:%s]: (objective) labels passed interval [0, 1] check", GetName(), __func__);
if (weights_ != nullptr) {
float minw;
double sumw;
Common::obtain_min_max_sum(weights_, num_data_, &minw, nullptr, &sumw);
if (minw < 0.0f) {
Log::Fatal("[%s]: at least one weight is negative.", GetName());
}
if (sumw == 0.0f) {
Log::Fatal("[%s]: sum of weights is zero.", GetName());
}
}
}
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) {
// compute pointwise gradients and hessians with implied unit weights
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i]));
gradients[i] = static_cast<score_t>(z - label_[i]);
hessians[i] = static_cast<score_t>(z * (1.0f - z));
}
} else {
// compute pointwise gradients and hessians with given weights
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i]));
gradients[i] = static_cast<score_t>((z - label_[i]) * weights_[i]);
hessians[i] = static_cast<score_t>(z * (1.0f - z) * weights_[i]);
}
}
}
const char* GetName() const override {
return "xentropy";
}
// convert score to a probability
void ConvertOutput(const double* input, double* output) const override {
output[0] = 1.0f / (1.0f + std::exp(-input[0]));
}
std::string ToString() const override {
std::stringstream str_buf;
str_buf << GetName();
return str_buf.str();
}
bool BoostFromAverage() const override { return true; }
private:
/*! \brief Number of data points */
data_size_t num_data_;
/*! \brief Pointer for label */
const float* label_;
/*! \brief Weights for data */
const float* weights_;
};
/*!
* \brief Objective function for alternative parameterization of cross-entropy (see top of file for explanation)
*/
class CrossEntropyLambda: public ObjectiveFunction {
public:
explicit CrossEntropyLambda(const ObjectiveConfig&) {
min_weight_ = max_weight_ = 0.0f;
}
explicit CrossEntropyLambda(const std::vector<std::string>&) {
}
~CrossEntropyLambda() {}
void Init(const Metadata& metadata, data_size_t num_data) override {
num_data_ = num_data;
label_ = metadata.label();
weights_ = metadata.weights();
CHECK_NOTNULL(label_);
Common::check_elements_interval_closed(label_, 0.0f, 1.0f, num_data_, GetName());
Log::Info("[%s:%s]: (objective) labels passed interval [0, 1] check", GetName(), __func__);
if (weights_ != nullptr) {
Common::obtain_min_max_sum(weights_, num_data_, &min_weight_, &max_weight_, nullptr);
if (min_weight_ <= 0.0f) {
Log::Fatal("[%s]: at least one weight is non-positive.", GetName());
}
// Issue an info statement about this ratio
double weight_ratio = max_weight_ / min_weight_;
Log::Info("[%s:%s]: min, max weights = %f, %f; ratio = %f",
GetName(), __func__,
min_weight_, max_weight_,
weight_ratio);
} else {
// all weights are implied to be unity; no need to do anything
}
}
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) {
// compute pointwise gradients and hessians with implied unit weights; exactly equivalent to CrossEntropy with unit weights
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i]));
gradients[i] = static_cast<score_t>(z - label_[i]);
hessians[i] = static_cast<score_t>(z * (1.0f - z));
}
} else {
// compute pointwise gradients and hessians with given weights
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
const double w = weights_[i];
const double y = label_[i];
const double epf = std::exp(score[i]);
const double hhat = std::log(1.0f + epf);
const double z = 1.0f - std::exp(-w*hhat);
const double enf = 1.0f / epf; // = std::exp(-score[i]);
gradients[i] = static_cast<score_t>((1.0f - y / z) * w / (1.0f + enf));
const double c = 1.0f / (1.0f - z);
double d = 1.0f + epf;
const double a = w * epf / (d * d);
d = c - 1.0f;
const double b = (c / (d * d) ) * (1.0f + w * epf - c);
hessians[i] = static_cast<score_t>(a * (1.0f + y * b));
}
}
}
const char* GetName() const override {
return "xentlambda";
}
//
// ATTENTION: the function output is the "normalized exponential parameter" lambda > 0, not the probability
//
// If this code would read: output[0] = 1.0f / (1.0f + std::exp(-input[0]));
// The output would still not be the probability unless the weights are unity.
//
// Let z = 1 / (1 + exp(-f)), then prob(z) = 1-(1-z)^w, where w is the weight for the specific point.
//
void ConvertOutput(const double* input, double* output) const override {
output[0] = std::log(1.0f + std::exp(input[0]));
}
std::string ToString() const override {
std::stringstream str_buf;
str_buf << GetName();
return str_buf.str();
}
// might want to boost from a weighted average in general, if possible
bool BoostFromAverage() const override { return true; }
private:
/*! \brief Number of data points */
data_size_t num_data_;
/*! \brief Pointer for label */
const float* label_;
/*! \brief Weights for data */
const float* weights_;
/*! \brief Minimum weight found during init */
float min_weight_;
/*! \brief Maximum weight found during init */
float max_weight_;
};
} // end namespace LightGBM
#endif // end #ifndef LIGHTGBM_OBJECTIVE_XENTROPY_OBJECTIVE_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment