Commit 3ef3a489 authored by Guolin Ke's avatar Guolin Ke
Browse files

change init_score to double type

parent 12a96334
......@@ -204,6 +204,8 @@ SEXP LGBM_DatasetSetField_R(SEXP handle,
vec[i] = static_cast<int32_t>(R_INT_PTR(field_data)[i]);
}
CHECK_CALL(LGBM_DatasetSetField(R_GET_PTR(handle), name, vec.data(), len, C_API_DTYPE_INT32));
} else if(!strcmp("init_score", name)) {
CHECK_CALL(LGBM_DatasetSetField(R_GET_PTR(handle), name, R_REAL_PTR(field_data), len, C_API_DTYPE_FLOAT64));
} else {
std::vector<float> vec(len);
#pragma omp parallel for schedule(static)
......@@ -234,6 +236,12 @@ SEXP LGBM_DatasetGetField_R(SEXP handle,
for (int i = 0; i < out_len - 1; ++i) {
R_INT_PTR(field_data)[i] = p_data[i + 1] - p_data[i];
}
} else if (!strcmp("init_score", name)) {
auto p_data = reinterpret_cast<const double*>(res);
#pragma omp parallel for schedule(static)
for (int i = 0; i < out_len; ++i) {
R_REAL_PTR(field_data)[i] = p_data[i];
}
} else {
auto p_data = reinterpret_cast<const float*>(res);
#pragma omp parallel for schedule(static)
......
......@@ -94,8 +94,6 @@ public:
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
*/
void SetInitScore(const float* init_score, data_size_t len);
void SetInitScore(const double* init_score, data_size_t len);
......@@ -195,7 +193,7 @@ public:
* \brief Get initial scores, if not exists, will return nullptr
* \return Pointer of initial scores
*/
inline const float* init_score() const {
inline const double* init_score() const {
if (!init_score_.empty()) {
return init_score_.data();
} else {
......@@ -206,7 +204,7 @@ public:
/*!
* \brief Get size of initial scores
*/
inline data_size_t num_init_score() const { return num_init_score_; }
inline int64_t num_init_score() const { return num_init_score_; }
/*! \brief Disable copy */
Metadata& operator=(const Metadata&) = delete;
......@@ -239,9 +237,9 @@ private:
/*! \brief Number of querys */
data_size_t num_queries_;
/*! \brief Number of Initial score, used to check correct weight file */
data_size_t num_init_score_;
int64_t num_init_score_;
/*! \brief Initial score */
std::vector<float> init_score_;
std::vector<double> init_score_;
/*! \brief Queries data */
std::vector<data_size_t> queries_;
/*! \brief mutex for threading safe call */
......@@ -336,10 +334,14 @@ public:
bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);
bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element);
bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr);
bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr);
bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
/*!
......
......@@ -90,6 +90,14 @@ def cfloat32_array_to_numpy(cptr, length):
else:
raise RuntimeError('Expected float pointer')
def cfloat64_array_to_numpy(cptr, length):
"""Convert a ctypes double pointer array to a numpy array.
"""
if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
return np.fromiter(cptr, dtype=np.float64, count=length)
else:
raise RuntimeError('Expected double pointer')
def cint32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.
......@@ -162,7 +170,7 @@ C_API_PREDICT_LEAF_INDEX = 2
"""data type of data field"""
FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"weight": C_API_DTYPE_FLOAT32,
"init_score": C_API_DTYPE_FLOAT32,
"init_score": C_API_DTYPE_FLOAT64,
"group": C_API_DTYPE_INT32}
......@@ -616,7 +624,6 @@ class Dataset(object):
for j in range_(self.predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j]
init_score = new_init_score
init_score = init_score.astype(dtype=np.float32, copy=False)
self.set_init_score(init_score)
elif self.predictor is not None:
raise TypeError('wrong predictor type {}'.format(type(self.predictor).__name__))
......@@ -813,16 +820,23 @@ class Dataset(object):
ctypes.c_int(0),
ctypes.c_int(FIELD_TYPE_MAPPER[field_name])))
return
dtype = np.int32 if field_name == 'group' else np.float32
dtype = np.float32
if field_name == 'group':
dtype = np.int32
elif field_name == 'init_score':
dtype = np.float64
data = list_to_1d_numpy(data, dtype, name=field_name)
if data.dtype == np.float32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
type_data = C_API_DTYPE_FLOAT32
elif data.dtype == np.float64:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
type_data = C_API_DTYPE_FLOAT64
elif data.dtype == np.int32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
type_data = C_API_DTYPE_INT32
else:
raise TypeError("Excepted np.float32 or np.int32, meet type({})".format(data.dtype))
raise TypeError("Excepted np.float32/64 or np.int32, meet type({})".format(data.dtype))
if type_data != FIELD_TYPE_MAPPER[field_name]:
raise TypeError("Input type error for set_field")
_safe_call(_LIB.LGBM_DatasetSetField(
......@@ -864,6 +878,8 @@ class Dataset(object):
return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_FLOAT32:
return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_FLOAT64:
return cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
else:
raise TypeError("Unknown type")
......@@ -976,7 +992,7 @@ class Dataset(object):
"""
self.init_score = init_score
if self.handle is not None and init_score is not None:
init_score = list_to_1d_numpy(init_score, name='init_score')
init_score = list_to_1d_numpy(init_score, np.float64, name='init_score')
self.set_field('init_score', init_score)
def set_group(self, group):
......
......@@ -20,18 +20,22 @@ public:
*/
ScoreUpdater(const Dataset* data, int num_class) : data_(data) {
num_data_ = data->num_data();
size_t total_size = static_cast<size_t>(num_data_) * num_class;
int64_t total_size = static_cast<int64_t>(num_data_) * num_class;
score_.resize(total_size);
// default start score is zero
std::fill(score_.begin(), score_.end(), 0.0f);
const float* init_score = data->metadata().init_score();
#pragma omp parallel for schedule(static)
for (int64_t i = 0; i < total_size; ++i) {
score_[i] = 0.0f;
}
const double* init_score = data->metadata().init_score();
// if exists initial score, will start from it
if (init_score != nullptr) {
if ((data->metadata().num_init_score() % num_data_) != 0
|| (data->metadata().num_init_score() / num_data_) != num_class) {
Log::Fatal("number of class for initial score error");
}
for (size_t i = 0; i < total_size; ++i) {
#pragma omp parallel for schedule(static)
for (int64_t i = 0; i < total_size; ++i) {
score_[i] = init_score[i];
}
}
......
......@@ -536,6 +536,8 @@ DllExport int LGBM_DatasetSetField(DatasetHandle handle,
is_success = dataset->SetFloatField(field_name, reinterpret_cast<const float*>(field_data), static_cast<int32_t>(num_element));
} else if (type == C_API_DTYPE_INT32) {
is_success = dataset->SetIntField(field_name, reinterpret_cast<const int*>(field_data), static_cast<int32_t>(num_element));
} else if (type == C_API_DTYPE_FLOAT64) {
is_success = dataset->SetDoubleField(field_name, reinterpret_cast<const double*>(field_data), static_cast<int32_t>(num_element));
}
if (!is_success) { throw std::runtime_error("Input data type erorr or field not found"); }
API_END();
......@@ -555,6 +557,9 @@ DllExport int LGBM_DatasetGetField(DatasetHandle handle,
} else if (dataset->GetIntField(field_name, out_len, reinterpret_cast<const int**>(out_ptr))) {
*out_type = C_API_DTYPE_INT32;
is_success = true;
} else if (dataset->GetDoubleField(field_name, out_len, reinterpret_cast<const double**>(out_ptr))) {
*out_type = C_API_DTYPE_FLOAT64;
is_success = true;
}
if (!is_success) { throw std::runtime_error("Field not found"); }
if (*out_ptr == nullptr) { *out_len = 0; }
......
......@@ -77,7 +77,16 @@ bool Dataset::SetFloatField(const char* field_name, const float* field_data, dat
metadata_.SetLabel(field_data, num_element);
} else if (name == std::string("weight") || name == std::string("weights")) {
metadata_.SetWeights(field_data, num_element);
} else if (name == std::string("init_score")) {
} else {
return false;
}
return true;
}
bool Dataset::SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element) {
std::string name(field_name);
name = Common::Trim(name);
if (name == std::string("init_score")) {
metadata_.SetInitScore(field_data, num_element);
} else {
return false;
......@@ -107,9 +116,18 @@ bool Dataset::GetFloatField(const char* field_name, data_size_t* out_len, const
} else if (name == std::string("weight") || name == std::string("weights")) {
*out_ptr = metadata_.weights();
*out_len = num_data_;
} else if (name == std::string("init_score")) {
} else {
return false;
}
return true;
}
bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr) {
std::string name(field_name);
name = Common::Trim(name);
if (name == std::string("init_score")) {
*out_ptr = metadata_.init_score();
*out_len = num_data_;
*out_len = static_cast<data_size_t>(metadata_.num_init_score());
} else {
return false;
}
......
......@@ -20,7 +20,10 @@ public:
: num_data_(num_data) {
data_.resize(num_data_);
VAL_T default_bin_T = static_cast<VAL_T>(default_bin);
std::fill(data_.begin(), data_.end(), default_bin_T);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
data_[i] = default_bin_T;
}
}
~DenseBin() {
......
......@@ -36,7 +36,10 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
}
weights_ = std::vector<float>(num_data_);
num_weights_ = num_data_;
std::fill(weights_.begin(), weights_.end(), 0.0f);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = 0.0f;
}
}
if (query_idx >= 0) {
if (!query_boundaries_.empty()) {
......@@ -45,7 +48,10 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
}
if (!query_weights_.empty()) { query_weights_.clear(); }
queries_ = std::vector<data_size_t>(num_data_);
std::fill(queries_.begin(), queries_.end(), 0);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
queries_[i] = 0;
}
}
}
......@@ -53,6 +59,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
num_data_ = num_used_indices;
label_ = std::vector<float>(num_used_indices);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_used_indices; i++) {
label_[i] = fullset.label_[used_indices[i]];
}
......@@ -60,6 +67,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
if (!fullset.weights_.empty()) {
weights_ = std::vector<float>(num_used_indices);
num_weights_ = num_used_indices;
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_used_indices; i++) {
weights_[i] = fullset.weights_[used_indices[i]];
}
......@@ -68,9 +76,10 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
}
if (!fullset.init_score_.empty()) {
int num_class = static_cast<int>(fullset.num_init_score_) / fullset.num_data_;
init_score_ = std::vector<float>(num_used_indices*num_class);
num_init_score_ = num_used_indices*num_class;
int num_class = static_cast<int>(fullset.num_init_score_ / fullset.num_data_);
init_score_ = std::vector<double>(num_used_indices*num_class);
num_init_score_ = static_cast<int64_t>(num_used_indices) * num_class;
#pragma omp parallel for schedule(static)
for (int k = 0; k < num_class; ++k) {
for (data_size_t i = 0; i < num_used_indices; i++) {
init_score_[k*num_data_ + i] = fullset.init_score_[k* fullset.num_data_ + used_indices[i]];
......@@ -121,6 +130,7 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
auto old_label = label_;
num_data_ = static_cast<data_size_t>(used_indices.size());
label_ = std::vector<float>(num_data_);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = old_label[used_indices[i]];
}
......@@ -201,7 +211,8 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
auto old_weights = weights_;
num_weights_ = num_data_;
weights_ = std::vector<float>(num_data_);
for (size_t i = 0; i < used_data_indices.size(); ++i) {
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
weights_[i] = old_weights[used_data_indices[i]];
}
old_weights.clear();
......@@ -243,9 +254,10 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
// get local initial scores
if (!init_score_.empty()) {
auto old_scores = init_score_;
int num_class = num_init_score_ / num_all_data;
num_init_score_ = num_data_ * num_class;
init_score_ = std::vector<float>(num_init_score_);
int num_class = static_cast<int>(num_init_score_ / num_all_data);
num_init_score_ = static_cast<int64_t>(num_data_) * num_class;
init_score_ = std::vector<double>(num_init_score_);
#pragma omp parallel for schedule(static)
for (int k = 0; k < num_class; ++k){
for (size_t i = 0; i < used_data_indices.size(); ++i) {
init_score_[k * num_data_ + i] = old_scores[k * num_all_data + used_data_indices[i]];
......@@ -259,26 +271,6 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
}
}
void Metadata::SetInitScore(const float* init_score, data_size_t len) {
std::lock_guard<std::mutex> lock(mutex_);
// save to nullptr
if (init_score == nullptr || len == 0) {
init_score_.clear();
num_init_score_ = 0;
return;
}
if ((len % num_data_) != 0) {
Log::Fatal("Initial score size doesn't match data size");
}
if (!init_score_.empty()) { init_score_.clear(); }
num_init_score_ = len;
init_score_ = std::vector<float>(len);
for (data_size_t i = 0; i < len; ++i) {
init_score_[i] = init_score[i];
}
}
void Metadata::SetInitScore(const double* init_score, data_size_t len) {
std::lock_guard<std::mutex> lock(mutex_);
// save to nullptr
......@@ -292,9 +284,10 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) {
}
if (!init_score_.empty()) { init_score_.clear(); }
num_init_score_ = len;
init_score_ = std::vector<float>(len);
for (data_size_t i = 0; i < len; ++i) {
init_score_[i] = static_cast<float>(init_score[i]);
init_score_ = std::vector<double>(len);
#pragma omp parallel for schedule(static)
for (int64_t i = 0; i < num_init_score_; ++i) {
init_score_[i] = init_score[i];
}
}
......@@ -308,6 +301,7 @@ void Metadata::SetLabel(const float* label, data_size_t len) {
}
if (!label_.empty()) { label_.clear(); }
label_ = std::vector<float>(num_data_);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = label[i];
}
......@@ -327,6 +321,7 @@ void Metadata::SetWeights(const float* weights, data_size_t len) {
if (!weights_.empty()) { weights_.clear(); }
num_weights_ = num_data_;
weights_ = std::vector<float>(num_weights_);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = weights[i];
}
......@@ -342,6 +337,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
return;
}
data_size_t sum = 0;
#pragma omp parallel for schedule(static) reduction(+:sum)
for (data_size_t i = 0; i < len; ++i) {
sum += query[i];
}
......@@ -413,6 +409,7 @@ void Metadata::LoadWeights() {
Log::Info("Loading weights...");
num_weights_ = static_cast<data_size_t>(reader.Lines().size());
weights_ = std::vector<float>(num_weights_);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) {
double tmp_weight = 0.0f;
Common::Atof(reader.Lines()[i].c_str(), &tmp_weight);
......@@ -435,26 +432,28 @@ void Metadata::LoadInitialScore() {
// use first line to count number class
int num_class = static_cast<int>(Common::Split(reader.Lines()[0].c_str(), '\t').size());
data_size_t num_line = static_cast<data_size_t>(reader.Lines().size());
num_init_score_ = static_cast<data_size_t>(num_line * num_class);
init_score_ = std::vector<float>(num_init_score_);
double tmp = 0.0f;
num_init_score_ = static_cast<int64_t>(num_line) * num_class;
init_score_ = std::vector<double>(num_init_score_);
if (num_class == 1) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_line; ++i) {
double tmp = 0.0f;
Common::Atof(reader.Lines()[i].c_str(), &tmp);
init_score_[i] = static_cast<float>(tmp);
init_score_[i] = static_cast<double>(tmp);
}
} else {
std::vector<std::string> oneline_init_score;
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_line; ++i) {
double tmp = 0.0f;
oneline_init_score = Common::Split(reader.Lines()[i].c_str(), '\t');
if (static_cast<int>(oneline_init_score.size()) != num_class) {
Log::Fatal("Invalid initial score file. Redundant or insufficient columns.");
}
for (int k = 0; k < num_class; ++k) {
Common::Atof(oneline_init_score[k].c_str(), &tmp);
init_score_[k * num_line + i] = static_cast<float>(tmp);
init_score_[k * num_line + i] = static_cast<double>(tmp);
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment