Commit 219c943d authored by remcob-gr's avatar remcob-gr Committed by Guolin Ke
Browse files

Add ability to move features from one data set to another in memory (#2006)

* Initial attempt to implement appending features in-memory to another data set

The intent is for this to enable munging files together easily, without needing to round-trip via numpy or write multiple copies to disk.
In turn, that enables working more efficiently with data sets that were written separately.

* Implement Dataset.dump_text, and fix small bug in appending of group bin boundaries.

Dumping to text enables us to compare results, without having to worry about issues like features being reordered.

* Add basic tests for validation logic for add_features_from.

* Remove various internal mapping items from dataset text dumps

These are too sensitive to the exact feature order chosen, which is not visible to the user.
Including them in tests appears unnecessary, as the data dumping code should provide enough coverage.

* Add test that add_features_from results in identical data sets according to dump_text.

* Add test that booster behaviour after using add_features_from matches that of training on the full data

This checks:
- That training after add_features_from works at all
- That add_features_from does not cause training to misbehave

* Expose feature_penalty and monotone_types/constraints via get_field

These getters allow us to check that add_features_from does the right thing with these vectors.

* Add tests that add_features correctly handles feature_penalty and monotone_constraints.

* Ensure add_features_from properly frees the added dataset and add unit test for this

Since add_features_from moves the feature group pointers from the added dataset to the dataset being added to, the added dataset is invalid after the call.
We must ensure we do not try and access this handle.

* Remove some obsolete TODOs

* Tidy up DumpTextFile by using a single iterator for each feature

This iterators were also passed around as raw pointers without being freed, which is now fixed.

* Factor out offsetting logic in AddFeaturesFrom

* Remove obsolete TODO

* Remove another TODO

This one is debatable, test code can be a bit messy and duplicate-heavy, factoring it out tends to end badly.
Leaving this for now, will revisit if adding more tests later on becomes a mess.

* Add documentation for newly-added methods.

* Fix whitespace issues identified by pylint.

* Fix a few more whitespace issues.

* Fix doc comments

* Implement deep copying for feature groups.

* Replace awkward std::move usage by emplace_back, and reduce vector size to num_features rather than num_total_features.

* Copy feature groups in addFeaturesFrom, rather than moving them.

* Fix bugs in FeatureGroup copy constructor and ensure source dataset remains usable

* Add reserve to PushVector and PushOffset

* Move definition of Clone into class body

* Fix PR review issues

* Fix for loop increment style.

* Fix test failure

* Some more docstring fixes.

* Remove blank line
parent f9ab5f58
...@@ -447,6 +447,11 @@ class Bin { ...@@ -447,6 +447,11 @@ class Bin {
* \return The bin data object * \return The bin data object
*/ */
static Bin* CreateSparseBin(data_size_t num_data, int num_bin); static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
/*!
* \brief Deep copy the bin
*/
virtual Bin* Clone() = 0;
}; };
inline uint32_t BinMapper::ValueToBin(double value) const { inline uint32_t BinMapper::ValueToBin(double value) const {
......
...@@ -21,6 +21,7 @@ typedef void* BoosterHandle; ...@@ -21,6 +21,7 @@ typedef void* BoosterHandle;
#define C_API_DTYPE_FLOAT64 (1) #define C_API_DTYPE_FLOAT64 (1)
#define C_API_DTYPE_INT32 (2) #define C_API_DTYPE_INT32 (2)
#define C_API_DTYPE_INT64 (3) #define C_API_DTYPE_INT64 (3)
#define C_API_DTYPE_INT8 (4)
#define C_API_PREDICT_NORMAL (0) #define C_API_PREDICT_NORMAL (0)
#define C_API_PREDICT_RAW_SCORE (1) #define C_API_PREDICT_RAW_SCORE (1)
...@@ -269,7 +270,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetGetFeatureNames( ...@@ -269,7 +270,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetGetFeatureNames(
LIGHTGBM_C_EXPORT int LGBM_DatasetFree(DatasetHandle handle); LIGHTGBM_C_EXPORT int LGBM_DatasetFree(DatasetHandle handle);
/*! /*!
* \brief save dateset to binary file * \brief save dataset to binary file
* \param handle a instance of dataset * \param handle a instance of dataset
* \param filename file name * \param filename file name
* \return 0 when succeed, -1 when failure happens * \return 0 when succeed, -1 when failure happens
...@@ -277,6 +278,15 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetFree(DatasetHandle handle); ...@@ -277,6 +278,15 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetFree(DatasetHandle handle);
LIGHTGBM_C_EXPORT int LGBM_DatasetSaveBinary(DatasetHandle handle, LIGHTGBM_C_EXPORT int LGBM_DatasetSaveBinary(DatasetHandle handle,
const char* filename); const char* filename);
/*!
* \brief save dataset to text file, intended for debugging use only
* \param handle a instance of dataset
* \param filename file name
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetDumpText(DatasetHandle handle,
const char* filename);
/*! /*!
* \brief set vector to a content in info * \brief set vector to a content in info
* Note: group and group only work for C_API_DTYPE_INT32 * Note: group and group only work for C_API_DTYPE_INT32
...@@ -335,6 +345,15 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumData(DatasetHandle handle, ...@@ -335,6 +345,15 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumData(DatasetHandle handle,
LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumFeature(DatasetHandle handle, LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumFeature(DatasetHandle handle,
int* out); int* out);
/*!
* \brief Add features from source to target, then free source
* \param target The handle of the dataset to add features to
* \param source The handle of the dataset to take features from
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
DatasetHandle source);
// --- start Booster interfaces // --- start Booster interfaces
/*! /*!
......
...@@ -386,11 +386,15 @@ class Dataset { ...@@ -386,11 +386,15 @@ class Dataset {
LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr); LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
LIGHTGBM_EXPORT bool GetInt8Field(const char* field_name, data_size_t* out_len, const int8_t** out_ptr);
/*! /*!
* \brief Save current dataset into binary file, will save to "filename.bin" * \brief Save current dataset into binary file, will save to "filename.bin"
*/ */
LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename); LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);
LIGHTGBM_EXPORT void DumpTextFile(const char* text_filename);
LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset); LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset); LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);
...@@ -581,6 +585,8 @@ class Dataset { ...@@ -581,6 +585,8 @@ class Dataset {
/*! \brief Disable copy */ /*! \brief Disable copy */
Dataset(const Dataset&) = delete; Dataset(const Dataset&) = delete;
void addFeaturesFrom(Dataset* other);
private: private:
std::string data_filename_; std::string data_filename_;
/*! \brief Store used features */ /*! \brief Store used features */
......
...@@ -211,8 +211,20 @@ class FeatureGroup { ...@@ -211,8 +211,20 @@ class FeatureGroup {
} }
/*! \brief Disable copy */ /*! \brief Disable copy */
FeatureGroup& operator=(const FeatureGroup&) = delete; FeatureGroup& operator=(const FeatureGroup&) = delete;
/*! \brief Disable copy */ /*! \brief Deep copy */
FeatureGroup(const FeatureGroup&) = delete; FeatureGroup(const FeatureGroup& other){
num_feature_ = other.num_feature_;
is_sparse_ = other.is_sparse_;
num_total_bin_ = other.num_total_bin_;
bin_offsets_ = other.bin_offsets_;
bin_mappers_.reserve(other.bin_mappers_.size());
for(auto& bin_mapper : other.bin_mappers_){
bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
}
bin_data_.reset(other.bin_data_->Clone());
}
private: private:
/*! \brief Number of features */ /*! \brief Number of features */
......
...@@ -107,6 +107,14 @@ def cint32_array_to_numpy(cptr, length): ...@@ -107,6 +107,14 @@ def cint32_array_to_numpy(cptr, length):
raise RuntimeError('Expected int pointer') raise RuntimeError('Expected int pointer')
def cint8_array_to_numpy(cptr, length):
"""Convert a ctypes int pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_int8)):
return np.fromiter(cptr, dtype=np.int8, count=length)
else:
raise RuntimeError('Expected int pointer')
def c_str(string): def c_str(string):
"""Convert a Python string to C string.""" """Convert a Python string to C string."""
return ctypes.c_char_p(string.encode('utf-8')) return ctypes.c_char_p(string.encode('utf-8'))
...@@ -166,6 +174,7 @@ C_API_DTYPE_FLOAT32 = 0 ...@@ -166,6 +174,7 @@ C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 = 1 C_API_DTYPE_FLOAT64 = 1
C_API_DTYPE_INT32 = 2 C_API_DTYPE_INT32 = 2
C_API_DTYPE_INT64 = 3 C_API_DTYPE_INT64 = 3
C_API_DTYPE_INT8 = 4
"""Matrix is row major in Python""" """Matrix is row major in Python"""
C_API_IS_ROW_MAJOR = 1 C_API_IS_ROW_MAJOR = 1
...@@ -180,7 +189,9 @@ C_API_PREDICT_CONTRIB = 3 ...@@ -180,7 +189,9 @@ C_API_PREDICT_CONTRIB = 3
FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32, FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"weight": C_API_DTYPE_FLOAT32, "weight": C_API_DTYPE_FLOAT32,
"init_score": C_API_DTYPE_FLOAT64, "init_score": C_API_DTYPE_FLOAT64,
"group": C_API_DTYPE_INT32} "group": C_API_DTYPE_INT32,
"feature_penalty": C_API_DTYPE_FLOAT64,
"monotone_constraints": C_API_DTYPE_INT8}
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int', 'int64': 'int', 'uint8': 'int', 'uint16': 'int',
...@@ -700,6 +711,8 @@ class Dataset(object): ...@@ -700,6 +711,8 @@ class Dataset(object):
self._predictor = None self._predictor = None
self.pandas_categorical = None self.pandas_categorical = None
self.params_back_up = None self.params_back_up = None
self.feature_penalty = None
self.monotone_constraints = None
def __del__(self): def __del__(self):
try: try:
...@@ -1179,6 +1192,8 @@ class Dataset(object): ...@@ -1179,6 +1192,8 @@ class Dataset(object):
return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value) return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_FLOAT64: elif out_type.value == C_API_DTYPE_FLOAT64:
return cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value) return cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_INT8:
return cint8_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int8)), tmp_out_len.value)
else: else:
raise TypeError("Unknown type") raise TypeError("Unknown type")
...@@ -1382,6 +1397,30 @@ class Dataset(object): ...@@ -1382,6 +1397,30 @@ class Dataset(object):
self.weight = self.get_field('weight') self.weight = self.get_field('weight')
return self.weight return self.weight
def get_feature_penalty(self):
"""Get the feature penalty of the Dataset.
Returns
-------
feature_penalty : numpy array or None
Feature penalty for each feature in the Dataset.
"""
if self.feature_penalty is None:
self.feature_penalty = self.get_field('feature_penalty')
return self.feature_penalty
def get_monotone_constraints(self):
"""Get the monotone constraints of the Dataset.
Returns
-------
monotone_constraints : numpy array or None
Monotone constraints: -1, 0 or 1, for each feature in the Dataset.
"""
if self.monotone_constraints is None:
self.monotone_constraints = self.get_field('monotone_constraints')
return self.monotone_constraints
def get_init_score(self): def get_init_score(self):
"""Get the initial score of the Dataset. """Get the initial score of the Dataset.
...@@ -1494,6 +1533,46 @@ class Dataset(object): ...@@ -1494,6 +1533,46 @@ class Dataset(object):
break break
return ref_chain return ref_chain
def add_features_from(self, other):
"""Add features from other Dataset to the current Dataset.
Both Datasets must be constructed before calling this method.
Parameters
----------
other : Dataset
The Dataset to take features from.
Returns
-------
self : Dataset
Dataset with the new features added.
"""
if self.handle is None or other.handle is None:
raise ValueError('Both source and target Datasets must be constructed before adding features')
_safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle))
return self
def dump_text(self, filename):
"""Save Dataset to a text file.
This format cannot be loaded back in by LightGBM, but is useful for debugging purposes.
Parameters
----------
filename : string
Name of the output file.
Returns
-------
self : Dataset
Returns self.
"""
_safe_call(_LIB.LGBM_DatasetDumpText(
self.construct().handle,
c_str(filename)))
return self
class Booster(object): class Booster(object):
"""Booster in LightGBM.""" """Booster in LightGBM."""
......
...@@ -813,6 +813,14 @@ int LGBM_DatasetSaveBinary(DatasetHandle handle, ...@@ -813,6 +813,14 @@ int LGBM_DatasetSaveBinary(DatasetHandle handle,
API_END(); API_END();
} }
int LGBM_DatasetDumpText(DatasetHandle handle,
const char* filename) {
API_BEGIN();
auto dataset = reinterpret_cast<Dataset*>(handle);
dataset->DumpTextFile(filename);
API_END();
}
int LGBM_DatasetSetField(DatasetHandle handle, int LGBM_DatasetSetField(DatasetHandle handle,
const char* field_name, const char* field_name,
const void* field_data, const void* field_data,
...@@ -849,6 +857,9 @@ int LGBM_DatasetGetField(DatasetHandle handle, ...@@ -849,6 +857,9 @@ int LGBM_DatasetGetField(DatasetHandle handle,
} else if (dataset->GetDoubleField(field_name, out_len, reinterpret_cast<const double**>(out_ptr))) { } else if (dataset->GetDoubleField(field_name, out_len, reinterpret_cast<const double**>(out_ptr))) {
*out_type = C_API_DTYPE_FLOAT64; *out_type = C_API_DTYPE_FLOAT64;
is_success = true; is_success = true;
} else if(dataset->GetInt8Field(field_name, out_len, reinterpret_cast<const int8_t**>(out_ptr))){
*out_type = C_API_DTYPE_INT8;
is_success = true;
} }
if (!is_success) { throw std::runtime_error("Field not found"); } if (!is_success) { throw std::runtime_error("Field not found"); }
if (*out_ptr == nullptr) { *out_len = 0; } if (*out_ptr == nullptr) { *out_len = 0; }
...@@ -878,6 +889,15 @@ int LGBM_DatasetGetNumFeature(DatasetHandle handle, ...@@ -878,6 +889,15 @@ int LGBM_DatasetGetNumFeature(DatasetHandle handle,
API_END(); API_END();
} }
int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
DatasetHandle source) {
API_BEGIN();
auto target_d = reinterpret_cast<Dataset*>(target);
auto source_d = reinterpret_cast<Dataset*>(source);
target_d->addFeaturesFrom(source_d);
API_END();
}
// ---- start of booster // ---- start of booster
int LGBM_BoosterCreate(const DatasetHandle train_data, int LGBM_BoosterCreate(const DatasetHandle train_data,
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <cstdio> #include <cstdio>
#include <unordered_map> #include <unordered_map>
#include <limits> #include <limits>
#include <memory>
#include <vector> #include <vector>
#include <utility> #include <utility>
#include <string> #include <string>
...@@ -577,7 +578,11 @@ bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len, const ...@@ -577,7 +578,11 @@ bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len, const
if (name == std::string("init_score")) { if (name == std::string("init_score")) {
*out_ptr = metadata_.init_score(); *out_ptr = metadata_.init_score();
*out_len = static_cast<data_size_t>(metadata_.num_init_score()); *out_len = static_cast<data_size_t>(metadata_.num_init_score());
} else { } else if (name == std::string("feature_penalty")){
*out_ptr = feature_penalty_.data();
*out_len = feature_penalty_.size();
}
else {
return false; return false;
} }
return true; return true;
...@@ -595,6 +600,18 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in ...@@ -595,6 +600,18 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in
return true; return true;
} }
bool Dataset::GetInt8Field(const char* field_name, data_size_t* out_len, const int8_t** out_ptr) {
std::string name(field_name);
name = Common::Trim(name);
if (name == std::string("monotone_constraints")) {
*out_ptr = monotone_types_.data();
*out_len = monotone_types_.size();
} else {
return false;
}
return true;
}
void Dataset::SaveBinaryFile(const char* bin_filename) { void Dataset::SaveBinaryFile(const char* bin_filename) {
if (bin_filename != nullptr if (bin_filename != nullptr
&& std::string(bin_filename) == data_filename_) { && std::string(bin_filename) == data_filename_) {
...@@ -690,6 +707,49 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -690,6 +707,49 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
} }
} }
void Dataset::DumpTextFile(const char* text_filename){
auto file = fopen(text_filename, "wt");
fprintf(file, "num_features: %d\n", num_features_);
fprintf(file, "num_total_features: %d\n", num_total_features_);
fprintf(file, "num_groups: %d\n", num_groups_);
fprintf(file, "num_data: %d\n", num_data_);
fprintf(file, "feature_names: ");
for(auto n : feature_names_){
fprintf(file, "%s, ", n.c_str());
}
fprintf(file, "\nmonotone_constraints: ");
for(auto i : monotone_types_){
fprintf(file, "%d, ", i);
}
fprintf(file, "\nfeature_penalty: ");
for(auto i : feature_penalty_){
fprintf(file, "%lf, ", i);
}
fprintf(file, "\n");
for(auto n : feature_names_){
fprintf(file, "%s, ", n.c_str());
}
std::vector<std::unique_ptr<BinIterator>> iterators;
iterators.reserve(num_features_);
for(int j = 0; j < num_features_; ++j){
auto group_idx = feature2group_[j];
auto sub_idx = feature2subfeature_[j];
iterators.emplace_back(feature_groups_[group_idx]->SubFeatureIterator(sub_idx));
}
for(data_size_t i = 0; i < num_data_; ++i){
fprintf(file, "\n");
for(int j = 0; j < num_total_features_; ++j){
auto inner_feature_idx = used_feature_map_[j];
if(inner_feature_idx < 0){
fprintf(file, "NA, ");
} else {
fprintf(file, "%d, ", iterators[inner_feature_idx]->RawGet(i));
}
}
}
fclose(file);
}
void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data, const data_size_t* data_indices, data_size_t num_data,
int leaf_idx, int leaf_idx,
...@@ -881,4 +941,71 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hess ...@@ -881,4 +941,71 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hess
} }
} }
template<typename T>
void PushVector(std::vector<T>& dest, const std::vector<T>& src){
dest.reserve(dest.size() + src.size());
for(auto i : src){
dest.push_back(i);
}
}
template<typename T>
void PushOffset(std::vector<T>& dest, const std::vector<T>& src, const T& offset){
dest.reserve(dest.size() + src.size());
for(auto i : src){
dest.push_back(i + offset);
}
}
template<typename T>
void PushClearIfEmpty(std::vector<T>& dest, const size_t dest_len, const std::vector<T>& src, const size_t src_len, const T& deflt){
if(!dest.empty() && !src.empty()){
PushVector(dest, src);
} else if(!dest.empty() && src.empty()){
for(size_t i = 0; i < src_len; ++i){
dest.push_back(deflt);
}
} else if(dest.empty() && !src.empty()){
for(size_t i = 0; i < dest_len; ++i){
dest.push_back(deflt);
}
PushVector(dest, src);
}
}
void Dataset::addFeaturesFrom(Dataset* other){
if(other->num_data_ != num_data_){
throw std::runtime_error("Cannot add features from other Dataset with a different number of rows");
}
PushVector(feature_names_, other->feature_names_);
PushVector(feature2subfeature_, other->feature2subfeature_);
PushVector(group_feature_cnt_, other->group_feature_cnt_);
feature_groups_.reserve(other->feature_groups_.size());
for(auto& fg : other->feature_groups_){
feature_groups_.emplace_back(new FeatureGroup(*fg));
}
for(auto feature_idx : other->used_feature_map_){
if(feature_idx >= 0){
used_feature_map_.push_back(feature_idx + num_features_);
} else {
used_feature_map_.push_back(-1); // Unused feature.
}
}
PushOffset(real_feature_idx_, other->real_feature_idx_, num_total_features_);
PushOffset(feature2group_, other->feature2group_, num_groups_);
auto bin_offset = group_bin_boundaries_.back();
// Skip the leading 0 when copying group_bin_boundaries.
for(auto i = other->group_bin_boundaries_.begin()+1; i < other->group_bin_boundaries_.end(); ++i){
group_bin_boundaries_.push_back(*i + bin_offset);
}
PushOffset(group_feature_start_, other->group_feature_start_, num_features_);
PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
num_features_ += other->num_features_;
num_total_features_ += other->num_total_features_;
num_groups_ += other->num_groups_;
}
} // namespace LightGBM } // namespace LightGBM
...@@ -311,11 +311,21 @@ class DenseBin: public Bin { ...@@ -311,11 +311,21 @@ class DenseBin: public Bin {
return sizeof(VAL_T) * num_data_; return sizeof(VAL_T) * num_data_;
} }
protected: DenseBin<VAL_T>* Clone() override;
private:
data_size_t num_data_; data_size_t num_data_;
std::vector<VAL_T> data_; std::vector<VAL_T> data_;
DenseBin<VAL_T>(const DenseBin<VAL_T>& other)
: num_data_(other.num_data_), data_(other.data_){}
}; };
template<typename VAL_T>
DenseBin<VAL_T>* DenseBin<VAL_T>::Clone(){
return new DenseBin<VAL_T>(*this);
}
template <typename VAL_T> template <typename VAL_T>
uint32_t DenseBinIterator<VAL_T>::Get(data_size_t idx) { uint32_t DenseBinIterator<VAL_T>::Get(data_size_t idx) {
auto ret = bin_data_->data_[idx]; auto ret = bin_data_->data_[idx];
......
...@@ -363,7 +363,14 @@ class Dense4bitsBin : public Bin { ...@@ -363,7 +363,14 @@ class Dense4bitsBin : public Bin {
return sizeof(uint8_t) * data_.size(); return sizeof(uint8_t) * data_.size();
} }
Dense4bitsBin* Clone() override {
return new Dense4bitsBin(*this);
}
protected: protected:
Dense4bitsBin(const Dense4bitsBin& other)
: num_data_(other.num_data_), data_(other.data_), buf_(other.buf_){}
data_size_t num_data_; data_size_t num_data_;
std::vector<uint8_t> data_; std::vector<uint8_t> data_;
std::vector<uint8_t> buf_; std::vector<uint8_t> buf_;
......
...@@ -407,7 +407,14 @@ class SparseBin: public Bin { ...@@ -407,7 +407,14 @@ class SparseBin: public Bin {
GetFastIndex(); GetFastIndex();
} }
SparseBin<VAL_T>* Clone() override;
protected: protected:
SparseBin<VAL_T>(const SparseBin<VAL_T>& other)
: num_data_(other.num_data_), deltas_(other.deltas_), vals_(other.vals_),
num_vals_(other.num_vals_), push_buffers_(other.push_buffers_),
fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_){}
data_size_t num_data_; data_size_t num_data_;
std::vector<uint8_t> deltas_; std::vector<uint8_t> deltas_;
std::vector<VAL_T> vals_; std::vector<VAL_T> vals_;
...@@ -417,6 +424,11 @@ class SparseBin: public Bin { ...@@ -417,6 +424,11 @@ class SparseBin: public Bin {
data_size_t fast_index_shift_; data_size_t fast_index_shift_;
}; };
template<typename VAL_T>
SparseBin<VAL_T>* SparseBin<VAL_T>::Clone(){
return new SparseBin(*this);
}
template <typename VAL_T> template <typename VAL_T>
inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) { inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
return InnerRawGet(idx); return InnerRawGet(idx);
......
...@@ -92,3 +92,143 @@ class TestBasic(unittest.TestCase): ...@@ -92,3 +92,143 @@ class TestBasic(unittest.TestCase):
self.assertEqual(len(subset_group), 2) self.assertEqual(len(subset_group), 2)
self.assertEqual(subset_group[0], 1) self.assertEqual(subset_group[0], 1)
self.assertEqual(subset_group[1], 9) self.assertEqual(subset_group[1], 9)
def test_add_features_throws_if_num_data_unequal(self):
X1 = np.random.random((1000, 1))
X2 = np.random.random((100, 1))
d1 = lgb.Dataset(X1).construct()
d2 = lgb.Dataset(X2).construct()
with self.assertRaises(lgb.basic.LightGBMError):
d1.add_features_from(d2)
def test_add_features_throws_if_datasets_unconstructed(self):
X1 = np.random.random((1000, 1))
X2 = np.random.random((100, 1))
d1 = lgb.Dataset(X1)
d2 = lgb.Dataset(X2)
with self.assertRaises(ValueError):
d1 = lgb.Dataset(X1)
d2 = lgb.Dataset(X2)
d1.add_features_from(d2)
with self.assertRaises(ValueError):
d1 = lgb.Dataset(X1).construct()
d2 = lgb.Dataset(X2)
d1.add_features_from(d2)
with self.assertRaises(ValueError):
d1 = lgb.Dataset(X1)
d2 = lgb.Dataset(X2).construct()
d1.add_features_from(d2)
def test_add_features_equal_data_on_alternating_used_unused(self):
X = np.random.random((1000, 5))
X[:, [1, 3]] = 0
names = ['col_%d' % (i,) for i in range(5)]
for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
d1.add_features_from(d2)
with tempfile.NamedTemporaryFile() as f:
d1name = f.name
d1.dump_text(d1name)
d = lgb.Dataset(X, feature_name=names).construct()
with tempfile.NamedTemporaryFile() as f:
dname = f.name
d.dump_text(dname)
with open(d1name, 'rt') as d1f:
d1txt = d1f.read()
with open(dname, 'rt') as df:
dtxt = df.read()
os.remove(dname)
os.remove(d1name)
self.assertEqual(dtxt, d1txt)
def test_add_features_same_booster_behaviour(self):
X = np.random.random((1000, 5))
X[:, [1, 3]] = 0
names = ['col_%d' % (i,) for i in range(5)]
for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
d1.add_features_from(d2)
d = lgb.Dataset(X, feature_name=names).construct()
y = np.random.random(1000)
d1.set_label(y)
d.set_label(y)
b1 = lgb.Booster(train_set=d1)
b = lgb.Booster(train_set=d)
for k in range(10):
b.update()
b1.update()
with tempfile.NamedTemporaryFile() as df:
dname = df.name
with tempfile.NamedTemporaryFile() as d1f:
d1name = d1f.name
b1.save_model(d1name)
b.save_model(dname)
with open(dname, 'rt') as df:
dtxt = df.read()
with open(d1name, 'rt') as d1f:
d1txt = d1f.read()
self.assertEqual(dtxt, d1txt)
def test_get_feature_penalty(self):
X = np.random.random((1000, 1))
d = lgb.Dataset(X, params={'feature_penalty': [0.5]}).construct()
self.assertEqual(np.asarray([0.5]), d.get_feature_penalty())
d = lgb.Dataset(X).construct()
self.assertEqual(None, d.get_feature_penalty())
def test_get_monotone_constraints(self):
X = np.random.random((1000, 1))
d = lgb.Dataset(X, params={'monotone_constraints': [1]}).construct()
self.assertEqual(np.asarray([1]), d.get_monotone_constraints())
d = lgb.Dataset(X).construct()
self.assertEqual(None, d.get_monotone_constraints())
def test_add_features_feature_penalty(self):
X = np.random.random((1000, 2))
test_cases = [
(None, None, None),
([0.5], None, [0.5, 1]),
(None, [0.5], [1, 0.5]),
([0.5], [0.5], [0.5, 0.5])]
for (p1, p2, expected) in test_cases:
if p1 is not None:
params1 = {'feature_penalty': p1}
else:
params1 = {}
d1 = lgb.Dataset(X[:, 0].reshape((-1, 1)), params=params1).construct()
if p2 is not None:
params2 = {'feature_penalty': p2}
else:
params2 = {}
d2 = lgb.Dataset(X[:, 1].reshape((-1, 1)), params=params2).construct()
d1.add_features_from(d2)
actual = d1.get_feature_penalty()
if isinstance(actual, np.ndarray):
actual = list(actual)
self.assertEqual(expected, actual)
def test_add_features_monotone_types(self):
X = np.random.random((1000, 2))
test_cases = [
(None, None, None),
([1], None, [1, 0]),
(None, [1], [0, 1]),
([1], [-1], [1, -1])]
for (p1, p2, expected) in test_cases:
if p1 is not None:
params1 = {'monotone_constraints': p1}
else:
params1 = {}
d1 = lgb.Dataset(X[:, 0].reshape((-1, 1)), params=params1).construct()
if p2 is not None:
params2 = {'monotone_constraints': p2}
else:
params2 = {}
d2 = lgb.Dataset(X[:, 1].reshape((-1, 1)), params=params2).construct()
d1.add_features_from(d2)
actual = d1.get_monotone_constraints()
if isinstance(actual, np.ndarray):
actual = list(actual)
self.assertEqual(expected, actual)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment