Commit 83a14174 authored by Guolin Ke's avatar Guolin Ke
Browse files

some bugs fixed

parent 0612dcc0
......@@ -149,7 +149,7 @@ DllExport int LGBM_DatasetCreateFromMat(const void* data,
DllExport int LGBM_DatasetGetSubset(
const DatesetHandle* full_data,
const int32_t* used_row_indices,
const int32_t num_used_row_indices,
int32_t num_used_row_indices,
const char* parameters,
DatesetHandle* out);
......
......@@ -47,6 +47,13 @@ public:
*/
void Init(const char* data_filename, const int num_class);
/*!
* \brief init as subset
* \param metadata Filename of data
* \param used_indices
* \param num_used_indices
*/
void Init(const Metadata& metadata, const data_size_t* used_indices, data_size_t num_used_indices);
/*!
* \brief Initial with binary memory
* \param memory Pointer to memory
*/
......@@ -77,7 +84,6 @@ public:
void CheckOrPartition(data_size_t num_all_data,
const std::vector<data_size_t>& used_data_indices);
void SetLabel(const float* label, data_size_t len);
void SetWeights(const float* weights, data_size_t len);
......
......@@ -410,6 +410,10 @@ class Dataset(object):
params: dict, optional
other parameters
"""
self.__label = None
self.__weight = None
self.__init_score = None
self.__group = None
if data is None:
self.handle = None
return
......@@ -453,10 +457,6 @@ class Dataset(object):
self.__init_from_csr(csr, params_str, ref_dataset)
except:
raise TypeError('can not initialize Dataset from {}'.format(type(data).__name__))
self.__label = None
self.__weight = None
self.__init_score = None
self.__group = None
if label is not None:
self.set_label(label)
if self.get_label() is None:
......@@ -505,6 +505,22 @@ class Dataset(object):
return Dataset(data, label=label, max_bin=self.max_bin, reference=self,
weight=weight, group_id=group_id, predictor=self.predictor,
silent=silent, params=params)
def subset(self, used_indices, params=None):
used_indices = list_to_1d_numpy(used_indices, np.int32)
ret = Dataset(None)
ret.handle = ctypes.c_void_p()
params_str = dict_to_str(params)
_safe_call(_LIB.LGBM_DatasetGetSubset(
ctypes.byref(self.handle),
used_indices.data_as(ctypes.POINTER(ctypes.c_int32)),
used_indices.shape[0],
c_str(params_str),
ctypes.byref(ret.handle)))
ret.max_bin = self.max_bin
ret.predictor = self.predictor
if ret.get_label() is None:
raise ValueError("label should not be None")
return ret
def __init_from_np2d(self, mat, params_str, ref_dataset):
"""
......@@ -1102,7 +1118,7 @@ class Booster(object):
def __inner_eval(self, data_name, data_idx, feval=None):
"""
Evaulate traning or validation data
Evaulate training or validation data
"""
if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset")
......
......@@ -387,7 +387,7 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
DllExport int LGBM_DatasetGetSubset(
const DatesetHandle* full_data,
const int32_t* used_row_indices,
const int32_t num_used_row_indices,
int32_t num_used_row_indices,
const char* parameters,
DatesetHandle* out) {
API_BEGIN();
......
......@@ -55,6 +55,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_spars
num_features_ = static_cast<int>(features_.size());
num_total_features_ = dataset->num_total_features_;
feature_names_ = dataset->feature_names_;
label_idx_ = dataset->label_idx_;
}
Dataset* Dataset::Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const {
......@@ -67,6 +68,8 @@ Dataset* Dataset::Subset(const data_size_t* used_indices, data_size_t num_used_i
ret->features_[fidx]->PushBin(0, i, iterator->Get(used_indices[i]));
}
}
ret->metadata_.Init(metadata_, used_indices, num_used_indices);
return ret.release();
}
bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) {
......
......@@ -50,6 +50,69 @@ void Metadata::Init(data_size_t num_data, int num_class, int weight_idx, int que
}
}
void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, data_size_t num_used_indices) {
num_data_ = num_used_indices;
num_class_ = fullset.num_class_;
label_ = std::vector<float>(num_used_indices);
for (data_size_t i = 0; i < num_used_indices; i++) {
label_[i] = fullset.label_[used_indices[i]];
}
if (fullset.weights_.size() > 0) {
weights_ = std::vector<float>(num_used_indices);
num_weights_ = num_used_indices;
for (data_size_t i = 0; i < num_used_indices; i++) {
weights_[i] = fullset.weights_[used_indices[i]];
}
} else {
num_weights_ = 0;
}
if (fullset.init_score_.size() > 0) {
init_score_ = std::vector<float>(num_used_indices);
num_init_score_ = num_used_indices;
for (data_size_t i = 0; i < num_used_indices; i++) {
init_score_[i] = fullset.init_score_[used_indices[i]];
}
} else {
num_init_score_ = 0;
}
if (fullset.query_boundaries_.size() > 0) {
std::vector<data_size_t> used_query;
data_size_t data_idx = 0;
for (data_size_t qid = 0; qid < num_queries_ && data_idx < num_used_indices; ++qid) {
data_size_t start = fullset.query_boundaries_[qid];
data_size_t end = fullset.query_boundaries_[qid + 1];
data_size_t len = end - start;
if (used_indices[data_idx] > start) {
continue;
} else if (used_indices[data_idx] == start) {
if (num_used_indices >= data_idx + len && used_indices[data_idx + len - 1] == end - 1) {
used_query.push_back(qid);
data_idx += len;
} else {
Log::Fatal("Data partition error, data didn't match queries");
}
} else {
Log::Fatal("Data partition error, data didn't match queries");
}
}
query_boundaries_ = std::vector<data_size_t>(used_query.size() + 1);
num_queries_ = static_cast<data_size_t>(used_query.size());
query_boundaries_[0] = 0;
for (data_size_t i = 0; i < num_queries_; ++i) {
data_size_t qid = used_query[i];
data_size_t len = fullset.query_boundaries_[qid + 1] - fullset.query_boundaries_[qid];
query_boundaries_[i + 1] = query_boundaries_[i] + len;
}
} else {
num_queries_ = 0;
}
}
void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
if (used_indices.size() <= 0) {
return;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment