"python-package/lightgbm/callback.py" did not exist on "19e085c9925ac039cf2ce17649013b1ef69385ee"
Unverified Commit 7360cff9 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

fix init_model with subset (#2252)

* fix init_model with subset

* Update basic.py

* added test

* fix predictor naming issue

* Update basic.py

* fix bug

* fix pylint

* fix comments

* Update basic.py

* Update basic.py

* updated test

* fixed bug

* fixed lint

* fix warning

* add get_data before initial prediction

* refine the warning in get_data

* refine warning

* Update basic.py
parent ebc831bc
......@@ -725,8 +725,40 @@ class Dataset(object):
if self.handle is not None:
_safe_call(_LIB.LGBM_DatasetFree(self.handle))
self.handle = None
self.need_slice = True
if self.used_indices is not None:
self.data = None
return self
def _set_init_score_by_predictor(self, predictor, data, used_indices=None):
data_has_header = False
if isinstance(data, string_type):
# check data has header or not
if self.params.get("has_header", False) or self.params.get("header", False):
data_has_header = True
init_score = predictor.predict(data,
raw_score=True,
data_has_header=data_has_header,
is_reshape=False)
num_data = self.num_data()
if used_indices is not None:
assert not self.need_slice
if isinstance(data, string_type):
sub_init_score = np.zeros(num_data * predictor.num_class, dtype=np.float32)
assert num_data == len(used_indices)
for i in range_(len(used_indices)):
for j in range_(predictor.num_class):
sub_init_score[i * redictor.num_class + j] = init_score[used_indices[i] * redictor.num_class + j]
init_score = sub_init_score
if predictor.num_class > 1:
# need to regroup init_score
new_init_score = np.zeros(init_score.size, dtype=np.float32)
for i in range_(num_data):
for j in range_(predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j]
init_score = new_init_score
self.set_init_score(init_score)
def _lazy_init(self, data, label=None, reference=None,
weight=None, group=None, init_score=None, predictor=None,
silent=False, feature_name='auto',
......@@ -742,7 +774,7 @@ class Dataset(object):
categorical_feature,
self.pandas_categorical)
label = _label_from_pandas(label)
self.data_has_header = False
# process for args
params = {} if params is None else params
args_names = (getattr(self.__class__, '_lazy_init')
......@@ -753,7 +785,6 @@ class Dataset(object):
warnings.warn('{0} keyword has been found in `params` and will be ignored.\n'
'Please use {0} argument of the Dataset constructor to pass this parameter.'
.format(key))
self.predictor = predictor
# user can set verbose with params, it has higher priority
if not any(verbose_alias in params for verbose_alias in ('verbose', 'verbosity')) and silent:
params["verbose"] = -1
......@@ -787,10 +818,6 @@ class Dataset(object):
raise TypeError('Reference dataset should be None or dataset instance')
# start construct data
if isinstance(data, string_type):
# check data has header or not
if str(params.get("has_header", "")).lower() == "true" \
or str(params.get("header", "")).lower() == "true":
self.data_has_header = True
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromFile(
c_str(data),
......@@ -824,24 +851,12 @@ class Dataset(object):
# load init score
if init_score is not None:
self.set_init_score(init_score)
if self.predictor is not None:
if predictor is not None:
warnings.warn("The prediction of init_model will be overridden by init_score.")
elif isinstance(self.predictor, _InnerPredictor):
init_score = self.predictor.predict(data,
raw_score=True,
data_has_header=self.data_has_header,
is_reshape=False)
if self.predictor.num_class > 1:
# need re group init score
new_init_score = np.zeros(init_score.size, dtype=np.float32)
num_data = self.num_data()
for i in range_(num_data):
for j in range_(self.predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j]
init_score = new_init_score
self.set_init_score(init_score)
elif self.predictor is not None:
raise TypeError('wrong predictor type {}'.format(type(self.predictor).__name__))
elif isinstance(predictor, _InnerPredictor):
self._set_init_score_by_predictor(predictor, data)
elif predictor is not None:
raise TypeError('Wrong predictor type {}'.format(type(predictor).__name__))
# set feature names
return self.set_feature_name(feature_name)
......@@ -1000,12 +1015,15 @@ class Dataset(object):
ctypes.c_int(used_indices.shape[0]),
c_str(params_str),
ctypes.byref(self.handle)))
self.data = self.reference.data
self.get_data()
if not self.free_raw_data:
self.get_data()
if self.group is not None:
self.set_group(self.group)
if self.get_label() is None:
raise ValueError("Label should not be None.")
if isinstance(self._predictor, _InnerPredictor) and self._predictor is not self.reference._predictor:
self.get_data()
self._set_init_score_by_predictor(self._predictor, self.data, used_indices)
else:
# create train
self._lazy_init(self.data, label=self.label,
......@@ -1237,7 +1255,7 @@ class Dataset(object):
"""
if predictor is self._predictor:
return self
if self.data is not None:
if self.data is not None or (self.used_indices is not None and self.reference is not None and self.reference.data is not None):
self._predictor = predictor
return self._free_handle()
else:
......@@ -1444,17 +1462,22 @@ class Dataset(object):
"""
if self.handle is None:
raise Exception("Cannot get data before construct Dataset")
if self.data is not None and self.used_indices is not None and self.need_slice:
if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data):
self.data = self.data[self.used_indices, :]
elif isinstance(self.data, DataFrame):
self.data = self.data.iloc[self.used_indices].copy()
elif isinstance(self.data, DataTable):
self.data = self.data[self.used_indices, :]
else:
warnings.warn("Cannot subset {} type of raw data.\n"
"Returning original raw data".format(type(self.data).__name__))
if self.need_slice and self.used_indices is not None and self.reference is not None:
self.data = self.reference.data
if self.data is not None:
if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data):
self.data = self.data[self.used_indices, :]
elif isinstance(self.data, DataFrame):
self.data = self.data.iloc[self.used_indices].copy()
elif isinstance(self.data, DataTable):
self.data = self.data[self.used_indices, :]
else:
warnings.warn("Cannot subset {} type of raw data.\n"
"Returning original raw data".format(type(self.data).__name__))
self.need_slice = False
if self.data is None:
raise LightGBMError("Cannot call `get_data` after freed raw data, "
"set free_raw_data=False when construct Dataset to avoid this.")
return self.data
def get_group(self):
......
......@@ -798,6 +798,46 @@ class TestEngine(unittest.TestCase):
sliced_pred = train_and_get_predictions(sliced_csr, sliced_labels)
np.testing.assert_allclose(origin_pred, sliced_pred)
def test_init_with_subset(self):
data = np.random.random((500, 2))
y = [1] * 250 + [0] * 250
lgb_train = lgb.Dataset(data, y, free_raw_data=False)
subset_index_1 = np.random.choice(np.arange(500), 300, replace=False)
subset_data_1 = lgb_train.subset(subset_index_1)
subset_index_2 = np.random.choice(np.arange(500), 200, replace=False)
subset_data_2 = lgb_train.subset(subset_index_2)
params = {
'objective': 'binary',
'verbose': -1
}
init_gbm = lgb.train(params=params,
train_set=subset_data_1,
num_boost_round=10,
keep_training_booster=True)
gbm = lgb.train(params=params,
train_set=subset_data_2,
num_boost_round=10,
init_model=init_gbm)
self.assertEqual(lgb_train.get_data().shape[0], 500)
self.assertEqual(subset_data_1.get_data().shape[0], 300)
self.assertEqual(subset_data_2.get_data().shape[0], 200)
lgb_train.save_binary("lgb_train_data.bin")
lgb_train_from_file = lgb.Dataset('lgb_train_data.bin', free_raw_data=False)
subset_data_3 = lgb_train_from_file.subset(subset_index_1)
subset_data_4 = lgb_train_from_file.subset(subset_index_2)
init_gbm_2 = lgb.train(params=params,
train_set=subset_data_3,
num_boost_round=10,
keep_training_booster=True)
with np.testing.assert_raises_regex(lgb.basic.LightGBMError, "Unknown format of training data"):
gbm = lgb.train(params=params,
train_set=subset_data_4,
num_boost_round=10,
init_model=init_gbm_2)
self.assertEqual(lgb_train_from_file.get_data(), "lgb_train_data.bin")
self.assertEqual(subset_data_3.get_data(), "lgb_train_data.bin")
self.assertEqual(subset_data_4.get_data(), "lgb_train_data.bin")
def test_monotone_constraint(self):
def is_increasing(y):
return (np.diff(y) >= 0.0).all()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment