Commit 49178def authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

add dart parameter to sklearn interface (#145)

* remove other parameters in sklearn fit

* add dart parameters to sklearn init
parent 616388e0
...@@ -61,11 +61,6 @@ The methods of each Class is in alphabetical order. ...@@ -61,11 +61,6 @@ The methods of each Class is in alphabetical order.
True if need to free raw data after construct inner dataset True if need to free raw data after construct inner dataset
####construct()
Lazy init
####create_valid(data, label=None, weight=None, group=None, silent=False, params=None) ####create_valid(data, label=None, weight=None, group=None, silent=False, params=None)
Create validation data align with current dataset. Create validation data align with current dataset.
...@@ -628,6 +623,16 @@ The methods of each Class is in alphabetical order. ...@@ -628,6 +623,16 @@ The methods of each Class is in alphabetical order.
Is unbalance for binary classification Is unbalance for binary classification
seed : int seed : int
Random number seed. Random number seed.
drop_rate : float
Only used when boosting_type='dart'. Probablity to select dropping trees.
skip_drop : float
Only used when boosting_type='dart'. Probablity to skip dropping trees.
max_drop : int
Only used when boosting_type='dart'. Max number of dropped trees in one iteration.
uniform_drop : bool
Only used when boosting_type='dart'. If true, drop trees uniformly, else drop according to weights.
xgboost_dart_mode : bool
Only used when boosting_type='dart'. Whether use xgboost dart mode.
Note Note
---- ----
...@@ -698,7 +703,7 @@ The methods of each Class is in alphabetical order. ...@@ -698,7 +703,7 @@ The methods of each Class is in alphabetical order.
Array of normailized feature importances Array of normailized feature importances
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, other_params=None) ####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None)
Fit the gradient boosting model. Fit the gradient boosting model.
...@@ -735,8 +740,6 @@ The methods of each Class is in alphabetical order. ...@@ -735,8 +740,6 @@ The methods of each Class is in alphabetical order.
Categorical features, Categorical features,
type int represents index, type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
other_params: dict
Other parameters
Note Note
---- ----
...@@ -762,12 +765,7 @@ The methods of each Class is in alphabetical order. ...@@ -762,12 +765,7 @@ The methods of each Class is in alphabetical order.
is eval result bigger better, e.g. AUC is bigger_better. is eval result bigger better, e.g. AUC is bigger_better.
for multi-class task, the y_pred is group by class_id first, then group by row_id for multi-class task, the y_pred is group by class_id first, then group by row_id
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i] if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
####get_params(deep=False)
Get parameters.
####predict(data, raw_score=False, num_iteration=0) ####predict(data, raw_score=False, num_iteration=0)
...@@ -809,14 +807,14 @@ The methods of each Class is in alphabetical order. ...@@ -809,14 +807,14 @@ The methods of each Class is in alphabetical order.
###LGBMRanker ###LGBMRanker
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric='ndcg', eval_at=1, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, other_params=None) ####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric='ndcg', eval_at=1, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None)
Most arguments are same as Common Methods except: Most arguments are same as Common Methods except:
eval_at : int or list of int, default=1 eval_at : int or list of int, default=1
The evaulation positions of NDCG The evaulation positions of NDCG
## Callbacks ##Callbacks
###Before iteration ###Before iteration
......
...@@ -138,7 +138,9 @@ class LGBMModel(LGBMModelBase): ...@@ -138,7 +138,9 @@ class LGBMModel(LGBMModelBase):
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10, nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0): is_unbalance=False, seed=0,
drop_rate=0.1, skip_drop=0.5, max_drop=50,
uniform_drop=False, xgboost_dart_mode=False):
""" """
Implementation of the Scikit-Learn API for LightGBM. Implementation of the Scikit-Learn API for LightGBM.
...@@ -187,6 +189,16 @@ class LGBMModel(LGBMModelBase): ...@@ -187,6 +189,16 @@ class LGBMModel(LGBMModelBase):
Is unbalance for binary classification Is unbalance for binary classification
seed : int seed : int
Random number seed. Random number seed.
drop_rate : float
Only used when boosting_type='dart'. Probablity to select dropping trees.
skip_drop : float
Only used when boosting_type='dart'. Probablity to skip dropping trees.
max_drop : int
Only used when boosting_type='dart'. Max number of dropped trees in one iteration.
uniform_drop : bool
Only used when boosting_type='dart'. If true, drop trees uniformly, else drop according to weights.
xgboost_dart_mode : bool
Only used when boosting_type='dart'. Whether use xgboost dart mode.
Note Note
---- ----
...@@ -233,6 +245,11 @@ class LGBMModel(LGBMModelBase): ...@@ -233,6 +245,11 @@ class LGBMModel(LGBMModelBase):
self.scale_pos_weight = scale_pos_weight self.scale_pos_weight = scale_pos_weight
self.is_unbalance = is_unbalance self.is_unbalance = is_unbalance
self.seed = seed self.seed = seed
self.drop_rate = drop_rate
self.skip_drop = skip_drop
self.max_drop = max_drop
self.uniform_drop = uniform_drop
self.xgboost_dart_mode = xgboost_dart_mode
self._Booster = None self._Booster = None
self.best_iteration = -1 self.best_iteration = -1
if callable(self.objective): if callable(self.objective):
...@@ -253,23 +270,13 @@ class LGBMModel(LGBMModelBase): ...@@ -253,23 +270,13 @@ class LGBMModel(LGBMModelBase):
raise LightGBMError('Need to call fit beforehand') raise LightGBMError('Need to call fit beforehand')
return self._Booster return self._Booster
def get_params(self, deep=False):
"""
Get parameters
"""
params = super(LGBMModel, self).get_params(deep=deep)
if self.nthread <= 0:
params.pop('nthread', None)
return params
def fit(self, X, y, def fit(self, X, y,
sample_weight=None, init_score=None, group=None, sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None, eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None):
other_params=None):
""" """
Fit the gradient boosting model Fit the gradient boosting model
...@@ -305,8 +312,6 @@ class LGBMModel(LGBMModelBase): ...@@ -305,8 +312,6 @@ class LGBMModel(LGBMModelBase):
Categorical features, Categorical features,
type int represents index, type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
other_params: dict
Other parameters
Note Note
---- ----
...@@ -335,10 +340,11 @@ class LGBMModel(LGBMModelBase): ...@@ -335,10 +340,11 @@ class LGBMModel(LGBMModelBase):
""" """
evals_result = {} evals_result = {}
params = self.get_params() params = self.get_params()
params['verbose'] = 0 if self.silent else 1 params['verbose'] = -1 if self.silent else 1
if hasattr(self, 'n_classes_') and self.n_classes_ > 2:
if other_params is not None: params['num_class'] = self.n_classes_
params.update(other_params) if hasattr(self, 'eval_at'):
params['ndcg_eval_at'] = self.eval_at
if self.fobj: if self.fobj:
params["objective"] = "None" params["objective"] = "None"
...@@ -408,9 +414,9 @@ class LGBMModel(LGBMModelBase): ...@@ -408,9 +414,9 @@ class LGBMModel(LGBMModelBase):
------- -------
predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes] predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
""" """
return self.booster().predict(data, return self._Booster.predict(data,
raw_score=raw_score, raw_score=raw_score,
num_iteration=num_iteration) num_iteration=num_iteration)
def apply(self, X, num_iteration=0): def apply(self, X, num_iteration=0):
""" """
...@@ -428,9 +434,9 @@ class LGBMModel(LGBMModelBase): ...@@ -428,9 +434,9 @@ class LGBMModel(LGBMModelBase):
------- -------
X_leaves : array_like, shape=[n_samples, n_trees] X_leaves : array_like, shape=[n_samples, n_trees]
""" """
return self.booster().predict(X, return self._Booster.predict(X,
pred_leaf=True, pred_leaf=True,
num_iteration=num_iteration) num_iteration=num_iteration)
def evals_result(self): def evals_result(self):
""" """
...@@ -466,14 +472,16 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -466,14 +472,16 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score=None, eval_init_score=None,
eval_metric="l2", eval_metric="l2",
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None):
other_params=None):
super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None, init_score=init_score, eval_set=eval_set,
eval_set, eval_sample_weight, eval_init_score, None, eval_sample_weight=eval_sample_weight,
eval_metric, early_stopping_rounds, eval_init_score=eval_init_score,
verbose, feature_name, categorical_feature, eval_metric=eval_metric,
other_params) early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature)
return self return self
class LGBMClassifier(LGBMModel, LGBMClassifierBase): class LGBMClassifier(LGBMModel, LGBMClassifierBase):
...@@ -484,14 +492,20 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -484,14 +492,20 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10, nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0): is_unbalance=False, seed=0,
super(LGBMClassifier, self).__init__(boosting_type, num_leaves, max_depth, drop_rate=0.1, skip_drop=0.5, max_drop=50,
learning_rate, n_estimators, max_bin, uniform_drop=False, xgboost_dart_mode=False):
silent, objective, nthread, super(LGBMClassifier, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
min_split_gain, min_child_weight, min_child_samples, max_depth=max_depth, learning_rate=learning_rate,
subsample, subsample_freq, colsample_bytree, n_estimators=n_estimators, max_bin=max_bin,
reg_alpha, reg_lambda, scale_pos_weight, silent=silent, objective=objective, nthread=nthread,
is_unbalance, seed) min_split_gain=min_split_gain, min_child_weight=min_child_weight,
min_child_samples=min_child_samples, subsample=subsample,
subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha, reg_lambda=reg_lambda,
scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance, seed=seed,
drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)
def fit(self, X, y, def fit(self, X, y,
sample_weight=None, init_score=None, sample_weight=None, init_score=None,
...@@ -499,37 +513,35 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -499,37 +513,35 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score=None, eval_init_score=None,
eval_metric="binary_logloss", eval_metric="binary_logloss",
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None):
other_params=None):
self.classes_ = np.unique(y) self._le = LGBMLabelEncoder().fit(y)
self.n_classes_ = len(self.classes_) y = self._le.transform(y)
if other_params is None:
other_params = {} self.n_classes_ = len(self._le.classes_)
if self.n_classes_ > 2: if self.n_classes_ > 2:
# Switch to using a multiclass objective in the underlying LGBM instance # Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass" self.objective = "multiclass"
other_params['num_class'] = self.n_classes_
if eval_set is not None and eval_metric == "binary_logloss": if eval_set is not None and eval_metric == "binary_logloss":
eval_metric = "multi_logloss" eval_metric = "multi_logloss"
self._le = LGBMLabelEncoder().fit(y)
training_labels = self._le.transform(y)
if eval_set is not None: if eval_set is not None:
eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set) eval_set = [(x[0], self._le.transform(x[1])) for x in eval_set]
super(LGBMClassifier, self).fit(X, training_labels, sample_weight, init_score, None, super(LGBMClassifier, self).fit(X, y, sample_weight=sample_weight,
eval_set, eval_sample_weight, eval_init_score, None, init_score=init_score, eval_set=eval_set,
eval_metric, early_stopping_rounds, eval_sample_weight=eval_sample_weight,
verbose, feature_name, categorical_feature, eval_init_score=eval_init_score,
other_params) eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature)
return self return self
def predict(self, data, raw_score=False, num_iteration=0): def predict(self, data, raw_score=False, num_iteration=0):
class_probs = self.booster().predict(data, class_probs = self._Booster.predict(data,
raw_score=raw_score, raw_score=raw_score,
num_iteration=num_iteration) num_iteration=num_iteration)
if len(class_probs.shape) > 1: if len(class_probs.shape) > 1:
column_indexes = np.argmax(class_probs, axis=1) column_indexes = np.argmax(class_probs, axis=1)
else: else:
...@@ -553,9 +565,9 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -553,9 +565,9 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
------- -------
predicted_probability : array_like, shape=[n_samples, n_classes] predicted_probability : array_like, shape=[n_samples, n_classes]
""" """
class_probs = self.booster().predict(data, class_probs = self._Booster.predict(data,
raw_score=raw_score, raw_score=raw_score,
num_iteration=num_iteration) num_iteration=num_iteration)
if self.n_classes_ > 2: if self.n_classes_ > 2:
return class_probs return class_probs
else: else:
...@@ -571,14 +583,20 @@ class LGBMRanker(LGBMModel): ...@@ -571,14 +583,20 @@ class LGBMRanker(LGBMModel):
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10, nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0): is_unbalance=False, seed=0,
super(LGBMRanker, self).__init__(boosting_type, num_leaves, max_depth, drop_rate=0.1, skip_drop=0.5, max_drop=50,
learning_rate, n_estimators, max_bin, uniform_drop=False, xgboost_dart_mode=False):
silent, objective, nthread, super(LGBMRanker, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
min_split_gain, min_child_weight, min_child_samples, max_depth=max_depth, learning_rate=learning_rate,
subsample, subsample_freq, colsample_bytree, n_estimators=n_estimators, max_bin=max_bin,
reg_alpha, reg_lambda, scale_pos_weight, silent=silent, objective=objective, nthread=nthread,
is_unbalance, seed) min_split_gain=min_split_gain, min_child_weight=min_child_weight,
min_child_samples=min_child_samples, subsample=subsample,
subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha, reg_lambda=reg_lambda,
scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance, seed=seed,
drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)
def fit(self, X, y, def fit(self, X, y,
sample_weight=None, init_score=None, group=None, sample_weight=None, init_score=None, group=None,
...@@ -586,8 +604,7 @@ class LGBMRanker(LGBMModel): ...@@ -586,8 +604,7 @@ class LGBMRanker(LGBMModel):
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric='ndcg', eval_at=1, eval_metric='ndcg', eval_at=1,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None):
other_params=None):
""" """
Most arguments like common methods except following: Most arguments like common methods except following:
...@@ -610,13 +627,13 @@ class LGBMRanker(LGBMModel): ...@@ -610,13 +627,13 @@ class LGBMRanker(LGBMModel):
raise ValueError("Should set group for all eval dataset for ranking task") raise ValueError("Should set group for all eval dataset for ranking task")
if eval_at is not None: if eval_at is not None:
other_params = {} if other_params is None else other_params self.eval_at = eval_at
if isinstance(eval_at, int): super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
eval_at = [eval_at] init_score=init_score, group=group,
other_params['ndcg_eval_at'] = list(eval_at) eval_set=eval_set, eval_sample_weight=eval_sample_weight,
super(LGBMRanker, self).fit(X, y, sample_weight, init_score, group, eval_init_score=eval_init_score, eval_group=eval_group,
eval_set, eval_sample_weight, eval_init_score, eval_group, eval_metric=eval_metric,
eval_metric, early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
verbose, feature_name, categorical_feature, verbose=verbose, feature_name=feature_name,
other_params) categorical_feature=categorical_feature)
return self return self
...@@ -16,7 +16,8 @@ def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor, ...@@ -16,7 +16,8 @@ def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
stratify=stratify, stratify=stratify,
random_state=42) random_state=42)
if return_data: return X_train, X_test, y_train, y_test if return_data: return X_train, X_test, y_train, y_test
gbm = model(n_estimators=num_round, objective=custom_obj) if custom_obj else model(n_estimators=num_round) if not custom_obj: gbm = model(n_estimators=num_round, silent=True)
else: gbm = model(n_estimators=num_round, objective=custom_obj, silent=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
if return_model: return gbm if return_model: return gbm
else: return feval(y_test, gbm.predict_proba(X_test) if proba else gbm.predict(X_test)) else: return feval(y_test, gbm.predict_proba(X_test) if proba else gbm.predict(X_test))
...@@ -64,12 +65,19 @@ class TestSklearn(unittest.TestCase): ...@@ -64,12 +65,19 @@ class TestSklearn(unittest.TestCase):
ret = test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj) ret = test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj)
self.assertLess(ret, 0.1) self.assertLess(ret, 0.1)
def test_dart(self):
X_train, X_test, y_train, y_test = test_template(return_data=True)
gbm = lgb.LGBMRegressor(boosting_type='dart')
gbm.fit(X_train, y_train)
self.assertLessEqual(gbm.score(X_train, y_train), 1.)
def test_grid_search(self): def test_grid_search(self):
X_train, X_test, y_train, y_test = test_template(return_data=True) X_train, X_test, y_train, y_test = test_template(return_data=True)
params = {'n_estimators': [10, 15, 20]} params = {'boosting_type': ['dart', 'gbdt'],
gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=5) 'n_estimators': [15, 20], 'drop_rate':[0.1, 0.2]}
gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3)
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
self.assertIn(gbm.best_params_['n_estimators'], [10, 15, 20]) self.assertIn(gbm.best_params_['n_estimators'], [15, 20])
def test_clone(self): def test_clone(self):
gbm = test_template(return_model=True) gbm = test_template(return_model=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment