Commit b51c7be4 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

refine Dataset class (#113)

Provide a high level Dataset class for easy use.
parent f3d33582
...@@ -14,7 +14,7 @@ before_install: ...@@ -14,7 +14,7 @@ before_install:
install: install:
- sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential - sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
- conda install --yes atlas numpy scipy scikit-learn - conda install --yes atlas numpy scipy scikit-learn pandas
script: script:
...@@ -22,12 +22,12 @@ script: ...@@ -22,12 +22,12 @@ script:
- mkdir build && cd build && cmake .. && make -j - mkdir build && cd build && cmake .. && make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR - cd $TRAVIS_BUILD_DIR
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
notifications: notifications:
email: false email: false
......
...@@ -76,7 +76,7 @@ add_executable(lightgbm src/main.cpp ${SOURCES}) ...@@ -76,7 +76,7 @@ add_executable(lightgbm src/main.cpp ${SOURCES})
add_library(_lightgbm SHARED src/c_api.cpp ${SOURCES}) add_library(_lightgbm SHARED src/c_api.cpp ${SOURCES})
if(MSVC) if(MSVC)
set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lightgbm") set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lib_lightgbm")
endif(MSVC) endif(MSVC)
if(USE_MPI) if(USE_MPI)
......
...@@ -17,13 +17,7 @@ X_test = df_test.drop(0, axis=1) ...@@ -17,13 +17,7 @@ X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm # create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# ATTENTION: you should carefully use lightgbm.Dataset
# it requires setting up categorical_feature when you init it
# rather than passing from lightgbm.train
# instead, you can simply use a tuple of length=2 like below
# it will help you construct Datasets with parameters in lightgbm.train
lgb_train = (X_train, y_train)
lgb_eval = (X_test, y_test)
# specify your configurations as a dict # specify your configurations as a dict
params = { params = {
...@@ -43,9 +37,7 @@ params = { ...@@ -43,9 +37,7 @@ params = {
gbm = lgb.train(params, gbm = lgb.train(params,
lgb_train, lgb_train,
num_boost_round=100, num_boost_round=100,
valid_datas=lgb_eval, valid_sets=lgb_eval,
# you can use a list to represent multiple valid_datas/valid_names
# don't use tuple, tuple is used to represent one dataset
early_stopping_rounds=10) early_stopping_rounds=10)
# save model to file # save model to file
......
...@@ -230,6 +230,7 @@ struct OverallConfig: public ConfigBase { ...@@ -230,6 +230,7 @@ struct OverallConfig: public ConfigBase {
public: public:
TaskType task_type = TaskType::kTrain; TaskType task_type = TaskType::kTrain;
NetworkConfig network_config; NetworkConfig network_config;
int seed = 0;
int num_threads = 0; int num_threads = 0;
bool is_parallel = false; bool is_parallel = false;
bool is_parallel_find_bin = false; bool is_parallel_find_bin = false;
...@@ -317,6 +318,7 @@ struct ParameterAlias { ...@@ -317,6 +318,7 @@ struct ParameterAlias {
{ {
{ "config", "config_file" }, { "config", "config_file" },
{ "nthread", "num_threads" }, { "nthread", "num_threads" },
{ "random_seed", "seed" },
{ "num_thread", "num_threads" }, { "num_thread", "num_threads" },
{ "boosting", "boosting_type" }, { "boosting", "boosting_type" },
{ "boost", "boosting_type" }, { "boost", "boosting_type" },
......
...@@ -8,7 +8,7 @@ from __future__ import absolute_import ...@@ -8,7 +8,7 @@ from __future__ import absolute_import
import os import os
from .basic import Predictor, Dataset, Booster from .basic import Dataset, Booster
from .engine import train, cv from .engine import train, cv
try: try:
from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
......
This diff is collapsed.
...@@ -6,52 +6,12 @@ from __future__ import absolute_import ...@@ -6,52 +6,12 @@ from __future__ import absolute_import
import collections import collections
from operator import attrgetter from operator import attrgetter
import numpy as np import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str from .basic import LightGBMError, _InnerPredictor, Dataset, Booster, is_str
from . import callback from . import callback
def _construct_dataset(X_y, reference=None, def train(params, train_set, num_boost_round=100,
params=None, other_fields=None, valid_sets=None, valid_names=None,
feature_name=None, categorical_feature=None,
predictor=None):
if 'max_bin' in params:
max_bin = int(params['max_bin'])
else:
max_bin = 255
weight = None
group = None
init_score = None
if other_fields is not None:
if not isinstance(other_fields, dict):
raise TypeError("type of other filed data should be dict")
weight = other_fields.get('weight', None)
group = other_fields.get('group', None)
init_score = other_fields.get('init_score', None)
if is_str(X_y):
data = X_y
label = None
else:
if len(X_y) != 2:
raise TypeError("should pass (data, label) tuple for dataset")
data = X_y[0]
label = X_y[1]
if reference is None:
ret = Dataset(data, label=label, max_bin=max_bin,
weight=weight, group=group,
predictor=predictor,
feature_name=feature_name,
categorical_feature=categorical_feature,
params=params)
else:
ret = reference.create_valid(data, label=label, weight=weight,
group=group, params=params)
if init_score is not None:
ret.set_init_score(init_score)
return ret
def train(params, train_data, num_boost_round=100,
valid_datas=None, valid_names=None,
fobj=None, feval=None, init_model=None, fobj=None, feval=None, init_model=None,
train_fields=None, valid_fields=None,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
early_stopping_rounds=None, evals_result=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None): verbose_eval=True, learning_rates=None, callbacks=None):
...@@ -61,14 +21,14 @@ def train(params, train_data, num_boost_round=100, ...@@ -61,14 +21,14 @@ def train(params, train_data, num_boost_round=100,
---------- ----------
params : dict params : dict
Parameters for training. Parameters for training.
train_data : Dataset, tuple (X, y) or filename of data train_set : Dataset
Data to be trained. Data to be trained.
num_boost_round: int num_boost_round: int
Number of boosting iterations. Number of boosting iterations.
valid_datas: list of Datasets, tuples (valid_X, valid_y) or filenames of data valid_sets: list of Datasets
List of data to be evaluated during training List of data to be evaluated during training
valid_names: list of string valid_names: list of string
Names of valid_datas Names of valid_sets
fobj : function fobj : function
Customized objective function. Customized objective function.
feval : function feval : function
...@@ -76,13 +36,6 @@ def train(params, train_data, num_boost_round=100, ...@@ -76,13 +36,6 @@ def train(params, train_data, num_boost_round=100,
Note: should return (eval_name, eval_result, is_higher_better) of list of this Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance init_model : file name of lightgbm model or 'Booster' instance
model used for continued train model used for continued train
train_fields : dict
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
valid_fields : dict
Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
Support fields: weight, group, init_score
feature_name : list of str feature_name : list of str
Feature names Feature names
categorical_feature : list of str or int categorical_feature : list of str or int
...@@ -95,8 +48,8 @@ def train(params, train_data, num_boost_round=100, ...@@ -95,8 +48,8 @@ def train(params, train_data, num_boost_round=100,
Returns the model with (best_iter + early_stopping_rounds) Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas. This dictionary used to store all evaluation results of all the items in valid_sets.
Example: with a valid_datas containing [valid_set, train_set] \ Example: with a valid_sets containing [valid_set, train_set] \
and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss') and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]}, Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}} 'eval': {'logloss': ['0.480385', '0.357756', ...]}}
...@@ -127,58 +80,40 @@ def train(params, train_data, num_boost_round=100, ...@@ -127,58 +80,40 @@ def train(params, train_data, num_boost_round=100,
""" """
"""create predictor first""" """create predictor first"""
if is_str(init_model): if is_str(init_model):
predictor = Predictor(model_file=init_model) predictor = _InnerPredictor(model_file=init_model)
elif isinstance(init_model, Booster): elif isinstance(init_model, Booster):
predictor = init_model.to_predictor() predictor = init_model._to_predictor()
elif isinstance(init_model, Predictor):
predictor = init_model
else: else:
predictor = None predictor = None
init_iteration = predictor.num_total_iteration if predictor else 0 init_iteration = predictor.num_total_iteration if predictor else 0
"""create dataset""" """check dataset"""
if isinstance(train_data, Dataset): if not isinstance(train_set, Dataset):
train_set = train_data raise TypeError("only can accept Dataset instance for traninig")
if train_fields is not None:
for field, data in train_fields.items(): train_set._set_predictor(predictor)
train_set.set_field(field, data) train_set.set_feature_name(feature_name)
else: train_set.set_categorical_feature(categorical_feature)
train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields,
feature_name=feature_name,
categorical_feature=categorical_feature,
predictor=predictor)
is_valid_contain_train = False is_valid_contain_train = False
train_data_name = "training" train_data_name = "training"
valid_sets = [] reduced_valid_sets = []
name_valid_sets = [] name_valid_sets = []
if valid_datas: if valid_sets:
if isinstance(valid_datas, (Dataset, tuple)): if isinstance(valid_sets, Dataset):
valid_datas = [valid_datas] valid_sets = [valid_sets]
if isinstance(valid_names, str): if isinstance(valid_names, str):
valid_names = [valid_names] valid_names = [valid_names]
for i, valid_data in enumerate(valid_datas): for i, valid_data in enumerate(valid_sets):
other_fields = None if valid_fields is None else valid_fields.get(i, None)
"""reduce cost for prediction training data""" """reduce cost for prediction training data"""
if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]: if valid_data is train_set:
is_valid_contain_train = True is_valid_contain_train = True
if valid_names is not None: if valid_names is not None:
train_data_name = valid_names[i] train_data_name = valid_names[i]
continue continue
if isinstance(valid_data, Dataset): if not isinstance(valid_data, Dataset):
valid_set = valid_data raise TypeError("only can accept Dataset instance for traninig")
if other_fields is not None: valid_data.set_reference(train_set)
for field, data in other_fields.items(): reduced_valid_sets.append(valid_data)
valid_set.set_field(field, data)
else:
valid_set = _construct_dataset(
valid_data,
train_set,
params,
other_fields=other_fields,
feature_name=feature_name,
categorical_feature=categorical_feature,
predictor=predictor)
valid_sets.append(valid_set)
if valid_names is not None and len(valid_names) > i: if valid_names is not None and len(valid_names) > i:
name_valid_sets.append(valid_names[i]) name_valid_sets.append(valid_names[i])
else: else:
...@@ -217,7 +152,7 @@ def train(params, train_data, num_boost_round=100, ...@@ -217,7 +152,7 @@ def train(params, train_data, num_boost_round=100,
booster = Booster(params=params, train_set=train_set) booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train: if is_valid_contain_train:
booster.set_train_data_name(train_data_name) booster.set_train_data_name(train_data_name)
for valid_set, name_valid_set in zip(valid_sets, name_valid_sets): for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
booster.add_valid(valid_set, name_valid_set) booster.add_valid(valid_set, name_valid_set)
"""start training""" """start training"""
...@@ -294,6 +229,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals ...@@ -294,6 +229,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
else: else:
raise LightGBMError('sklearn needs to be installed in order to use stratified cv') raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
else: else:
full_data.construct()
randidx = np.random.permutation(full_data.num_data()) randidx = np.random.permutation(full_data.num_data())
kstep = int(len(randidx) / nfold) kstep = int(len(randidx) / nfold)
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)] idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
...@@ -322,8 +258,8 @@ def _agg_cv_result(raw_results): ...@@ -322,8 +258,8 @@ def _agg_cv_result(raw_results):
cvmap[one_line[1]].append(one_line[2]) cvmap[one_line[1]].append(one_line[2])
return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()] return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, train_fields=None, metrics=(), fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
early_stopping_rounds=None, fpreproc=None, early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0, verbose_eval=None, show_stdv=True, seed=0,
...@@ -334,7 +270,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -334,7 +270,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
---------- ----------
params : dict params : dict
Booster params. Booster params.
train_data : tuple (X, y) or filename of data train_set : Dataset
Data to be trained. Data to be trained.
num_boost_round : int num_boost_round : int
Number of boosting iterations. Number of boosting iterations.
...@@ -350,9 +286,8 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -350,9 +286,8 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
Custom objective function. Custom objective function.
feval : function feval : function
Custom evaluation function. Custom evaluation function.
train_fields : dict init_model : file name of lightgbm model or 'Booster' instance
Other data file in training data. e.g. train_fields['weight'] is weight data model used for continued train
Support fields: weight, group, init_score
feature_name : list of str feature_name : list of str
Feature names Feature names
categorical_feature : list of str or int categorical_feature : list of str or int
...@@ -382,6 +317,20 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -382,6 +317,20 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
------- -------
evaluation history : list(string) evaluation history : list(string)
""" """
if not isinstance(train_set, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
if is_str(init_model):
predictor = _InnerPredictor(model_file=init_model)
elif isinstance(init_model, Booster):
predictor = init_model._to_predictor()
else:
predictor = None
train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
if metrics: if metrics:
params.setdefault('metric', []) params.setdefault('metric', [])
if is_str(metrics): if is_str(metrics):
...@@ -389,11 +338,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -389,11 +338,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
else: else:
params['metric'].extend(metrics) params['metric'].extend(metrics)
train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields,
feature_name=feature_name,
categorical_feature=categorical_feature)
results = collections.defaultdict(list) results = collections.defaultdict(list)
cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified) cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)
......
...@@ -19,6 +19,7 @@ def find_lib_path(): ...@@ -19,6 +19,7 @@ def find_lib_path():
if os.name == 'nt': if os.name == 'nt':
dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/')) dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/'))
dll_path.append(os.path.join(curr_path, './windows/x64/Dll/')) dll_path.append(os.path.join(curr_path, './windows/x64/Dll/'))
dll_path.append(os.path.join(curr_path, '../../Release/'))
dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path] dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path]
else: else:
dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path] dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path]
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
from __future__ import absolute_import from __future__ import absolute_import
import numpy as np import numpy as np
from .basic import LightGBMError, is_str from .basic import LightGBMError, Dataset, is_str
from .engine import train from .engine import train
# sklearn # sklearn
try: try:
...@@ -195,9 +195,12 @@ class LGBMModel(LGBMModelBase): ...@@ -195,9 +195,12 @@ class LGBMModel(LGBMModelBase):
params.pop('nthread', None) params.pop('nthread', None)
return params return params
def fit(self, X, y, eval_set=None, eval_metric=None, def fit(self, X, y,
sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
other_params=None): other_params=None):
""" """
...@@ -209,24 +212,29 @@ class LGBMModel(LGBMModelBase): ...@@ -209,24 +212,29 @@ class LGBMModel(LGBMModelBase):
Feature matrix Feature matrix
y : array_like y : array_like
Labels Labels
sample_weight : array_like
weight of training data
init_score : array_like
init score of training data
group : array_like
group data of training data
eval_set : list, optional eval_set : list, optional
A list of (X, y) tuple pairs to use as a validation set for early-stopping A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_sample_weight : List of array
weight of eval data
eval_init_score : List of array
init score of eval data
eval_group : List of array
group data of eval data
eval_metric : str, list of str, callable, optional eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use. If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric. The call \ If callable, a custom evaluation metric. The call \
signature is func(y_predicted, dataset) where dataset will be a \ signature is func(y_predicted, dataset) where dataset will be a \
Dataset fobject such that you may need to call the get_label \ Dateset object such that you may need to call the get_label \
method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool) method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
early_stopping_rounds : int early_stopping_rounds : int
verbose : bool verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation If `verbose` and an evaluation set is used, writes the evaluation
train_fields : dict
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
valid_fields : dict
Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
Support fields: weight, group, init_score
feature_name : list of str feature_name : list of str
Feature names Feature names
categorical_feature : list of str or int categorical_feature : list of str or int
...@@ -263,12 +271,33 @@ class LGBMModel(LGBMModelBase): ...@@ -263,12 +271,33 @@ class LGBMModel(LGBMModelBase):
feval = None feval = None
feval = eval_metric if callable(eval_metric) else None feval = eval_metric if callable(eval_metric) else None
self._Booster = train(params, (X, y), def _construct_dataset(X, y, sample_weight, init_score, group):
self.n_estimators, valid_datas=eval_set, ret = Dataset(X, label=y, weight=sample_weight, group=group)
ret.set_init_score(init_score)
return ret
train_set = _construct_dataset(X, y, sample_weight, init_score, group)
valid_sets = []
if eval_set is not None:
if isinstance(eval_set, tuple):
eval_set = [eval_set]
for i, valid_data in enumerate(eval_set):
"""reduce cost for prediction training data"""
if valid_data[0] is X and valid_data[1] is y:
valid_set = train_set
else:
valid_weight = None if eval_sample_weight is None else eval_sample_weight.get(i, None)
valid_init_score = None if eval_init_score is None else eval_init_score.get(i, None)
valid_group = None if eval_group is None else eval_group.get(i, None)
valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group)
valid_sets.append(valid_set)
self._Booster = train(params, train_set,
self.n_estimators, valid_sets=valid_sets,
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval, evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, train_fields=train_fields, verbose_eval=verbose, feature_name=feature_name,
valid_fields=valid_fields, feature_name=feature_name,
categorical_feature=categorical_feature) categorical_feature=categorical_feature)
if evals_result: if evals_result:
...@@ -331,14 +360,48 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -331,14 +360,48 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM regression. __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:]) """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y,
sample_weight=None, init_score=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None,
other_params=None):
super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None,
eval_set, eval_sample_weight, eval_init_score, None,
eval_metric, early_stopping_rounds,
verbose, feature_name, categorical_feature,
other_params)
return self
class LGBMClassifier(LGBMModel, LGBMClassifierBase): class LGBMClassifier(LGBMModel, LGBMClassifierBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM classification. __doc__ = """Implementation of the scikit-learn API for LightGBM classification.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:]) """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None, def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="binary",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
super(LGBMClassifier, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin,
silent, objective,
nthread, min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed)
def fit(self, X, y,
sample_weight=None, init_score=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
other_params=None): other_params=None):
...@@ -350,12 +413,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -350,12 +413,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
# Switch to using a multiclass objective in the underlying LGBM instance # Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass" self.objective = "multiclass"
other_params['num_class'] = self.n_classes_ other_params['num_class'] = self.n_classes_
if eval_metric is None and eval_set is not None:
eval_metric = "multi_logloss"
else:
self.objective = "binary"
if eval_metric is None and eval_set is not None:
eval_metric = "binary_logloss"
self._le = LGBMLabelEncoder().fit(y) self._le = LGBMLabelEncoder().fit(y)
training_labels = self._le.transform(y) training_labels = self._le.transform(y)
...@@ -363,10 +420,10 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -363,10 +420,10 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
if eval_set is not None: if eval_set is not None:
eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set) eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)
super(LGBMClassifier, self).fit(X, training_labels, eval_set, super(LGBMClassifier, self).fit(X, training_labels, sample_weight, init_score, None,
eval_set, eval_sample_weight, eval_init_score, None,
eval_metric, early_stopping_rounds, eval_metric, early_stopping_rounds,
verbose, train_fields, valid_fields, verbose, feature_name, categorical_feature,
feature_name, categorical_feature,
other_params) other_params)
return self return self
...@@ -442,34 +499,59 @@ class LGBMRanker(LGBMModel): ...@@ -442,34 +499,59 @@ class LGBMRanker(LGBMModel):
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:]) """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None, def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="lambdarank",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
super(LGBMRanker, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin,
silent, objective,
nthread, min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed)
if callable(self.objective):
self.fobj = _group_wise_objective(self.objective)
else:
self.fobj = None
def fit(self, X, y,
sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None,
eval_metric=None, eval_at=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None): feature_name=None, categorical_feature=None,
other_params=None):
"""
Most arguments like LGBMModel.fit except following:
eval_at : list of int
The evaulation positions of NDCG
"""
"""check group data""" """check group data"""
if "group" not in train_fields: if group is None:
raise ValueError("should set group in train_fields for ranking task") raise ValueError("should use group for ranking task")
if eval_set is not None: if eval_set is not None:
if valid_fields is None: if eval_group is None:
raise ValueError("valid_fields cannot be None when eval_set is not None") raise ValueError("eval_group cannot be None when eval_set is not None")
elif len(valid_fields) != len(eval_set): elif len(eval_group) != len(eval_set):
raise ValueError("lenght of valid_fields should equal with eval_set") raise ValueError("length of eval_group should equal with eval_set")
else: else:
for inner in valid_fields: for inner_group in eval_group:
if "group" not in inner: if inner_group is None:
raise ValueError("should set group in valid_fields for ranking task") raise ValueError("should set group for all eval data for ranking task")
if eval_at is not None:
if callable(self.objective): other_params = {} if other_params is None else other_params
self.fobj = _group_wise_objective(self.objective) other_params['ndcg_eval_at'] = list(eval_at)
else: super(LGBMRanker, self).fit(X, y, sample_weight, init_score, group,
self.objective = "lambdarank" eval_set, eval_sample_weight, eval_init_score, eval_group,
self.fobj = None eval_metric, early_stopping_rounds,
if eval_metric is None and eval_set is not None: verbose, feature_name, categorical_feature,
eval_metric = "ndcg"
super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
early_stopping_rounds, verbose,
train_fields, valid_fields,
other_params) other_params)
return self return self
#include <LightGBM/config.h> #include <LightGBM/config.h>
#include <LightGBM/utils/common.h> #include <LightGBM/utils/common.h>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/log.h> #include <LightGBM/utils/log.h>
#include <vector> #include <vector>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <algorithm> #include <algorithm>
#include <limits>
namespace LightGBM { namespace LightGBM {
...@@ -22,7 +24,7 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par ...@@ -22,7 +24,7 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par
continue; continue;
} }
params[key] = value; params[key] = value;
} else { } else if(Common::Trim(arg).size() > 0){
Log::Warning("Unknown parameter %s", arg.c_str()); Log::Warning("Unknown parameter %s", arg.c_str());
} }
} }
...@@ -33,12 +35,21 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par ...@@ -33,12 +35,21 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par
void OverallConfig::Set(const std::unordered_map<std::string, std::string>& params) { void OverallConfig::Set(const std::unordered_map<std::string, std::string>& params) {
// load main config types // load main config types
GetInt(params, "num_threads", &num_threads); GetInt(params, "num_threads", &num_threads);
// generate seeds by seed.
if (GetInt(params, "seed", &seed)) {
Random rand(seed);
int int_max = std::numeric_limits<int>::max();
io_config.data_random_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.bagging_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.drop_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.tree_config.feature_fraction_seed = static_cast<int>(rand.NextInt(0, int_max));
}
GetTaskType(params); GetTaskType(params);
GetBoostingType(params); GetBoostingType(params);
GetObjectiveType(params); GetObjectiveType(params);
GetMetricType(params); GetMetricType(params);
// sub-config setup // sub-config setup
network_config.Set(params); network_config.Set(params);
io_config.Set(params); io_config.Set(params);
......
...@@ -8,10 +8,6 @@ x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_s ...@@ -8,10 +8,6 @@ x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_s
train_data = lgb.Dataset(x_train, max_bin=255, label=y_train) train_data = lgb.Dataset(x_train, max_bin=255, label=y_train)
num_features = train_data.num_feature()
names = ["name_%d" %(i) for i in range(num_features)]
train_data.set_feature_name(names)
valid_data = train_data.create_valid(x_test, label=y_test) valid_data = train_data.create_valid(x_test, label=y_test)
config={"objective":"binary","metric":"auc", "min_data":1, "num_leaves":15} config={"objective":"binary","metric":"auc", "min_data":1, "num_leaves":15}
......
# coding: utf-8
# pylint: disable = invalid-name, C0111
import json
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
df_train = pd.read_csv('../../examples/regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../../examples/regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)
# specify your configurations as a dict
params = {
'task' : 'train',
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : {'l2', 'auc'},
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
'verbose' : 0
}
# train
init_gbm = lgb.train(params,
lgb_train,
num_boost_round=5,
valid_sets=lgb_eval)
print('Start continue train')
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_sets=lgb_eval,
early_stopping_rounds=10,
init_model=init_gbm)
# save model to file
gbm.save_model('model.txt')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# dump model to json (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
# feature importances
print('Feature importances:', gbm.feature_importance())
print('Feature importances:', gbm.feature_importance("gain"))
print('Start test cv')
lgb.cv(params,
lgb_train,
num_boost_round=100,
nfold=5,
verbose_eval=5,
init_model=init_gbm)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment