Commit b51c7be4 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

refine Dataset class (#113)

Provide a high level Dataset class for easy use.
parent f3d33582
......@@ -14,7 +14,7 @@ before_install:
install:
- sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
- conda install --yes atlas numpy scipy scikit-learn
- conda install --yes atlas numpy scipy scikit-learn pandas
script:
......@@ -22,12 +22,12 @@ script:
- mkdir build && cd build && cmake .. && make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
notifications:
email: false
......
......@@ -76,7 +76,7 @@ add_executable(lightgbm src/main.cpp ${SOURCES})
add_library(_lightgbm SHARED src/c_api.cpp ${SOURCES})
if(MSVC)
set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lightgbm")
set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lib_lightgbm")
endif(MSVC)
if(USE_MPI)
......
......@@ -17,13 +17,7 @@ X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# ATTENTION: you should carefully use lightgbm.Dataset
# it requires setting up categorical_feature when you init it
# rather than passing from lightgbm.train
# instead, you can simply use a tuple of length=2 like below
# it will help you construct Datasets with parameters in lightgbm.train
lgb_train = (X_train, y_train)
lgb_eval = (X_test, y_test)
# specify your configurations as a dict
params = {
......@@ -43,9 +37,7 @@ params = {
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_datas=lgb_eval,
# you can use a list to represent multiple valid_datas/valid_names
# don't use tuple, tuple is used to represent one dataset
valid_sets=lgb_eval,
early_stopping_rounds=10)
# save model to file
......
......@@ -230,6 +230,7 @@ struct OverallConfig: public ConfigBase {
public:
TaskType task_type = TaskType::kTrain;
NetworkConfig network_config;
int seed = 0;
int num_threads = 0;
bool is_parallel = false;
bool is_parallel_find_bin = false;
......@@ -317,6 +318,7 @@ struct ParameterAlias {
{
{ "config", "config_file" },
{ "nthread", "num_threads" },
{ "random_seed", "seed" },
{ "num_thread", "num_threads" },
{ "boosting", "boosting_type" },
{ "boost", "boosting_type" },
......
......@@ -8,7 +8,7 @@ from __future__ import absolute_import
import os
from .basic import Predictor, Dataset, Booster
from .basic import Dataset, Booster
from .engine import train, cv
try:
from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
......
This diff is collapsed.
......@@ -6,52 +6,12 @@ from __future__ import absolute_import
import collections
from operator import attrgetter
import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from .basic import LightGBMError, _InnerPredictor, Dataset, Booster, is_str
from . import callback
def _construct_dataset(X_y, reference=None,
params=None, other_fields=None,
feature_name=None, categorical_feature=None,
predictor=None):
if 'max_bin' in params:
max_bin = int(params['max_bin'])
else:
max_bin = 255
weight = None
group = None
init_score = None
if other_fields is not None:
if not isinstance(other_fields, dict):
raise TypeError("type of other filed data should be dict")
weight = other_fields.get('weight', None)
group = other_fields.get('group', None)
init_score = other_fields.get('init_score', None)
if is_str(X_y):
data = X_y
label = None
else:
if len(X_y) != 2:
raise TypeError("should pass (data, label) tuple for dataset")
data = X_y[0]
label = X_y[1]
if reference is None:
ret = Dataset(data, label=label, max_bin=max_bin,
weight=weight, group=group,
predictor=predictor,
feature_name=feature_name,
categorical_feature=categorical_feature,
params=params)
else:
ret = reference.create_valid(data, label=label, weight=weight,
group=group, params=params)
if init_score is not None:
ret.set_init_score(init_score)
return ret
def train(params, train_data, num_boost_round=100,
valid_datas=None, valid_names=None,
def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None,
train_fields=None, valid_fields=None,
feature_name=None, categorical_feature=None,
early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None):
......@@ -61,14 +21,14 @@ def train(params, train_data, num_boost_round=100,
----------
params : dict
Parameters for training.
train_data : Dataset, tuple (X, y) or filename of data
train_set : Dataset
Data to be trained.
num_boost_round: int
Number of boosting iterations.
valid_datas: list of Datasets, tuples (valid_X, valid_y) or filenames of data
valid_sets: list of Datasets
List of data to be evaluated during training
valid_names: list of string
Names of valid_datas
Names of valid_sets
fobj : function
Customized objective function.
feval : function
......@@ -76,13 +36,6 @@ def train(params, train_data, num_boost_round=100,
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
train_fields : dict
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
valid_fields : dict
Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
Support fields: weight, group, init_score
feature_name : list of str
Feature names
categorical_feature : list of str or int
......@@ -95,8 +48,8 @@ def train(params, train_data, num_boost_round=100,
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas.
Example: with a valid_datas containing [valid_set, train_set] \
This dictionary used to store all evaluation results of all the items in valid_sets.
Example: with a valid_sets containing [valid_set, train_set] \
and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
......@@ -127,58 +80,40 @@ def train(params, train_data, num_boost_round=100,
"""
"""create predictor first"""
if is_str(init_model):
predictor = Predictor(model_file=init_model)
predictor = _InnerPredictor(model_file=init_model)
elif isinstance(init_model, Booster):
predictor = init_model.to_predictor()
elif isinstance(init_model, Predictor):
predictor = init_model
predictor = init_model._to_predictor()
else:
predictor = None
init_iteration = predictor.num_total_iteration if predictor else 0
"""create dataset"""
if isinstance(train_data, Dataset):
train_set = train_data
if train_fields is not None:
for field, data in train_fields.items():
train_set.set_field(field, data)
else:
train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields,
feature_name=feature_name,
categorical_feature=categorical_feature,
predictor=predictor)
"""check dataset"""
if not isinstance(train_set, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
is_valid_contain_train = False
train_data_name = "training"
valid_sets = []
reduced_valid_sets = []
name_valid_sets = []
if valid_datas:
if isinstance(valid_datas, (Dataset, tuple)):
valid_datas = [valid_datas]
if valid_sets:
if isinstance(valid_sets, Dataset):
valid_sets = [valid_sets]
if isinstance(valid_names, str):
valid_names = [valid_names]
for i, valid_data in enumerate(valid_datas):
other_fields = None if valid_fields is None else valid_fields.get(i, None)
for i, valid_data in enumerate(valid_sets):
"""reduce cost for prediction training data"""
if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]:
if valid_data is train_set:
is_valid_contain_train = True
if valid_names is not None:
train_data_name = valid_names[i]
continue
if isinstance(valid_data, Dataset):
valid_set = valid_data
if other_fields is not None:
for field, data in other_fields.items():
valid_set.set_field(field, data)
else:
valid_set = _construct_dataset(
valid_data,
train_set,
params,
other_fields=other_fields,
feature_name=feature_name,
categorical_feature=categorical_feature,
predictor=predictor)
valid_sets.append(valid_set)
if not isinstance(valid_data, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
valid_data.set_reference(train_set)
reduced_valid_sets.append(valid_data)
if valid_names is not None and len(valid_names) > i:
name_valid_sets.append(valid_names[i])
else:
......@@ -217,7 +152,7 @@ def train(params, train_data, num_boost_round=100,
booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train:
booster.set_train_data_name(train_data_name)
for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
booster.add_valid(valid_set, name_valid_set)
"""start training"""
......@@ -294,6 +229,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
else:
raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
else:
full_data.construct()
randidx = np.random.permutation(full_data.num_data())
kstep = int(len(randidx) / nfold)
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
......@@ -322,8 +258,8 @@ def _agg_cv_result(raw_results):
cvmap[one_line[1]].append(one_line[2])
return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, train_fields=None,
def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None,
early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0,
......@@ -334,7 +270,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
----------
params : dict
Booster params.
train_data : tuple (X, y) or filename of data
train_set : Dataset
Data to be trained.
num_boost_round : int
Number of boosting iterations.
......@@ -350,9 +286,8 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
Custom objective function.
feval : function
Custom evaluation function.
train_fields : dict
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str
Feature names
categorical_feature : list of str or int
......@@ -382,6 +317,20 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
-------
evaluation history : list(string)
"""
if not isinstance(train_set, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
if is_str(init_model):
predictor = _InnerPredictor(model_file=init_model)
elif isinstance(init_model, Booster):
predictor = init_model._to_predictor()
else:
predictor = None
train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
if metrics:
params.setdefault('metric', [])
if is_str(metrics):
......@@ -389,11 +338,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
else:
params['metric'].extend(metrics)
train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields,
feature_name=feature_name,
categorical_feature=categorical_feature)
results = collections.defaultdict(list)
cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)
......
......@@ -19,6 +19,7 @@ def find_lib_path():
if os.name == 'nt':
dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/'))
dll_path.append(os.path.join(curr_path, './windows/x64/Dll/'))
dll_path.append(os.path.join(curr_path, '../../Release/'))
dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path]
else:
dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path]
......
......@@ -4,7 +4,7 @@
from __future__ import absolute_import
import numpy as np
from .basic import LightGBMError, is_str
from .basic import LightGBMError, Dataset, is_str
from .engine import train
# sklearn
try:
......@@ -195,9 +195,12 @@ class LGBMModel(LGBMModelBase):
params.pop('nthread', None)
return params
def fit(self, X, y, eval_set=None, eval_metric=None,
def fit(self, X, y,
sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None,
feature_name=None, categorical_feature=None,
other_params=None):
"""
......@@ -209,24 +212,29 @@ class LGBMModel(LGBMModelBase):
Feature matrix
y : array_like
Labels
sample_weight : array_like
weight of training data
init_score : array_like
init score of training data
group : array_like
group data of training data
eval_set : list, optional
A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_sample_weight : List of array
weight of eval data
eval_init_score : List of array
init score of eval data
eval_group : List of array
group data of eval data
eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric. The call \
signature is func(y_predicted, dataset) where dataset will be a \
Dataset fobject such that you may need to call the get_label \
Dateset object such that you may need to call the get_label \
method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
train_fields : dict
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
valid_fields : dict
Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
Support fields: weight, group, init_score
feature_name : list of str
Feature names
categorical_feature : list of str or int
......@@ -263,12 +271,33 @@ class LGBMModel(LGBMModelBase):
feval = None
feval = eval_metric if callable(eval_metric) else None
self._Booster = train(params, (X, y),
self.n_estimators, valid_datas=eval_set,
def _construct_dataset(X, y, sample_weight, init_score, group):
ret = Dataset(X, label=y, weight=sample_weight, group=group)
ret.set_init_score(init_score)
return ret
train_set = _construct_dataset(X, y, sample_weight, init_score, group)
valid_sets = []
if eval_set is not None:
if isinstance(eval_set, tuple):
eval_set = [eval_set]
for i, valid_data in enumerate(eval_set):
"""reduce cost for prediction training data"""
if valid_data[0] is X and valid_data[1] is y:
valid_set = train_set
else:
valid_weight = None if eval_sample_weight is None else eval_sample_weight.get(i, None)
valid_init_score = None if eval_init_score is None else eval_init_score.get(i, None)
valid_group = None if eval_group is None else eval_group.get(i, None)
valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group)
valid_sets.append(valid_set)
self._Booster = train(params, train_set,
self.n_estimators, valid_sets=valid_sets,
early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, train_fields=train_fields,
valid_fields=valid_fields, feature_name=feature_name,
verbose_eval=verbose, feature_name=feature_name,
categorical_feature=categorical_feature)
if evals_result:
......@@ -331,14 +360,48 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM regression.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y,
sample_weight=None, init_score=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None,
other_params=None):
super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None,
eval_set, eval_sample_weight, eval_init_score, None,
eval_metric, early_stopping_rounds,
verbose, feature_name, categorical_feature,
other_params)
return self
class LGBMClassifier(LGBMModel, LGBMClassifierBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM classification.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None,
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="binary",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
super(LGBMClassifier, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin,
silent, objective,
nthread, min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed)
def fit(self, X, y,
sample_weight=None, init_score=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None,
feature_name=None, categorical_feature=None,
other_params=None):
......@@ -350,12 +413,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
# Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass"
other_params['num_class'] = self.n_classes_
if eval_metric is None and eval_set is not None:
eval_metric = "multi_logloss"
else:
self.objective = "binary"
if eval_metric is None and eval_set is not None:
eval_metric = "binary_logloss"
self._le = LGBMLabelEncoder().fit(y)
training_labels = self._le.transform(y)
......@@ -363,10 +420,10 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
if eval_set is not None:
eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)
super(LGBMClassifier, self).fit(X, training_labels, eval_set,
super(LGBMClassifier, self).fit(X, training_labels, sample_weight, init_score, None,
eval_set, eval_sample_weight, eval_init_score, None,
eval_metric, early_stopping_rounds,
verbose, train_fields, valid_fields,
feature_name, categorical_feature,
verbose, feature_name, categorical_feature,
other_params)
return self
......@@ -442,34 +499,59 @@ class LGBMRanker(LGBMModel):
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None,
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="lambdarank",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
super(LGBMRanker, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin,
silent, objective,
nthread, min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed)
if callable(self.objective):
self.fobj = _group_wise_objective(self.objective)
else:
self.fobj = None
def fit(self, X, y,
sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None,
eval_metric=None, eval_at=None,
early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None):
feature_name=None, categorical_feature=None,
other_params=None):
"""
Most arguments like LGBMModel.fit except following:
eval_at : list of int
The evaulation positions of NDCG
"""
"""check group data"""
if "group" not in train_fields:
raise ValueError("should set group in train_fields for ranking task")
if group is None:
raise ValueError("should use group for ranking task")
if eval_set is not None:
if valid_fields is None:
raise ValueError("valid_fields cannot be None when eval_set is not None")
elif len(valid_fields) != len(eval_set):
raise ValueError("lenght of valid_fields should equal with eval_set")
if eval_group is None:
raise ValueError("eval_group cannot be None when eval_set is not None")
elif len(eval_group) != len(eval_set):
raise ValueError("length of eval_group should equal with eval_set")
else:
for inner in valid_fields:
if "group" not in inner:
raise ValueError("should set group in valid_fields for ranking task")
if callable(self.objective):
self.fobj = _group_wise_objective(self.objective)
else:
self.objective = "lambdarank"
self.fobj = None
if eval_metric is None and eval_set is not None:
eval_metric = "ndcg"
super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
early_stopping_rounds, verbose,
train_fields, valid_fields,
for inner_group in eval_group:
if inner_group is None:
raise ValueError("should set group for all eval data for ranking task")
if eval_at is not None:
other_params = {} if other_params is None else other_params
other_params['ndcg_eval_at'] = list(eval_at)
super(LGBMRanker, self).fit(X, y, sample_weight, init_score, group,
eval_set, eval_sample_weight, eval_init_score, eval_group,
eval_metric, early_stopping_rounds,
verbose, feature_name, categorical_feature,
other_params)
return self
#include <LightGBM/config.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/log.h>
#include <vector>
#include <string>
#include <unordered_set>
#include <algorithm>
#include <limits>
namespace LightGBM {
......@@ -22,7 +24,7 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par
continue;
}
params[key] = value;
} else {
} else if(Common::Trim(arg).size() > 0){
Log::Warning("Unknown parameter %s", arg.c_str());
}
}
......@@ -33,12 +35,21 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par
void OverallConfig::Set(const std::unordered_map<std::string, std::string>& params) {
// load main config types
GetInt(params, "num_threads", &num_threads);
// generate seeds by seed.
if (GetInt(params, "seed", &seed)) {
Random rand(seed);
int int_max = std::numeric_limits<int>::max();
io_config.data_random_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.bagging_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.drop_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.tree_config.feature_fraction_seed = static_cast<int>(rand.NextInt(0, int_max));
}
GetTaskType(params);
GetBoostingType(params);
GetObjectiveType(params);
GetMetricType(params);
// sub-config setup
network_config.Set(params);
io_config.Set(params);
......
......@@ -8,10 +8,6 @@ x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_s
train_data = lgb.Dataset(x_train, max_bin=255, label=y_train)
num_features = train_data.num_feature()
names = ["name_%d" %(i) for i in range(num_features)]
train_data.set_feature_name(names)
valid_data = train_data.create_valid(x_test, label=y_test)
config={"objective":"binary","metric":"auc", "min_data":1, "num_leaves":15}
......
# coding: utf-8
# pylint: disable = invalid-name, C0111
import json
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
df_train = pd.read_csv('../../examples/regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../../examples/regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)
# specify your configurations as a dict
params = {
'task' : 'train',
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : {'l2', 'auc'},
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
'verbose' : 0
}
# train
init_gbm = lgb.train(params,
lgb_train,
num_boost_round=5,
valid_sets=lgb_eval)
print('Start continue train')
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_sets=lgb_eval,
early_stopping_rounds=10,
init_model=init_gbm)
# save model to file
gbm.save_model('model.txt')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# dump model to json (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
# feature importances
print('Feature importances:', gbm.feature_importance())
print('Feature importances:', gbm.feature_importance("gain"))
print('Start test cv')
lgb.cv(params,
lgb_train,
num_boost_round=100,
nfold=5,
verbose_eval=5,
init_model=init_gbm)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment