Commit 76c44d78 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

refine compatibility (#186)

* add compat.py

* unify types

* range/xrange -> range_

* move argc_ to compat.py; add is_numeric error type

* use simplejson for json

* move json to compat.py

* move pandas to compat.py

* move sklearn to compat.py

* remove unused function

* fix 'unify types'

* argc_ (lambda -> def)
parent aa333925
......@@ -5,28 +5,16 @@
from __future__ import absolute_import
import ctypes
import json
import os
import sys
from tempfile import NamedTemporaryFile
import numpy as np
import scipy.sparse
from .compat import (DataFrame, Series, integer_types, json, numeric_types,
range_, string_type)
from .libpath import find_lib_path
"""pandas"""
try:
from pandas import Series, DataFrame
except ImportError:
class Series(object):
pass
class DataFrame(object):
pass
IS_PY3 = (sys.version_info[0] == 3)
def _load_lib():
"""Load LightGBM Library."""
......@@ -57,28 +45,17 @@ def _safe_call(ret):
raise LightGBMError(_LIB.LGBM_GetLastError())
def is_str(s):
"""Check is a str or not"""
if IS_PY3:
return isinstance(s, str)
else:
return isinstance(s, basestring)
def is_numeric(obj):
"""Check is a number or not, include numpy number etc."""
try:
float(obj)
return True
except:
except (TypeError, ValueError):
# TypeError: obj is not a string or a number
# ValueError: invalid literal
return False
def is_numpy_object(data):
"""Check is numpy object"""
return type(data).__module__ == np.__name__
def is_numpy_1d_array(data):
"""Check is 1d numpy array"""
return isinstance(data, np.ndarray) and len(data.shape) == 1
......@@ -87,7 +64,7 @@ def is_numpy_1d_array(data):
def is_1d_list(data):
"""Check is 1d list"""
return isinstance(data, list) and \
(not data or isinstance(data[0], (int, float, bool)))
(not data or isinstance(data[0], numeric_types))
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
......@@ -140,7 +117,7 @@ def param_dict_to_str(data):
for key, val in data.items():
if isinstance(val, (list, tuple, set)) or is_numpy_1d_array(val):
pairs.append(str(key) + '=' + ','.join(map(str, val)))
elif is_str(val) or isinstance(val, (int, float, bool)) or is_numeric(val):
elif isinstance(val, string_type) or isinstance(val, numeric_types) or is_numeric(val):
pairs.append(str(key) + '=' + str(val))
else:
raise TypeError('Unknown type of parameter:%s, got:%s'
......@@ -314,7 +291,7 @@ class _InnerPredictor(object):
int_data_has_header = 1 if data_has_header else 0
if num_iteration > self.num_total_iteration:
num_iteration = self.num_total_iteration
if is_str(data):
if isinstance(data, string_type):
with _temp_file() as f:
_safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle,
......@@ -576,9 +553,9 @@ class Dataset(object):
if feature_name is not None:
feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature:
if is_str(name) and name in feature_dict:
if isinstance(name, string_type) and name in feature_dict:
categorical_indices.add(feature_dict[name])
elif isinstance(name, int):
elif isinstance(name, integer_types):
categorical_indices.add(name)
else:
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
......@@ -594,7 +571,7 @@ class Dataset(object):
elif reference is not None:
raise TypeError('Reference dataset should be None or dataset instance')
"""start construct data"""
if is_str(data):
if isinstance(data, string_type):
"""check data has header or not"""
if str(params.get("has_header", "")).lower() == "true" \
or str(params.get("header", "")).lower() == "true":
......@@ -635,8 +612,8 @@ class Dataset(object):
# need re group init score
new_init_score = np.zeros(init_score.size, dtype=np.float32)
num_data = self.num_data()
for i in range(num_data):
for j in range(self.predictor.num_class):
for i in range_(num_data):
for j in range_(self.predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j]
init_score = new_init_score
init_score = init_score.astype(dtype=np.float32, copy=False)
......@@ -1065,7 +1042,7 @@ class Dataset(object):
if self.group is not None:
# group data from LightGBM is boundaries data, need to convert to group size
new_group = []
for i in range(len(self.group) - 1):
for i in range_(len(self.group) - 1):
new_group.append(self.group[i + 1] - self.group[i])
self.group = new_group
return self.group
......@@ -1292,7 +1269,7 @@ class Booster(object):
_safe_call(_LIB.LGBM_BoosterUpdateOneIter(
self.handle,
ctypes.byref(is_finished)))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
return is_finished.value == 1
else:
grad, hess = fobj(self.__inner_predict(0), self.train_set)
......@@ -1326,7 +1303,7 @@ class Booster(object):
grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.byref(is_finished)))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
return is_finished.value == 1
def rollback_one_iter(self):
......@@ -1335,7 +1312,7 @@ class Booster(object):
"""
_safe_call(_LIB.LGBM_BoosterRollbackOneIter(
self.handle))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
def current_iteration(self):
out_cur_iter = ctypes.c_int(0)
......@@ -1366,7 +1343,7 @@ class Booster(object):
if data is self.train_set:
data_idx = 0
else:
for i in range(len(self.valid_sets)):
for i in range_(len(self.valid_sets)):
if data is self.valid_sets[i]:
data_idx = i + 1
break
......@@ -1407,7 +1384,7 @@ class Booster(object):
result: str
Evaluation result list.
"""
return [item for i in range(1, self.__num_dataset)
return [item for i in range_(1, self.__num_dataset)
for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
def save_model(self, filename, num_iteration=-1):
......@@ -1535,7 +1512,7 @@ class Booster(object):
self.__get_eval_info()
ret = []
if self.__num_inner_eval > 0:
result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float64)
result = np.array([0.0 for _ in range_(self.__num_inner_eval)], dtype=np.float64)
tmp_out_len = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetEval(
self.handle,
......@@ -1544,7 +1521,7 @@ class Booster(object):
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("Wrong length of eval results")
for i in range(self.__num_inner_eval):
for i in range_(self.__num_inner_eval):
ret.append((data_name, self.__name_inner_eval[i], result[i], self.__higher_better_inner_eval[i]))
if feval is not None:
if data_idx == 0:
......@@ -1572,7 +1549,7 @@ class Booster(object):
else:
n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = \
np.array([0.0 for _ in range(n_preds)], dtype=np.float64, copy=False)
np.array([0.0 for _ in range_(n_preds)], dtype=np.float64, copy=False)
"""avoid to predict many time in one iteration"""
if not self.__is_predicted_cur_iter[data_idx]:
tmp_out_len = ctypes.c_int64(0)
......@@ -1602,7 +1579,7 @@ class Booster(object):
if self.__num_inner_eval > 0:
"""Get name of evals"""
tmp_out_len = ctypes.c_int(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)]
string_buffers = [ctypes.create_string_buffer(255) for i in range_(self.__num_inner_eval)]
ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
_safe_call(_LIB.LGBM_BoosterGetEvalNames(
self.handle,
......@@ -1611,7 +1588,7 @@ class Booster(object):
if self.__num_inner_eval != tmp_out_len.value:
raise ValueError("Length of eval names doesn't equal with num_evals")
self.__name_inner_eval = \
[string_buffers[i].value.decode() for i in range(self.__num_inner_eval)]
[string_buffers[i].value.decode() for i in range_(self.__num_inner_eval)]
self.__higher_better_inner_eval = \
[name.startswith(('auc', 'ndcg')) for name in self.__name_inner_eval]
......@@ -1642,7 +1619,7 @@ class Booster(object):
"""
for key, value in kwargs.items():
if value is not None:
if not is_str(value):
if not isinstance(value, string_type):
raise ValueError("Set attr only accepts strings")
self.__attr[key] = value
else:
......
......@@ -4,6 +4,8 @@ from __future__ import absolute_import
import collections
from .compat import range_
class EarlyStopException(Exception):
"""Exception of early stopping.
......@@ -171,7 +173,7 @@ def early_stopping(stopping_rounds, verbose=True):
msg = "Train until valid scores didn't improve in {} rounds."
print(msg.format(stopping_rounds))
for i in range(len(env.evaluation_result_list)):
for i in range_(len(env.evaluation_result_list)):
best_score[i] = float('-inf')
best_iter[i] = 0
if verbose:
......@@ -182,7 +184,7 @@ def early_stopping(stopping_rounds, verbose=True):
"""internal function"""
if not best_score:
init(env)
for i in range(len(env.evaluation_result_list)):
for i in range_(len(env.evaluation_result_list)):
score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
if score > best_score[i]:
best_score[i] = score
......
# coding: utf-8
# pylint: disable = C0103
"""Compatibility"""
from __future__ import absolute_import
import inspect
import sys
is_py3 = (sys.version_info[0] == 3)
"""compatibility between python2 and python3"""
if is_py3:
string_type = str
numeric_types = (int, float, bool)
integer_types = int
range_ = range
def argc_(func):
"""return number of arguments of a function"""
return len(inspect.signature(func).parameters)
else:
string_type = basestring
numeric_types = (int, long, float, bool)
integer_types = (int, long)
range_ = xrange
def argc_(func):
"""return number of arguments of a function"""
return len(inspect.getargspec(func).args)
"""json"""
try:
import simplejson as json
except (ImportError, SyntaxError):
# simplejson does not support Python 3.2, it throws a SyntaxError
# because of u'...' Unicode literals.
import json
"""pandas"""
try:
from pandas import Series, DataFrame
except ImportError:
class Series(object):
pass
class DataFrame(object):
pass
"""sklearn"""
try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import deprecated
try:
from sklearn.model_selection import StratifiedKFold
except ImportError:
from sklearn.cross_validation import StratifiedKFold
SKLEARN_INSTALLED = True
LGBMModelBase = BaseEstimator
LGBMRegressorBase = RegressorMixin
LGBMClassifierBase = ClassifierMixin
LGBMLabelEncoder = LabelEncoder
LGBMDeprecated = deprecated
LGBMStratifiedKFold = StratifiedKFold
except ImportError:
SKLEARN_INSTALLED = False
LGBMModelBase = object
LGBMClassifierBase = object
LGBMRegressorBase = object
LGBMLabelEncoder = None
LGBMDeprecated = None
LGBMStratifiedKFold = None
......@@ -9,7 +9,9 @@ from operator import attrgetter
import numpy as np
from . import callback
from .basic import Booster, Dataset, LightGBMError, _InnerPredictor, is_str
from .basic import Booster, Dataset, LightGBMError, _InnerPredictor
from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
range_, string_type)
def train(params, train_set, num_boost_round=100,
......@@ -85,7 +87,7 @@ def train(params, train_set, num_boost_round=100,
booster : a trained booster model
"""
"""create predictor first"""
if is_str(init_model):
if isinstance(init_model, string_type):
predictor = _InnerPredictor(model_file=init_model)
elif isinstance(init_model, Booster):
predictor = init_model._to_predictor()
......@@ -108,7 +110,7 @@ def train(params, train_set, num_boost_round=100,
if valid_sets is not None:
if isinstance(valid_sets, Dataset):
valid_sets = [valid_sets]
if isinstance(valid_names, str):
if isinstance(valid_names, string_type):
valid_names = [valid_names]
for i, valid_data in enumerate(valid_sets):
"""reduce cost for prediction training data"""
......@@ -138,7 +140,7 @@ def train(params, train_set, num_boost_round=100,
# Most of legacy advanced options becomes callbacks
if verbose_eval is True:
callbacks.add(callback.print_evaluation())
elif isinstance(verbose_eval, int):
elif isinstance(verbose_eval, integer_types):
callbacks.add(callback.print_evaluation(verbose_eval))
if early_stopping_rounds is not None:
......@@ -163,7 +165,7 @@ def train(params, train_set, num_boost_round=100,
booster.add_valid(valid_set, name_valid_set)
"""start training"""
for i in range(init_iteration, init_iteration + num_boost_round):
for i in range_(init_iteration, init_iteration + num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=booster,
params=params,
......@@ -217,25 +219,14 @@ class CVBooster(object):
return handlerFunction
try:
from sklearn.model_selection import StratifiedKFold
SKLEARN_StratifiedKFold = True
except ImportError:
try:
from sklearn.cross_validation import StratifiedKFold
SKLEARN_StratifiedKFold = True
except ImportError:
SKLEARN_StratifiedKFold = False
def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=False, shuffle=True):
"""
Make an n-fold list of Booster from random indices.
"""
np.random.seed(seed)
if stratified:
if SKLEARN_StratifiedKFold:
sfk = StratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
if SKLEARN_INSTALLED:
sfk = LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
else:
raise LightGBMError('Scikit-learn is required for stratified cv')
......@@ -244,11 +235,11 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
if shuffle:
randidx = np.random.permutation(full_data.num_data())
kstep = int(len(randidx) / nfold)
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range_(nfold)]
ret = CVBooster()
for k in range(nfold):
train_set = full_data.subset(np.concatenate([idset[i] for i in range(nfold) if k != i]))
for k in range_(nfold):
train_set = full_data.subset(np.concatenate([idset[i] for i in range_(nfold) if k != i]))
valid_set = full_data.subset(idset[k])
# run preprocessing on the data set if needed
if fpreproc is not None:
......@@ -341,7 +332,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
if not isinstance(train_set, Dataset):
raise TypeError("Traninig only accepts Dataset object")
if is_str(init_model):
if isinstance(init_model, string_type):
predictor = _InnerPredictor(model_file=init_model)
elif isinstance(init_model, Booster):
predictor = init_model._to_predictor()
......@@ -354,7 +345,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
if metrics:
params.setdefault('metric', [])
if is_str(metrics):
if isinstance(metrics, string_type):
params['metric'].append(metrics)
else:
params['metric'].extend(metrics)
......@@ -373,7 +364,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
callbacks.add(callback.early_stopping(early_stopping_rounds, verbose=False))
if verbose_eval is True:
callbacks.add(callback.print_evaluation(show_stdv=show_stdv))
elif isinstance(verbose_eval, int):
elif isinstance(verbose_eval, integer_types):
callbacks.add(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
callbacks_before_iter = {cb for cb in callbacks if getattr(cb, 'before_iteration', False)}
......@@ -381,7 +372,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
for i in range(num_boost_round):
for i in range_(num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=cvfolds,
params=params,
......
......@@ -3,38 +3,14 @@
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
import inspect
import numpy as np
from .basic import IS_PY3, Dataset, LightGBMError
from .basic import Dataset, LightGBMError
from .compat import (SKLEARN_INSTALLED, LGBMClassifierBase, LGBMDeprecated,
LGBMLabelEncoder, LGBMModelBase, LGBMRegressorBase, argc_,
range_)
from .engine import train
'''sklearn'''
try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import deprecated
SKLEARN_INSTALLED = True
LGBMModelBase = BaseEstimator
LGBMRegressorBase = RegressorMixin
LGBMClassifierBase = ClassifierMixin
LGBMLabelEncoder = LabelEncoder
except ImportError:
SKLEARN_INSTALLED = False
LGBMModelBase = object
LGBMClassifierBase = object
LGBMRegressorBase = object
LGBMLabelEncoder = None
def _argc(func):
if IS_PY3:
return len(inspect.signature(func).parameters)
else:
return len(inspect.getargspec(func).args)
def _objective_function_wrapper(func):
"""Decorate an objective function
......@@ -67,7 +43,7 @@ def _objective_function_wrapper(func):
def inner(preds, dataset):
"""internal function"""
labels = dataset.get_label()
argc = _argc(func)
argc = argc_(func)
if argc == 2:
grad, hess = func(labels, preds)
elif argc == 3:
......@@ -86,8 +62,8 @@ def _objective_function_wrapper(func):
num_class = len(grad) // num_data
if num_class * num_data != len(grad):
raise ValueError("Length of grad and hess should equal to num_class * num_data")
for k in range(num_class):
for i in range(num_data):
for k in range_(num_class):
for i in range_(num_data):
idx = k * num_data + i
grad[idx] *= weight[i]
hess[idx] *= weight[i]
......@@ -132,7 +108,7 @@ def _eval_function_wrapper(func):
def inner(preds, dataset):
"""internal function"""
labels = dataset.get_label()
argc = _argc(func)
argc = argc_(func)
if argc == 2:
return func(labels, preds)
elif argc == 3:
......@@ -490,11 +466,11 @@ class LGBMModel(LGBMModelBase):
importace_array = self.booster_.feature_importance().astype(np.float32)
return importace_array / importace_array.sum()
@deprecated('Use attribute booster_ instead.')
@LGBMDeprecated('Use attribute booster_ instead.')
def booster(self):
return self.booster_
@deprecated('Use attribute feature_importance_ instead.')
@LGBMDeprecated('Use attribute feature_importance_ instead.')
def feature_importance(self):
return self.feature_importance_
......@@ -695,7 +671,7 @@ class LGBMRanker(LGBMModel):
raise ValueError("Eval_group cannot be None when eval_set is not None")
elif len(eval_group) != len(eval_set):
raise ValueError("Length of eval_group should equal to eval_set")
elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))) \
elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range_(len(eval_group)))) \
or (isinstance(eval_group, list) and any(group is None for group in eval_group)):
raise ValueError("Should set group for all eval dataset for ranking task; if you use dict, the index should start from 0")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment