Unverified Commit 45ac271b authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python] replace numpy.zeros with numpy.empty for the speedup (#4410)

parent db3915c2
...@@ -780,7 +780,7 @@ class _InnerPredictor: ...@@ -780,7 +780,7 @@ class _InnerPredictor:
ptr_data, type_ptr_data, _ = c_float_array(data) ptr_data, type_ptr_data, _ = c_float_array(data)
n_preds = self.__get_num_preds(start_iteration, num_iteration, mat.shape[0], predict_type) n_preds = self.__get_num_preds(start_iteration, num_iteration, mat.shape[0], predict_type)
if preds is None: if preds is None:
preds = np.zeros(n_preds, dtype=np.float64) preds = np.empty(n_preds, dtype=np.float64)
elif len(preds.shape) != 1 or len(preds) != n_preds: elif len(preds.shape) != 1 or len(preds) != n_preds:
raise ValueError("Wrong length of pre-allocated predict array") raise ValueError("Wrong length of pre-allocated predict array")
out_num_preds = ctypes.c_int64(0) out_num_preds = ctypes.c_int64(0)
...@@ -807,7 +807,7 @@ class _InnerPredictor: ...@@ -807,7 +807,7 @@ class _InnerPredictor:
# __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])] n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])]
n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
preds = np.zeros(sum(n_preds), dtype=np.float64) preds = np.empty(sum(n_preds), dtype=np.float64)
for chunk, (start_idx_pred, end_idx_pred) in zip(np.array_split(mat, sections), for chunk, (start_idx_pred, end_idx_pred) in zip(np.array_split(mat, sections),
zip(n_preds_sections, n_preds_sections[1:])): zip(n_preds_sections, n_preds_sections[1:])):
# avoid memory consumption by arrays concatenation operations # avoid memory consumption by arrays concatenation operations
...@@ -868,7 +868,7 @@ class _InnerPredictor: ...@@ -868,7 +868,7 @@ class _InnerPredictor:
nrow = len(csr.indptr) - 1 nrow = len(csr.indptr) - 1
n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type) n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type)
if preds is None: if preds is None:
preds = np.zeros(n_preds, dtype=np.float64) preds = np.empty(n_preds, dtype=np.float64)
elif len(preds.shape) != 1 or len(preds) != n_preds: elif len(preds.shape) != 1 or len(preds) != n_preds:
raise ValueError("Wrong length of pre-allocated predict array") raise ValueError("Wrong length of pre-allocated predict array")
out_num_preds = ctypes.c_int64(0) out_num_preds = ctypes.c_int64(0)
...@@ -913,7 +913,7 @@ class _InnerPredictor: ...@@ -913,7 +913,7 @@ class _InnerPredictor:
out_ptr_data = ctypes.POINTER(ctypes.c_float)() out_ptr_data = ctypes.POINTER(ctypes.c_float)()
else: else:
out_ptr_data = ctypes.POINTER(ctypes.c_double)() out_ptr_data = ctypes.POINTER(ctypes.c_double)()
out_shape = np.zeros(2, dtype=np.int64) out_shape = np.empty(2, dtype=np.int64)
_safe_call(_LIB.LGBM_BoosterPredictSparseOutput( _safe_call(_LIB.LGBM_BoosterPredictSparseOutput(
self.handle, self.handle,
ptr_indptr, ptr_indptr,
...@@ -946,7 +946,7 @@ class _InnerPredictor: ...@@ -946,7 +946,7 @@ class _InnerPredictor:
# __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)] n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)]
n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
preds = np.zeros(sum(n_preds), dtype=np.float64) preds = np.empty(sum(n_preds), dtype=np.float64)
for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip(zip(sections, sections[1:]), for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip(zip(sections, sections[1:]),
zip(n_preds_sections, n_preds_sections[1:])): zip(n_preds_sections, n_preds_sections[1:])):
# avoid memory consumption by arrays concatenation operations # avoid memory consumption by arrays concatenation operations
...@@ -971,7 +971,7 @@ class _InnerPredictor: ...@@ -971,7 +971,7 @@ class _InnerPredictor:
out_ptr_data = ctypes.POINTER(ctypes.c_float)() out_ptr_data = ctypes.POINTER(ctypes.c_float)()
else: else:
out_ptr_data = ctypes.POINTER(ctypes.c_double)() out_ptr_data = ctypes.POINTER(ctypes.c_double)()
out_shape = np.zeros(2, dtype=np.int64) out_shape = np.empty(2, dtype=np.int64)
_safe_call(_LIB.LGBM_BoosterPredictSparseOutput( _safe_call(_LIB.LGBM_BoosterPredictSparseOutput(
self.handle, self.handle,
ptr_indptr, ptr_indptr,
...@@ -1002,7 +1002,7 @@ class _InnerPredictor: ...@@ -1002,7 +1002,7 @@ class _InnerPredictor:
if predict_type == C_API_PREDICT_CONTRIB: if predict_type == C_API_PREDICT_CONTRIB:
return inner_predict_sparse(csc, start_iteration, num_iteration, predict_type) return inner_predict_sparse(csc, start_iteration, num_iteration, predict_type)
n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type) n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type)
preds = np.zeros(n_preds, dtype=np.float64) preds = np.empty(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0) out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr) ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
...@@ -1176,7 +1176,7 @@ class Dataset: ...@@ -1176,7 +1176,7 @@ class Dataset:
if used_indices is not None: if used_indices is not None:
assert not self.need_slice assert not self.need_slice
if isinstance(data, str): if isinstance(data, str):
sub_init_score = np.zeros(num_data * predictor.num_class, dtype=np.float32) sub_init_score = np.empty(num_data * predictor.num_class, dtype=np.float32)
assert num_data == len(used_indices) assert num_data == len(used_indices)
for i in range(len(used_indices)): for i in range(len(used_indices)):
for j in range(predictor.num_class): for j in range(predictor.num_class):
...@@ -1184,7 +1184,7 @@ class Dataset: ...@@ -1184,7 +1184,7 @@ class Dataset:
init_score = sub_init_score init_score = sub_init_score
if predictor.num_class > 1: if predictor.num_class > 1:
# need to regroup init_score # need to regroup init_score
new_init_score = np.zeros(init_score.size, dtype=np.float32) new_init_score = np.empty(init_score.size, dtype=np.float32)
for i in range(num_data): for i in range(num_data):
for j in range(predictor.num_class): for j in range(predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j] new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j]
...@@ -1320,7 +1320,7 @@ class Dataset: ...@@ -1320,7 +1320,7 @@ class Dataset:
def __init_from_list_np2d(self, mats, params_str, ref_dataset): def __init_from_list_np2d(self, mats, params_str, ref_dataset):
"""Initialize data from a list of 2-D numpy matrices.""" """Initialize data from a list of 2-D numpy matrices."""
ncol = mats[0].shape[1] ncol = mats[0].shape[1]
nrow = np.zeros((len(mats),), np.int32) nrow = np.empty((len(mats),), np.int32)
if mats[0].dtype == np.float64: if mats[0].dtype == np.float64:
ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))() ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))()
else: else:
...@@ -3310,7 +3310,7 @@ class Booster: ...@@ -3310,7 +3310,7 @@ class Booster:
if iteration is None: if iteration is None:
iteration = self.best_iteration iteration = self.best_iteration
importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type] importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
result = np.zeros(self.num_feature(), dtype=np.float64) result = np.empty(self.num_feature(), dtype=np.float64)
_safe_call(_LIB.LGBM_BoosterFeatureImportance( _safe_call(_LIB.LGBM_BoosterFeatureImportance(
self.handle, self.handle,
ctypes.c_int(iteration), ctypes.c_int(iteration),
...@@ -3397,7 +3397,7 @@ class Booster: ...@@ -3397,7 +3397,7 @@ class Booster:
self.__get_eval_info() self.__get_eval_info()
ret = [] ret = []
if self.__num_inner_eval > 0: if self.__num_inner_eval > 0:
result = np.zeros(self.__num_inner_eval, dtype=np.float64) result = np.empty(self.__num_inner_eval, dtype=np.float64)
tmp_out_len = ctypes.c_int(0) tmp_out_len = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetEval( _safe_call(_LIB.LGBM_BoosterGetEval(
self.handle, self.handle,
...@@ -3437,7 +3437,7 @@ class Booster: ...@@ -3437,7 +3437,7 @@ class Booster:
n_preds = self.train_set.num_data() * self.__num_class n_preds = self.train_set.num_data() * self.__num_class
else: else:
n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = np.zeros(n_preds, dtype=np.float64) self.__inner_predict_buffer[data_idx] = np.empty(n_preds, dtype=np.float64)
# avoid to predict many time in one iteration # avoid to predict many time in one iteration
if not self.__is_predicted_cur_iter[data_idx]: if not self.__is_predicted_cur_iter[data_idx]:
tmp_out_len = ctypes.c_int64(0) tmp_out_len = ctypes.c_int64(0)
......
...@@ -333,7 +333,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi ...@@ -333,7 +333,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
flatted_group = np.repeat(range(len(group_info)), repeats=group_info) flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
else: else:
flatted_group = np.zeros(num_data, dtype=np.int32) flatted_group = np.zeros(num_data, dtype=np.int32)
folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group) folds = folds.split(X=np.empty(num_data), y=full_data.get_label(), groups=flatted_group)
else: else:
if any(params.get(obj_alias, "") in {"lambdarank", "rank_xendcg", "xendcg", if any(params.get(obj_alias, "") in {"lambdarank", "rank_xendcg", "xendcg",
"xe_ndcg", "xe_ndcg_mart", "xendcg_mart"} "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"}
...@@ -344,12 +344,12 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi ...@@ -344,12 +344,12 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False) group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
flatted_group = np.repeat(range(len(group_info)), repeats=group_info) flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
group_kfold = _LGBMGroupKFold(n_splits=nfold) group_kfold = _LGBMGroupKFold(n_splits=nfold)
folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group) folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)
elif stratified: elif stratified:
if not SKLEARN_INSTALLED: if not SKLEARN_INSTALLED:
raise LightGBMError('scikit-learn is required for stratified cv') raise LightGBMError('scikit-learn is required for stratified cv')
skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed) skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
folds = skf.split(X=np.zeros(num_data), y=full_data.get_label()) folds = skf.split(X=np.empty(num_data), y=full_data.get_label())
else: else:
if shuffle: if shuffle:
randidx = np.random.RandomState(seed).permutation(num_data) randidx = np.random.RandomState(seed).permutation(num_data)
......
...@@ -268,7 +268,7 @@ def test_booster(): ...@@ -268,7 +268,7 @@ def test_booster():
for line in inp.readlines(): for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]]) data.append([float(x) for x in line.split('\t')[1:]])
mat = np.array(data, dtype=np.float64) mat = np.array(data, dtype=np.float64)
preb = np.zeros(mat.shape[0], dtype=np.float64) preb = np.empty(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_int64(0) num_preb = ctypes.c_int64(0)
data = np.array(mat.reshape(mat.size), dtype=np.float64, copy=False) data = np.array(mat.reshape(mat.size), dtype=np.float64, copy=False)
LIB.LGBM_BoosterPredictForMat( LIB.LGBM_BoosterPredictForMat(
......
...@@ -1441,9 +1441,8 @@ def test_max_bin_by_feature(): ...@@ -1441,9 +1441,8 @@ def test_max_bin_by_feature():
def test_small_max_bin(): def test_small_max_bin():
np.random.seed(0) np.random.seed(0)
y = np.random.choice([0, 1], 100) y = np.random.choice([0, 1], 100)
x = np.zeros((100, 1)) x = np.ones((100, 1))
x[:30, 0] = -1 x[:30, 0] = -1
x[30:60, 0] = 1
x[60:, 0] = 2 x[60:, 0] = 2
params = {'objective': 'binary', params = {'objective': 'binary',
'seed': 0, 'seed': 0,
...@@ -2259,7 +2258,7 @@ def test_node_level_subcol(): ...@@ -2259,7 +2258,7 @@ def test_node_level_subcol():
def test_forced_bins(): def test_forced_bins():
x = np.zeros((100, 2)) x = np.empty((100, 2))
x[:, 0] = np.arange(0, 1, 0.01) x[:, 0] = np.arange(0, 1, 0.01)
x[:, 1] = -np.arange(0, 1, 0.01) x[:, 1] = -np.arange(0, 1, 0.01)
y = np.arange(0, 1, 0.01) y = np.arange(0, 1, 0.01)
...@@ -2275,7 +2274,6 @@ def test_forced_bins(): ...@@ -2275,7 +2274,6 @@ def test_forced_bins():
est = lgb.train(params, lgb_x, num_boost_round=20) est = lgb.train(params, lgb_x, num_boost_round=20)
new_x = np.zeros((3, x.shape[1])) new_x = np.zeros((3, x.shape[1]))
new_x[:, 0] = [0.31, 0.37, 0.41] new_x[:, 0] = [0.31, 0.37, 0.41]
new_x[:, 1] = [0, 0, 0]
predicted = est.predict(new_x) predicted = est.predict(new_x)
assert len(np.unique(predicted)) == 3 assert len(np.unique(predicted)) == 3
new_x[:, 0] = [0, 0, 0] new_x[:, 0] = [0, 0, 0]
...@@ -2300,7 +2298,7 @@ def test_forced_bins(): ...@@ -2300,7 +2298,7 @@ def test_forced_bins():
def test_binning_same_sign(): def test_binning_same_sign():
# test that binning works properly for features with only positive or only negative values # test that binning works properly for features with only positive or only negative values
x = np.zeros((99, 2)) x = np.empty((99, 2))
x[:, 0] = np.arange(0.01, 1, 0.01) x[:, 0] = np.arange(0.01, 1, 0.01)
x[:, 1] = -np.arange(0.01, 1, 0.01) x[:, 1] = -np.arange(0.01, 1, 0.01)
y = np.arange(0.01, 1, 0.01) y = np.arange(0.01, 1, 0.01)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment