Unverified Commit d064019f authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python] avoid data copy where possible (#2383)

* avoid copy where possible

* use precise type for importance type

* removed pointless code

* simplify sparse pandas Series conversion

* more memory savings

* always force type conversion for 1-D arrays

* one more copy=False
parent 7a8c4e52
...@@ -80,10 +80,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'): ...@@ -80,10 +80,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
elif isinstance(data, Series): elif isinstance(data, Series):
if _get_bad_pandas_dtypes([data.dtypes]): if _get_bad_pandas_dtypes([data.dtypes]):
raise ValueError('Series.dtypes must be int, float or bool') raise ValueError('Series.dtypes must be int, float or bool')
if hasattr(data.values, 'values'): # SparseArray return np.array(data, dtype=dtype, copy=False) # SparseArray should be supported as well
return data.values.values.astype(dtype)
else:
return data.values.astype(dtype)
else: else:
raise TypeError("Wrong type({0}) for {1}.\n" raise TypeError("Wrong type({0}) for {1}.\n"
"It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name)) "It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
...@@ -296,7 +293,9 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica ...@@ -296,7 +293,9 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n" raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in the following fields: " "Did not expect the data types in the following fields: "
+ ', '.join(data.columns[bad_indices])) + ', '.join(data.columns[bad_indices]))
data = data.values.astype('float') data = data.values
if data.dtype != np.float32 and data.dtype != np.float64:
data = data.astype(np.float32)
else: else:
if feature_name == 'auto': if feature_name == 'auto':
feature_name = None feature_name = None
...@@ -311,7 +310,7 @@ def _label_from_pandas(label): ...@@ -311,7 +310,7 @@ def _label_from_pandas(label):
raise ValueError('DataFrame for label cannot have multiple columns') raise ValueError('DataFrame for label cannot have multiple columns')
if _get_bad_pandas_dtypes(label.dtypes): if _get_bad_pandas_dtypes(label.dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool') raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float').flatten() label = np.ravel(label.values.astype(np.float32, copy=False))
return label return label
...@@ -534,8 +533,7 @@ class _InnerPredictor(object): ...@@ -534,8 +533,7 @@ class _InnerPredictor(object):
def inner_predict(mat, num_iteration, predict_type, preds=None): def inner_predict(mat, num_iteration, predict_type, preds=None):
if mat.dtype == np.float32 or mat.dtype == np.float64: if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False) data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else: else: # change non-float data to float data, need to copy
"""change non-float data to float data, need to copy"""
data = np.array(mat.reshape(mat.size), dtype=np.float32) data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data, _ = c_float_array(data) ptr_data, type_ptr_data, _ = c_float_array(data)
n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type) n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
...@@ -876,8 +874,7 @@ class Dataset(object): ...@@ -876,8 +874,7 @@ class Dataset(object):
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
if mat.dtype == np.float32 or mat.dtype == np.float64: if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False) data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else: else: # change non-float data to float data, need to copy
# change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32) data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data, _ = c_float_array(data) ptr_data, type_ptr_data, _ = c_float_array(data)
...@@ -915,8 +912,7 @@ class Dataset(object): ...@@ -915,8 +912,7 @@ class Dataset(object):
if mat.dtype == np.float32 or mat.dtype == np.float64: if mat.dtype == np.float32 or mat.dtype == np.float64:
mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False) mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else: else: # change non-float data to float data, need to copy
# change non-float data to float data, need to copy
mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32) mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)
chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i]) chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
...@@ -1012,7 +1008,7 @@ class Dataset(object): ...@@ -1012,7 +1008,7 @@ class Dataset(object):
used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices') used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
assert used_indices.flags.c_contiguous assert used_indices.flags.c_contiguous
if self.reference.group is not None: if self.reference.group is not None:
group_info = np.array(self.reference.group).astype(int) group_info = np.array(self.reference.group).astype(np.int32, copy=False)
_, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices], _, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices],
return_counts=True) return_counts=True)
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
...@@ -2512,7 +2508,7 @@ class Booster(object): ...@@ -2512,7 +2508,7 @@ class Booster(object):
ctypes.c_int(importance_type_int), ctypes.c_int(importance_type_int),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if importance_type_int == 0: if importance_type_int == 0:
return result.astype(int) return result.astype(np.int32)
else: else:
return result return result
......
...@@ -308,17 +308,17 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi ...@@ -308,17 +308,17 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
if hasattr(folds, 'split'): if hasattr(folds, 'split'):
group_info = full_data.get_group() group_info = full_data.get_group()
if group_info is not None: if group_info is not None:
group_info = np.array(group_info, dtype=int) group_info = np.array(group_info, dtype=np.int32, copy=False)
flatted_group = np.repeat(range_(len(group_info)), repeats=group_info) flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
else: else:
flatted_group = np.zeros(num_data, dtype=int) flatted_group = np.zeros(num_data, dtype=np.int32)
folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group) folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group)
else: else:
if 'objective' in params and params['objective'] == 'lambdarank': if 'objective' in params and params['objective'] == 'lambdarank':
if not SKLEARN_INSTALLED: if not SKLEARN_INSTALLED:
raise LightGBMError('Scikit-learn is required for lambdarank cv.') raise LightGBMError('Scikit-learn is required for lambdarank cv.')
# lambdarank task, split according to groups # lambdarank task, split according to groups
group_info = np.array(full_data.get_group(), dtype=int) group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
flatted_group = np.repeat(range_(len(group_info)), repeats=group_info) flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
group_kfold = _LGBMGroupKFold(n_splits=nfold) group_kfold = _LGBMGroupKFold(n_splits=nfold)
folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group) folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment