Unverified Commit 0f0dd9d5 authored by José Morales's avatar José Morales Committed by GitHub
Browse files

[python-package] do not copy column-major numpy arrays when creating Dataset...


[python-package] do not copy column-major numpy arrays when creating Dataset from list of arrays (#6773)
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 335d1688
...@@ -422,7 +422,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data, ...@@ -422,7 +422,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
* \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64`` * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
* \param nrow Number of rows * \param nrow Number of rows
* \param ncol Number of columns * \param ncol Number of columns
* \param is_row_major 1 for row-major, 0 for column-major * \param is_row_major Pointer to the data layouts. 1 for row-major, 0 for column-major
* \param parameters Additional parameters * \param parameters Additional parameters
* \param reference Used to align bin mapper with other dataset, nullptr means isn't used * \param reference Used to align bin mapper with other dataset, nullptr means isn't used
* \param[out] out Created dataset * \param[out] out Created dataset
...@@ -433,7 +433,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat, ...@@ -433,7 +433,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat,
int data_type, int data_type,
int32_t* nrow, int32_t* nrow,
int32_t ncol, int32_t ncol,
int is_row_major, int* is_row_major,
const char* parameters, const char* parameters,
const DatasetHandle reference, const DatasetHandle reference,
DatasetHandle* out); DatasetHandle* out);
......
...@@ -2343,6 +2343,7 @@ class Dataset: ...@@ -2343,6 +2343,7 @@ class Dataset:
ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))() ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))()
else: else:
ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))() ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))()
layouts = (ctypes.c_int * len(mats))()
holders = [] holders = []
type_ptr_data = -1 type_ptr_data = -1
...@@ -2356,15 +2357,13 @@ class Dataset: ...@@ -2356,15 +2357,13 @@ class Dataset:
nrow[i] = mat.shape[0] nrow[i] = mat.shape[0]
if mat.dtype == np.float32 or mat.dtype == np.float64: mat, layout = _np2d_to_np1d(mat)
mats[i] = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
else: # change non-float data to float data, need to copy
mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)
chunk_ptr_data, chunk_type_ptr_data, holder = _c_float_array(mats[i]) chunk_ptr_data, chunk_type_ptr_data, holder = _c_float_array(mat)
if type_ptr_data != -1 and chunk_type_ptr_data != type_ptr_data: if type_ptr_data != -1 and chunk_type_ptr_data != type_ptr_data:
raise ValueError("Input chunks must have same type") raise ValueError("Input chunks must have same type")
ptr_data[i] = chunk_ptr_data ptr_data[i] = chunk_ptr_data
layouts[i] = layout
type_ptr_data = chunk_type_ptr_data type_ptr_data = chunk_type_ptr_data
holders.append(holder) holders.append(holder)
...@@ -2376,7 +2375,7 @@ class Dataset: ...@@ -2376,7 +2375,7 @@ class Dataset:
ctypes.c_int(type_ptr_data), ctypes.c_int(type_ptr_data),
nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ctypes.c_int32(ncol), ctypes.c_int32(ncol),
ctypes.c_int(_C_API_IS_ROW_MAJOR), layouts,
_c_str(params_str), _c_str(params_str),
ref_dataset, ref_dataset,
ctypes.byref(self._handle), ctypes.byref(self._handle),
......
...@@ -1309,7 +1309,7 @@ int LGBM_DatasetCreateFromMat(const void* data, ...@@ -1309,7 +1309,7 @@ int LGBM_DatasetCreateFromMat(const void* data,
data_type, data_type,
&nrow, &nrow,
ncol, ncol,
is_row_major, &is_row_major,
parameters, parameters,
reference, reference,
out); out);
...@@ -1320,7 +1320,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, ...@@ -1320,7 +1320,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
int data_type, int data_type,
int32_t* nrow, int32_t* nrow,
int32_t ncol, int32_t ncol,
int is_row_major, int* is_row_major,
const char* parameters, const char* parameters,
const DatasetHandle reference, const DatasetHandle reference,
DatasetHandle* out) { DatasetHandle* out) {
...@@ -1337,7 +1337,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, ...@@ -1337,7 +1337,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
std::vector<std::function<std::vector<double>(int row_idx)>> get_row_fun; std::vector<std::function<std::vector<double>(int row_idx)>> get_row_fun;
for (int j = 0; j < nmat; ++j) { for (int j = 0; j < nmat; ++j) {
get_row_fun.push_back(RowFunctionFromDenseMatric(data[j], nrow[j], ncol, data_type, is_row_major)); get_row_fun.push_back(RowFunctionFromDenseMatric(data[j], nrow[j], ncol, data_type, is_row_major[j]));
} }
if (reference == nullptr) { if (reference == nullptr) {
......
...@@ -983,3 +983,17 @@ def test_equal_datasets_from_row_major_and_col_major_data(tmp_path): ...@@ -983,3 +983,17 @@ def test_equal_datasets_from_row_major_and_col_major_data(tmp_path):
# check datasets are equal # check datasets are equal
assert filecmp.cmp(ds_row_path, ds_col_path) assert filecmp.cmp(ds_row_path, ds_col_path)
def test_equal_datasets_from_one_and_several_matrices_w_different_layouts(rng, tmp_path):
# several matrices
mats = [np.require(rng.random(size=(100, 2)), requirements=order) for order in ("C", "F", "F", "C")]
several_path = tmp_path / "several.txt"
lgb.Dataset(mats)._dump_text(several_path)
# one matrix
mat = np.vstack(mats)
one_path = tmp_path / "one.txt"
lgb.Dataset(mat)._dump_text(one_path)
assert filecmp.cmp(one_path, several_path)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment