Unverified Commit 0f0dd9d5 authored by José Morales's avatar José Morales Committed by GitHub
Browse files

[python-package] do not copy column-major numpy arrays when creating Dataset...


[python-package] do not copy column-major numpy arrays when creating Dataset from list of arrays (#6773)
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 335d1688
......@@ -422,7 +422,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
* \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
* \param nrow Number of rows
* \param ncol Number of columns
* \param is_row_major 1 for row-major, 0 for column-major
* \param is_row_major Pointer to the data layouts. 1 for row-major, 0 for column-major
* \param parameters Additional parameters
* \param reference Used to align bin mapper with other dataset, nullptr means isn't used
* \param[out] out Created dataset
......@@ -433,7 +433,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat,
int data_type,
int32_t* nrow,
int32_t ncol,
int is_row_major,
int* is_row_major,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out);
......
......@@ -2343,6 +2343,7 @@ class Dataset:
ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))()
else:
ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))()
layouts = (ctypes.c_int * len(mats))()
holders = []
type_ptr_data = -1
......@@ -2356,15 +2357,13 @@ class Dataset:
nrow[i] = mat.shape[0]
if mat.dtype == np.float32 or mat.dtype == np.float64:
mats[i] = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
else: # change non-float data to float data, need to copy
mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)
mat, layout = _np2d_to_np1d(mat)
chunk_ptr_data, chunk_type_ptr_data, holder = _c_float_array(mats[i])
chunk_ptr_data, chunk_type_ptr_data, holder = _c_float_array(mat)
if type_ptr_data != -1 and chunk_type_ptr_data != type_ptr_data:
raise ValueError("Input chunks must have same type")
ptr_data[i] = chunk_ptr_data
layouts[i] = layout
type_ptr_data = chunk_type_ptr_data
holders.append(holder)
......@@ -2376,7 +2375,7 @@ class Dataset:
ctypes.c_int(type_ptr_data),
nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ctypes.c_int32(ncol),
ctypes.c_int(_C_API_IS_ROW_MAJOR),
layouts,
_c_str(params_str),
ref_dataset,
ctypes.byref(self._handle),
......
......@@ -1309,7 +1309,7 @@ int LGBM_DatasetCreateFromMat(const void* data,
data_type,
&nrow,
ncol,
is_row_major,
&is_row_major,
parameters,
reference,
out);
......@@ -1320,7 +1320,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
int data_type,
int32_t* nrow,
int32_t ncol,
int is_row_major,
int* is_row_major,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
......@@ -1337,7 +1337,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
std::vector<std::function<std::vector<double>(int row_idx)>> get_row_fun;
for (int j = 0; j < nmat; ++j) {
get_row_fun.push_back(RowFunctionFromDenseMatric(data[j], nrow[j], ncol, data_type, is_row_major));
get_row_fun.push_back(RowFunctionFromDenseMatric(data[j], nrow[j], ncol, data_type, is_row_major[j]));
}
if (reference == nullptr) {
......
......@@ -983,3 +983,17 @@ def test_equal_datasets_from_row_major_and_col_major_data(tmp_path):
# check datasets are equal
assert filecmp.cmp(ds_row_path, ds_col_path)
def test_equal_datasets_from_one_and_several_matrices_w_different_layouts(rng, tmp_path):
# several matrices
mats = [np.require(rng.random(size=(100, 2)), requirements=order) for order in ("C", "F", "F", "C")]
several_path = tmp_path / "several.txt"
lgb.Dataset(mats)._dump_text(several_path)
# one matrix
mat = np.vstack(mats)
one_path = tmp_path / "one.txt"
lgb.Dataset(mat)._dump_text(one_path)
assert filecmp.cmp(one_path, several_path)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment