Commit 5df9584f authored by Fedor Korotkiy's avatar Fedor Korotkiy Committed by Guolin Ke
Browse files

Support creating Dataset from list of matrices (#1474)

parent 2e93cdab
...@@ -204,6 +204,27 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data, ...@@ -204,6 +204,27 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
const DatasetHandle reference, const DatasetHandle reference,
DatasetHandle* out); DatasetHandle* out);
/*!
* \brief create dataset from array of dense matrices
* \param data pointer to the data space
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nrow number of rows
* \param ncol number columns
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat,
const void** data,
int data_type,
int32_t* nrow,
int32_t ncol,
int is_row_major,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out);
/*! /*!
* \brief Create subset of a data * \brief Create subset of a data
* \param handle handle of full dataset * \param handle handle of full dataset
......
...@@ -712,6 +712,8 @@ class Dataset(object): ...@@ -712,6 +712,8 @@ class Dataset(object):
self.__init_from_csc(data, params_str, ref_dataset) self.__init_from_csc(data, params_str, ref_dataset)
elif isinstance(data, np.ndarray): elif isinstance(data, np.ndarray):
self.__init_from_np2d(data, params_str, ref_dataset) self.__init_from_np2d(data, params_str, ref_dataset)
elif isinstance(data, list) and len(data) > 0 and all(isinstance(x, np.ndarray) for x in data):
self.__init_from_list_np2d(data, params_str, ref_dataset)
else: else:
try: try:
csr = scipy.sparse.csr_matrix(data) csr = scipy.sparse.csr_matrix(data)
...@@ -775,6 +777,54 @@ class Dataset(object): ...@@ -775,6 +777,54 @@ class Dataset(object):
ref_dataset, ref_dataset,
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
def __init_from_list_np2d(self, mats, params_str, ref_dataset):
"""
Initialize data from list of 2-D numpy matrices.
"""
ncol = mats[0].shape[1]
nrow = np.zeros((len(mats),), np.int32)
if mats[0].dtype == np.float64:
ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))()
else:
ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))()
holders = []
type_ptr_data = None
for i, mat in enumerate(mats):
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')
if mat.shape[1] != ncol:
raise ValueError('Input arrays must have same number of columns')
nrow[i] = mat.shape[0]
if mat.dtype == np.float32 or mat.dtype == np.float64:
mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
# change non-float data to float data, need to copy
mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)
chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
if type_ptr_data is not None and chunk_type_ptr_data != type_ptr_data:
raise ValueError('Input chunks must have same type')
ptr_data[i] = chunk_ptr_data
type_ptr_data = chunk_type_ptr_data
holders.append(holder)
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromMats(
ctypes.c_int(len(mats)),
ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))),
ctypes.c_int(type_ptr_data),
nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ctypes.c_int(ncol),
ctypes.c_int(C_API_IS_ROW_MAJOR),
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def __init_from_csr(self, csr, params_str, ref_dataset): def __init_from_csr(self, csr, params_str, ref_dataset):
""" """
Initialize data from a CSR matrix. Initialize data from a CSR matrix.
......
...@@ -475,6 +475,27 @@ int LGBM_DatasetCreateFromMat(const void* data, ...@@ -475,6 +475,27 @@ int LGBM_DatasetCreateFromMat(const void* data,
const char* parameters, const char* parameters,
const DatasetHandle reference, const DatasetHandle reference,
DatasetHandle* out) { DatasetHandle* out) {
return LGBM_DatasetCreateFromMats(1,
&data,
data_type,
&nrow,
ncol,
is_row_major,
parameters,
reference,
out);
}
int LGBM_DatasetCreateFromMats(int32_t nmat,
const void** data,
int data_type,
int32_t* nrow,
int32_t ncol,
int is_row_major,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
API_BEGIN(); API_BEGIN();
auto param = Config::Str2Map(parameters); auto param = Config::Str2Map(parameters);
Config config; Config config;
...@@ -483,22 +504,39 @@ int LGBM_DatasetCreateFromMat(const void* data, ...@@ -483,22 +504,39 @@ int LGBM_DatasetCreateFromMat(const void* data,
omp_set_num_threads(config.num_threads); omp_set_num_threads(config.num_threads);
} }
std::unique_ptr<Dataset> ret; std::unique_ptr<Dataset> ret;
auto get_row_fun = RowFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major); int32_t total_nrow = 0;
for (int j = 0; j < nmat; ++j) {
total_nrow += nrow[j];
}
std::vector<std::function<std::vector<double>(int row_idx)>> get_row_fun;
for (int j = 0; j < nmat; ++j) {
get_row_fun.push_back(RowFunctionFromDenseMatric(data[j], nrow[j], ncol, data_type, is_row_major));
}
if (reference == nullptr) { if (reference == nullptr) {
// sample data first // sample data first
Random rand(config.data_random_seed); Random rand(config.data_random_seed);
int sample_cnt = static_cast<int>(nrow < config.bin_construct_sample_cnt ? nrow : config.bin_construct_sample_cnt); int sample_cnt = static_cast<int>(total_nrow < config.bin_construct_sample_cnt ? total_nrow : config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt); auto sample_indices = rand.Sample(total_nrow, sample_cnt);
sample_cnt = static_cast<int>(sample_indices.size()); sample_cnt = static_cast<int>(sample_indices.size());
std::vector<std::vector<double>> sample_values(ncol); std::vector<std::vector<double>> sample_values(ncol);
std::vector<std::vector<int>> sample_idx(ncol); std::vector<std::vector<int>> sample_idx(ncol);
int offset = 0;
int j = 0;
for (size_t i = 0; i < sample_indices.size(); ++i) { for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i]; auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx)); while ((idx - offset) >= nrow[j]) {
for (size_t j = 0; j < row.size(); ++j) { offset += nrow[j];
if (std::fabs(row[j]) > kZeroThreshold || std::isnan(row[j])) { ++j;
sample_values[j].emplace_back(row[j]); }
sample_idx[j].emplace_back(static_cast<int>(i));
auto row = get_row_fun[j](static_cast<int>(idx - offset));
for (size_t k = 0; k < row.size(); ++k) {
if (std::fabs(row[k]) > kZeroThreshold || std::isnan(row[k])) {
sample_values[k].emplace_back(row[k]);
sample_idx[k].emplace_back(static_cast<int>(i));
} }
} }
} }
...@@ -507,22 +545,27 @@ int LGBM_DatasetCreateFromMat(const void* data, ...@@ -507,22 +545,27 @@ int LGBM_DatasetCreateFromMat(const void* data,
Common::Vector2Ptr<int>(sample_idx).data(), Common::Vector2Ptr<int>(sample_idx).data(),
static_cast<int>(sample_values.size()), static_cast<int>(sample_values.size()),
Common::VectorSize<double>(sample_values).data(), Common::VectorSize<double>(sample_values).data(),
sample_cnt, nrow)); sample_cnt, total_nrow));
} else { } else {
ret.reset(new Dataset(nrow)); ret.reset(new Dataset(total_nrow));
ret->CreateValid( ret->CreateValid(
reinterpret_cast<const Dataset*>(reference)); reinterpret_cast<const Dataset*>(reference));
} }
OMP_INIT_EX(); int32_t start_row = 0;
#pragma omp parallel for schedule(static) for (int j = 0; j < nmat; ++j) {
for (int i = 0; i < nrow; ++i) { OMP_INIT_EX();
OMP_LOOP_EX_BEGIN(); #pragma omp parallel for schedule(static)
const int tid = omp_get_thread_num(); for (int i = 0; i < nrow[j]; ++i) {
auto one_row = get_row_fun(i); OMP_LOOP_EX_BEGIN();
ret->PushOneRow(tid, i, one_row); const int tid = omp_get_thread_num();
OMP_LOOP_EX_END(); auto one_row = get_row_fun[j](i);
ret->PushOneRow(tid, start_row + i, one_row);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
start_row += nrow[j];
} }
OMP_THROW_EX();
ret->FinishLoad(); ret->FinishLoad();
*out = ret.release(); *out = ret.release();
API_END(); API_END();
......
...@@ -60,3 +60,16 @@ class TestBasic(unittest.TestCase): ...@@ -60,3 +60,16 @@ class TestBasic(unittest.TestCase):
for preds in zip(pred_early_stopping, pred_from_matr): for preds in zip(pred_early_stopping, pred_from_matr):
# scores likely to be different, but prediction should still be the same # scores likely to be different, but prediction should still be the same
self.assertEqual(preds[0] > 0, preds[1] > 0) self.assertEqual(preds[0] > 0, preds[1] > 0)
def test_chunked_dataset(self):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)
chunk_size = X_train.shape[0] // 10 + 1
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
train_data.construct()
valid_data.construct()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment