Unverified Commit d10372e2 authored by José Morales's avatar José Morales Committed by GitHub
Browse files

[c-api][python-package][R-package] expose feature num bin (#5048)



* expose FeatureNumBin in C api

* parametrize min_data_in_bin and add test with max_bin_by_feature

* include feature_num_bin in R package

* add suggestion from review
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* update error message and lint

* lint

* add call method

* minor improvements in tests

* add suggestions from review

* lint

* rename argument to feature in python and r packages
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent c043be1d
......@@ -376,6 +376,21 @@ Dataset <- R6::R6Class(
},
# Get number of bins for feature
get_feature_num_bin = function(feature) {
if (lgb.is.null.handle(x = private$handle)) {
stop("Cannot get number of bins in feature before constructing Dataset.")
}
num_bin <- integer(1L)
.Call(
LGBM_DatasetGetFeatureNumBin_R
, private$handle
, feature - 1L
, num_bin
)
return(num_bin)
},
# Get column names
get_colnames = function() {
......
......@@ -428,6 +428,17 @@ SEXP LGBM_DatasetGetNumFeature_R(SEXP handle,
R_API_END();
}
SEXP LGBM_DatasetGetFeatureNumBin_R(SEXP handle, SEXP feature_idx, SEXP out) {
R_API_BEGIN();
_AssertDatasetHandleNotNull(handle);
int feature = Rf_asInteger(feature_idx);
int nbins;
CHECK_CALL(LGBM_DatasetGetFeatureNumBin(R_ExternalPtrAddr(handle), feature, &nbins));
INTEGER(out)[0] = nbins;
return R_NilValue;
R_API_END();
}
// --- start Booster interfaces
void _BoosterFinalizer(SEXP handle) {
......@@ -939,6 +950,7 @@ static const R_CallMethodDef CallEntries[] = {
{"LGBM_DatasetUpdateParamChecking_R", (DL_FUNC) &LGBM_DatasetUpdateParamChecking_R, 2},
{"LGBM_DatasetGetNumData_R" , (DL_FUNC) &LGBM_DatasetGetNumData_R , 2},
{"LGBM_DatasetGetNumFeature_R" , (DL_FUNC) &LGBM_DatasetGetNumFeature_R , 2},
{"LGBM_DatasetGetFeatureNumBin_R" , (DL_FUNC) &LGBM_DatasetGetFeatureNumBin_R , 3},
{"LGBM_BoosterCreate_R" , (DL_FUNC) &LGBM_BoosterCreate_R , 2},
{"LGBM_BoosterFree_R" , (DL_FUNC) &LGBM_BoosterFree_R , 1},
{"LGBM_BoosterCreateFromModelfile_R", (DL_FUNC) &LGBM_BoosterCreateFromModelfile_R, 1},
......
......@@ -213,6 +213,19 @@ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetGetNumFeature_R(
SEXP out
);
/*!
* \brief get number of bins for feature
* \param handle the handle to the Dataset
* \param feature the index of the feature
* \param out The output of number of bins
* \return R NULL value
*/
LIGHTGBM_C_EXPORT SEXP LGBM_DatasetGetFeatureNumBin_R(
SEXP handle,
SEXP feature,
SEXP out
);
// --- start Booster interfaces
/*!
......
......@@ -498,6 +498,9 @@ test_that("Dataset: method calls on a Dataset with a null handle should raise an
expect_error({
dtrain$get_colnames()
}, regexp = "cannot get column names before dataset has been constructed")
expect_error({
dtrain$get_feature_num_bin(1L)
}, regexp = "Cannot get number of bins in feature before constructing Dataset.")
expect_error({
dtrain$save_binary(fname = tempfile(fileext = ".bin"))
}, regexp = "Attempting to create a Dataset without any raw data")
......@@ -522,3 +525,26 @@ test_that("Dataset: method calls on a Dataset with a null handle should raise an
dtrain$set_reference(reference = dvalid)
}, regexp = "cannot get column names before dataset has been constructed")
})
test_that("lgb.Dataset$get_feature_num_bin() works", {
raw_df <- data.frame(
all_random = runif(100L)
, two_vals = rep(c(1.0, 2.0), 50L)
, three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0)
, two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_)
, all_zero = rep(0.0, 100L)
)
raw_mat <- data.matrix(raw_df)
min_data_in_bin <- 2L
ds <- lgb.Dataset(raw_mat, params = list(min_data_in_bin = min_data_in_bin))
ds$construct()
expected_num_bins <- c(
100L %/% min_data_in_bin + 1L # extra bin for zero
, 3L # 0, 1, 2
, 3L # 0, 1, 2
, 4L # 0, 1, 2 + NA
, 0L # unused
)
actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin)
expect_identical(actual_num_bins, expected_num_bins)
})
......@@ -432,6 +432,17 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumData(DatasetHandle handle,
LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumFeature(DatasetHandle handle,
int* out);
/*!
* \brief Get number of bins for feature.
* \param handle Handle of dataset
* \param feature Index of the feature
* \param[out] out The address to hold number of bins
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetGetFeatureNumBin(DatasetHandle handle,
int feature,
int* out);
/*!
* \brief Add features from ``source`` to ``target``.
* \param target The handle of the dataset to add features to
......
......@@ -2370,6 +2370,28 @@ class Dataset:
else:
raise LightGBMError("Cannot get num_feature before construct dataset")
def feature_num_bin(self, feature: int) -> int:
"""Get the number of bins for a feature.
Parameters
----------
feature : int
Index of the feature.
Returns
-------
number_of_bins : int
The number of constructed bins for the feature in the Dataset.
"""
if self.handle is not None:
ret = ctypes.c_int(0)
_safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
ctypes.c_int(feature),
ctypes.byref(ret)))
return ret.value
else:
raise LightGBMError("Cannot get feature_num_bin before construct dataset")
def get_ref_chain(self, ref_limit=100):
"""Get a chain of Dataset objects.
......
......@@ -1550,6 +1550,20 @@ int LGBM_DatasetGetNumFeature(DatasetHandle handle,
API_END();
}
int LGBM_DatasetGetFeatureNumBin(DatasetHandle handle,
int feature,
int* out) {
API_BEGIN();
auto dataset = reinterpret_cast<Dataset*>(handle);
int inner_idx = dataset->InnerFeatureIndex(feature);
if (inner_idx >= 0) {
*out = dataset->FeatureNumBin(inner_idx);
} else {
*out = 0;
}
API_END();
}
int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
DatasetHandle source) {
API_BEGIN();
......
......@@ -621,3 +621,32 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype):
built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
assert built_data.dtype == dtype
assert np.shares_memory(X, built_data)
@pytest.mark.parametrize('min_data_in_bin', [2, 10])
def test_feature_num_bin(min_data_in_bin):
X = np.vstack([
np.random.rand(100),
np.array([1, 2] * 50),
np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
]).T
ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct()
expected_num_bins = [
100 // min_data_in_bin + 1, # extra bin for zero
3, # 0, 1, 2
3, # 0, 1, 2
4, # 0, 1, 2 + nan
0, # unused
]
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
assert actual_num_bins == expected_num_bins
def test_feature_num_bin_with_max_bin_by_feature():
X = np.random.rand(100, 3)
max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1])
ds = lgb.Dataset(X, params={'max_bin_by_feature': max_bin_by_feature}).construct()
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment