Unverified Commit 5b664b67 authored by José Morales's avatar José Morales Committed by GitHub
Browse files

[python-package][R-package] allow using feature names when retrieving number of bins (#5116)

* allow using feature names when retrieving number of bins

* unname vector

* use default feature names when not defined

* lint

* apply suggestions

* remove extra comma

* add test with categorical feature

* make feature names sync more transparent
parent 53218c11
...@@ -289,6 +289,13 @@ Dataset <- R6::R6Class( ...@@ -289,6 +289,13 @@ Dataset <- R6::R6Class(
self$set_colnames(colnames = private$colnames) self$set_colnames(colnames = private$colnames)
} }
# Ensure that private$colnames matches the feature names on the C++ side. This line is necessary
# in cases like constructing from a file or from a matrix with no column names.
private$colnames <- .Call(
LGBM_DatasetGetFeatureNames_R
, private$handle
)
# Load init score if requested # Load init score if requested
if (!is.null(private$predictor) && is.null(private$used_indices)) { if (!is.null(private$predictor) && is.null(private$used_indices)) {
...@@ -381,6 +388,13 @@ Dataset <- R6::R6Class( ...@@ -381,6 +388,13 @@ Dataset <- R6::R6Class(
if (lgb.is.null.handle(x = private$handle)) { if (lgb.is.null.handle(x = private$handle)) {
stop("Cannot get number of bins in feature before constructing Dataset.") stop("Cannot get number of bins in feature before constructing Dataset.")
} }
if (is.character(feature)) {
feature_name <- feature
feature <- which(private$colnames == feature_name)
if (length(feature) == 0L) {
stop(sprintf("feature '%s' not found", feature_name))
}
}
num_bin <- integer(1L) num_bin <- integer(1L)
.Call( .Call(
LGBM_DatasetGetFeatureNumBin_R LGBM_DatasetGetFeatureNumBin_R
......
...@@ -533,10 +533,16 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { ...@@ -533,10 +533,16 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
, three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0) , three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0)
, two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_) , two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_)
, all_zero = rep(0.0, 100L) , all_zero = rep(0.0, 100L)
, categorical = sample.int(2L, 100L, replace = TRUE)
) )
n_features <- ncol(raw_df)
raw_mat <- data.matrix(raw_df) raw_mat <- data.matrix(raw_df)
min_data_in_bin <- 2L min_data_in_bin <- 2L
ds <- lgb.Dataset(raw_mat, params = list(min_data_in_bin = min_data_in_bin)) ds <- lgb.Dataset(
raw_mat
, params = list(min_data_in_bin = min_data_in_bin)
, categorical_feature = n_features
)
ds$construct() ds$construct()
expected_num_bins <- c( expected_num_bins <- c(
100L %/% min_data_in_bin + 1L # extra bin for zero 100L %/% min_data_in_bin + 1L # extra bin for zero
...@@ -544,9 +550,30 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { ...@@ -544,9 +550,30 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
, 3L # 0, 1, 2 , 3L # 0, 1, 2
, 4L # 0, 1, 2 + NA , 4L # 0, 1, 2 + NA
, 0L # unused , 0L # unused
, 3L # 1, 2 + NA
) )
actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin) actual_num_bins <- sapply(1L:n_features, ds$get_feature_num_bin)
expect_identical(actual_num_bins, expected_num_bins) expect_identical(actual_num_bins, expected_num_bins)
# test using defined feature names
bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin)
expect_identical(unname(bins_by_name), expected_num_bins)
# test using default feature names
no_names_mat <- raw_mat
colnames(no_names_mat) <- NULL
ds_no_names <- lgb.Dataset(
no_names_mat
, params = list(min_data_in_bin = min_data_in_bin)
, categorical_feature = n_features
)
ds_no_names$construct()
default_names <- lapply(
X = seq(1L, ncol(raw_mat))
, FUN = function(i) {
sprintf("Column_%d", i - 1L)
}
)
bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin)
expect_identical(bins_by_default_name, expected_num_bins)
}) })
test_that("lgb.Dataset can be constructed with categorical features and without colnames", { test_that("lgb.Dataset can be constructed with categorical features and without colnames", {
...@@ -555,9 +582,9 @@ test_that("lgb.Dataset can be constructed with categorical features and without ...@@ -555,9 +582,9 @@ test_that("lgb.Dataset can be constructed with categorical features and without
ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct() ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct()
sparse_mat <- as(raw_mat, "dgCMatrix") sparse_mat <- as(raw_mat, "dgCMatrix")
ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct() ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct()
# check that the column names are NULL # check that the column names are the default ones
expect_null(ds$.__enclos_env__$private$colnames) expect_equal(ds$.__enclos_env__$private$colnames, "Column_0")
expect_null(ds2$.__enclos_env__$private$colnames) expect_equal(ds2$.__enclos_env__$private$colnames, "Column_0")
# check for error when index is greater than the number of columns # check for error when index is greater than the number of columns
expect_error({ expect_error({
lgb.Dataset(raw_mat, categorical_feature = 2L)$construct() lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
......
...@@ -1817,6 +1817,7 @@ class Dataset: ...@@ -1817,6 +1817,7 @@ class Dataset:
feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params) feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
if self.free_raw_data: if self.free_raw_data:
self.data = None self.data = None
self.feature_name = self.get_feature_name()
return self return self
def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None): def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None):
...@@ -2382,13 +2383,13 @@ class Dataset: ...@@ -2382,13 +2383,13 @@ class Dataset:
else: else:
raise LightGBMError("Cannot get num_feature before construct dataset") raise LightGBMError("Cannot get num_feature before construct dataset")
def feature_num_bin(self, feature: int) -> int: def feature_num_bin(self, feature: Union[int, str]) -> int:
"""Get the number of bins for a feature. """Get the number of bins for a feature.
Parameters Parameters
---------- ----------
feature : int feature : int or str
Index of the feature. Index or name of the feature.
Returns Returns
------- -------
...@@ -2396,6 +2397,8 @@ class Dataset: ...@@ -2396,6 +2397,8 @@ class Dataset:
The number of constructed bins for the feature in the Dataset. The number of constructed bins for the feature in the Dataset.
""" """
if self.handle is not None: if self.handle is not None:
if isinstance(feature, str):
feature = self.feature_name.index(feature)
ret = ctypes.c_int(0) ret = ctypes.c_int(0)
_safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle, _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
ctypes.c_int(feature), ctypes.c_int(feature),
......
...@@ -663,17 +663,33 @@ def test_feature_num_bin(min_data_in_bin): ...@@ -663,17 +663,33 @@ def test_feature_num_bin(min_data_in_bin):
np.array([0, 1, 2] * 33 + [0]), np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]), np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100), np.zeros(100),
np.random.choice([0, 1], 100),
]).T ]).T
ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct() n_continuous = X.shape[1] - 1
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
ds_kwargs = dict(
params={'min_data_in_bin': min_data_in_bin},
categorical_feature=[n_continuous], # last feature
)
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
expected_num_bins = [ expected_num_bins = [
100 // min_data_in_bin + 1, # extra bin for zero 100 // min_data_in_bin + 1, # extra bin for zero
3, # 0, 1, 2 3, # 0, 1, 2
3, # 0, 1, 2 3, # 0, 1, 2
4, # 0, 1, 2 + nan 4, # 0, 1, 2 + nan
0, # unused 0, # unused
3, # 0, 1 + nan
] ]
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])] actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
assert actual_num_bins == expected_num_bins assert actual_num_bins == expected_num_bins
# test using defined feature names
bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
assert bins_by_name == expected_num_bins
# test using default feature names
ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
default_names = [f'Column_{i}' for i in range(X.shape[1])]
bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
assert bins_by_default_name == expected_num_bins
# check for feature indices outside of range # check for feature indices outside of range
num_features = X.shape[1] num_features = X.shape[1]
with pytest.raises( with pytest.raises(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment