Commit ef778069 authored by Guolin Ke's avatar Guolin Ke
Browse files

Add categorical feature support back.

parent d93eb338
...@@ -12,6 +12,7 @@ export(lgb.Dataset) ...@@ -12,6 +12,7 @@ export(lgb.Dataset)
export(lgb.Dataset.construct) export(lgb.Dataset.construct)
export(lgb.Dataset.create.valid) export(lgb.Dataset.create.valid)
export(lgb.Dataset.save) export(lgb.Dataset.save)
export(lgb.Dataset.set.categorical)
export(lgb.Dataset.set.reference) export(lgb.Dataset.set.reference)
export(lgb.cv) export(lgb.cv)
export(lgb.dump) export(lgb.dump)
......
...@@ -12,6 +12,7 @@ Dataset <- R6Class( ...@@ -12,6 +12,7 @@ Dataset <- R6Class(
params = list(), params = list(),
reference = NULL, reference = NULL,
colnames = NULL, colnames = NULL,
categorical_feature = NULL,
predictor = NULL, predictor = NULL,
free_raw_data = TRUE, free_raw_data = TRUE,
used_indices = NULL, used_indices = NULL,
...@@ -41,6 +42,7 @@ Dataset <- R6Class( ...@@ -41,6 +42,7 @@ Dataset <- R6Class(
private$reference <- reference private$reference <- reference
private$colnames <- colnames private$colnames <- colnames
private$categorical_feature <- categorical_feature
private$predictor <- predictor private$predictor <- predictor
private$free_raw_data <- free_raw_data private$free_raw_data <- free_raw_data
private$used_indices <- used_indices private$used_indices <- used_indices
...@@ -52,6 +54,7 @@ Dataset <- R6Class( ...@@ -52,6 +54,7 @@ Dataset <- R6Class(
private$params, private$params,
self, self,
private$colnames, private$colnames,
private$categorical_feature,
private$predictor, private$predictor,
private$free_raw_data, private$free_raw_data,
NULL, NULL,
...@@ -73,6 +76,21 @@ Dataset <- R6Class( ...@@ -73,6 +76,21 @@ Dataset <- R6Class(
if (is.null(private$colnames) && !is.null(cnames)) { if (is.null(private$colnames) && !is.null(cnames)) {
private$colnames <- as.character(cnames) private$colnames <- as.character(cnames)
} }
# Get categorical feature index
if (!is.null(private$categorical_feature)) {
if (typeof(private$categorical_feature) == "character") {
cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1)
if (sum(is.na(cate_indices)) > 0) {
stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
}
} else {
if (max(private$categorical_feature) > length(private$colnames)) {
stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
}
cate_indices <- as.list(private$categorical_feature - 1)
}
private$params$categorical_feature <- cate_indices
}
# Check has header or not # Check has header or not
has_header <- FALSE has_header <- FALSE
if (!is.null(private$params$has_header) || if (!is.null(private$params$has_header) ||
...@@ -271,6 +289,7 @@ Dataset <- R6Class( ...@@ -271,6 +289,7 @@ Dataset <- R6Class(
private$params, private$params,
self, self,
private$colnames, private$colnames,
private$categorical_feature,
private$predictor, private$predictor,
private$free_raw_data, private$free_raw_data,
idxset, idxset,
...@@ -282,7 +301,20 @@ Dataset <- R6Class( ...@@ -282,7 +301,20 @@ Dataset <- R6Class(
private$params <- modifyList(private$params, params) private$params <- modifyList(private$params, params)
self self
}, },
set_categorical_feature = function(categorical_feature) {
if (identical(private$categorical_feature, categorical_feature)) { return(self) }
if (is.null(private$raw_data)) {
stop(
"set_categorical_feature: cannot set categorical feature after freeing raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset"
)
}
private$categorical_feature <- categorical_feature
self$finalize()
self
},
set_reference = function(reference) { set_reference = function(reference) {
self$set_categorical_feature(reference$.__enclos_env__$private$categorical_feature)
self$set_colnames(reference$get_colnames()) self$set_colnames(reference$get_colnames())
private$set_predictor(reference$.__enclos_env__$private$predictor) private$set_predictor(reference$.__enclos_env__$private$predictor)
if (identical(private$reference, reference)) { return(self) } if (identical(private$reference, reference)) { return(self) }
...@@ -316,6 +348,7 @@ Dataset <- R6Class( ...@@ -316,6 +348,7 @@ Dataset <- R6Class(
params = list(), params = list(),
reference = NULL, reference = NULL,
colnames = NULL, colnames = NULL,
categorical_feature = NULL,
predictor = NULL, predictor = NULL,
free_raw_data = TRUE, free_raw_data = TRUE,
used_indices = NULL, used_indices = NULL,
...@@ -353,6 +386,7 @@ Dataset <- R6Class( ...@@ -353,6 +386,7 @@ Dataset <- R6Class(
#' @param params a list of parameters #' @param params a list of parameters
#' @param reference reference dataset #' @param reference reference dataset
#' @param colnames names of columns #' @param colnames names of columns
#' @param categorical_feature categorical features
#' @param free_raw_data TRUE for need to free raw data after construct #' @param free_raw_data TRUE for need to free raw data after construct
#' @param info a list of information of the lgb.Dataset object #' @param info a list of information of the lgb.Dataset object
#' @param ... other information to pass to \code{info} or parameters pass to \code{params} #' @param ... other information to pass to \code{info} or parameters pass to \code{params}
...@@ -371,6 +405,7 @@ lgb.Dataset <- function(data, ...@@ -371,6 +405,7 @@ lgb.Dataset <- function(data,
params = list(), params = list(),
reference = NULL, reference = NULL,
colnames = NULL, colnames = NULL,
categorical_feature = NULL,
free_raw_data = TRUE, free_raw_data = TRUE,
info = list(), info = list(),
...) { ...) {
...@@ -379,6 +414,7 @@ lgb.Dataset <- function(data, ...@@ -379,6 +414,7 @@ lgb.Dataset <- function(data,
params, params,
reference, reference,
colnames, colnames,
categorical_feature,
NULL, NULL,
free_raw_data, free_raw_data,
NULL, NULL,
...@@ -628,6 +664,29 @@ setinfo.lgb.Dataset <- function(dataset, name, info, ...) { ...@@ -628,6 +664,29 @@ setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
dataset$setinfo(name, info) dataset$setinfo(name, info)
} }
#' Set categorical feature of \code{lgb.Dataset}
#'
#' @param dataset object of class \code{lgb.Dataset}
#' @param categorical_feature categorical features
#' @return passed dataset
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
#' dtrain <- lgb.Dataset('lgb.Dataset.data')
#' lgb.Dataset.set.categorical(dtrain, 1:2)
#' }
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
if (!lgb.is.Dataset(dataset)) {
stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
}
dataset$set_categorical_feature(categorical_feature)
}
#' Set reference of \code{lgb.Dataset} #' Set reference of \code{lgb.Dataset}
#' #'
#' If you want to use validation data, you should set reference to training data #' If you want to use validation data, you should set reference to training data
......
...@@ -46,6 +46,9 @@ CVBooster <- R6Class( ...@@ -46,6 +46,9 @@ CVBooster <- R6Class(
#' the \code{nfold} and \code{stratified} parameters are ignored. #' the \code{nfold} and \code{stratified} parameters are ignored.
#' @param init_model path of model file of \code{lgb.Booster} object, will continue train from this model #' @param init_model path of model file of \code{lgb.Booster} object, will continue train from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param early_stopping_rounds int #' @param early_stopping_rounds int
#' Activates early stopping. #' Activates early stopping.
#' Requires at least one validation data and one metric #' Requires at least one validation data and one metric
...@@ -81,6 +84,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10, ...@@ -81,6 +84,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
folds = NULL, folds = NULL,
init_model = NULL, init_model = NULL,
colnames = NULL, colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL, early_stopping_rounds = NULL,
callbacks = list(), ...) { callbacks = list(), ...) {
addiction_params <- list(...) addiction_params <- list(...)
...@@ -118,6 +122,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10, ...@@ -118,6 +122,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
data$update_params(params) data$update_params(params)
data$.__enclos_env__$private$set_predictor(predictor) data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) } if (!is.null(colnames)) { data$set_colnames(colnames) }
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
data$construct() data$construct()
if (!is.null(folds)) { if (!is.null(folds)) {
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#' \item \code{leaf_parent}: ID of the parent node for current leaf (integer) #' \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
#' \item \code{split_gain}: Split gain of a node #' \item \code{split_gain}: Split gain of a node
#' \item \code{threshold}: Spliting threshold value of a node #' \item \code{threshold}: Spliting threshold value of a node
#' \item \code{decision_type}: Decision type of a node
#' \item \code{internal_value}: Node value #' \item \code{internal_value}: Node value
#' \item \code{internal_count}: The number of observation collected by a node #' \item \code{internal_count}: The number of observation collected by a node
#' \item \code{leaf_value}: Leaf value #' \item \code{leaf_value}: Leaf value
...@@ -62,14 +63,14 @@ single.tree.parse <- function(lgb_tree) { ...@@ -62,14 +63,14 @@ single.tree.parse <- function(lgb_tree) {
single_tree_dt <- data.table::data.table(tree_index = integer(0), single_tree_dt <- data.table::data.table(tree_index = integer(0),
split_index = integer(0), split_feature = integer(0), node_parent = integer(0), split_index = integer(0), split_feature = integer(0), node_parent = integer(0),
leaf_index = integer(0), leaf_parent = integer(0), leaf_index = integer(0), leaf_parent = integer(0),
split_gain = numeric(0), threshold = numeric(0), split_gain = numeric(0), threshold = numeric(0), decision_type = character(0),
internal_value = integer(0), internal_count = integer(0), internal_value = integer(0), internal_count = integer(0),
leaf_value = integer(0), leaf_count = integer(0)) leaf_value = integer(0), leaf_count = integer(0))
pre_order_traversal <- function(tree_node_leaf, parent_index = NA) { pre_order_traversal <- function(tree_node_leaf, parent_index = NA) {
if (!is.null(tree_node_leaf$split_index)) { if (!is.null(tree_node_leaf$split_index)) {
single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt, single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
c(tree_node_leaf[c("split_index", "split_feature", c(tree_node_leaf[c("split_index", "split_feature",
"split_gain", "threshold", "split_gain", "threshold", "decision_type",
"internal_value", "internal_count")], "internal_value", "internal_count")],
"node_parent" = parent_index)), "node_parent" = parent_index)),
use.names = TRUE, fill = TRUE) use.names = TRUE, fill = TRUE)
......
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#' @param eval_freq evalutaion output frequency, only effect when verbose > 0 #' @param eval_freq evalutaion output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model #' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param early_stopping_rounds int #' @param early_stopping_rounds int
#' Activates early stopping. #' Activates early stopping.
#' Requires at least one validation data and one metric #' Requires at least one validation data and one metric
...@@ -52,6 +55,7 @@ lgb.train <- function(params = list(), data, nrounds = 10, ...@@ -52,6 +55,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
eval_freq = 1L, eval_freq = 1L,
init_model = NULL, init_model = NULL,
colnames = NULL, colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL, early_stopping_rounds = NULL,
callbacks = list(), ...) { callbacks = list(), ...) {
additional_params <- list(...) additional_params <- list(...)
...@@ -96,6 +100,7 @@ lgb.train <- function(params = list(), data, nrounds = 10, ...@@ -96,6 +100,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
data$update_params(params) data$update_params(params)
data$.__enclos_env__$private$set_predictor(predictor) data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) } if (!is.null(colnames)) { data$set_colnames(colnames) }
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
data$construct() data$construct()
vaild_contain_train <- FALSE vaild_contain_train <- FALSE
train_data_name <- "train" train_data_name <- "train"
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.Dataset.R
\name{lgb.Dataset.set.categorical}
\alias{lgb.Dataset.set.categorical}
\title{Set categorical feature of \code{lgb.Dataset}}
\usage{
lgb.Dataset.set.categorical(dataset, categorical_feature)
}
\arguments{
\item{dataset}{object of class \code{lgb.Dataset}}
\item{categorical_feature}{categorical features}
}
\value{
passed dataset
}
\description{
Set categorical feature of \code{lgb.Dataset}
}
\examples{
\dontrun{
data(agaricus.train, package='lightgbm')
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label=train$label)
lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
dtrain <- lgb.Dataset('lgb.Dataset.data')
lgb.Dataset.set.categorical(dtrain, 1:2)
}
}
...@@ -24,6 +24,7 @@ The columns of the \code{data.table} are: ...@@ -24,6 +24,7 @@ The columns of the \code{data.table} are:
\item \code{leaf_parent}: ID of the parent node for current leaf (integer) \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
\item \code{split_gain}: Split gain of a node \item \code{split_gain}: Split gain of a node
\item \code{threshold}: Spliting threshold value of a node \item \code{threshold}: Spliting threshold value of a node
\item \code{decision_type}: Decision type of a node
\item \code{internal_value}: Node value \item \code{internal_value}: Node value
\item \code{internal_count}: The number of observation collected by a node \item \code{internal_count}: The number of observation collected by a node
\item \code{leaf_value}: Leaf value \item \code{leaf_value}: Leaf value
......
...@@ -9,12 +9,12 @@ ...@@ -9,12 +9,12 @@
lgb.cv(params = list(), data, nrounds = 10, nfold = 3, label = NULL, lgb.cv(params = list(), data, nrounds = 10, nfold = 3, label = NULL,
weight = NULL, obj = NULL, eval = NULL, verbose = 1, record = TRUE, weight = NULL, obj = NULL, eval = NULL, verbose = 1, record = TRUE,
eval_freq = 1L, showsd = TRUE, stratified = TRUE, folds = NULL, eval_freq = 1L, showsd = TRUE, stratified = TRUE, folds = NULL,
init_model = NULL, colnames = NULL, init_model = NULL, colnames = NULL, categorical_feature = NULL,
early_stopping_rounds = NULL, callbacks = list(), ...) early_stopping_rounds = NULL, callbacks = list(), ...)
lgb.train(params = list(), data, nrounds = 10, valids = list(), lgb.train(params = list(), data, nrounds = 10, valids = list(),
obj = NULL, eval = NULL, verbose = 1, record = TRUE, eval_freq = 1L, obj = NULL, eval = NULL, verbose = 1, record = TRUE, eval_freq = 1L,
init_model = NULL, colnames = NULL, init_model = NULL, colnames = NULL, categorical_feature = NULL,
early_stopping_rounds = NULL, callbacks = list(), ...) early_stopping_rounds = NULL, callbacks = list(), ...)
lightgbm(data, label = NULL, weight = NULL, params = list(), lightgbm(data, label = NULL, weight = NULL, params = list(),
...@@ -60,6 +60,10 @@ the \code{nfold} and \code{stratified} parameters are ignored.} ...@@ -60,6 +60,10 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset} \item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
\item{categorical_feature}{list of str or int
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int \item{early_stopping_rounds}{int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
...@@ -114,6 +118,10 @@ Tree still grow by leaf-wise.} ...@@ -114,6 +118,10 @@ Tree still grow by leaf-wise.}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset} \item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
\item{categorical_feature}{list of str or int
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int \item{early_stopping_rounds}{int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
......
...@@ -20,8 +20,7 @@ News ...@@ -20,8 +20,7 @@ News
01/08/2017 : Release [**R-package**](./R-package) beta version, welcome to have a try and provide feedback. 01/08/2017 : Release [**R-package**](./R-package) beta version, welcome to have a try and provide feedback.
12/05/2016 : [deprecated in v2]**Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding (refer to [categorical log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_speed.log) and [one-hot log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_onehot_speed.log)). 12/05/2016 : **Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding.
For the setting details, please refer to [IO Parameters](./docs/Parameters.md#io-parameters).
12/02/2016 : Release [**python-package**](./python-package) beta version, welcome to have a try and provide feedback. 12/02/2016 : Release [**python-package**](./python-package) beta version, welcome to have a try and provide feedback.
......
...@@ -20,11 +20,11 @@ LightGBM FAQ ...@@ -20,11 +20,11 @@ LightGBM FAQ
- **Solution 1**: this error should be solved in latest version. If you still meet this error, try to remove lightgbm.egg-info folder in your python-package and reinstall, or check [this thread on stackoverflow](http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path). - **Solution 1**: this error should be solved in latest version. If you still meet this error, try to remove lightgbm.egg-info folder in your python-package and reinstall, or check [this thread on stackoverflow](http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path).
- **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`. - **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference/categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.
- **Solution 2**: Because LightGBM contructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers, categorical features and feature names etc., the Dataset objects are constructed when contruct a Booster. And if you set free_raw_data=True (default), the raw data (with python data struct) will be freed. So, if you want to: - **Solution 2**: Because LightGBM contructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers, categorical features and feature names etc., the Dataset objects are constructed when contruct a Booster. And if you set free_raw_data=True (default), the raw data (with python data struct) will be freed. So, if you want to:
+ get label(or weight/init_score/group) before contruct dataset, it's same as get `self.label` + get label(or weight/init_score/group) before contruct dataset, it's same as get `self.label`
+ set label(or weight/init_score/group) before contruct dataset, it's same as `self.label=some_label_array` + set label(or weight/init_score/group) before contruct dataset, it's same as `self.label=some_label_array`
+ get num_data(or num_feature) before contruct dataset, you can get data with `self.data`, then if your data is `numpy.ndarray`, use some code like `self.data.shape` + get num_data(or num_feature) before contruct dataset, you can get data with `self.data`, then if your data is `numpy.ndarray`, use some code like `self.data.shape`
+ set predictor(or reference) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data + set predictor(or reference/categorical feature) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
...@@ -150,6 +150,11 @@ The parameter format is ```key1=value1 key2=value2 ... ``` . And parameters can ...@@ -150,6 +150,11 @@ The parameter format is ```key1=value1 key2=value2 ... ``` . And parameters can
* Use number for index, e.g. ```ignore_column=0,1,2``` means column_0, column_1 and column_2 will be ignored. * Use number for index, e.g. ```ignore_column=0,1,2``` means column_0, column_1 and column_2 will be ignored.
* Add a prefix ```name:``` for column name, e.g. ```ignore_column=name:c1,c2,c3``` means c1, c2 and c3 will be ignored. * Add a prefix ```name:``` for column name, e.g. ```ignore_column=name:c1,c2,c3``` means c1, c2 and c3 will be ignored.
* Note: Index start from ```0```. And it doesn't count the label column. * Note: Index start from ```0```. And it doesn't count the label column.
* ```categorical_feature```, default=```""```, type=string, alias=```categorical_column```,```cat_feature```,```cat_column```
* specific categorical features
* Use number for index, e.g. ```categorical_feature=0,1,2``` means column_0, column_1 and column_2 are categorical features.
* Add a prefix ```name:``` for column name, e.g. ```categorical_feature=name:c1,c2,c3``` means c1, c2 and c3 are categorical features.
* Note: Only support categorical with ```int``` type. Index start from ```0```. And it doesn't count the label column.
* ```predict_raw_score```, default=```false```, type=bool, alias=```raw_score```,```is_predict_raw_score``` * ```predict_raw_score```, default=```false```, type=bool, alias=```raw_score```,```is_predict_raw_score```
* only used in prediction task * only used in prediction task
* Set to ```true``` will only predict the raw scores. * Set to ```true``` will only predict the raw scores.
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
- [Booster](Python-API.md#booster) - [Booster](Python-API.md#booster)
* [Training API](Python-API.md#training-api) * [Training API](Python-API.md#training-api)
- [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone) - [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone)
- [cv](Python-API.md#cvparams-train_set-num_boost_round10-data_splitternone-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone) - [cv](Python-API.md#cvparams-train_set-num_boost_round10-data_splitternone-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone)
* [Scikit-learn API](Python-API.md#scikit-learn-api) * [Scikit-learn API](Python-API.md#scikit-learn-api)
- [Common Methods](Python-API.md#common-methods) - [Common Methods](Python-API.md#common-methods)
...@@ -33,7 +33,7 @@ The methods of each Class is in alphabetical order. ...@@ -33,7 +33,7 @@ The methods of each Class is in alphabetical order.
###Dataset ###Dataset
####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name='auto', params=None, free_raw_data=True) ####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name='auto', categorical_feature='auto', params=None, free_raw_data=True)
Parameters Parameters
---------- ----------
...@@ -55,6 +55,11 @@ The methods of each Class is in alphabetical order. ...@@ -55,6 +55,11 @@ The methods of each Class is in alphabetical order.
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params : dict, optional params : dict, optional
Other parameters Other parameters
free_raw_data : Bool free_raw_data : Bool
...@@ -146,6 +151,17 @@ The methods of each Class is in alphabetical order. ...@@ -146,6 +151,17 @@ The methods of each Class is in alphabetical order.
Name of the output file. Name of the output file.
####set_categorical_feature(categorical_feature)
Set categorical features.
Parameters
----------
categorical_feature : list of str or list of int
Name (str) or index (int) of categorical features
####set_feature_name(feature_name) ####set_feature_name(feature_name)
Set feature name. Set feature name.
...@@ -450,7 +466,7 @@ The methods of each Class is in alphabetical order. ...@@ -450,7 +466,7 @@ The methods of each Class is in alphabetical order.
##Training API ##Training API
####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None) ####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)
Train with given parameters. Train with given parameters.
...@@ -476,6 +492,11 @@ The methods of each Class is in alphabetical order. ...@@ -476,6 +492,11 @@ The methods of each Class is in alphabetical order.
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
...@@ -515,7 +536,7 @@ The methods of each Class is in alphabetical order. ...@@ -515,7 +536,7 @@ The methods of each Class is in alphabetical order.
booster : a trained booster model booster : a trained booster model
####cv(params, train_set, num_boost_round=10, data_splitter=None, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None) ####cv(params, train_set, num_boost_round=10, data_splitter=None, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)
Cross-validation with given paramaters. Cross-validation with given paramaters.
...@@ -546,6 +567,11 @@ The methods of each Class is in alphabetical order. ...@@ -546,6 +567,11 @@ The methods of each Class is in alphabetical order.
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue. every <early_stopping_rounds> round(s) to continue.
...@@ -695,7 +721,7 @@ The methods of each Class is in alphabetical order. ...@@ -695,7 +721,7 @@ The methods of each Class is in alphabetical order.
X_leaves : array_like, shape=[n_samples, n_trees] X_leaves : array_like, shape=[n_samples, n_trees]
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', callbacks=None) ####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None)
Fit the gradient boosting model. Fit the gradient boosting model.
...@@ -729,6 +755,11 @@ The methods of each Class is in alphabetical order. ...@@ -729,6 +755,11 @@ The methods of each Class is in alphabetical order.
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions callbacks : list of callback functions
List of callback functions that are applied at each iteration. List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information. See Callbacks in Python-API.md for more information.
......
...@@ -68,6 +68,13 @@ test_data = lgb.Dataset('test.svm', reference=train_data) ...@@ -68,6 +68,13 @@ test_data = lgb.Dataset('test.svm', reference=train_data)
In LightGBM, the validation data should be aligned with training data. In LightGBM, the validation data should be aligned with training data.
#### Specific feature names and categorical features
```python
train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])
```
LightGBM can use categorical features as input directly. It doesn't need to covert to one-hot coding, and is much faster than one-hot coding (about 8x speed-up).
**Note:You should convert your categorical features to int type before you construct `Dataset`.**
#### Weights can be set when needed: #### Weights can be set when needed:
```python ```python
......
...@@ -14,11 +14,11 @@ LightGBM supports input data file with [CSV](https://en.wikipedia.org/wiki/Comma ...@@ -14,11 +14,11 @@ LightGBM supports input data file with [CSV](https://en.wikipedia.org/wiki/Comma
Label is the data of first column, and there is no header in the file. Label is the data of first column, and there is no header in the file.
### [deprecated in v2] Categorical feature support ### Categorical feature support
update 12/5/2016: update 12/5/2016:
LightGBM can use categorical feature directly (without one-hot coding). The experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up compared with one-hot coding (refer to [categorical log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_speed.log) and [one-hot log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_onehot_speed.log)). LightGBM can use categorical feature directly (without one-hot coding). The experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up compared with one-hot coding.
For the setting details, please refer to [Parameters](./Parameters.md#io-parameters). For the setting details, please refer to [Parameters](./Parameters.md#io-parameters).
......
...@@ -29,6 +29,7 @@ Examples including: ...@@ -29,6 +29,7 @@ Examples including:
- Feature importances with sklearn interface - Feature importances with sklearn interface
- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py) - [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
- Set feature names - Set feature names
- Directly use categorical features without one-hot encoding
- Load model file to continue training - Load model file to continue training
- Change learning rates during training - Change learning rates during training
- Self-defined objective function - Self-defined objective function
......
...@@ -42,11 +42,13 @@ params = { ...@@ -42,11 +42,13 @@ params = {
feature_name = ['feature_' + str(col) for col in range(num_feature)] feature_name = ['feature_' + str(col) for col in range(num_feature)]
print('Start training...') print('Start training...')
# feature_name and categorical_feature
gbm = lgb.train(params, gbm = lgb.train(params,
lgb_train, lgb_train,
num_boost_round=10, num_boost_round=10,
valid_sets=lgb_train, # eval training data valid_sets=lgb_train, # eval training data
feature_name=feature_name) feature_name=feature_name,
categorical_feature=[21])
# check feature name # check feature name
print('Finish first 10 rounds...') print('Finish first 10 rounds...')
......
...@@ -38,6 +38,7 @@ gbm = lgb.train(params, ...@@ -38,6 +38,7 @@ gbm = lgb.train(params,
num_boost_round=100, num_boost_round=100,
valid_sets=[lgb_train, lgb_test], valid_sets=[lgb_train, lgb_test],
feature_name=['f' + str(i + 1) for i in range(28)], feature_name=['f' + str(i + 1) for i in range(28)],
categorical_feature=[21],
evals_result=evals_result, evals_result=evals_result,
verbose_eval=10) verbose_eval=10)
...@@ -49,6 +50,6 @@ print('Plot feature importances...') ...@@ -49,6 +50,6 @@ print('Plot feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10) ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show() plt.show()
print('Plot 84th tree...') print('Plot 84th tree...') # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show() plt.show()
...@@ -12,6 +12,12 @@ ...@@ -12,6 +12,12 @@
namespace LightGBM { namespace LightGBM {
enum BinType {
NumericalBin,
CategoricalBin
};
/*! \brief Store data for one histogram bin */ /*! \brief Store data for one histogram bin */
struct HistogramBinEntry { struct HistogramBinEntry {
public: public:
...@@ -58,11 +64,19 @@ public: ...@@ -58,11 +64,19 @@ public:
if (num_bin_ != other.num_bin_) { if (num_bin_ != other.num_bin_) {
return false; return false;
} }
if (bin_type_ == BinType::NumericalBin) {
for (int i = 0; i < num_bin_; ++i) { for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) { if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
return false; return false;
} }
} }
} else {
for (int i = 0; i < num_bin_; i++) {
if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
return false;
}
}
}
return true; return true;
} }
...@@ -83,7 +97,11 @@ public: ...@@ -83,7 +97,11 @@ public:
* \return Feature value of this bin * \return Feature value of this bin
*/ */
inline double BinToValue(uint32_t bin) const { inline double BinToValue(uint32_t bin) const {
if (bin_type_ == BinType::NumericalBin) {
return bin_upper_bound_[bin]; return bin_upper_bound_[bin];
} else {
return bin_2_categorical_[bin];
}
} }
/*! /*!
* \brief Get sizes in byte of this object * \brief Get sizes in byte of this object
...@@ -110,8 +128,9 @@ public: ...@@ -110,8 +128,9 @@ public:
* \param max_bin The maximal number of bin * \param max_bin The maximal number of bin
* \param min_data_in_bin min number of data in one bin * \param min_data_in_bin min number of data in one bin
* \param min_split_data * \param min_split_data
* \param bin_type Type of this bin
*/ */
void FindBin(std::vector<double>& values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data); void FindBin(std::vector<double>& values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type);
/*! /*!
* \brief Use specific number of bin to calculate the size of this class * \brief Use specific number of bin to calculate the size of this class
...@@ -131,15 +150,25 @@ public: ...@@ -131,15 +150,25 @@ public:
* \param buffer The source * \param buffer The source
*/ */
void CopyFrom(const char* buffer); void CopyFrom(const char* buffer);
/*!
* \brief Get bin types
*/
inline BinType bin_type() const { return bin_type_; }
/*! /*!
* \brief Get bin info * \brief Get bin info
*/ */
inline std::string bin_info() const { inline std::string bin_info() const {
if (bin_type_ == BinType::CategoricalBin) {
return Common::Join(bin_2_categorical_, ":");
} else {
std::stringstream str_buf; std::stringstream str_buf;
str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2); str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
str_buf << '[' << min_val_ << ':' << max_val_ << ']'; str_buf << '[' << min_val_ << ':' << max_val_ << ']';
return str_buf.str(); return str_buf.str();
} }
}
private: private:
/*! \brief Number of bins */ /*! \brief Number of bins */
...@@ -150,6 +179,12 @@ private: ...@@ -150,6 +179,12 @@ private:
bool is_trival_; bool is_trival_;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */ /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double sparse_rate_; double sparse_rate_;
/*! \brief Type of this bin */
BinType bin_type_;
/*! \brief Mapper from categorical to bin */
std::unordered_map<int, unsigned int> categorical_2_bin_;
/*! \brief Mapper from bin to categorical */
std::vector<int> bin_2_categorical_;
/*! \brief minimal feature vaule */ /*! \brief minimal feature vaule */
double min_val_; double min_val_;
/*! \brief maximum feature value */ /*! \brief maximum feature value */
...@@ -297,12 +332,13 @@ public: ...@@ -297,12 +332,13 @@ public:
* \param num_data Number of used data * \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object. * \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object. * \param gt_indices After called this function. The greater data indices will store on this object.
* \param bin_type type of bin
* \return The number of less than or equal data. * \return The number of less than or equal data.
*/ */
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, uint32_t threshold, uint32_t default_bin, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const = 0; data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;
/*! /*!
* \brief Create the ordered bin for this bin * \brief Create the ordered bin for this bin
...@@ -346,6 +382,7 @@ public: ...@@ -346,6 +382,7 @@ public:
}; };
inline uint32_t BinMapper::ValueToBin(double value) const { inline uint32_t BinMapper::ValueToBin(double value) const {
if (bin_type_ == BinType::NumericalBin) {
// binary search to find bin // binary search to find bin
int l = 0; int l = 0;
int r = num_bin_ - 1; int r = num_bin_ - 1;
...@@ -358,6 +395,14 @@ inline uint32_t BinMapper::ValueToBin(double value) const { ...@@ -358,6 +395,14 @@ inline uint32_t BinMapper::ValueToBin(double value) const {
} }
} }
return l; return l;
} else {
int int_value = static_cast<int>(value);
if (categorical_2_bin_.count(int_value)) {
return categorical_2_bin_.at(int_value);
} else {
return num_bin_ - 1;
}
}
} }
} // namespace LightGBM } // namespace LightGBM
......
...@@ -124,6 +124,10 @@ public: ...@@ -124,6 +124,10 @@ public:
* And add an prefix "name:" while using column name * And add an prefix "name:" while using column name
* Note: when using Index, it dosen't count the label index */ * Note: when using Index, it dosen't count the label index */
std::string ignore_column = ""; std::string ignore_column = "";
/*! \brief specific categorical columns, Note:only support for integer type categorical
* And add an prefix "name:" while using column name
* Note: when using Index, it dosen't count the label index */
std::string categorical_column = "";
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override; LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -381,6 +385,9 @@ struct ParameterAlias { ...@@ -381,6 +385,9 @@ struct ParameterAlias {
{ "query_column", "group_column" }, { "query_column", "group_column" },
{ "ignore_feature", "ignore_column" }, { "ignore_feature", "ignore_column" },
{ "blacklist", "ignore_column" }, { "blacklist", "ignore_column" },
{ "categorical_feature", "categorical_column" },
{ "cat_column", "categorical_column" },
{ "cat_feature", "categorical_column" },
{ "predict_raw_score", "is_predict_raw_score" }, { "predict_raw_score", "is_predict_raw_score" },
{ "predict_leaf_index", "is_predict_leaf_index" }, { "predict_leaf_index", "is_predict_leaf_index" },
{ "raw_score", "is_predict_raw_score" }, { "raw_score", "is_predict_raw_score" },
......
...@@ -71,7 +71,8 @@ private: ...@@ -71,7 +71,8 @@ private:
std::unordered_set<int> ignore_features_; std::unordered_set<int> ignore_features_;
/*! \brief store feature names */ /*! \brief store feature names */
std::vector<std::string> feature_names_; std::vector<std::string> feature_names_;
/*! \brief Mapper from real feature index to used index*/
std::unordered_set<int> categorical_features_;
}; };
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment