"src/vscode:/vscode.git/clone" did not exist on "548cec829e232ac388154758b59ca696b2147a08"
Commit 4f77bd28 authored by Guolin Ke's avatar Guolin Ke
Browse files

update to v2.

parent 13d4581b
......@@ -12,7 +12,6 @@ export(lgb.Dataset)
export(lgb.Dataset.construct)
export(lgb.Dataset.create.valid)
export(lgb.Dataset.save)
export(lgb.Dataset.set.categorical)
export(lgb.Dataset.set.reference)
export(lgb.cv)
export(lgb.dump)
......
......@@ -12,7 +12,6 @@ Dataset <- R6Class(
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
......@@ -42,7 +41,6 @@ Dataset <- R6Class(
private$reference <- reference
private$colnames <- colnames
private$categorical_feature <- categorical_feature
private$predictor <- predictor
private$free_raw_data <- free_raw_data
private$used_indices <- used_indices
......@@ -54,7 +52,6 @@ Dataset <- R6Class(
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
NULL,
......@@ -76,21 +73,6 @@ Dataset <- R6Class(
if (is.null(private$colnames) && !is.null(cnames)) {
private$colnames <- as.character(cnames)
}
# Get categorical feature index
if (!is.null(private$categorical_feature)) {
if (typeof(private$categorical_feature) == "character") {
cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1)
if (sum(is.na(cate_indices)) > 0) {
stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
}
} else {
if (max(private$categorical_feature) > length(private$colnames)) {
stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
}
cate_indices <- as.list(private$categorical_feature - 1)
}
private$params$categorical_feature <- cate_indices
}
# Check has header or not
has_header <- FALSE
if (!is.null(private$params$has_header) ||
......@@ -289,7 +271,6 @@ Dataset <- R6Class(
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
idxset,
......@@ -301,20 +282,7 @@ Dataset <- R6Class(
private$params <- modifyList(private$params, params)
self
},
set_categorical_feature = function(categorical_feature) {
if (identical(private$categorical_feature, categorical_feature)) { return(self) }
if (is.null(private$raw_data)) {
stop(
"set_categorical_feature: cannot set categorical feature after freeing raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset"
)
}
private$categorical_feature <- categorical_feature
self$finalize()
self
},
set_reference = function(reference) {
self$set_categorical_feature(reference$.__enclos_env__$private$categorical_feature)
self$set_colnames(reference$get_colnames())
private$set_predictor(reference$.__enclos_env__$private$predictor)
if (identical(private$reference, reference)) { return(self) }
......@@ -348,7 +316,6 @@ Dataset <- R6Class(
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
......@@ -386,7 +353,6 @@ Dataset <- R6Class(
#' @param params a list of parameters
#' @param reference reference dataset
#' @param colnames names of columns
#' @param categorical_feature categorical features
#' @param free_raw_data TRUE for need to free raw data after construct
#' @param info a list of information of the lgb.Dataset object
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
......@@ -405,7 +371,6 @@ lgb.Dataset <- function(data,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
free_raw_data = TRUE,
info = list(),
...) {
......@@ -414,7 +379,6 @@ lgb.Dataset <- function(data,
params,
reference,
colnames,
categorical_feature,
NULL,
free_raw_data,
NULL,
......@@ -664,29 +628,6 @@ setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
dataset$setinfo(name, info)
}
#' Set categorical feature of \code{lgb.Dataset}
#'
#' @param dataset object of class \code{lgb.Dataset}
#' @param categorical_feature categorical features
#' @return passed dataset
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
#' dtrain <- lgb.Dataset('lgb.Dataset.data')
#' lgb.Dataset.set.categorical(dtrain, 1:2)
#' }
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
if (!lgb.is.Dataset(dataset)) {
stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
}
dataset$set_categorical_feature(categorical_feature)
}
#' Set reference of \code{lgb.Dataset}
#'
#' If you want to use validation data, you should set reference to training data
......
......@@ -46,9 +46,6 @@ CVBooster <- R6Class(
#' the \code{nfold} and \code{stratified} parameters are ignored.
#' @param init_model path of model file of \code{lgb.Booster} object, will continue train from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param early_stopping_rounds int
#' Activates early stopping.
#' Requires at least one validation data and one metric
......@@ -84,7 +81,6 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
folds = NULL,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(), ...) {
addiction_params <- list(...)
......@@ -122,7 +118,6 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
data$update_params(params)
data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) }
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
data$construct()
if (!is.null(folds)) {
......
......@@ -19,7 +19,6 @@
#' \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
#' \item \code{split_gain}: Split gain of a node
#' \item \code{threshold}: Spliting threshold value of a node
#' \item \code{decision_type}: Decision type of a node
#' \item \code{internal_value}: Node value
#' \item \code{internal_count}: The number of observation collected by a node
#' \item \code{leaf_value}: Leaf value
......@@ -63,14 +62,14 @@ single.tree.parse <- function(lgb_tree) {
single_tree_dt <- data.table::data.table(tree_index = integer(0),
split_index = integer(0), split_feature = integer(0), node_parent = integer(0),
leaf_index = integer(0), leaf_parent = integer(0),
split_gain = numeric(0), threshold = numeric(0), decision_type = character(0),
split_gain = numeric(0), threshold = numeric(0),
internal_value = integer(0), internal_count = integer(0),
leaf_value = integer(0), leaf_count = integer(0))
pre_order_traversal <- function(tree_node_leaf, parent_index = NA) {
if (!is.null(tree_node_leaf$split_index)) {
single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
c(tree_node_leaf[c("split_index", "split_feature",
"split_gain", "threshold", "decision_type",
"split_gain", "threshold",
"internal_value", "internal_count")],
"node_parent" = parent_index)),
use.names = TRUE, fill = TRUE)
......
......@@ -18,9 +18,6 @@
#' @param eval_freq evalutaion output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param early_stopping_rounds int
#' Activates early stopping.
#' Requires at least one validation data and one metric
......@@ -55,7 +52,6 @@ lgb.train <- function(params = list(), data, nrounds = 10,
eval_freq = 1L,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(), ...) {
additional_params <- list(...)
......@@ -100,7 +96,6 @@ lgb.train <- function(params = list(), data, nrounds = 10,
data$update_params(params)
data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) }
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
data$construct()
vaild_contain_train <- FALSE
train_data_name <- "train"
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.Dataset.R
\name{lgb.Dataset.set.categorical}
\alias{lgb.Dataset.set.categorical}
\title{Set categorical feature of \code{lgb.Dataset}}
\usage{
lgb.Dataset.set.categorical(dataset, categorical_feature)
}
\arguments{
\item{dataset}{object of class \code{lgb.Dataset}}
\item{categorical_feature}{categorical features}
}
\value{
passed dataset
}
\description{
Set categorical feature of \code{lgb.Dataset}
}
\examples{
\dontrun{
data(agaricus.train, package='lightgbm')
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label=train$label)
lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
dtrain <- lgb.Dataset('lgb.Dataset.data')
lgb.Dataset.set.categorical(dtrain, 1:2)
}
}
......@@ -24,7 +24,6 @@ The columns of the \code{data.table} are:
\item \code{leaf_parent}: ID of the parent node for current leaf (integer)
\item \code{split_gain}: Split gain of a node
\item \code{threshold}: Spliting threshold value of a node
\item \code{decision_type}: Decision type of a node
\item \code{internal_value}: Node value
\item \code{internal_count}: The number of observation collected by a node
\item \code{leaf_value}: Leaf value
......
......@@ -9,12 +9,12 @@
lgb.cv(params = list(), data, nrounds = 10, nfold = 3, label = NULL,
weight = NULL, obj = NULL, eval = NULL, verbose = 1, record = TRUE,
eval_freq = 1L, showsd = TRUE, stratified = TRUE, folds = NULL,
init_model = NULL, colnames = NULL, categorical_feature = NULL,
init_model = NULL, colnames = NULL,
early_stopping_rounds = NULL, callbacks = list(), ...)
lgb.train(params = list(), data, nrounds = 10, valids = list(),
obj = NULL, eval = NULL, verbose = 1, record = TRUE, eval_freq = 1L,
init_model = NULL, colnames = NULL, categorical_feature = NULL,
init_model = NULL, colnames = NULL,
early_stopping_rounds = NULL, callbacks = list(), ...)
lightgbm(data, label = NULL, weight = NULL, params = list(),
......@@ -60,10 +60,6 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
\item{categorical_feature}{list of str or int
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
......@@ -118,10 +114,6 @@ Tree still grow by leaf-wise.}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
\item{categorical_feature}{list of str or int
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
......
......@@ -18,7 +18,7 @@ News
----
01/08/2017 : Release [**R-package**](./R-package) beta version, welcome to have a try and provide feedback.
12/05/2016 : **Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding (refer to [categorical log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_speed.log) and [one-hot log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_onehot_speed.log)).
12/05/2016 : [deprecated in v2]**Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding (refer to [categorical log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_speed.log) and [one-hot log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_onehot_speed.log)).
For the setting details, please refer to [IO Parameters](./docs/Parameters.md#io-parameters).
12/02/2016 : Release [**python-package**](./python-package) beta version, welcome to have a try and provide feedback.
......
......@@ -20,11 +20,11 @@ LightGBM FAQ
- **Solution 1**: this error should be solved in latest version. If you still meet this error, try to remove lightgbm.egg-info folder in your python-package and reinstall, or check [this thread on stackoverflow](http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path).
- **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference/categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.
- **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.
- **Solution 2**: Because LightGBM contructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers, categorical features and feature names etc., the Dataset objects are constructed when contruct a Booster. And if you set free_raw_data=True (default), the raw data (with python data struct) will be freed. So, if you want to:
+ get label(or weight/init_score/group) before contruct dataset, it's same as get `self.label`
+ set label(or weight/init_score/group) before contruct dataset, it's same as `self.label=some_label_array`
+ get num_data(or num_feature) before contruct dataset, you can get data with `self.data`, then if your data is `numpy.ndarray`, use some code like `self.data.shape`
+ set predictor(or reference/categorical feature) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
+ set predictor(or reference) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
......@@ -145,11 +145,6 @@ The parameter format is ```key1=value1 key2=value2 ... ``` . And parameters can
* Use number for index, e.g. ```ignore_column=0,1,2``` means column_0, column_1 and column_2 will be ignored.
* Add a prefix ```name:``` for column name, e.g. ```ignore_column=name:c1,c2,c3``` means c1, c2 and c3 will be ignored.
* Note: Index start from ```0```. And it doesn't count the label column.
* ```categorical_feature```, default=```""```, type=string, alias=```categorical_column```,```cat_feature```,```cat_column```
* specific categorical features
* Use number for index, e.g. ```categorical_feature=0,1,2``` means column_0, column_1 and column_2 are categorical features.
* Add a prefix ```name:``` for column name, e.g. ```categorical_feature=name:c1,c2,c3``` means c1, c2 and c3 are categorical features.
* Note: Only support categorical with ```int``` type. Index start from ```0```. And it doesn't count the label column.
* ```predict_raw_score```, default=```false```, type=bool, alias=```raw_score```,```is_predict_raw_score```
* only used in prediction task
* Set to ```true``` will only predict the raw scores.
......
......@@ -5,8 +5,8 @@
- [Booster](Python-API.md#booster)
* [Training API](Python-API.md#training-api)
- [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone)
- [cv](Python-API.md#cvparams-train_set-num_boost_round10-data_splitternone-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone)
- [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone)
- [cv](Python-API.md#cvparams-train_set-num_boost_round10-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone)
* [Scikit-learn API](Python-API.md#scikit-learn-api)
- [Common Methods](Python-API.md#common-methods)
......@@ -33,7 +33,7 @@ The methods of each Class is in alphabetical order.
###Dataset
####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name='auto', categorical_feature='auto', params=None, free_raw_data=True)
####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name='auto', params=None, free_raw_data=True)
Parameters
----------
......@@ -55,11 +55,6 @@ The methods of each Class is in alphabetical order.
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params : dict, optional
Other parameters
free_raw_data : Bool
......@@ -151,17 +146,6 @@ The methods of each Class is in alphabetical order.
Name of the output file.
####set_categorical_feature(categorical_feature)
Set categorical features.
Parameters
----------
categorical_feature : list of str or list of int
Name (str) or index (int) of categorical features
####set_feature_name(feature_name)
Set feature name.
......@@ -466,7 +450,7 @@ The methods of each Class is in alphabetical order.
##Training API
####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)
####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)
Train with given parameters.
......@@ -492,11 +476,6 @@ The methods of each Class is in alphabetical order.
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
......@@ -536,7 +515,7 @@ The methods of each Class is in alphabetical order.
booster : a trained booster model
####cv(params, train_set, num_boost_round=10, data_splitter=None, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)
####cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)
Cross-validation with given paramaters.
......@@ -567,11 +546,6 @@ The methods of each Class is in alphabetical order.
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
......@@ -721,7 +695,7 @@ The methods of each Class is in alphabetical order.
X_leaves : array_like, shape=[n_samples, n_trees]
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None)
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', callbacks=None)
Fit the gradient boosting model.
......@@ -755,11 +729,6 @@ The methods of each Class is in alphabetical order.
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
......
......@@ -68,13 +68,6 @@ test_data = lgb.Dataset('test.svm', reference=train_data)
In LightGBM, the validation data should be aligned with training data.
#### Specific feature names and categorical features
```python
train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])
```
LightGBM can use categorical features as input directly. It doesn't need to covert to one-hot coding, and is much faster than one-hot coding (about 8x speed-up).
**Note:You should convert your categorical features to int type before you construct `Dataset`.**
#### Weights can be set when needed:
```python
......
......@@ -14,7 +14,7 @@ LightGBM supports input data file with [CSV](https://en.wikipedia.org/wiki/Comma
Label is the data of first column, and there is no header in the file.
### Categorical feature support
### [deprecated in v2] Categorical feature support
update 12/5/2016:
......
......@@ -29,7 +29,6 @@ Examples including:
- Feature importances with sklearn interface
- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
- Set feature names
- Directly use categorical features without one-hot encoding
- Load model file to continue training
- Change learning rates during training
- Self-defined objective function
......
......@@ -42,13 +42,11 @@ params = {
feature_name = ['feature_' + str(col) for col in range(num_feature)]
print('Start training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])
feature_name=feature_name)
# check feature name
print('Finish first 10 rounds...')
......
......@@ -38,7 +38,6 @@ gbm = lgb.train(params,
num_boost_round=100,
valid_sets=[lgb_train, lgb_test],
feature_name=['f' + str(i + 1) for i in range(28)],
categorical_feature=[21],
evals_result=evals_result,
verbose_eval=10)
......@@ -50,6 +49,6 @@ print('Plot feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()
print('Plot 84th tree...') # one tree use categorical feature to split
print('Plot 84th tree...')
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show()
......@@ -12,11 +12,6 @@
namespace LightGBM {
enum BinType {
NumericalBin,
CategoricalBin
};
/*! \brief Store data for one histogram bin */
struct HistogramBinEntry {
public:
......@@ -26,7 +21,6 @@ public:
double sum_hessians = 0.0f;
/*! \brief Number of data on this bin */
data_size_t cnt = 0;
/*!
* \brief Sum up (reducers) functions for histogram bin
*/
......@@ -59,24 +53,14 @@ public:
explicit BinMapper(const void* memory);
~BinMapper();
static double kSparseThreshold;
bool CheckAlign(const BinMapper& other) const {
if (num_bin_ != other.num_bin_) {
return false;
}
if (bin_type_ != other.bin_type_) {
return false;
}
if (bin_type_ == BinType::NumericalBin) {
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
return false;
}
}
} else {
for (int i = 0; i < num_bin_; i++) {
if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
return false;
}
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
return false;
}
}
return true;
......@@ -98,12 +82,8 @@ public:
* \param bin
* \return Feature value of this bin
*/
inline double BinToValue(unsigned int bin) const {
if (bin_type_ == BinType::NumericalBin) {
return bin_upper_bound_[bin];
} else {
return bin_2_categorical_[bin];
}
inline double BinToValue(uint32_t bin) const {
return bin_upper_bound_[bin];
}
/*!
* \brief Get sizes in byte of this object
......@@ -114,27 +94,24 @@ public:
* \param value
* \return bin for this feature value
*/
inline unsigned int ValueToBin(double value) const;
inline uint32_t ValueToBin(double value) const;
/*!
* \brief Get the default bin when value is 0 or is firt categorical
* \brief Get the default bin when value is 0
* \return default bin
*/
inline uint32_t GetDefaultBin() const {
if (bin_type_ == BinType::NumericalBin) {
return ValueToBin(0);
} else {
return 0;
}
return default_bin_;
}
/*!
* \brief Construct feature value to bin mapper according feature values
* \param values (Sampled) values of this feature, Note: not include zero.
* \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
* \param max_bin The maximal number of bin
* \param min_data_in_bin min number of data in one bin
* \param bin_type Type of this bin
*/
void FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin, BinType bin_type);
void FindBin(std::vector<double>& values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data);
/*!
* \brief Use specific number of bin to calculate the size of this class
......@@ -155,21 +132,14 @@ public:
*/
void CopyFrom(const char* buffer);
/*!
* \brief Get bin types
*/
inline BinType bin_type() const { return bin_type_; }
/*!
* \brief Get bin info
*/
inline std::string bin_info() const {
if (bin_type_ == BinType::CategoricalBin) {
return Common::Join(bin_2_categorical_, ",");
} else {
std::stringstream str_buf;
str_buf << '[' << min_val_ << ',' << max_val_ << ']';
return str_buf.str();
}
std::stringstream str_buf;
str_buf << '[' << min_val_ << ',' << max_val_ << ']';
return str_buf.str();
}
private:
/*! \brief Number of bins */
int num_bin_;
......@@ -179,16 +149,12 @@ private:
bool is_trival_;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double sparse_rate_;
/*! \brief Type of this bin */
BinType bin_type_;
/*! \brief Mapper from categorical to bin */
std::unordered_map<int, unsigned int> categorical_2_bin_;
/*! \brief Mapper from bin to categorical */
std::vector<int> bin_2_categorical_;
/*! \brief minimal feature vaule */
double min_val_;
/*! \brief maximum feature value */
double max_val_;
/*! \brief bin value of feature value 0 */
uint32_t default_bin_;
};
/*!
......@@ -228,9 +194,12 @@ public:
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
* \param right_leaf The new leaf index after perform this split
* \param left_indices left_indices[i] == true means the i-th data will be on left leaf after split
* \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
* \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
*/
virtual void Split(int leaf, int right_leaf, const char* left_indices) = 0;
virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
virtual data_size_t NonZeroCount(int leaf) const = 0;
};
/*! \brief Iterator for one bin column */
......@@ -238,10 +207,12 @@ class BinIterator {
public:
/*!
* \brief Get bin data on specific row index
* \param tid thread_id
* \param idx Index of this data
* \return Bin data
*/
virtual uint32_t Get(data_size_t idx) = 0;
virtual void Reset(data_size_t idx) = 0;
virtual ~BinIterator() = default;
};
......@@ -266,11 +237,14 @@ public:
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
/*!
* \brief Get bin interator of this bin
* \brief Get bin interator of this bin for specific feature
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param start_idx start index of this
* \return Iterator of this bin
*/
virtual BinIterator* GetIterator(data_size_t start_idx) const = 0;
virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const = 0;
/*!
* \brief Save binary data to file
......@@ -315,6 +289,10 @@ public:
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param bin_type split type
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
......@@ -322,8 +300,8 @@ public:
* \param gt_indices After called this function. The greater data indices will store on this object.
* \return The number of less than or equal data.
*/
virtual data_size_t Split(
unsigned int threshold,
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
......@@ -351,8 +329,7 @@ public:
* \return The bin data object
*/
static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse,
bool* is_sparse, uint32_t default_bin, BinType bin_type);
double sparse_rate, bool is_enable_sparse, bool* is_sparse);
/*!
* \brief Create object for bin data of one feature, used for dense feature
......@@ -362,8 +339,7 @@ public:
* \param bin_type type of bin
* \return The bin data object
*/
static Bin* CreateDenseBin(data_size_t num_data, int num_bin,
uint32_t default_bin, BinType bin_type);
static Bin* CreateDenseBin(data_size_t num_data, int num_bin);
/*!
* \brief Create object for bin data of one feature, used for sparse feature
......@@ -373,32 +349,22 @@ public:
* \param bin_type type of bin
* \return The bin data object
*/
static Bin* CreateSparseBin(data_size_t num_data,
int num_bin, uint32_t default_bin, BinType bin_type);
static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
};
inline unsigned int BinMapper::ValueToBin(double value) const {
inline uint32_t BinMapper::ValueToBin(double value) const {
// binary search to find bin
if (bin_type_ == BinType::NumericalBin) {
int l = 0;
int r = num_bin_ - 1;
while (l < r) {
int m = (r + l - 1) / 2;
if (value <= bin_upper_bound_[m]) {
r = m;
} else {
l = m + 1;
}
}
return l;
} else {
int int_value = static_cast<int>(value);
if (categorical_2_bin_.count(int_value)) {
return categorical_2_bin_.at(int_value);
int l = 0;
int r = num_bin_ - 1;
while (l < r) {
int m = (r + l - 1) / 2;
if (value <= bin_upper_bound_[m]) {
r = m;
} else {
return num_bin_ - 1;
l = m + 1;
}
}
return l;
}
} // namespace LightGBM
......
......@@ -100,10 +100,14 @@ public:
bool use_two_round_loading = false;
bool is_save_binary_file = false;
bool enable_load_from_binary_file = true;
int bin_construct_sample_cnt = 50000;
int bin_construct_sample_cnt = 200000;
bool is_predict_leaf_index = false;
bool is_predict_raw_score = false;
int min_data_in_leaf = 100;
int min_data_in_bin = 5;
double max_conflict_rate = 0.0000f;
bool enable_bundle = true;
bool adjacent_bundle = false;
bool has_header = false;
/*! \brief Index or column name of label, default is the first column
* And add an prefix "name:" while using column name */
......@@ -120,10 +124,6 @@ public:
* And add an prefix "name:" while using column name
* Note: when using Index, it dosen't count the label index */
std::string ignore_column = "";
/*! \brief specific categorical columns, Note:only support for integer type categorical
* And add an prefix "name:" while using column name
* Note: when using Index, it dosen't count the label index */
std::string categorical_column = "";
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
};
......@@ -207,6 +207,8 @@ public:
bool xgboost_dart_mode = false;
bool uniform_drop = false;
int drop_seed = 4;
double top_rate = 0.2f;
double other_rate = 0.1f;
std::string tree_learner_type = "serial";
TreeConfig tree_config;
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
......@@ -379,9 +381,6 @@ struct ParameterAlias {
{ "query_column", "group_column" },
{ "ignore_feature", "ignore_column" },
{ "blacklist", "ignore_column" },
{ "categorical_feature", "categorical_column" },
{ "cat_column", "categorical_column" },
{ "cat_feature", "categorical_column" },
{ "predict_raw_score", "is_predict_raw_score" },
{ "predict_leaf_index", "is_predict_leaf_index" },
{ "raw_score", "is_predict_raw_score" },
......
......@@ -6,7 +6,7 @@
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <LightGBM/feature.h>
#include <LightGBM/feature_group.h>
#include <vector>
#include <utility>
......@@ -19,7 +19,6 @@ namespace LightGBM {
/*! \brief forward declaration */
class DatasetLoader;
/*!
* \brief This class is used to store some meta(non-feature) data for training data,
* e.g. labels, weights, initial scores, qurey level informations.
......@@ -285,6 +284,12 @@ public:
LIGHTGBM_EXPORT Dataset(data_size_t num_data);
void Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const std::vector<std::vector<int>>& sample_non_zero_indices,
size_t total_sample_cnt,
const IOConfig& io_config);
/*! \brief Destructor */
LIGHTGBM_EXPORT ~Dataset();
......@@ -299,7 +304,7 @@ public:
return false;
}
for (int i = 0; i < num_features_; ++i) {
if (!features_[i]->CheckAlign(*(other.features_[i].get()))) {
if (!FeatureBinMapper(i)->CheckAlign(*(other.FeatureBinMapper(i)))) {
return false;
}
}
......@@ -310,7 +315,9 @@ public:
for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
int feature_idx = used_feature_map_[i];
if (feature_idx >= 0) {
features_[feature_idx]->PushData(tid, row_idx, feature_values[i]);
const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
feature_groups_[group]->PushData(tid, sub_feature, row_idx, feature_values[i]);
}
}
}
......@@ -320,14 +327,33 @@ public:
if (inner_data.first >= num_total_features_) { continue; }
int feature_idx = used_feature_map_[inner_data.first];
if (feature_idx >= 0) {
features_[feature_idx]->PushData(tid, row_idx, inner_data.second);
const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
feature_groups_[group]->PushData(tid, sub_feature, row_idx, inner_data.second);
}
}
}
inline int GetInnerFeatureIndex(int col_idx) const {
inline void PushOneData(int tid, data_size_t row_idx, int group, int sub_feature, double value) {
feature_groups_[group]->PushData(tid, sub_feature, row_idx, value);
}
inline int RealFeatureIndex(int fidx) const {
return real_feature_idx_[fidx];
}
inline int InnerFeatureIndex(int col_idx) const {
return used_feature_map_[col_idx];
}
inline int Feature2Group(int feature_idx) const {
return feature2group_[feature_idx];
}
inline int Feture2SubFeature(int feature_idx) const {
return feature2subfeature_[feature_idx];
}
inline uint64_t NumTotalBin() const {
return group_bin_boundaries_.back();
}
void ReSize(data_size_t num_data);
......@@ -354,12 +380,70 @@ public:
LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
/*!
* \brief Get a feature pointer for specific index
* \param i Index for feature
* \return Pointer of feature
*/
inline Feature* FeatureAt(int i) const { return features_[i].get(); }
LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);
void ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
HistogramBinEntry* histogram_data) const;
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
HistogramBinEntry* data) const;
inline data_size_t Split(
int feature,
uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
const int group = feature2group_[feature];
const int sub_feature = feature2subfeature_[feature];
return feature_groups_[group]->Split(sub_feature, threshold, data_indices, num_data, lte_indices, gt_indices);
}
inline int SubFeatureBinOffset(int i) const {
const int sub_feature = feature2subfeature_[i];
if (sub_feature == 0) {
return 1;
} else {
return 0;
}
}
inline int FeatureNumBin(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->bin_mappers_[sub_feature]->num_bin();
}
inline const BinMapper* FeatureBinMapper(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->bin_mappers_[sub_feature].get();
}
inline BinIterator* FeatureIterator(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->SubFetureIterator(sub_feature);
}
inline double RealThreshold(int i, uint32_t threshold) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->bin_mappers_[sub_feature]->BinToValue(threshold);
}
inline void CreateOrderedBins(std::vector<std::unique_ptr<OrderedBin>>* ordered_bins) const {
ordered_bins->resize(num_groups_);
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_groups_; ++i) {
ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin());
}
}
/*!
* \brief Get meta data pointer
......@@ -398,7 +482,7 @@ public:
private:
const char* data_filename_;
/*! \brief Store used features */
std::vector<std::unique_ptr<Feature>> features_;
std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
/*! \brief Mapper from real feature index to used index*/
std::vector<int> used_feature_map_;
/*! \brief Number of used features*/
......@@ -415,6 +499,13 @@ private:
std::vector<std::string> feature_names_;
/*! \brief store feature names */
static const char* binary_file_token;
int num_groups_;
std::vector<int> real_feature_idx_;
std::vector<int> feature2group_;
std::vector<int> feature2subfeature_;
std::vector<uint64_t> group_bin_boundaries_;
std::vector<int> group_feature_start_;
std::vector<int> group_feature_cnt_;
};
} // namespace LightGBM
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment