Add categorical feature support back.

ef778069 · Guolin Ke · d93eb338 · ef778069 · ef778069 · ef778069
Commit ef778069 authored Mar 01, 2017 by Guolin Ke
20 changed files
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -12,6 +12,7 @@ export(lgb.Dataset)
 export(lgb.Dataset.construct)
 export(lgb.Dataset.create.valid)
 export(lgb.Dataset.save)
+export(lgb.Dataset.set.categorical)
 export(lgb.Dataset.set.reference)
 export(lgb.cv)
 export(lgb.dump)

--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -12,6 +12,7 @@ Dataset <- R6Class(
                          params              = list(),
                          reference           = NULL,
                          colnames            = NULL,
+                          categorical_feature = NULL,
                          predictor           = NULL,
                          free_raw_data       = TRUE,
                          used_indices        = NULL,
@@ -41,6 +42,7 @@ Dataset <- R6Class(
      private$reference <- reference
      private$colnames  <- colnames

+      private$categorical_feature <- categorical_feature
      private$predictor           <- predictor
      private$free_raw_data       <- free_raw_data
      private$used_indices        <- used_indices
@@ -52,6 +54,7 @@ Dataset <- R6Class(
        private$params,
        self,
        private$colnames,
+        private$categorical_feature,
        private$predictor,
        private$free_raw_data,
        NULL,
@@ -73,6 +76,21 @@ Dataset <- R6Class(
      if (is.null(private$colnames) && !is.null(cnames)) {
        private$colnames <- as.character(cnames)
      }
+      # Get categorical feature index
+      if (!is.null(private$categorical_feature)) {
+        if (typeof(private$categorical_feature) == "character") {
+            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1)
+            if (sum(is.na(cate_indices)) > 0) {
+              stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
+            }
+          } else {
+            if (max(private$categorical_feature) > length(private$colnames)) {
+              stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
+            }
+            cate_indices <- as.list(private$categorical_feature - 1)
+          }
+        private$params$categorical_feature <- cate_indices
+      }
      # Check has header or not
      has_header <- FALSE
      if (!is.null(private$params$has_header) ||
@@ -271,6 +289,7 @@ Dataset <- R6Class(
        private$params,
        self,
        private$colnames,
+        private$categorical_feature,
        private$predictor,
        private$free_raw_data,
        idxset,
@@ -282,7 +301,20 @@ Dataset <- R6Class(
      private$params <- modifyList(private$params, params)
      self
    },
+    set_categorical_feature = function(categorical_feature) {
+      if (identical(private$categorical_feature, categorical_feature)) { return(self) }
+      if (is.null(private$raw_data)) {
+        stop(
+          "set_categorical_feature: cannot set categorical feature after freeing raw data,
+          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset"
+        )
+      }
+      private$categorical_feature <- categorical_feature
+      self$finalize()
+      self
+    },
    set_reference = function(reference) {
+      self$set_categorical_feature(reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(reference$get_colnames())
      private$set_predictor(reference$.__enclos_env__$private$predictor)
      if (identical(private$reference, reference)) { return(self) }
@@ -316,6 +348,7 @@ Dataset <- R6Class(
    params              = list(),
    reference           = NULL,
    colnames            = NULL,
+    categorical_feature = NULL,
    predictor           = NULL,
    free_raw_data       = TRUE,
    used_indices        = NULL,
@@ -353,6 +386,7 @@ Dataset <- R6Class(
 #' @param params a list of parameters
 #' @param reference reference dataset
 #' @param colnames names of columns
+#' @param categorical_feature categorical features
 #' @param free_raw_data TRUE for need to free raw data after construct
 #' @param info a list of information of the lgb.Dataset object
 #' @param ... other information to pass to \code{info} or parameters pass to \code{params}
@@ -371,6 +405,7 @@ lgb.Dataset <- function(data,
                        params              = list(),
                        reference           = NULL,
                        colnames            = NULL,
+                        categorical_feature = NULL,
                        free_raw_data       = TRUE,
                        info                = list(),
                        ...) {
@@ -379,6 +414,7 @@ lgb.Dataset <- function(data,
    params,
    reference,
    colnames,
+    categorical_feature,
    NULL,
    free_raw_data,
    NULL,
@@ -628,6 +664,29 @@ setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
  dataset$setinfo(name, info)
 }

+#' Set categorical feature of \code{lgb.Dataset}
+#'
+#' @param dataset object of class \code{lgb.Dataset}
+#' @param categorical_feature categorical features
+#' @return passed dataset
+#' @examples
+#' \dontrun{
+#'   data(agaricus.train, package='lightgbm')
+#'   train <- agaricus.train
+#'   dtrain <- lgb.Dataset(train$data, label=train$label)
+#'   lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
+#'   dtrain <- lgb.Dataset('lgb.Dataset.data')
+#'   lgb.Dataset.set.categorical(dtrain, 1:2)
+#' }
+#' @rdname lgb.Dataset.set.categorical
+#' @export
+lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
+  if (!lgb.is.Dataset(dataset)) {
+    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
+  }
+  dataset$set_categorical_feature(categorical_feature)
+}
+
 #' Set reference of \code{lgb.Dataset}
 #'
 #' If you want to use validation data, you should set reference to training data

--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -46,6 +46,9 @@ CVBooster <- R6Class(
 #'        the \code{nfold} and \code{stratified} parameters are ignored.
 #' @param init_model path of model file of \code{lgb.Booster} object, will continue train from this model
 #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
+#' @param categorical_feature list of str or int
+#'        type int represents index,
+#'        type str represents feature names
 #' @param early_stopping_rounds int
 #'        Activates early stopping.
 #'        Requires at least one validation data and one metric
@@ -81,6 +84,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
                   folds                 = NULL,
                   init_model            = NULL,
                   colnames              = NULL,
+                   categorical_feature   = NULL,
                   early_stopping_rounds = NULL,
                   callbacks             = list(), ...) {
  addiction_params <- list(...)
@@ -118,6 +122,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
  data$update_params(params)
  data$.__enclos_env__$private$set_predictor(predictor)
  if (!is.null(colnames)) { data$set_colnames(colnames) }
+  if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
  data$construct()

  if (!is.null(folds)) {

--- a/R-package/R/lgb.model.dt.tree.R
+++ b/R-package/R/lgb.model.dt.tree.R
@@ -19,6 +19,7 @@
 #'  \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
 #'  \item \code{split_gain}: Split gain of a node
 #'  \item \code{threshold}: Spliting threshold value of a node
+#'  \item \code{decision_type}: Decision type of a node
 #'  \item \code{internal_value}: Node value
 #'  \item \code{internal_count}: The number of observation collected by a node
 #'  \item \code{leaf_value}: Leaf value
@@ -62,14 +63,14 @@ single.tree.parse <- function(lgb_tree) {
  single_tree_dt <- data.table::data.table(tree_index = integer(0),
                                           split_index = integer(0), split_feature = integer(0), node_parent = integer(0),
                                           leaf_index = integer(0), leaf_parent = integer(0),
-                                           split_gain = numeric(0), threshold = numeric(0),
+                                           split_gain = numeric(0), threshold = numeric(0), decision_type = character(0),
                                           internal_value = integer(0), internal_count = integer(0),
                                           leaf_value = integer(0), leaf_count = integer(0))
  pre_order_traversal <- function(tree_node_leaf, parent_index = NA) {
    if (!is.null(tree_node_leaf$split_index)) {
      single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
                                                        c(tree_node_leaf[c("split_index", "split_feature",
-                                                                           "split_gain", "threshold",
+                                                                           "split_gain", "threshold", "decision_type",
                                                                           "internal_value", "internal_count")],
                                                          "node_parent" = parent_index)),
                                               use.names = TRUE, fill = TRUE)

--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -18,6 +18,9 @@
 #' @param eval_freq evalutaion output frequency, only effect when verbose > 0
 #' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
 #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
+#' @param categorical_feature list of str or int
+#'        type int represents index,
+#'        type str represents feature names
 #' @param early_stopping_rounds int
 #'        Activates early stopping.
 #'        Requires at least one validation data and one metric
@@ -52,6 +55,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
                      eval_freq             = 1L,
                      init_model            = NULL,
                      colnames              = NULL,
+                      categorical_feature   = NULL,
                      early_stopping_rounds = NULL,
                      callbacks             = list(), ...) {
  additional_params <- list(...)
@@ -96,6 +100,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
  data$update_params(params)
  data$.__enclos_env__$private$set_predictor(predictor)
  if (!is.null(colnames)) { data$set_colnames(colnames) }
+  if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
  data$construct()
  vaild_contain_train <- FALSE
  train_data_name     <- "train"

--- a/R-package/man/lgb.Dataset.set.categorical.Rd
+++ b/R-package/man/lgb.Dataset.set.categorical.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lgb.Dataset.R
+\name{lgb.Dataset.set.categorical}
+\alias{lgb.Dataset.set.categorical}
+\title{Set categorical feature of \code{lgb.Dataset}}
+\usage{
+lgb.Dataset.set.categorical(dataset, categorical_feature)
+}
+\arguments{
+\item{dataset}{object of class \code{lgb.Dataset}}
+
+\item{categorical_feature}{categorical features}
+}
+\value{
+passed dataset
+}
+\description{
+Set categorical feature of \code{lgb.Dataset}
+}
+\examples{
+\dontrun{
+  data(agaricus.train, package='lightgbm')
+  train <- agaricus.train
+  dtrain <- lgb.Dataset(train$data, label=train$label)
+  lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
+  dtrain <- lgb.Dataset('lgb.Dataset.data')
+  lgb.Dataset.set.categorical(dtrain, 1:2)
+}
+}
+
--- a/R-package/man/lgb.model.dt.tree.Rd
+++ b/R-package/man/lgb.model.dt.tree.Rd
@@ -24,6 +24,7 @@ The columns of the \code{data.table} are:
 \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
 \item \code{split_gain}: Split gain of a node
 \item \code{threshold}: Spliting threshold value of a node
+ \item \code{decision_type}: Decision type of a node
 \item \code{internal_value}: Node value
 \item \code{internal_count}: The number of observation collected by a node
 \item \code{leaf_value}: Leaf value

--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@@ -9,12 +9,12 @@
 lgb.cv(params = list(), data, nrounds = 10, nfold = 3, label = NULL,
  weight = NULL, obj = NULL, eval = NULL, verbose = 1, record = TRUE,
  eval_freq = 1L, showsd = TRUE, stratified = TRUE, folds = NULL,
-  init_model = NULL, colnames = NULL, 
+  init_model = NULL, colnames = NULL, categorical_feature = NULL,
  early_stopping_rounds = NULL, callbacks = list(), ...)

 lgb.train(params = list(), data, nrounds = 10, valids = list(),
  obj = NULL, eval = NULL, verbose = 1, record = TRUE, eval_freq = 1L,
-  init_model = NULL, colnames = NULL, 
+  init_model = NULL, colnames = NULL, categorical_feature = NULL,
  early_stopping_rounds = NULL, callbacks = list(), ...)

 lightgbm(data, label = NULL, weight = NULL, params = list(),
@@ -60,6 +60,10 @@ the \code{nfold} and \code{stratified} parameters are ignored.}

 \item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}

+\item{categorical_feature}{list of str or int
+type int represents index,
+type str represents feature names}
+
 \item{early_stopping_rounds}{int
 Activates early stopping.
 Requires at least one validation data and one metric
@@ -114,6 +118,10 @@ Tree still grow by leaf-wise.}

 \item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}

+\item{categorical_feature}{list of str or int
+type int represents index,
+type str represents feature names}
+
 \item{early_stopping_rounds}{int
 Activates early stopping.
 Requires at least one validation data and one metric

--- a/README.md
+++ b/README.md
@@ -20,8 +20,7 @@ News

 01/08/2017 : Release [**R-package**](./R-package) beta version, welcome to have a try and provide feedback.

-12/05/2016 : [deprecated in v2]**Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding (refer to [categorical log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_speed.log) and [one-hot log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_onehot_speed.log)).
-For the setting details, please refer to [IO Parameters](./docs/Parameters.md#io-parameters).
+12/05/2016 : **Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding.

 12/02/2016 : Release [**python-package**](./python-package) beta version, welcome to have a try and provide feedback.


--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -20,11 +20,11 @@ LightGBM FAQ

 - **Solution 1**: this error should be solved in latest version. If you still meet this error, try to remove lightgbm.egg-info folder in your python-package and reinstall, or check [this thread on stackoverflow](http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path).

- **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.
+- **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference/categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.

 - **Solution 2**: Because LightGBM contructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers, categorical features and feature names etc., the Dataset objects are constructed when contruct a Booster. And if you set free_raw_data=True (default), the raw data (with python data struct) will be freed. So, if you want to:

  + get label(or weight/init_score/group) before contruct dataset, it's same as get `self.label`
  + set label(or weight/init_score/group) before contruct dataset, it's same as `self.label=some_label_array`
  + get num_data(or num_feature) before contruct dataset, you can get data with `self.data`, then if your data is `numpy.ndarray`, use some code like `self.data.shape`
-  + set predictor(or reference) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
+  + set predictor(or reference/categorical feature) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
--- a/docs/Parameters.md
+++ b/docs/Parameters.md
@@ -150,6 +150,11 @@ The parameter format is ```key1=value1 key2=value2 ... ``` . And parameters can
  * Use number for index, e.g. ```ignore_column=0,1,2``` means column_0, column_1 and column_2 will be ignored.
  * Add a prefix ```name:``` for column name, e.g. ```ignore_column=name:c1,c2,c3``` means c1, c2 and c3 will be ignored.
  * Note: Index start from ```0```. And it doesn't count the label column.
+* ```categorical_feature```, default=```""```, type=string, alias=```categorical_column```,```cat_feature```,```cat_column```
+  * specific categorical features
+  * Use number for index, e.g. ```categorical_feature=0,1,2``` means column_0, column_1 and column_2 are categorical features.
+  * Add a prefix ```name:``` for column name, e.g. ```categorical_feature=name:c1,c2,c3``` means c1, c2 and c3 are categorical features.
+  * Note: Only support categorical with ```int``` type. Index start from ```0```. And it doesn't count the label column.
 * ```predict_raw_score```, default=```false```, type=bool, alias=```raw_score```,```is_predict_raw_score```
  * only used in prediction task
  * Set to ```true``` will only predict the raw scores.

--- a/docs/Python-API.md
+++ b/docs/Python-API.md
@@ -5,8 +5,8 @@
    - [Booster](Python-API.md#booster)

 * [Training API](Python-API.md#training-api)
-    - [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone)
-    - [cv](Python-API.md#cvparams-train_set-num_boost_round10-data_splitternone-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone)
+    - [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone)
+    - [cv](Python-API.md#cvparams-train_set-num_boost_round10-data_splitternone-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone)

 * [Scikit-learn API](Python-API.md#scikit-learn-api)
    - [Common Methods](Python-API.md#common-methods)
@@ -33,7 +33,7 @@ The methods of each Class is in alphabetical order.

 ###Dataset

-####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name='auto', params=None, free_raw_data=True)
+####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name='auto', categorical_feature='auto', params=None, free_raw_data=True)

    Parameters
    ----------
@@ -55,6 +55,11 @@ The methods of each Class is in alphabetical order.
    feature_name : list of str, or 'auto'
        Feature names
        If 'auto' and data is pandas DataFrame, use data columns name
+    categorical_feature : list of str or int, or 'auto'
+        Categorical features,
+        type int represents index,
+        type str represents feature names (need to specify feature_name as well)
+        If 'auto' and data is pandas DataFrame, use pandas categorical columns
    params : dict, optional
        Other parameters
    free_raw_data : Bool
@@ -146,6 +151,17 @@ The methods of each Class is in alphabetical order.
        Name of the output file.


+####set_categorical_feature(categorical_feature)
+
+    Set categorical features.
+
+    Parameters
+    ----------
+    categorical_feature : list of str or list of int
+        Name (str) or index (int) of categorical features
+
+
+
 ####set_feature_name(feature_name)

    Set feature name.
@@ -450,7 +466,7 @@ The methods of each Class is in alphabetical order.

 ##Training API

-####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)
+####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)

    Train with given parameters.

@@ -476,6 +492,11 @@ The methods of each Class is in alphabetical order.
    feature_name : list of str, or 'auto'
        Feature names
        If 'auto' and data is pandas DataFrame, use data columns name
+    categorical_feature : list of str or int, or 'auto'
+        Categorical features,
+        type int represents index,
+        type str represents feature names (need to specify feature_name as well)
+        If 'auto' and data is pandas DataFrame, use pandas categorical columns
    early_stopping_rounds: int
        Activates early stopping.
        Requires at least one validation data and one metric
@@ -515,7 +536,7 @@ The methods of each Class is in alphabetical order.
    booster : a trained booster model


-####cv(params, train_set, num_boost_round=10, data_splitter=None, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)
+####cv(params, train_set, num_boost_round=10, data_splitter=None, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)

    Cross-validation with given paramaters.

@@ -546,6 +567,11 @@ The methods of each Class is in alphabetical order.
    feature_name : list of str, or 'auto'
        Feature names
        If 'auto' and data is pandas DataFrame, use data columns name
+    categorical_feature : list of str or int, or 'auto'
+        Categorical features,
+        type int represents index,
+        type str represents feature names (need to specify feature_name as well)
+        If 'auto' and data is pandas DataFrame, use pandas categorical columns
    early_stopping_rounds: int
        Activates early stopping. CV error needs to decrease at least
        every <early_stopping_rounds> round(s) to continue.
@@ -695,7 +721,7 @@ The methods of each Class is in alphabetical order.
    X_leaves : array_like, shape=[n_samples, n_trees]


-####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', callbacks=None)
+####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None)

    Fit the gradient boosting model.

@@ -729,6 +755,11 @@ The methods of each Class is in alphabetical order.
    feature_name : list of str, or 'auto'
        Feature names
        If 'auto' and data is pandas DataFrame, use data columns name
+    categorical_feature : list of str or int, or 'auto'
+        Categorical features,
+        type int represents index,
+        type str represents feature names (need to specify feature_name as well)
+        If 'auto' and data is pandas DataFrame, use pandas categorical columns
    callbacks : list of callback functions
        List of callback functions that are applied at each iteration.
        See Callbacks in Python-API.md for more information.

--- a/docs/Python-intro.md
+++ b/docs/Python-intro.md
@@ -68,6 +68,13 @@ test_data = lgb.Dataset('test.svm', reference=train_data)

 In LightGBM, the validation data should be aligned with training data.

+#### Specific feature names and categorical features
+
+```python
+train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])
+```
+LightGBM can use categorical features as input directly. It doesn't need to covert to one-hot coding, and is much faster than one-hot coding (about 8x speed-up). 
+**Note:You should convert your categorical features to int type before you construct `Dataset`.**

 #### Weights can be set when needed:
 ```python

--- a/docs/Quick-Start.md
+++ b/docs/Quick-Start.md
@@ -14,11 +14,11 @@ LightGBM supports input data file with [CSV](https://en.wikipedia.org/wiki/Comma

 Label is the data of first column, and there is no header in the file.

-### [deprecated in v2] Categorical feature support
+### Categorical feature support

 update 12/5/2016:

-LightGBM can use categorical feature directly (without one-hot coding). The experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up compared with one-hot coding (refer to [categorical log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_speed.log) and [one-hot log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_onehot_speed.log)).
+LightGBM can use categorical feature directly (without one-hot coding). The experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up compared with one-hot coding.

 For the setting details, please refer to [Parameters](./Parameters.md#io-parameters).


--- a/examples/python-guide/README.md
+++ b/examples/python-guide/README.md
@@ -29,6 +29,7 @@ Examples including:
    - Feature importances with sklearn interface
 - [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
    - Set feature names
+    - Directly use categorical features without one-hot encoding
    - Load model file to continue training
    - Change learning rates during training
    - Self-defined objective function

--- a/examples/python-guide/advanced_example.py
+++ b/examples/python-guide/advanced_example.py
@@ -42,11 +42,13 @@ params = {
 feature_name = ['feature_' + str(col) for col in range(num_feature)]

 print('Start training...')
+# feature_name and categorical_feature
 gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,  # eval training data
-                feature_name=feature_name)
+                feature_name=feature_name,
+                categorical_feature=[21])

 # check feature name
 print('Finish first 10 rounds...')

--- a/examples/python-guide/plot_example.py
+++ b/examples/python-guide/plot_example.py
@@ -38,6 +38,7 @@ gbm = lgb.train(params,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=['f' + str(i + 1) for i in range(28)],
+                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

@@ -49,6 +50,6 @@ print('Plot feature importances...')
 ax = lgb.plot_importance(gbm, max_num_features=10)
 plt.show()

-print('Plot 84th tree...')
+print('Plot 84th tree...')  # one tree use categorical feature to split
 ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
 plt.show()
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -12,6 +12,12 @@

 namespace LightGBM {

+enum BinType {
+  NumericalBin,
+  CategoricalBin
+};
+
+
 /*! \brief Store data for one histogram bin */
 struct HistogramBinEntry {
 public:
@@ -58,11 +64,19 @@ public:
    if (num_bin_ != other.num_bin_) {
      return false;
    }
+    if (bin_type_ == BinType::NumericalBin) {
      for (int i = 0; i < num_bin_; ++i) {
        if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
          return false;
        }
      }
+    } else {
+      for (int i = 0; i < num_bin_; i++) {
+        if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
+          return false;
+        }
+      }
+    }
    return true;
  }

@@ -83,7 +97,11 @@ public:
  * \return Feature value of this bin
  */
  inline double BinToValue(uint32_t bin) const {
+    if (bin_type_ == BinType::NumericalBin) {
      return bin_upper_bound_[bin];
+    } else {
+      return bin_2_categorical_[bin];
+    }
  }
  /*!
  * \brief Get sizes in byte of this object
@@ -110,8 +128,9 @@ public:
  * \param max_bin The maximal number of bin
  * \param min_data_in_bin min number of data in one bin
  * \param min_split_data
+  * \param bin_type Type of this bin
  */
-  void FindBin(std::vector<double>& values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data);
+  void FindBin(std::vector<double>& values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type);

  /*!
  * \brief Use specific number of bin to calculate the size of this class
@@ -131,15 +150,25 @@ public:
  * \param buffer The source
  */
  void CopyFrom(const char* buffer);
+
+  /*!
+  * \brief Get bin types
+  */
+  inline BinType bin_type() const { return bin_type_; }
+
  /*!
  * \brief Get bin info
  */
  inline std::string bin_info() const {
+    if (bin_type_ == BinType::CategoricalBin) {
+      return Common::Join(bin_2_categorical_, ":");
+    } else {
      std::stringstream str_buf;
      str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
      str_buf << '[' << min_val_ << ':' << max_val_ << ']';
      return str_buf.str();
    }
+  }

 private:
  /*! \brief Number of bins */
@@ -150,6 +179,12 @@ private:
  bool is_trival_;
  /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
  double sparse_rate_;
+  /*! \brief Type of this bin */
+  BinType bin_type_;
+  /*! \brief Mapper from categorical to bin */
+  std::unordered_map<int, unsigned int> categorical_2_bin_;
+  /*! \brief Mapper from bin to categorical */
+  std::vector<int> bin_2_categorical_;
  /*! \brief minimal feature vaule */
  double min_val_;
  /*! \brief maximum feature value */
@@ -297,12 +332,13 @@ public:
  * \param num_data Number of used data
  * \param lte_indices After called this function. The less or equal data indices will store on this object.
  * \param gt_indices After called this function. The greater data indices will store on this object.
+  * \param bin_type type of bin
  * \return The number of less than or equal data.
  */
  virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, 
    uint32_t default_bin, uint32_t threshold,
    data_size_t* data_indices, data_size_t num_data,
-    data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
+    data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;

  /*!
  * \brief Create the ordered bin for this bin
@@ -346,6 +382,7 @@ public:
 };

 inline uint32_t BinMapper::ValueToBin(double value) const {
+  if (bin_type_ == BinType::NumericalBin) {
    // binary search to find bin
    int l = 0;
    int r = num_bin_ - 1;
@@ -358,6 +395,14 @@ inline uint32_t BinMapper::ValueToBin(double value) const {
      }
    }
    return l;
+  } else {
+    int int_value = static_cast<int>(value);
+    if (categorical_2_bin_.count(int_value)) {
+      return categorical_2_bin_.at(int_value);
+    } else {
+      return num_bin_ - 1;
+    }
+  }
 }

 }  // namespace LightGBM

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -124,6 +124,10 @@ public:
  * And add an prefix "name:" while using column name
  * Note: when using Index, it dosen't count the label index */
  std::string ignore_column = "";
+  /*! \brief specific categorical columns, Note:only support for integer type categorical
+  * And add an prefix "name:" while using column name
+  * Note: when using Index, it dosen't count the label index */
+  std::string categorical_column = "";
  LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
 };

@@ -381,6 +385,9 @@ struct ParameterAlias {
      { "query_column", "group_column" },
      { "ignore_feature", "ignore_column" },
      { "blacklist", "ignore_column" },
+      { "categorical_feature", "categorical_column" },
+      { "cat_column", "categorical_column" },
+      { "cat_feature", "categorical_column" },
      { "predict_raw_score", "is_predict_raw_score" },
      { "predict_leaf_index", "is_predict_leaf_index" }, 
      { "raw_score", "is_predict_raw_score" },

--- a/include/LightGBM/dataset_loader.h
+++ b/include/LightGBM/dataset_loader.h
@@ -71,7 +71,8 @@ private:
  std::unordered_set<int> ignore_features_;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
-
+  /*! \brief Mapper from real feature index to used index*/
+  std::unordered_set<int> categorical_features_;
 };

 }