Commit eade219e authored by Qiwei Ye's avatar Qiwei Ye
Browse files

merge conflict

parents f23e6083 060bd316
[](For bugs and unexpected issues, please provide the following information, so that we could reproduce in our system)
Please search your question on previous issues, stackoverflow or other search engines before you open a new one.
For bugs and unexpected issues, please provide following information, so that we could reproduce on our system.
## Environment info
Operating System:
CPU:
C++/Python/R version:
## Error Message:
## Reproducible examples
## Steps to reproduce
......
......@@ -272,6 +272,8 @@ lightgbm
# Created by https://www.gitignore.io/api/python
### Python ###
!/python-package/lightgbm/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
......@@ -366,5 +368,9 @@ ENV/
# R testing artefact
lightgbm.model
# saved or dumped model
*.model
*.pkl
# macOS
.DS_Store
......@@ -14,7 +14,7 @@ before_install:
install:
- sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
- conda install --yes atlas numpy scipy scikit-learn
- conda install --yes atlas numpy scipy scikit-learn pandas matplotlib
- pip install pep8
......@@ -23,12 +23,12 @@ script:
- mkdir build && cd build && cmake .. && make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
- cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 .
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
notifications:
email: false
......
......@@ -8,6 +8,11 @@ endif()
PROJECT(lightgbm)
OPTION(USE_MPI "MPI based parallel learning" OFF)
OPTION(USE_OPENMP "Enable OpenMP" ON)
if(APPLE)
OPTION(APPLE_OUTPUT_DYLIB "Output dylib shared library" OFF)
endif()
if(USE_MPI)
find_package(MPI REQUIRED)
......@@ -18,10 +23,16 @@ else()
ADD_DEFINITIONS(-DUSE_SOCKET)
endif(USE_MPI)
find_package(OpenMP REQUIRED)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
if(USE_OPENMP)
find_package(OpenMP REQUIRED)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
else()
# Ignore unknown #pragma warning
if( (CMAKE_CXX_COMPILER_ID MATCHES "[cC][lL][aA][nN][gG]")
OR (CMAKE_CXX_COMPILER_ID MATCHES "[gG][nN][uU]"))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
endif()
endif(USE_OPENMP)
if(UNIX OR MINGW OR CYGWIN)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11")
......@@ -61,7 +72,11 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
include_directories (${LightGBM_HEADER_DIR})
if(APPLE)
if (APPLE_OUTPUT_DYLIB)
SET(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
else()
SET(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
endif()
endif(APPLE)
if(USE_MPI)
......@@ -99,5 +114,7 @@ endif()
install(TARGETS lightgbm _lightgbm
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
......@@ -10,8 +10,8 @@ Description: LightGBM is a gradient boosting framework that uses tree based lear
1.Faster training speed and higher efficiency.
2.Lower memory usage.
3.Better accuracy.
4.Parallel learning supported
5. Capable of handling large-scale data
4.Parallel learning supported.
5. Capable of handling large-scale data.
License: The MIT License (MIT) | file LICENSE
URL: https://github.com/Microsoft/LightGBM
BugReports: https://github.com/Microsoft/LightGBM/issues
......@@ -25,14 +25,14 @@ Suggests:
vcd (>= 1.3),
testthat,
igraph (>= 1.0.1),
methods,
data.table (>= 1.9.6),
magrittr (>= 1.5),
stringi (>= 0.5.2)
Depends:
R (>= 3.0),
R6
Imports:
methods,
Matrix (>= 1.1-0),
methods
data.table (>= 1.9.6),
magrittr (>= 1.5),
jsonlite
RoxygenNote: 5.0.1
......@@ -17,12 +17,22 @@ export(lgb.Dataset.set.reference)
export(lgb.cv)
export(lgb.dump)
export(lgb.get.eval.result)
export(lgb.importance)
export(lgb.interprete)
export(lgb.load)
export(lgb.model.dt.tree)
export(lgb.plot.importance)
export(lgb.plot.interpretation)
export(lgb.save)
export(lgb.train)
export(lightgbm)
export(readRDS.lgb.Booster)
export(saveRDS.lgb.Booster)
export(setinfo)
export(slice)
import(methods)
importFrom(R6,R6Class)
importFrom(data.table,":=")
importFrom(magrittr,"%>%")
importFrom(magrittr,"%T>%")
useDynLib(lightgbm)
......@@ -92,7 +92,8 @@ cb.print.evaluation <- function(period = 1){
if ( (i - 1) %% period == 0
| i == env$begin_iteration
| i == env$end_iteration ) {
cat(merge.eval.string(env), "\n")
msg <- merge.eval.string(env)
if (nchar(msg) > 0) { cat(merge.eval.string(env), "\n") }
}
}
}
......
......@@ -6,7 +6,6 @@ Booster <- R6Class(
record_evals = list(),
finalize = function() {
if (!lgb.is.null.handle(private$handle)) {
cat("freeing booster handle\n")
lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle)
private$handle <- NULL
}
......@@ -50,7 +49,7 @@ Booster <- R6Class(
}
class(handle) <- "lgb.Booster.handle"
private$handle <- handle
private$num_class <- as.integer(1)
private$num_class <- 1L
private$num_class <-
lgb.call("LGBM_BoosterGetNumClasses_R", ret = private$num_class, private$handle)
},
......@@ -107,6 +106,10 @@ Booster <- R6Class(
} else {
if (!is.function(fobj)) { stop("lgb.Booster.update: fobj should be a function") }
gpair <- fobj(private$inner_predict(1), private$train_set)
if(is.null(gpair$grad) | is.null(gpair$hess)){
stop("lgb.Booster.update: custom objective should
return a list with attributes (hess, grad)")
}
ret <- lgb.call(
"LGBM_BoosterUpdateOneIterCustom_R", ret = NULL,
private$handle,
......@@ -128,7 +131,7 @@ Booster <- R6Class(
self
},
current_iter = function() {
cur_iter <- as.integer(0)
cur_iter <- 0L
lgb.call("LGBM_BoosterGetCurrentIteration_R", ret = cur_iter, private$handle)
},
eval = function(data, name, feval = NULL) {
......@@ -192,7 +195,14 @@ Booster <- R6Class(
predictor <- Predictor$new(private$handle)
predictor$predict(data, num_iteration, rawscore, predleaf, header, reshape)
},
to_predictor = function() { Predictor$new(private$handle) }
to_predictor = function() { Predictor$new(private$handle) },
raw = NA,
save = function() {
temp <- tempfile()
lgb.save(self, temp)
self$raw <- readChar(temp, file.info(temp)$size)
file.remove(temp)
}
),
private = list(
handle = NULL,
......@@ -214,7 +224,7 @@ Booster <- R6Class(
stop("data_idx should not be greater than num_dataset")
}
if (is.null(private$predict_buffer[[data_name]])) {
npred <- as.integer(0)
npred <- 0L
npred <- lgb.call("LGBM_BoosterGetNumPredict_R",
ret = npred,
private$handle,
......@@ -240,8 +250,7 @@ Booster <- R6Class(
private$eval_names <- names
private$higher_better_inner_eval <- rep(FALSE, length(names))
for (i in seq_along(names)) {
if (names[i]) == "auc" |
grepl("^ndcg", names[i])) {
if ((names[i] == "auc") | grepl("^ndcg", names[i])) {
private$higher_better_inner_eval[i] <- TRUE
}
}
......@@ -276,6 +285,11 @@ Booster <- R6Class(
data <- private$train_set
if (data_idx > 1) { data <- private$valid_sets[[data_idx - 1]] }
res <- feval(private$inner_predict(data_idx), data)
if(is.null(res$name) | is.null(res$value) |
is.null(res$higher_better)) {
stop("lgb.Booster.eval: custom eval function should return a
list with attribute (name, value, higher_better)");
}
res$data_name <- data_name
ret <- append(ret, list(res))
}
......
......@@ -4,7 +4,6 @@ Dataset <- R6Class(
public = list(
finalize = function() {
if (!lgb.is.null.handle(private$handle)) {
cat("free dataset handle\n")
lgb.call("LGBM_DatasetFree_R", ret = NULL, private$handle)
private$handle <- NULL
}
......@@ -79,26 +78,16 @@ Dataset <- R6Class(
}
# Get categorical feature index
if (!is.null(private$categorical_feature)) {
fname_dict <- list()
if (!is.null(private$colnames)) {
fname_dict <- `names<-`(
list((seq_along(private$colnames) - 1)),
private$colnames
)
}
cate_indices <- list()
for (key in private$categorical_feature) {
if (is.character(key)) {
idx <- fname_dict[[key]]
if (is.null(idx)) {
stop("lgb.self.get.handle: cannot find feature name ", sQuote(key))
if (typeof(private$categorical_feature) == "character") {
cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1)
if (sum(is.na(cate_indices)) > 0) {
stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
}
cate_indices <- c(cate_indices, idx)
} else {
# one-based indices to zero-based
idx <- as.integer(key - 1)
cate_indices <- c(cate_indices, idx)
if (max(private$categorical_feature) > length(private$colnames)) {
stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
}
cate_indices <- as.list(private$categorical_feature - 1)
}
private$params$categorical_feature <- cate_indices
}
......@@ -200,8 +189,8 @@ Dataset <- R6Class(
},
dim = function() {
if (!lgb.is.null.handle(private$handle)) {
num_row <- as.integer(0)
num_col <- as.integer(0)
num_row <- 0L
num_col <- 0L
c(
lgb.call("LGBM_DatasetGetNumData_R", ret = num_row, private$handle),
......@@ -252,7 +241,7 @@ Dataset <- R6Class(
)
}
if (is.null(private$info[[name]]) && !lgb.is.null.handle(private$handle)) {
info_len <- as.integer(0)
info_len <- 0L
info_len <- lgb.call("LGBM_DatasetGetFieldSize_R",
ret = info_len,
private$handle,
......@@ -388,9 +377,9 @@ Dataset <- R6Class(
)
)
#' Contruct lgb.Dataset object
#' Construct lgb.Dataset object
#'
#' Contruct lgb.Dataset object from dense matrix, sparse matrix
#' Construct lgb.Dataset object from dense matrix, sparse matrix
#' or local file (that was created previously by saving an \code{lgb.Dataset}).
#'
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
......@@ -435,9 +424,9 @@ lgb.Dataset <- function(data,
}
#' Contruct validation data
#' Construct validation data
#'
#' Contruct validation data according to training data
#' Construct validation data according to training data
#'
#' @param dataset \code{lgb.Dataset} object, training data
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
......
......@@ -31,7 +31,7 @@ Predictor <- R6Class(
predleaf = FALSE, header = FALSE, reshape = FALSE) {
if (is.null(num_iteration)) { num_iteration <- -1 }
num_row <- 0
num_row <- 0L
if (is.character(data)) {
tmp_filename <- tempfile(pattern = "lightgbm_")
on.exit(unlink(tmp_filename), add = TRUE)
......@@ -46,7 +46,7 @@ Predictor <- R6Class(
preds <- as.vector(t(preds))
} else {
num_row <- nrow(data)
npred <- as.integer(0)
npred <- 0L
npred <- lgb.call("LGBM_BoosterCalcNumPredict_R", ret = npred,
private$handle,
as.integer(num_row),
......@@ -85,7 +85,11 @@ Predictor <- R6Class(
stop("predict: prediction length ", sQuote(length(preds))," is not a multiple of nrows(data): ", sQuote(num_row))
}
npred_per_case <- length(preds) / num_row
if (reshape && npred_per_case > 1) { preds <- matrix(preds, ncol = npred_per_case) }
if (predleaf) {
preds <- matrix(preds, ncol = npred_per_case, byrow = TRUE)
} else if (reshape && npred_per_case > 1) {
preds <- matrix(preds, ncol = npred_per_case, byrow = TRUE)
}
preds
}
),
......
......@@ -26,11 +26,18 @@ CVBooster <- R6Class(
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label vector of response values. Should be provided only when data is an R-matrix.
#' @param weight vector of response values. If not NULL, will set to dataset
#' @param obj objective function, can be character or custom objective function
#' @param obj objective function, can be character or custom objective function. Examples include
#' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' @param boosting boosting type. \code{gbdt}, \code{dart}
#' @param num_leaves number of leaves in one tree. defaults to 127
#' @param max_depth Limit the max depth for tree model. This is used to deal with overfit when #data is small.
#' Tree still grow by leaf-wise.
#' @param num_threads Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).
#' @param eval evaluation function, can be (list of) character or custom eval function
#' @param verbose verbosity for output
#' if verbose > 0 , also will record iteration message to booster$record_evals
#' @param eval_freq evalutaion output frequence
#' @param verbose verbosity for output, if <= 0, also will disable the print of evalutaion during training
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param eval_freq evalutaion output frequence, only effect when verbose > 0
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
#' by the values of outcome labels.
......@@ -51,7 +58,7 @@ CVBooster <- R6Class(
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param ... other parameters, see parameters.md for more informations
#' @return a trained booster model \code{lgb.Booster}.
#' @return a trained model \code{lgb.CVBooster}.
#' @examples
#' \dontrun{
#' library(lightgbm)
......@@ -63,13 +70,20 @@ CVBooster <- R6Class(
#' }
#' @rdname lgb.train
#' @export
lgb.cv <- function(params=list(), data, nrounds = 10, nfold = 3,
label = NULL, weight = NULL,
obj = NULL, eval = NULL,
verbose = 1, eval_freq = 1L, showsd = TRUE,
stratified = TRUE, folds = NULL,
lgb.cv <- function(params=list(), data, nrounds = 10,
nfold = 3,
label = NULL,
weight = NULL,
obj = NULL,
eval = NULL,
verbose = 1,
record = TRUE,
eval_freq = 1L,
showsd = TRUE,
stratified = TRUE,
folds = NULL,
init_model = NULL,
colnames= NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(), ...) {
......@@ -108,11 +122,11 @@ lgb.cv <- function(params=list(), data, nrounds = 10, nfold = 3,
data$update_params(params)
data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) }
data$set_categorical_feature(categorical_feature)
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
data$construct()
if (!is.null(folds)) {
if (!is.list(folds) || length(folds) < 2)
if (!is.list(folds) | length(folds) < 2)
stop(sQuote("folds"), " must be a list with 2 or more elements that are vectors of indices for each CV-fold")
nfold <- length(folds)
} else {
......@@ -120,11 +134,11 @@ lgb.cv <- function(params=list(), data, nrounds = 10, nfold = 3,
folds <- generate.cv.folds(nfold, nrow(data), stratified, getinfo(data, 'label'), params)
}
if (eval_freq > 0) {
if (verbose > 0 & eval_freq > 0) {
callbacks <- add.cb(callbacks, cb.print.evaluation(eval_freq))
}
if (verbose > 0) { callbacks <- add.cb(callbacks, cb.record.evaluation()) }
if (record) { callbacks <- add.cb(callbacks, cb.record.evaluation()) }
if (!is.null(early_stopping_rounds)) {
if (early_stopping_rounds > 0) {
......
#' Compute feature importance in a model
#'
#' Creates a \code{data.table} of feature importances in a model.
#'
#' @param model object of class \code{lgb.Booster}.
#' @param percentage whether to show importance in relative percentage.
#'
#' @return
#'
#' For a tree model, a \code{data.table} with the following columns:
#' \itemize{
#' \item \code{Feature} Feature names in the model.
#' \item \code{Gain} The total gain of this feature's splits.
#' \item \code{Cover} The number of observation related to this feature.
#' \item \code{Frequency} The number of times a feature splited in trees.
#' }
#'
#' @examples
#'
#' data(agaricus.train, package = 'lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' params = list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' model <- lgb.train(params, dtrain, 20)
#' model <- lgb.train(params, dtrain, 20)
#'
#' tree_imp1 <- lgb.importance(model, percentage = TRUE)
#' tree_imp2 <- lgb.importance(model, percentage = FALSE)
#'
#' @importFrom magrittr %>% %T>%
#' @importFrom data.table :=
#' @export
lgb.importance <- function(model, percentage = TRUE) {
if (!any(class(model) == "lgb.Booster")) {
stop("'model' has to be an object of class lgb.Booster")
}
tree_dt <- lgb.model.dt.tree(model)
tree_imp <- tree_dt %>%
magrittr::extract(.,
i = is.na(split_index) == FALSE,
j = .(Gain = sum(split_gain), Cover = sum(internal_count), Frequency = .N),
by = "split_feature") %T>%
data.table::setnames(., old = "split_feature", new = "Feature") %>%
magrittr::extract(., i = order(Gain, decreasing = TRUE))
if (percentage) {
tree_imp[, ":="(Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency))]
}
return(tree_imp)
}
#' Compute feature contribution of prediction
#'
#' Computes feature contribution components of rawscore prediction.
#'
#' @param model object of class \code{lgb.Booster}.
#' @param data a matrix object or a dgCMatrix object.
#' @param idxset a integer vector of indices of rows needed.
#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration.
#'
#' @return
#'
#' For regression, binary classification and lambdarank model, a \code{list} of \code{data.table} with the following columns:
#' \itemize{
#' \item \code{Feature} Feature names in the model.
#' \item \code{Contribution} The total contribution of this feature's splits.
#' }
#' For multiclass classification, a \code{list} of \code{data.table} with the Feature column and Contribution columns to each class.
#'
#' @examples
#'
#' Sigmoid <- function(x) 1 / (1 + exp(-x))
#' Logit <- function(x) log(x / (1 - x))
#' data(agaricus.train, package = 'lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label)))
#' data(agaricus.test, package = 'lightgbm')
#' test <- agaricus.test
#'
#' params = list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' model <- lgb.train(params, dtrain, 20)
#' model <- lgb.train(params, dtrain, 20)
#'
#' tree_interpretation <- lgb.interprete(model, test$data, 1:5)
#'
#' @importFrom magrittr %>% %T>%
#' @export
lgb.interprete <- function(model, data, idxset, num_iteration = NULL) {
tree_dt <- lgb.model.dt.tree(model, num_iteration)
num_class <- model$.__enclos_env__$private$num_class
tree_interpretation_dt_list <- vector(mode = "list", length = length(idxset))
leaf_index_mat_list <- model$predict(data[idxset, , drop = FALSE],
num_iteration = num_iteration,
predleaf = TRUE) %>%
t(.) %>%
data.table::as.data.table(.) %>%
lapply(., FUN = function(x) matrix(x, ncol = num_class, byrow = TRUE))
tree_index_mat_list <- lapply(leaf_index_mat_list,
FUN = function(x) matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE))
for (i in seq_along(idxset)) {
tree_interpretation_dt_list[[i]] <- single.row.interprete(tree_dt, num_class, tree_index_mat_list[[i]], leaf_index_mat_list[[i]])
}
return(tree_interpretation_dt_list)
}
single.tree.interprete <- function(tree_dt, tree_id, leaf_id) {
single_tree_dt <- tree_dt[tree_index == tree_id, ]
leaf_dt <- single_tree_dt[leaf_index == leaf_id, .(leaf_index, leaf_parent, leaf_value)]
node_dt <- single_tree_dt[!is.na(split_index), .(split_index, split_feature, node_parent, internal_value)]
feature_seq <- character(0)
value_seq <- numeric(0)
leaf_to_root <- function(parent_id, current_value) {
value_seq <<- c(current_value, value_seq)
if (!is.na(parent_id)) {
this_node <- node_dt[split_index == parent_id, ]
feature_seq <<- c(this_node[["split_feature"]], feature_seq)
leaf_to_root(this_node[["node_parent"]], this_node[["internal_value"]])
}
}
leaf_to_root(leaf_dt[["leaf_parent"]], leaf_dt[["leaf_value"]])
data.table::data.table(Feature = feature_seq, Contribution = diff.default(value_seq))
}
multiple.tree.interprete <- function(tree_dt, tree_index, leaf_index) {
mapply(single.tree.interprete,
tree_id = tree_index, leaf_id = leaf_index,
MoreArgs = list(tree_dt = tree_dt),
SIMPLIFY = FALSE, USE.NAMES = TRUE) %>%
data.table::rbindlist(., use.names = TRUE) %>%
magrittr::extract(., j = .(Contribution = sum(Contribution)), by = "Feature") %>%
magrittr::extract(., i = order(abs(Contribution), decreasing = TRUE))
}
single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index_mat) {
tree_interpretation <- vector(mode = "list", length = num_class)
for (i in seq_len(num_class)) {
tree_interpretation[[i]] <- multiple.tree.interprete(tree_dt, tree_index_mat[,i], leaf_index_mat[,i]) %T>%
{
if (num_class > 1) {
data.table::setnames(., old = "Contribution", new = paste("Class", i - 1))
}
}
}
if (num_class == 1) {
tree_interpretation_dt <- tree_interpretation[[1]]
} else {
tree_interpretation_dt <- Reduce(f = function(x, y) merge(x, y, by = "Feature", all = TRUE),
x = tree_interpretation)
for (j in 2:ncol(tree_interpretation_dt)) {
data.table::set(tree_interpretation_dt,
i = which(is.na(tree_interpretation_dt[[j]])),
j = j,
value = 0)
}
}
return(tree_interpretation_dt)
}
#' Parse a LightGBM model json dump
#'
#' Parse a LightGBM model json dump into a \code{data.table} structure.
#'
#' @param model object of class \code{lgb.Booster}
#'
#' @return
#' A \code{data.table} with detailed information about model trees' nodes and leafs.
#'
#' The columns of the \code{data.table} are:
#'
#' \itemize{
#' \item \code{tree_index}: ID of a tree in a model (integer)
#' \item \code{split_index}: ID of a node in a tree (integer)
#' \item \code{split_feature}: for a node, it's a feature name (character);
#' for a leaf, it simply labels it as \code{'NA'}
#' \item \code{node_parent}: ID of the parent node for current node (integer)
#' \item \code{leaf_index}: ID of a leaf in a tree (integer)
#' \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
#' \item \code{split_gain}: Split gain of a node
#' \item \code{threshold}: Spliting threshold value of a node
#' \item \code{decision_type}: Decision type of a node
#' \item \code{internal_value}: Node value
#' \item \code{internal_count}: The number of observation collected by a node
#' \item \code{leaf_value}: Leaf value
#' \item \code{leaf_count}: The number of observation collected by a leaf
#' }
#'
#' @examples
#'
#' data(agaricus.train, package = 'lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' params = list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' model <- lgb.train(params, dtrain, 20)
#' model <- lgb.train(params, dtrain, 20)
#'
#' tree_dt <- lgb.model.dt.tree(model)
#'
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @export
lgb.model.dt.tree <- function(model, num_iteration = NULL) {
json_model <- lgb.dump(model, num_iteration = num_iteration)
parsed_json_model <- jsonlite::fromJSON(json_model,
simplifyVector = TRUE,
simplifyDataFrame = FALSE,
simplifyMatrix = FALSE,
flatten = FALSE)
tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse)
tree_dt <- data.table::rbindlist(tree_list, use.names = TRUE)
tree_dt[, split_feature := Lookup(split_feature,
seq(0, parsed_json_model$max_feature_idx, by = 1),
parsed_json_model$feature_names)]
return(tree_dt)
}
single.tree.parse <- function(lgb_tree) {
single_tree_dt <- data.table::data.table(tree_index = integer(0),
split_index = integer(0), split_feature = integer(0), node_parent = integer(0),
leaf_index = integer(0), leaf_parent = integer(0),
split_gain = numeric(0), threshold = numeric(0), decision_type = character(0),
internal_value = integer(0), internal_count = integer(0),
leaf_value = integer(0), leaf_count = integer(0))
pre_order_traversal <- function(tree_node_leaf, parent_index = NA) {
if (!is.null(tree_node_leaf$split_index)) {
single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
c(tree_node_leaf[c("split_index", "split_feature",
"split_gain", "threshold", "decision_type",
"internal_value", "internal_count")],
"node_parent" = parent_index)),
use.names = TRUE, fill = TRUE)
pre_order_traversal(tree_node_leaf$left_child, parent_index = tree_node_leaf$split_index)
pre_order_traversal(tree_node_leaf$right_child, parent_index = tree_node_leaf$split_index)
} else if (!is.null(tree_node_leaf$leaf_index)) {
single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
tree_node_leaf[c("leaf_index", "leaf_parent",
"leaf_value", "leaf_count")]),
use.names = TRUE, fill = TRUE)
}
}
pre_order_traversal(lgb_tree$tree_structure)
single_tree_dt[, tree_index := lgb_tree$tree_index]
return(single_tree_dt)
}
Lookup <- function(key, key_lookup, value_lookup, missing = NA) {
match(key, key_lookup) %>%
magrittr::extract(value_lookup, .) %>%
magrittr::inset(. , is.na(.), missing)
}
#' Plot feature importance as a bar graph
#'
#' Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph.
#'
#' @param tree_imp a \code{data.table} returned by \code{\link{lgb.importance}}.
#' @param top_n maximal number of top features to include into the plot.
#' @param measure the name of importance measure to plot, can be "Gain", "Cover" or "Frequency".
#' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names.
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
#'
#' @details
#' The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature.
#' Features are shown ranked in a decreasing importance order.
#'
#' @return
#' The \code{lgb.plot.importance} function creates a \code{barplot}
#' and silently returns a processed data.table with \code{top_n} features sorted by defined importance.
#'
#' @examples
#'
#' data(agaricus.train, package = 'lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' params = list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' model <- lgb.train(params, dtrain, 20)
#' model <- lgb.train(params, dtrain, 20)
#'
#' tree_imp <- lgb.importance(model, percentage = TRUE)
#' lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain")
#'
#' @export
lgb.plot.importance <- function(tree_imp, top_n = 10, measure = "Gain", left_margin = 10, cex = NULL) {
measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE)
top_n <- min(top_n, nrow(tree_imp))
tree_imp <- tree_imp[order(abs(get(measure)), decreasing = TRUE),][1:top_n,]
if (is.null(cex)) {
cex <- 2.5 / log2(1 + top_n)
}
op <- par(no.readonly = TRUE)
on.exit(par(op))
par(mar = op$mar %>% magrittr::inset(., 2, left_margin))
tree_imp[.N:1,
barplot(height = get(measure), names.arg = Feature, horiz = TRUE, border = NA,
main = "Feature Importance", xlab = measure, cex.names = cex, las = 1)]
invisible(tree_imp)
}
#' Plot feature contribution as a bar graph
#'
#' Plot previously calculated feature contribution as a bar graph.
#'
#' @param tree_interpretation_dt a \code{data.table} returned by \code{\link{lgb.interprete}}.
#' @param top_n maximal number of top features to include into the plot.
#' @param cols the column numbers of layout, will be used only for multiclass classification feature contribution.
#' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names.
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
#'
#' @details
#' The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature.
#' Features are shown ranked in a decreasing contribution order.
#'
#' @return
#' The \code{lgb.plot.interpretation} function creates a \code{barplot}
#'
#' @examples
#'
#' Sigmoid <- function(x) 1 / (1 + exp(-x))
#' Logit <- function(x) log(x / (1 - x))
#' data(agaricus.train, package = 'lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label)))
#' data(agaricus.test, package = 'lightgbm')
#' test <- agaricus.test
#'
#' params = list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' model <- lgb.train(params, dtrain, 20)
#' model <- lgb.train(params, dtrain, 20)
#'
#' tree_interpretation <- lgb.interprete(model, test$data, 1:5)
#' lgb.plot.interpretation(tree_interpretation[[1]], top_n = 10)
#'
#' @export
lgb.plot.interpretation <- function(tree_interpretation_dt, top_n = 10, cols = 1, left_margin = 10, cex = NULL) {
num_class <- ncol(tree_interpretation_dt) - 1
op <- par(no.readonly = TRUE)
on.exit(par(op))
par(mar = op$mar %>% magrittr::inset(., 1:3, c(3, left_margin, 2)))
if (num_class == 1) {
multiple.tree.plot.interpretation(tree_interpretation_dt, top_n = top_n, title = NULL, cex = cex)
} else {
layout_mat <- matrix(seq(1, cols * ceiling(num_class / cols)),
ncol = cols, nrow = ceiling(num_class / cols))
par(mfcol = c(nrow(layout_mat), ncol(layout_mat)))
for (i in seq_len(num_class)) {
tree_interpretation_dt[, c(1, i + 1), with = FALSE] %T>%
data.table::setnames(., old = names(.), new = c("Feature", "Contribution")) %>%
multiple.tree.plot.interpretation(., top_n = top_n, title = paste("Class", i - 1), cex = cex)
}
}
}
multiple.tree.plot.interpretation <- function(tree_interpretation, top_n, title, cex) {
tree_interpretation <- tree_interpretation[order(abs(Contribution), decreasing = TRUE),][1:min(top_n, .N),]
if (is.null(cex)) {
cex <- 2.5 / log2(1 + top_n)
}
tree_interpretation[.N:1,
barplot(height = Contribution, names.arg = Feature, horiz = TRUE,
col = ifelse(Contribution > 0, "firebrick", "steelblue"),
border = NA, main = title, cex.names = cex, las = 1)]
invisible(NULL)
}
......@@ -4,11 +4,18 @@
#' @param data a \code{lgb.Dataset} object, used for training
#' @param nrounds number of training rounds
#' @param valids a list of \code{lgb.Dataset} objects, used for validation
#' @param obj objective function, can be character or custom objective function
#' @param obj objective function, can be character or custom objective function. Examples include
#' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' @param boosting boosting type. \code{gbdt}, \code{dart}
#' @param num_leaves number of leaves in one tree. defaults to 127
#' @param max_depth Limit the max depth for tree model. This is used to deal with overfit when #data is small.
#' Tree still grow by leaf-wise.
#' @param num_threads Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).
#' @param eval evaluation function, can be (a list of) character or custom eval function
#' @param verbose verbosity for output
#' if \code{verbose > 0}, also will record iteration message to \code{booster$record_evals}
#' @param eval_freq evalutaion output frequency
#' @param verbose verbosity for output, if <= 0, also will disable the print of evalutaion during training
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param eval_freq evalutaion output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
......@@ -44,6 +51,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
obj = NULL,
eval = NULL,
verbose = 1,
record = TRUE,
eval_freq = 1L,
init_model = NULL,
colnames = NULL,
......@@ -92,7 +100,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
data$update_params(params)
data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) }
data$set_categorical_feature(categorical_feature)
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
data$construct()
vaild_contain_train <- FALSE
train_data_name <- "train"
......@@ -111,11 +119,11 @@ lgb.train <- function(params = list(), data, nrounds = 10,
}
}
# process callbacks
if (eval_freq > 0) {
if (verbose > 0 & eval_freq > 0) {
callbacks <- add.cb(callbacks, cb.print.evaluation(eval_freq))
}
if (verbose > 0 && length(valids) > 0) {
if (record & length(valids) > 0) {
callbacks <- add.cb(callbacks, cb.record.evaluation())
}
......
#' readRDS for lgb.Booster models
#'
#' Attemps to load a model using RDS.
#'
#' @param file a connection or the name of the file where the R object is saved to or read from.
#' @param refhook a hook function for handling reference objects.
#'
#' @return an R object.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' params <- list(objective="regression", metric="l2")
#' valids <- list(test=dtest)
#' model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' saveRDS.lgb.Booster(model, "model.rds")
#' new_model <- readRDS.lgb.Booster("model.rds")
#' }
#' @export
readRDS.lgb.Booster <- function(file = "", refhook = NULL) {
object <- readRDS(file = file, refhook = refhook)
if (!is.na(object$raw)) {
temp <- tempfile()
write(object$raw, temp)
object2 <- lgb.load(temp)
file.remove(temp)
object2$best_iter <- object$best_iter
object2$record_evals <- object$record_evals
return(object2)
} else {
return(object)
}
}
#' saveRDS for lgb.Booster models
#'
#' Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
#'
#' @param object R object to serialize.
#' @param file a connection or the name of the file where the R object is saved to or read from.
#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.
#' @param version the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.
#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.
#' @param refhook a hook function for handling reference objects.
#' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.
#'
#' @return NULL invisibly.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' params <- list(objective="regression", metric="l2")
#' valids <- list(test=dtest)
#' model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' saveRDS.lgb.Booster(model, "model.rds")
#' }
#' @export
saveRDS.lgb.Booster <- function(object, file = "", ascii = FALSE, version = NULL, compress = TRUE, refhook = NULL, raw = TRUE) {
if (is.na(object$raw) & (raw)) {
object$save()
saveRDS(object, file = file, ascii = ascii, version = version, compress = compress, refhook = refhook)
object$raw <- NA
} else {
saveRDS(object, file = file, ascii = ascii, version = version, compress = compress, refhook = refhook)
}
}
......@@ -19,11 +19,8 @@ You can also install directly from R using the repository with `devtools`:
devtools::install_github("Microsoft/LightGBM", subdir = "R-package")
```
To install LightGBM from a specific commit, you can specify the reference, such as the following to install the first release of the R package for LightGBM:
If want to build the self-contained R package, you can run ```unix_build_package.sh```(for UNIX) or ```win_build_package.cmd ```(for Windows). Then use ```R CMD INSTALL lightgbm_0.1.tar.gz``` to install.
```r
devtools::install_github("Microsoft/LightGBM", ref = "1b7643b", subdir = "R-package")
```
You can check quickly if your LightGBM R package is working by running the following:
......@@ -35,6 +32,28 @@ dtrain <- lgb.Dataset(train$data, label=train$label)
params <- list(objective="regression", metric="l2")
model <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
```
### OSX installation
The default installation cannot successfully in OSX due to clang in OSX doesn't support openmp.
You can use following scirpts to change default compiler to gcc, then complie LightGBM R-package:
```
brew install gcc --without-multilib
mkdir -p ~/.R
touch ~/.R/Makevars
cat <<EOF >>~/.R/Makevars
C=gcc-6
CXX=g++-6
CXX1X=g++-6
LDFLAGS=-L/usr/local/Cellar/gcc/6.3.0/lib
CPPFLAGS=-I/usr/local/Cellar/gcc/6.3.0/include
SHLIB_OPENMP_CFLAGS = -fopenmp
SHLIB_OPENMP_CXXFLAGS = -fopenmp
SHLIB_OPENMP_FCFLAGS = -fopenmp
SHLIB_OPENMP_FFLAGS = -fopenmp
EOF
```
Note: for ``LDFLAGS=-L/usr/local/Cellar/gcc/6.3.0/lib`` and ``CPPFLAGS=-I/usr/local/Cellar/gcc/6.3.0/include``, you may need to change ``6.3.0`` to your gcc version.
Examples
------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment