categorical_features_prepare.R 3.16 KB
Newer Older
1
2
3
4
5
6
7
# Here we are going to try training a model with categorical features

# Load libraries
library(data.table)
library(lightgbm)

# Load data and look at the structure
8
#
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Classes 'data.table' and 'data.frame':	4521 obs. of  17 variables:
# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
# $ job      : chr  "unemployed" "services" "management" "management" ...
# $ marital  : chr  "married" "married" "single" "married" ...
# $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
# $ default  : chr  "no" "no" "no" "no" ...
# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
# $ housing  : chr  "no" "yes" "yes" "yes" ...
# $ loan     : chr  "no" "yes" "no" "yes" ...
# $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
# $ month    : chr  "oct" "may" "apr" "jun" ...
# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
# $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
# $ y        : chr  "no" "no" "no" "no" ...
data(bank, package = "lightgbm")
str(bank)

# We must now transform the data to fit in LightGBM
# For this task, we use lgb.prepare
# The function transforms the data into a fittable data
33
#
34
# Classes 'data.table' and 'data.frame':	4521 obs. of  17 variables:
35
# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
36
37
38
39
# $ job      : num  11 8 5 5 2 5 7 10 3 8 ...
# $ marital  : num  2 2 3 2 2 3 2 2 2 2 ...
# $ education: num  1 2 3 3 2 3 3 2 3 1 ...
# $ default  : num  1 1 1 1 1 1 1 1 1 1 ...
40
# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
41
42
43
# $ housing  : num  1 2 2 2 2 1 2 2 2 2 ...
# $ loan     : num  1 2 1 2 1 1 1 1 1 2 ...
# $ contact  : num  1 1 1 3 3 1 1 1 3 1 ...
44
# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
45
# $ month    : num  11 9 1 7 9 4 9 9 9 1 ...
46
47
48
49
# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
50
51
# $ poutcome : num  4 1 1 4 4 1 2 4 4 1 ...
# $ y        : num  1 1 1 1 1 1 1 1 1 1 ...
52
53
54
55
bank <- lgb.prepare(data = bank)
str(bank)

# Remove 1 to label because it must be between 0 and 1
56
bank$y <- bank$y - 1L
57
58

# Data input to LightGBM must be a matrix, without the label
59
my_data <- as.matrix(bank[, 1L:16L, with = FALSE])
60
61
62

# Creating the LightGBM dataset with categorical features
# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
63
64
65
lgb_data <- lgb.Dataset(
    data = my_data
    , label = bank$y
66
    , categorical_feature = c(2L, 3L, 4L, 5L, 7L, 8L, 9L, 11L, 16L)
67
)
68
69

# We can now train a model
70
71
72
params <- list(
    objective = "binary"
    , metric = "l2"
73
    , min_data = 1L
74
    , learning_rate = 0.1
75
76
77
    , min_data = 0L
    , min_hessian = 1.0
    , max_depth = 2L
78
79
80
81
)
model <- lgb.train(
    params = params
    , data = lgb_data
82
    , nrounds = 100L
83
84
    , valids = list(train = lgb_data)
)
85
86
87

# Try to find split_feature: 2
# If you find it, it means it used a categorical feature in the first tree
88
lgb.dump(model, num_iteration = 1L)