leaf_stability.R 6.35 KB
Newer Older
1
2
3
4
5
6
# We are going to look at how iterating too much might generate observation instability.
# Obviously, we are in a controlled environment, without issues (real rules).
# Do not do this in a real scenario.

library(lightgbm)

7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# define helper functions for creating plots

# output of `RColorBrewer::brewer.pal(10, "RdYlGn")`, hardcooded here to avoid a dependency
.diverging_palette <- c(
  "#A50026", "#D73027", "#F46D43", "#FDAE61", "#FEE08B"
  , "#D9EF8B", "#A6D96A", "#66BD63", "#1A9850", "#006837"
)

.prediction_depth_plot <- function(df) {
  plot(
    x = df$X
    , y = df$Y
    , type = "p"
    , main = "Prediction Depth"
    , xlab = "Leaf Bin"
    , ylab = "Prediction Probability"
    , pch = 19L
    , col = .diverging_palette[df$binned + 1L]
  )
  legend(
    "topright"
    , title = "bin"
    , legend = sort(unique(df$binned))
    , pch = 19L
    , col = .diverging_palette[sort(unique(df$binned + 1L))]
    , cex = 0.7
  )
}

.prediction_depth_spread_plot <- function(df) {
  plot(
    x = df$binned
    , xlim = c(0L, 9L)
    , y = df$Z
    , type = "p"
    , main = "Prediction Depth Spread"
    , xlab = "Leaf Bin"
    , ylab = "Logloss"
    , pch = 19L
    , col = .diverging_palette[df$binned + 1L]
  )
  legend(
    "topright"
    , title = "bin"
    , legend = sort(unique(df$binned))
    , pch = 19L
    , col = .diverging_palette[sort(unique(df$binned + 1L))]
    , cex = 0.7
  )
}

.depth_density_plot <- function(df) {
  plot(
    x = density(df$Y)
    , xlim = c(min(df$Y), max(df$Y))
    , type = "p"
    , main = "Depth Density"
    , xlab = "Prediction Probability"
    , ylab = "Bin Density"
    , pch = 19L
    , col = .diverging_palette[df$binned + 1L]
  )
  legend(
    "topright"
    , title = "bin"
    , legend = sort(unique(df$binned))
    , pch = 19L
    , col = .diverging_palette[sort(unique(df$binned + 1L))]
    , cex = 0.7
  )
}

# load some data
80
81
82
83
84
85
86
data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
data(agaricus.test, package = "lightgbm")
test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)

87
# setup parameters and we train a model
88
89
90
91
92
93
94
95
96
params <- list(
  objective = "regression"
  , metric = "l2"
  , min_data = 1L
  , learning_rate = 0.1
  , bagging_fraction = 0.1
  , bagging_freq = 1L
  , bagging_seed = 1L
)
97
valids <- list(test = dtest)
98
99
100
model <- lgb.train(
    params
    , dtrain
101
    , 50L
102
103
    , valids
)
104
105
106
107
108
109

# We create a data.frame with the following structure:
# X = average leaf of the observation throughout all trees
# Y = prediction probability (clamped to [1e-15, 1-1e-15])
# Z = logloss
# binned = binned quantile of average leaf
110
111
112
113
new_data <- data.frame(
    X = rowMeans(predict(
        model
        , agaricus.test$data
114
        , type = "leaf"
115
116
117
118
119
120
    ))
    , Y = pmin(
        pmax(
            predict(model, agaricus.test$data)
            , 1e-15
        )
121
        , 1.0 - 1e-15
122
123
    )
)
124
new_data$Z <- -1.0 * (agaricus.test$label * log(new_data$Y) + (1L - agaricus.test$label) * log(1L - new_data$Y))
125
126
127
128
new_data$binned <- .bincode(
    x = new_data$X
    , breaks = quantile(
        x = new_data$X
129
        , probs = seq_len(9L) / 10.0
130
131
132
133
    )
    , right = TRUE
    , include.lowest = TRUE
)
134
new_data$binned[is.na(new_data$binned)] <- 0L
135
136
137
138
139
140
141

# We can check the binned content
table(new_data$binned)

# We can plot the binned content
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
# On the third plot, it is smooth!
142
143
144
.prediction_depth_plot(df = new_data)
.prediction_depth_spread_plot(df = new_data)
.depth_density_plot(df = new_data)
145
146

# Now, let's show with other parameters
147
148
149
150
151
152
params <- list(
  objective = "regression"
  , metric = "l2"
  , min_data = 1L
  , learning_rate = 1.0
)
153
154
155
model2 <- lgb.train(
    params
    , dtrain
156
    , 100L
157
158
    , valids
)
159
160

# We create the data structure, but for model2
161
162
163
164
new_data2 <- data.frame(
    X = rowMeans(predict(
        model2
        , agaricus.test$data
165
        , type = "leaf"
166
167
168
169
170
171
172
173
174
    ))
    , Y = pmin(
        pmax(
            predict(
                model2
                , agaricus.test$data
            )
            , 1e-15
        )
175
      , 1.0 - 1e-15
176
177
     )
)
178
new_data2$Z <- -1.0 * (agaricus.test$label * log(new_data2$Y) + (1L - agaricus.test$label) * log(1L - new_data2$Y))
179
180
181
182
new_data2$binned <- .bincode(
    x = new_data2$X
    , breaks = quantile(
        x = new_data2$X
183
        , probs = seq_len(9L) / 10.0
184
185
186
187
    )
    , right = TRUE
    , include.lowest = TRUE
)
188
new_data2$binned[is.na(new_data2$binned)] <- 0L
189
190
191
192
193
194

# We can check the binned content
table(new_data2$binned)

# We can plot the binned content
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
195
196
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are
# real thus it is not an issue
197
# However, if the rules were not true, the loss would explode.
198
199
200
.prediction_depth_plot(df = new_data2)
.prediction_depth_spread_plot(df = new_data2)
.depth_density_plot(df = new_data2)
201
202

# Now, try with very severe overfitting
203
204
205
206
207
208
params <- list(
  objective = "regression"
  , metric = "l2"
  , min_data = 1L
  , learning_rate = 1.0
)
209
210
211
model3 <- lgb.train(
    params
    , dtrain
212
    , 1000L
213
214
    , valids
)
215
216

# We create the data structure, but for model3
217
218
219
220
new_data3 <- data.frame(
    X = rowMeans(predict(
        model3
        , agaricus.test$data
221
        , type = "leaf"
222
223
224
225
226
227
228
229
230
    ))
    , Y = pmin(
        pmax(
            predict(
                model3
                , agaricus.test$data
            )
            , 1e-15
        )
231
        , 1.0 - 1e-15
232
233
    )
)
234
new_data3$Z <- -1.0 * (agaricus.test$label * log(new_data3$Y) + (1L - agaricus.test$label) * log(1L - new_data3$Y))
235
236
237
238
new_data3$binned <- .bincode(
    x = new_data3$X
    , breaks = quantile(
        x = new_data3$X
239
        , probs = seq_len(9L) / 10.0
240
241
242
243
    )
    , right = TRUE
    , include.lowest = TRUE
)
244
new_data3$binned[is.na(new_data3$binned)] <- 0L
245
246
247
248
249

# We can check the binned content
table(new_data3$binned)

# We can plot the binned content
250
251
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules
# are real thus it is not an issue.
252
# However, if the rules were not true, the loss would explode. See the sudden spikes?
253
.depth_density_plot(df = new_data3)
254
255

# Compare with our second model, the difference is severe. This is smooth.
256
.depth_density_plot(df = new_data2)