weight_param.R 3.14 KB
Newer Older
1
2
3
4
5
6
7
8
# This demo R code is to provide a demonstration of hyperparameter adjustment
# when scaling weights for appropriate learning
# As with any optimizers, bad parameters can impair performance

# Load library
library(lightgbm)

# We will train a model with the following scenarii:
9
10
11
# - Run 1: sum of weights equal to 6513 (x 1e-5) without adjusted regularization (not learning)
# - Run 2: sum of weights equal to 6513 (x 1e-5) adjusted regularization (learning)
# - Run 3: sum of weights equal to 6513 with adjusted regularization (learning)
12
13

# Setup small weights
14
15
weights1 <- rep(1e-5, 6513L)
weights2 <- rep(1e-5, 1611L)
16
17
18
19
20
21
22
23
24
25

# Load data and create datasets
data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label, weight = weights1)
data(agaricus.test, package = "lightgbm")
test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label, weight = weights2)
valids <- list(test = dtest)

26
# Run 1: sum of weights equal to 6513 (x 1e-5) without adjusted regularization (not learning)
27
28
# It cannot learn because regularization is too large!
# min_sum_hessian alone is bigger than the sum of weights, thus you will never learn anything
29
30
31
32
params <- list(
    objective = "regression"
    , metric = "l2"
    , device = "cpu"
33
34
35
36
    , min_sum_hessian = 10.0
    , num_leaves = 7L
    , max_depth = 3L
    , nthread = 1L
37
38
    , min_data = 1L
    , learning_rate = 1.0
39
40
41
42
)
model <- lgb.train(
    params
    , dtrain
43
    , 50L
44
    , valids
45
    , early_stopping_rounds = 10L
46
)
47
48
49
weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(weight_loss) # Shows how poor the learning was: a straight line!

50
# Run 2: sum of weights equal to 6513 (x 1e-5) with adjusted regularization (learning)
51
52
# Adjusted regularization just consisting in multiplicating results by 1e4 (x10000)
# Notice how it learns, there is no issue as we adjusted regularization ourselves
53
54
55
56
57
params <- list(
    objective = "regression"
    , metric = "l2"
    , device = "cpu"
    , min_sum_hessian = 1e-4
58
59
60
    , num_leaves = 7L
    , max_depth = 3L
    , nthread = 1L
61
62
    , min_data = 1L
    , learning_rate = 1.0
63
64
65
66
)
model <- lgb.train(
    params
    , dtrain
67
    , 50L
68
    , valids
69
    , early_stopping_rounds = 10L
70
)
71
72
73
small_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(small_weight_loss) # It learns!

74
# Run 3: sum of weights equal to 6513 with adjusted regularization (learning)
75
76
77
78
79
dtrain <- lgb.Dataset(train$data, label = train$label)
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
valids <- list(test = dtest)

# Setup parameters and run model...
80
81
82
83
params <- list(
    objective = "regression"
    , metric = "l2"
    , device = "cpu"
84
85
86
87
    , min_sum_hessian = 10.0
    , num_leaves = 7L
    , max_depth = 3L
    , nthread = 1L
88
89
    , min_data = 1L
    , learning_rate = 1.0
90
91
92
93
)
model <- lgb.train(
    params
    , dtrain
94
    , 50L
95
    , valids
96
    , early_stopping_rounds = 10L
97
)
98
99
100
101
102
103
large_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(large_weight_loss) # It learns!


# Do you want to compare the learning? They both converge.
plot(small_weight_loss, large_weight_loss)
104
curve(1.0 * x, from = 0L, to = 0.02, add = TRUE)