run.py 7.5 KB
Newer Older
1
2
import json
import os
3
4
5
6

import numpy as np
import pandas as pd
import torch
7
import torch.nn.functional as F
8
from BGNN import BGNNPredictor
9
10
from category_encoders import CatBoostEncoder
from sklearn import preprocessing
11
12
13
14
15
16
17
18
19
from torch.nn import ELU, Dropout, Linear, ReLU, Sequential

from dgl.data.utils import load_graphs
from dgl.nn.pytorch import AGNNConv as AGNNConvDGL
from dgl.nn.pytorch import APPNPConv
from dgl.nn.pytorch import ChebConv as ChebConvDGL
from dgl.nn.pytorch import GATConv as GATConvDGL
from dgl.nn.pytorch import GraphConv

20
21

class GNNModelDGL(torch.nn.Module):
22
23
24
25
26
27
28
29
30
31
32
    def __init__(
        self,
        in_dim,
        hidden_dim,
        out_dim,
        dropout=0.0,
        name="gat",
        residual=True,
        use_mlp=False,
        join_with_mlp=False,
    ):
33
34
35
36
37
        super(GNNModelDGL, self).__init__()
        self.name = name
        self.use_mlp = use_mlp
        self.join_with_mlp = join_with_mlp
        self.normalize_input_columns = True
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
        if name == "gat":
            self.l1 = GATConvDGL(
                in_dim,
                hidden_dim // 8,
                8,
                feat_drop=dropout,
                attn_drop=dropout,
                residual=False,
                activation=F.elu,
            )
            self.l2 = GATConvDGL(
                hidden_dim,
                out_dim,
                1,
                feat_drop=dropout,
                attn_drop=dropout,
                residual=residual,
                activation=None,
            )
        elif name == "gcn":
58
59
60
            self.l1 = GraphConv(in_dim, hidden_dim, activation=F.elu)
            self.l2 = GraphConv(hidden_dim, out_dim, activation=F.elu)
            self.drop = Dropout(p=dropout)
61
62
63
        elif name == "cheb":
            self.l1 = ChebConvDGL(in_dim, hidden_dim, k=3)
            self.l2 = ChebConvDGL(hidden_dim, out_dim, k=3)
64
            self.drop = Dropout(p=dropout)
65
66
67
68
        elif name == "agnn":
            self.lin1 = Sequential(
                Dropout(p=dropout), Linear(in_dim, hidden_dim), ELU()
            )
69
70
            self.l1 = AGNNConvDGL(learn_beta=False)
            self.l2 = AGNNConvDGL(learn_beta=True)
71
72
73
74
75
76
77
78
79
80
81
82
            self.lin2 = Sequential(
                Dropout(p=dropout), Linear(hidden_dim, out_dim), ELU()
            )
        elif name == "appnp":
            self.lin1 = Sequential(
                Dropout(p=dropout),
                Linear(in_dim, hidden_dim),
                ReLU(),
                Dropout(p=dropout),
                Linear(hidden_dim, out_dim),
            )
            self.l1 = APPNPConv(k=10, alpha=0.1, edge_drop=0.0)
83
84
85
86
87
88
89
90

    def forward(self, graph, features):
        h = features
        if self.use_mlp:
            if self.join_with_mlp:
                h = torch.cat((h, self.mlp(features)), 1)
            else:
                h = self.mlp(features)
91
        if self.name == "gat":
92
93
            h = self.l1(graph, h).flatten(1)
            logits = self.l2(graph, h).mean(1)
94
        elif self.name in ["appnp"]:
95
96
            h = self.lin1(h)
            logits = self.l1(graph, h)
97
        elif self.name == "agnn":
98
99
100
101
            h = self.lin1(h)
            h = self.l1(graph, h)
            h = self.l2(graph, h)
            logits = self.lin2(h)
102
        elif self.name == "che3b":
103
104
105
106
            lambda_max = dgl.laplacian_lambda_max(graph)
            h = self.drop(h)
            h = self.l1(graph, h, lambda_max)
            logits = self.l2(graph, h, lambda_max)
107
        elif self.name == "gcn":
108
109
110
111
112
113
            h = self.drop(h)
            h = self.l1(graph, h)
            logits = self.l2(graph, h)

        return logits

114

115
def read_input(input_folder):
116
117
    X = pd.read_csv(f"{input_folder}/X.csv")
    y = pd.read_csv(f"{input_folder}/y.csv")
118
119

    categorical_columns = []
120
121
    if os.path.exists(f"{input_folder}/cat_features.txt"):
        with open(f"{input_folder}/cat_features.txt") as f:
122
123
124
125
126
127
128
129
130
131
132
133
            for line in f:
                if line.strip():
                    categorical_columns.append(line.strip())

    cat_features = None
    if categorical_columns:
        columns = X.columns
        cat_features = np.where(columns.isin(categorical_columns))[0]

        for col in list(columns[cat_features]):
            X[col] = X[col].astype(str)

134
    gs, _ = load_graphs(f"{input_folder}/graph.dgl")
135
136
    graph = gs[0]

137
    with open(f"{input_folder}/masks.json") as f:
138
139
140
141
        masks = json.load(f)

    return graph, X, y, cat_features, masks

142

143
144
145
146
147
148
149
def normalize_features(X, train_mask, val_mask, test_mask):
    min_max_scaler = preprocessing.MinMaxScaler()
    A = X.to_numpy(copy=True)
    A[train_mask] = min_max_scaler.fit_transform(A[train_mask])
    A[val_mask + test_mask] = min_max_scaler.transform(A[val_mask + test_mask])
    return pd.DataFrame(A, columns=X.columns).astype(float)

150

151
152
153
154
155
def replace_na(X, train_mask):
    if X.isna().any().any():
        return X.fillna(X.iloc[train_mask].min() - 1)
    return X

156

157
158
159
160
def encode_cat_features(X, y, cat_features, train_mask, val_mask, test_mask):
    enc = CatBoostEncoder()
    A = X.to_numpy(copy=True)
    b = y.to_numpy(copy=True)
161
162
163
164
165
166
    A[np.ix_(train_mask, cat_features)] = enc.fit_transform(
        A[np.ix_(train_mask, cat_features)], b[train_mask]
    )
    A[np.ix_(val_mask + test_mask, cat_features)] = enc.transform(
        A[np.ix_(val_mask + test_mask, cat_features)]
    )
167
168
169
    A = A.astype(float)
    return pd.DataFrame(A, columns=X.columns)

170
171

if __name__ == "__main__":
172
173
    # datasets can be found here: https://www.dropbox.com/s/verx1evkykzli88/datasets.zip
    # Read dataset
174
    input_folder = "datasets/avazu"
175
    graph, X, y, cat_features, masks = read_input(input_folder)
176
177
178
179
180
    train_mask, val_mask, test_mask = (
        masks["0"]["train"],
        masks["0"]["val"],
        masks["0"]["test"],
    )
181
182
183
184
185
186

    encoded_X = X.copy()
    normalizeFeatures = False
    replaceNa = True

    if len(cat_features):
187
188
189
        encoded_X = encode_cat_features(
            encoded_X, y, cat_features, train_mask, val_mask, test_mask
        )
190
    if normalizeFeatures:
191
192
193
        encoded_X = normalize_features(
            encoded_X, train_mask, val_mask, test_mask
        )
194
195
196
197
    if replaceNa:
        encoded_X = replace_na(encoded_X, train_mask)

    # specify parameters
198
    task = "regression"
199
    hidden_dim = 128
200
201
202
203
204
205
    trees_per_epoch = 5  # 5-10 are good values to try
    backprop_per_epoch = 5  # 5-10 are good values to try
    lr = 0.1  # 0.01-0.1 are good values to try
    append_gbdt_pred = (
        False  # this can be important for performance (try True and False)
    )
206
207
208
209
    train_input_features = False
    gbdt_depth = 6
    gbdt_lr = 0.1

210
211
212
    out_dim = (
        y.shape[1] if task == "regression" else len(set(y.iloc[test_mask, 0]))
    )
213
214
215
216
217
218
    in_dim = out_dim + X.shape[1] if append_gbdt_pred else out_dim

    # specify GNN model
    gnn_model = GNNModelDGL(in_dim, hidden_dim, out_dim)

    # initialize BGNN model
219
220
221
222
223
224
225
226
227
228
229
230
    bgnn = BGNNPredictor(
        gnn_model,
        task=task,
        loss_fn=None,
        trees_per_epoch=trees_per_epoch,
        backprop_per_epoch=backprop_per_epoch,
        lr=lr,
        append_gbdt_pred=append_gbdt_pred,
        train_input_features=train_input_features,
        gbdt_depth=gbdt_depth,
        gbdt_lr=gbdt_lr,
    )
231
232

    # train
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
    metrics = bgnn.fit(
        graph,
        encoded_X,
        y,
        train_mask,
        val_mask,
        test_mask,
        original_X=X,
        cat_features=cat_features,
        num_epochs=100,
        patience=10,
        metric_name="loss",
    )

    bgnn.plot_interactive(
        metrics,
        legend=["train", "valid", "test"],
        title="Avazu",
        metric_name="loss",
    )