run.py 7.42 KB
Newer Older
1
2
import json
import os
3
4
5
6

import numpy as np
import pandas as pd
import torch
7
import torch.nn.functional as F
8
from BGNN import BGNNPredictor
9
from category_encoders import CatBoostEncoder
10
11

from dgl.data.utils import load_graphs
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
12
13
14
15
16
17
18
19
20
from dgl.nn.pytorch import (
    AGNNConv as AGNNConvDGL,
    APPNPConv,
    ChebConv as ChebConvDGL,
    GATConv as GATConvDGL,
    GraphConv,
)
from sklearn import preprocessing
from torch.nn import Dropout, ELU, Linear, ReLU, Sequential
21

22
23

class GNNModelDGL(torch.nn.Module):
24
25
26
27
28
29
30
31
32
33
34
    def __init__(
        self,
        in_dim,
        hidden_dim,
        out_dim,
        dropout=0.0,
        name="gat",
        residual=True,
        use_mlp=False,
        join_with_mlp=False,
    ):
35
36
37
38
39
        super(GNNModelDGL, self).__init__()
        self.name = name
        self.use_mlp = use_mlp
        self.join_with_mlp = join_with_mlp
        self.normalize_input_columns = True
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
        if name == "gat":
            self.l1 = GATConvDGL(
                in_dim,
                hidden_dim // 8,
                8,
                feat_drop=dropout,
                attn_drop=dropout,
                residual=False,
                activation=F.elu,
            )
            self.l2 = GATConvDGL(
                hidden_dim,
                out_dim,
                1,
                feat_drop=dropout,
                attn_drop=dropout,
                residual=residual,
                activation=None,
            )
        elif name == "gcn":
60
61
62
            self.l1 = GraphConv(in_dim, hidden_dim, activation=F.elu)
            self.l2 = GraphConv(hidden_dim, out_dim, activation=F.elu)
            self.drop = Dropout(p=dropout)
63
64
65
        elif name == "cheb":
            self.l1 = ChebConvDGL(in_dim, hidden_dim, k=3)
            self.l2 = ChebConvDGL(hidden_dim, out_dim, k=3)
66
            self.drop = Dropout(p=dropout)
67
68
69
70
        elif name == "agnn":
            self.lin1 = Sequential(
                Dropout(p=dropout), Linear(in_dim, hidden_dim), ELU()
            )
71
72
            self.l1 = AGNNConvDGL(learn_beta=False)
            self.l2 = AGNNConvDGL(learn_beta=True)
73
74
75
76
77
78
79
80
81
82
83
84
            self.lin2 = Sequential(
                Dropout(p=dropout), Linear(hidden_dim, out_dim), ELU()
            )
        elif name == "appnp":
            self.lin1 = Sequential(
                Dropout(p=dropout),
                Linear(in_dim, hidden_dim),
                ReLU(),
                Dropout(p=dropout),
                Linear(hidden_dim, out_dim),
            )
            self.l1 = APPNPConv(k=10, alpha=0.1, edge_drop=0.0)
85
86
87
88
89
90
91
92

    def forward(self, graph, features):
        h = features
        if self.use_mlp:
            if self.join_with_mlp:
                h = torch.cat((h, self.mlp(features)), 1)
            else:
                h = self.mlp(features)
93
        if self.name == "gat":
94
95
            h = self.l1(graph, h).flatten(1)
            logits = self.l2(graph, h).mean(1)
96
        elif self.name in ["appnp"]:
97
98
            h = self.lin1(h)
            logits = self.l1(graph, h)
99
        elif self.name == "agnn":
100
101
102
103
            h = self.lin1(h)
            h = self.l1(graph, h)
            h = self.l2(graph, h)
            logits = self.lin2(h)
104
        elif self.name == "che3b":
105
106
107
108
            lambda_max = dgl.laplacian_lambda_max(graph)
            h = self.drop(h)
            h = self.l1(graph, h, lambda_max)
            logits = self.l2(graph, h, lambda_max)
109
        elif self.name == "gcn":
110
111
112
113
114
115
            h = self.drop(h)
            h = self.l1(graph, h)
            logits = self.l2(graph, h)

        return logits

116

117
def read_input(input_folder):
118
119
    X = pd.read_csv(f"{input_folder}/X.csv")
    y = pd.read_csv(f"{input_folder}/y.csv")
120
121

    categorical_columns = []
122
123
    if os.path.exists(f"{input_folder}/cat_features.txt"):
        with open(f"{input_folder}/cat_features.txt") as f:
124
125
126
127
128
129
130
131
132
133
134
135
            for line in f:
                if line.strip():
                    categorical_columns.append(line.strip())

    cat_features = None
    if categorical_columns:
        columns = X.columns
        cat_features = np.where(columns.isin(categorical_columns))[0]

        for col in list(columns[cat_features]):
            X[col] = X[col].astype(str)

136
    gs, _ = load_graphs(f"{input_folder}/graph.dgl")
137
138
    graph = gs[0]

139
    with open(f"{input_folder}/masks.json") as f:
140
141
142
143
        masks = json.load(f)

    return graph, X, y, cat_features, masks

144

145
146
147
148
149
150
151
def normalize_features(X, train_mask, val_mask, test_mask):
    min_max_scaler = preprocessing.MinMaxScaler()
    A = X.to_numpy(copy=True)
    A[train_mask] = min_max_scaler.fit_transform(A[train_mask])
    A[val_mask + test_mask] = min_max_scaler.transform(A[val_mask + test_mask])
    return pd.DataFrame(A, columns=X.columns).astype(float)

152

153
154
155
156
157
def replace_na(X, train_mask):
    if X.isna().any().any():
        return X.fillna(X.iloc[train_mask].min() - 1)
    return X

158

159
160
161
162
def encode_cat_features(X, y, cat_features, train_mask, val_mask, test_mask):
    enc = CatBoostEncoder()
    A = X.to_numpy(copy=True)
    b = y.to_numpy(copy=True)
163
164
165
166
167
168
    A[np.ix_(train_mask, cat_features)] = enc.fit_transform(
        A[np.ix_(train_mask, cat_features)], b[train_mask]
    )
    A[np.ix_(val_mask + test_mask, cat_features)] = enc.transform(
        A[np.ix_(val_mask + test_mask, cat_features)]
    )
169
170
171
    A = A.astype(float)
    return pd.DataFrame(A, columns=X.columns)

172
173

if __name__ == "__main__":
174
175
    # datasets can be found here: https://www.dropbox.com/s/verx1evkykzli88/datasets.zip
    # Read dataset
176
    input_folder = "datasets/avazu"
177
    graph, X, y, cat_features, masks = read_input(input_folder)
178
179
180
181
182
    train_mask, val_mask, test_mask = (
        masks["0"]["train"],
        masks["0"]["val"],
        masks["0"]["test"],
    )
183
184
185
186
187
188

    encoded_X = X.copy()
    normalizeFeatures = False
    replaceNa = True

    if len(cat_features):
189
190
191
        encoded_X = encode_cat_features(
            encoded_X, y, cat_features, train_mask, val_mask, test_mask
        )
192
    if normalizeFeatures:
193
194
195
        encoded_X = normalize_features(
            encoded_X, train_mask, val_mask, test_mask
        )
196
197
198
199
    if replaceNa:
        encoded_X = replace_na(encoded_X, train_mask)

    # specify parameters
200
    task = "regression"
201
    hidden_dim = 128
202
203
204
205
206
207
    trees_per_epoch = 5  # 5-10 are good values to try
    backprop_per_epoch = 5  # 5-10 are good values to try
    lr = 0.1  # 0.01-0.1 are good values to try
    append_gbdt_pred = (
        False  # this can be important for performance (try True and False)
    )
208
209
210
211
    train_input_features = False
    gbdt_depth = 6
    gbdt_lr = 0.1

212
213
214
    out_dim = (
        y.shape[1] if task == "regression" else len(set(y.iloc[test_mask, 0]))
    )
215
216
217
218
219
220
    in_dim = out_dim + X.shape[1] if append_gbdt_pred else out_dim

    # specify GNN model
    gnn_model = GNNModelDGL(in_dim, hidden_dim, out_dim)

    # initialize BGNN model
221
222
223
224
225
226
227
228
229
230
231
232
    bgnn = BGNNPredictor(
        gnn_model,
        task=task,
        loss_fn=None,
        trees_per_epoch=trees_per_epoch,
        backprop_per_epoch=backprop_per_epoch,
        lr=lr,
        append_gbdt_pred=append_gbdt_pred,
        train_input_features=train_input_features,
        gbdt_depth=gbdt_depth,
        gbdt_lr=gbdt_lr,
    )
233
234

    # train
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
    metrics = bgnn.fit(
        graph,
        encoded_X,
        y,
        train_mask,
        val_mask,
        test_mask,
        original_X=X,
        cat_features=cat_features,
        num_epochs=100,
        patience=10,
        metric_name="loss",
    )

    bgnn.plot_interactive(
        metrics,
        legend=["train", "valid", "test"],
        title="Avazu",
        metric_name="loss",
    )