run.py 6.88 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import torch
from BGNN import BGNNPredictor
import pandas as pd
import numpy as np
import json
import os
from dgl.data.utils import load_graphs
from dgl.nn.pytorch import GATConv as GATConvDGL, GraphConv, ChebConv as ChebConvDGL, \
    AGNNConv as AGNNConvDGL, APPNPConv
from torch.nn import Dropout, ELU, Sequential, Linear, ReLU
import torch.nn.functional as F
from category_encoders import CatBoostEncoder
from sklearn import preprocessing

class GNNModelDGL(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim,
                 dropout=0., name='gat', residual=True, use_mlp=False, join_with_mlp=False):
        super(GNNModelDGL, self).__init__()
        self.name = name
        self.use_mlp = use_mlp
        self.join_with_mlp = join_with_mlp
        self.normalize_input_columns = True
        if name == 'gat':
            self.l1 = GATConvDGL(in_dim, hidden_dim//8, 8, feat_drop=dropout, attn_drop=dropout, residual=False,
                              activation=F.elu)
            self.l2 = GATConvDGL(hidden_dim, out_dim, 1, feat_drop=dropout, attn_drop=dropout, residual=residual, activation=None)
        elif name == 'gcn':
            self.l1 = GraphConv(in_dim, hidden_dim, activation=F.elu)
            self.l2 = GraphConv(hidden_dim, out_dim, activation=F.elu)
            self.drop = Dropout(p=dropout)
        elif name == 'cheb':
            self.l1 = ChebConvDGL(in_dim, hidden_dim, k = 3)
            self.l2 = ChebConvDGL(hidden_dim, out_dim, k = 3)
            self.drop = Dropout(p=dropout)
        elif name == 'agnn':
            self.lin1 = Sequential(Dropout(p=dropout), Linear(in_dim, hidden_dim), ELU())
            self.l1 = AGNNConvDGL(learn_beta=False)
            self.l2 = AGNNConvDGL(learn_beta=True)
            self.lin2 = Sequential(Dropout(p=dropout), Linear(hidden_dim, out_dim), ELU())
        elif name == 'appnp':
            self.lin1 = Sequential(Dropout(p=dropout), Linear(in_dim, hidden_dim),
                       ReLU(), Dropout(p=dropout), Linear(hidden_dim, out_dim))
            self.l1 = APPNPConv(k=10, alpha=0.1, edge_drop=0.)


    def forward(self, graph, features):
        h = features
        if self.use_mlp:
            if self.join_with_mlp:
                h = torch.cat((h, self.mlp(features)), 1)
            else:
                h = self.mlp(features)
        if self.name == 'gat':
            h = self.l1(graph, h).flatten(1)
            logits = self.l2(graph, h).mean(1)
        elif self.name in ['appnp']:
            h = self.lin1(h)
            logits = self.l1(graph, h)
        elif self.name == 'agnn':
            h = self.lin1(h)
            h = self.l1(graph, h)
            h = self.l2(graph, h)
            logits = self.lin2(h)
64
65
66
67
68
69
        elif self.name == 'che3b':
            lambda_max = dgl.laplacian_lambda_max(graph)
            h = self.drop(h)
            h = self.l1(graph, h, lambda_max)
            logits = self.l2(graph, h, lambda_max)
        elif self.name == 'gcn':
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
            h = self.drop(h)
            h = self.l1(graph, h)
            logits = self.l2(graph, h)

        return logits

def read_input(input_folder):
    X = pd.read_csv(f'{input_folder}/X.csv')
    y = pd.read_csv(f'{input_folder}/y.csv')

    categorical_columns = []
    if os.path.exists(f'{input_folder}/cat_features.txt'):
        with open(f'{input_folder}/cat_features.txt') as f:
            for line in f:
                if line.strip():
                    categorical_columns.append(line.strip())

    cat_features = None
    if categorical_columns:
        columns = X.columns
        cat_features = np.where(columns.isin(categorical_columns))[0]

        for col in list(columns[cat_features]):
            X[col] = X[col].astype(str)

    gs, _ = load_graphs(f'{input_folder}/graph.dgl')
    graph = gs[0]

    with open(f'{input_folder}/masks.json') as f:
        masks = json.load(f)

    return graph, X, y, cat_features, masks

def normalize_features(X, train_mask, val_mask, test_mask):
    min_max_scaler = preprocessing.MinMaxScaler()
    A = X.to_numpy(copy=True)
    A[train_mask] = min_max_scaler.fit_transform(A[train_mask])
    A[val_mask + test_mask] = min_max_scaler.transform(A[val_mask + test_mask])
    return pd.DataFrame(A, columns=X.columns).astype(float)

def replace_na(X, train_mask):
    if X.isna().any().any():
        return X.fillna(X.iloc[train_mask].min() - 1)
    return X

def encode_cat_features(X, y, cat_features, train_mask, val_mask, test_mask):
    enc = CatBoostEncoder()
    A = X.to_numpy(copy=True)
    b = y.to_numpy(copy=True)
    A[np.ix_(train_mask, cat_features)] = enc.fit_transform(A[np.ix_(train_mask, cat_features)], b[train_mask])
    A[np.ix_(val_mask + test_mask, cat_features)] = enc.transform(A[np.ix_(val_mask + test_mask, cat_features)])
    A = A.astype(float)
    return pd.DataFrame(A, columns=X.columns)

if __name__ == '__main__':
    # datasets can be found here: https://www.dropbox.com/s/verx1evkykzli88/datasets.zip
    # Read dataset
    input_folder = 'datasets/avazu'
    graph, X, y, cat_features, masks = read_input(input_folder)
    train_mask, val_mask, test_mask = masks['0']['train'], masks['0']['val'], masks['0']['test']

    encoded_X = X.copy()
    normalizeFeatures = False
    replaceNa = True

    if len(cat_features):
        encoded_X = encode_cat_features(encoded_X, y, cat_features, train_mask, val_mask, test_mask)
    if normalizeFeatures:
        encoded_X = normalize_features(encoded_X, train_mask, val_mask, test_mask)
    if replaceNa:
        encoded_X = replace_na(encoded_X, train_mask)



    # specify parameters
    task = 'regression'
    hidden_dim = 128
    trees_per_epoch = 5 # 5-10 are good values to try
    backprop_per_epoch = 5 # 5-10 are good values to try
    lr = 0.1 # 0.01-0.1 are good values to try
    append_gbdt_pred = False # this can be important for performance (try True and False)
    train_input_features = False
    gbdt_depth = 6
    gbdt_lr = 0.1

    out_dim = y.shape[1] if task == 'regression' else len(set(y.iloc[test_mask, 0]))
    in_dim = out_dim + X.shape[1] if append_gbdt_pred else out_dim

    # specify GNN model
    gnn_model = GNNModelDGL(in_dim, hidden_dim, out_dim)


    # initialize BGNN model
    bgnn = BGNNPredictor(gnn_model, task=task,
                         loss_fn=None,
                         trees_per_epoch=trees_per_epoch,
                         backprop_per_epoch=backprop_per_epoch,
                         lr=lr,
                         append_gbdt_pred=append_gbdt_pred,
                         train_input_features=train_input_features,
                         gbdt_depth=gbdt_depth,
                         gbdt_lr=gbdt_lr)

    # train
    metrics = bgnn.fit(graph, encoded_X, y, train_mask, val_mask, test_mask,
                       original_X = X, cat_features=cat_features,
                       num_epochs=100, patience=10, metric_name='loss')

178
    bgnn.plot_interactive(metrics, legend=['train', 'valid', 'test'], title='Avazu', metric_name='loss')