import dgl import torch as th import torch.optim as optim from torch.utils.data import DataLoader from sklearn import metrics import utils from model import EGES from sampler import Sampler def train(args, train_g, sku_info, num_skus, num_brands, num_shops, num_cates): sampler = Sampler( train_g, args.walk_length, args.num_walks, args.window_size, args.num_negative ) # for each node in the graph, we sample pos and neg # pairs for it, and feed these sampled pairs into the model. # (nodes in the graph are of course batched before sampling) dataloader = DataLoader( th.arange(train_g.num_nodes()), # this is the batch_size of input nodes batch_size=args.batch_size, shuffle=True, collate_fn=lambda x: sampler.sample(x, sku_info) ) model = EGES(args.dim, num_skus, num_brands, num_shops, num_cates) optimizer = optim.Adam(model.parameters(), lr=args.lr) for epoch in range(args.epochs): epoch_total_loss = 0 for step, (srcs, dsts, labels) in enumerate(dataloader): # the batch size of output pairs is unfixed # TODO: shuffle the triples? srcs_embeds, dsts_embeds = model(srcs, dsts) loss = model.loss(srcs_embeds, dsts_embeds, labels) optimizer.zero_grad() loss.backward() optimizer.step() epoch_total_loss += loss.item() if step % args.log_every == 0: print('Epoch {:05d} | Step {:05d} | Step Loss {:.4f} | Epoch Avg Loss: {:.4f}'.format( epoch, step, loss.item(), epoch_total_loss / (step + 1))) eval(model, test_g, sku_info) return model def eval(model, test_graph, sku_info): preds, labels = [], [] for edge in test_graph: src = th.tensor(sku_info[edge.src.numpy()[0]]).view(1, 4) dst = th.tensor(sku_info[edge.dst.numpy()[0]]).view(1, 4) # (1, dim) src = model.query_node_embed(src) dst = model.query_node_embed(dst) # (1, dim) -> (1, dim) -> (1, ) logit = th.sigmoid(th.sum(src * dst)) preds.append(logit.detach().numpy().tolist()) labels.append(edge.label) fpr, tpr, thresholds = metrics.roc_curve(labels, preds, pos_label=1) print("Evaluate link prediction AUC: {:.4f}".format(metrics.auc(fpr, tpr))) if __name__ == "__main__": args = utils.init_args() valid_sku_raw_ids = utils.get_valid_sku_set(args.item_info_data) g, sku_encoder, sku_decoder = utils.construct_graph( args.action_data, args.session_interval_sec, valid_sku_raw_ids ) train_g, test_g = utils.split_train_test_graph(g) sku_info_encoder, sku_info_decoder, sku_info = \ utils.encode_sku_fields(args.item_info_data, sku_encoder, sku_decoder) num_skus = len(sku_encoder) num_brands = len(sku_info_encoder["brand"]) num_shops = len(sku_info_encoder["shop"]) num_cates = len(sku_info_encoder["cate"]) print( "Num skus: {}, num brands: {}, num shops: {}, num cates: {}".\ format(num_skus, num_brands, num_shops, num_cates) ) model = train(args, train_g, sku_info, num_skus, num_brands, num_shops, num_cates)