Unverified Commit f19f05ce authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

[Misc] Black auto fix. (#4651)


Co-authored-by: default avatarSteve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
parent 977b1ba4
import torch as th import torch as th
import dgl import dgl
class NegativeSampler(object): class NegativeSampler(object):
def __init__(self, g, k, neg_share=False, device=None): def __init__(self, g, k, neg_share=False, device=None):
if device is None: if device is None:
...@@ -16,6 +18,6 @@ class NegativeSampler(object): ...@@ -16,6 +18,6 @@ class NegativeSampler(object):
dst = self.weights.multinomial(n, replacement=True) dst = self.weights.multinomial(n, replacement=True)
dst = dst.view(-1, 1, self.k).expand(-1, self.k, -1).flatten() dst = dst.view(-1, 1, self.k).expand(-1, self.k, -1).flatten()
else: else:
dst = self.weights.multinomial(n*self.k, replacement=True) dst = self.weights.multinomial(n * self.k, replacement=True)
src = src.repeat_interleave(self.k) src = src.repeat_interleave(self.k)
return src, dst return src, dst
import argparse
import time
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torchmetrics.functional as MF import torchmetrics.functional as MF
import tqdm
from ogb.nodeproppred import DglNodePropPredDataset
import dgl import dgl
import dgl.nn as dglnn import dgl.nn as dglnn
import time
import numpy as np
from ogb.nodeproppred import DglNodePropPredDataset
import tqdm
import argparse
class SAGE(nn.Module): class SAGE(nn.Module):
def __init__(self, in_feats, n_hidden, n_classes): def __init__(self, in_feats, n_hidden, n_classes):
super().__init__() super().__init__()
self.layers = nn.ModuleList() self.layers = nn.ModuleList()
self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean')) self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, "mean"))
self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean')) self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, "mean"))
self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean')) self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, "mean"))
self.dropout = nn.Dropout(0.5) self.dropout = nn.Dropout(0.5)
self.n_hidden = n_hidden self.n_hidden = n_hidden
self.n_classes = n_classes self.n_classes = n_classes
...@@ -33,20 +36,31 @@ class SAGE(nn.Module): ...@@ -33,20 +36,31 @@ class SAGE(nn.Module):
def inference(self, g, device, batch_size, num_workers, buffer_device=None): def inference(self, g, device, batch_size, num_workers, buffer_device=None):
# The difference between this inference function and the one in the official # The difference between this inference function and the one in the official
# example is that the intermediate results can also benefit from prefetching. # example is that the intermediate results can also benefit from prefetching.
feat = g.ndata['feat'] feat = g.ndata["feat"]
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1, prefetch_node_feats=['feat']) sampler = dgl.dataloading.MultiLayerFullNeighborSampler(
1, prefetch_node_feats=["feat"]
)
dataloader = dgl.dataloading.DataLoader( dataloader = dgl.dataloading.DataLoader(
g, torch.arange(g.num_nodes()).to(g.device), sampler, device=device, g,
batch_size=batch_size, shuffle=False, drop_last=False, torch.arange(g.num_nodes()).to(g.device),
num_workers=num_workers) sampler,
device=device,
batch_size=batch_size,
shuffle=False,
drop_last=False,
num_workers=num_workers,
)
if buffer_device is None: if buffer_device is None:
buffer_device = device buffer_device = device
for l, layer in enumerate(self.layers): for l, layer in enumerate(self.layers):
y = torch.empty( y = torch.empty(
g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes, g.num_nodes(),
device=buffer_device, pin_memory=True) self.n_hidden if l != len(self.layers) - 1 else self.n_classes,
device=buffer_device,
pin_memory=True,
)
feat = feat.to(device) feat = feat.to(device)
for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader): for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
# use an explicitly contuous slice # use an explicitly contuous slice
...@@ -57,44 +71,64 @@ class SAGE(nn.Module): ...@@ -57,44 +71,64 @@ class SAGE(nn.Module):
h = self.dropout(h) h = self.dropout(h)
# be design, our output nodes are contiguous so we can take # be design, our output nodes are contiguous so we can take
# advantage of that here # advantage of that here
y[output_nodes[0]:output_nodes[-1]+1] = h.to(buffer_device) y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
feat = y feat = y
return y return y
dataset = DglNodePropPredDataset("ogbn-products")
dataset = DglNodePropPredDataset('ogbn-products')
graph, labels = dataset[0] graph, labels = dataset[0]
graph.ndata['label'] = labels.squeeze() graph.ndata["label"] = labels.squeeze()
split_idx = dataset.get_idx_split() split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx['train'], split_idx['valid'], split_idx['test'] train_idx, valid_idx, test_idx = (
split_idx["train"],
split_idx["valid"],
split_idx["test"],
)
device = 'cuda' device = "cuda"
train_idx = train_idx.to(device) train_idx = train_idx.to(device)
valid_idx = valid_idx.to(device) valid_idx = valid_idx.to(device)
test_idx = test_idx.to(device) test_idx = test_idx.to(device)
graph = graph.to(device) graph = graph.to(device)
model = SAGE(graph.ndata['feat'].shape[1], 256, dataset.num_classes).to(device) model = SAGE(graph.ndata["feat"].shape[1], 256, dataset.num_classes).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4) opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
sampler = dgl.dataloading.NeighborSampler( sampler = dgl.dataloading.NeighborSampler(
[15, 10, 5], prefetch_node_feats=['feat'], prefetch_labels=['label']) [15, 10, 5], prefetch_node_feats=["feat"], prefetch_labels=["label"]
)
train_dataloader = dgl.dataloading.DataLoader( train_dataloader = dgl.dataloading.DataLoader(
graph, train_idx, sampler, device=device, batch_size=1024, shuffle=True, graph,
drop_last=False, num_workers=0, use_uva=False) train_idx,
sampler,
device=device,
batch_size=1024,
shuffle=True,
drop_last=False,
num_workers=0,
use_uva=False,
)
valid_dataloader = dgl.dataloading.DataLoader( valid_dataloader = dgl.dataloading.DataLoader(
graph, valid_idx, sampler, device=device, batch_size=1024, shuffle=True, graph,
drop_last=False, num_workers=0, use_uva=False) valid_idx,
sampler,
device=device,
batch_size=1024,
shuffle=True,
drop_last=False,
num_workers=0,
use_uva=False,
)
durations = [] durations = []
for _ in range(10): for _ in range(10):
model.train() model.train()
t0 = time.time() t0 = time.time()
for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader): for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
x = blocks[0].srcdata['feat'] x = blocks[0].srcdata["feat"]
y = blocks[-1].dstdata['label'] y = blocks[-1].dstdata["label"]
y_hat = model(blocks, x) y_hat = model(blocks, x)
loss = F.cross_entropy(y_hat, y) loss = F.cross_entropy(y_hat, y)
opt.zero_grad() opt.zero_grad()
...@@ -103,7 +137,7 @@ for _ in range(10): ...@@ -103,7 +137,7 @@ for _ in range(10):
if it % 20 == 0: if it % 20 == 0:
acc = MF.accuracy(torch.argmax(y_hat, dim=1), y) acc = MF.accuracy(torch.argmax(y_hat, dim=1), y)
mem = torch.cuda.max_memory_allocated() / 1000000 mem = torch.cuda.max_memory_allocated() / 1000000
print('Loss', loss.item(), 'Acc', acc.item(), 'GPU Mem', mem, 'MB') print("Loss", loss.item(), "Acc", acc.item(), "GPU Mem", mem, "MB")
tt = time.time() tt = time.time()
print(tt - t0) print(tt - t0)
durations.append(tt - t0) durations.append(tt - t0)
...@@ -113,19 +147,19 @@ for _ in range(10): ...@@ -113,19 +147,19 @@ for _ in range(10):
y_hats = [] y_hats = []
for it, (input_nodes, output_nodes, blocks) in enumerate(valid_dataloader): for it, (input_nodes, output_nodes, blocks) in enumerate(valid_dataloader):
with torch.no_grad(): with torch.no_grad():
x = blocks[0].srcdata['feat'] x = blocks[0].srcdata["feat"]
ys.append(blocks[-1].dstdata['label']) ys.append(blocks[-1].dstdata["label"])
y_hats.append(torch.argmax(model(blocks, x), dim=1)) y_hats.append(torch.argmax(model(blocks, x), dim=1))
acc = MF.accuracy(torch.cat(y_hats), torch.cat(ys)) acc = MF.accuracy(torch.cat(y_hats), torch.cat(ys))
print('Validation acc:', acc.item()) print("Validation acc:", acc.item())
print(np.mean(durations[4:]), np.std(durations[4:])) print(np.mean(durations[4:]), np.std(durations[4:]))
# Test accuracy and offline inference of all nodes # Test accuracy and offline inference of all nodes
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
pred = model.inference(graph, device, 4096, 0, 'cpu') pred = model.inference(graph, device, 4096, 0, "cpu")
pred = pred[test_idx].to(device) pred = pred[test_idx].to(device)
label = graph.ndata['label'][test_idx] label = graph.ndata["label"][test_idx]
acc = MF.accuracy(torch.argmax(pred, dim=1), label) acc = MF.accuracy(torch.argmax(pred, dim=1), label)
print('Test acc:', acc.item()) print("Test acc:", acc.item())
import dgl
import numpy as np
import torch as th
import argparse import argparse
import time
import sys
import os import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..')) import sys
from load_graph import load_reddit, load_ogb import time
import numpy as np
import torch as th
import dgl
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from load_graph import load_ogb, load_reddit
if __name__ == '__main__': if __name__ == "__main__":
argparser = argparse.ArgumentParser("Partition builtin graphs") argparser = argparse.ArgumentParser("Partition builtin graphs")
argparser.add_argument('--dataset', type=str, default='reddit', argparser.add_argument(
help='datasets: reddit, ogb-product, ogb-paper100M') "--dataset",
argparser.add_argument('--num_parts', type=int, default=4, type=str,
help='number of partitions') default="reddit",
argparser.add_argument('--part_method', type=str, default='metis', help="datasets: reddit, ogb-product, ogb-paper100M",
help='the partition method') )
argparser.add_argument('--balance_train', action='store_true', argparser.add_argument(
help='balance the training size in each partition.') "--num_parts", type=int, default=4, help="number of partitions"
argparser.add_argument('--undirected', action='store_true', )
help='turn the graph into an undirected graph.') argparser.add_argument(
argparser.add_argument('--balance_edges', action='store_true', "--part_method", type=str, default="metis", help="the partition method"
help='balance the number of edges in each partition.') )
argparser.add_argument('--num_trainers_per_machine', type=int, default=1, argparser.add_argument(
help='the number of trainers per machine. The trainer ids are stored\ "--balance_train",
in the node feature \'trainer_id\'') action="store_true",
argparser.add_argument('--output', type=str, default='data', help="balance the training size in each partition.",
help='Output path of partitioned graph.') )
argparser.add_argument(
"--undirected",
action="store_true",
help="turn the graph into an undirected graph.",
)
argparser.add_argument(
"--balance_edges",
action="store_true",
help="balance the number of edges in each partition.",
)
argparser.add_argument(
"--num_trainers_per_machine",
type=int,
default=1,
help="the number of trainers per machine. The trainer ids are stored\
in the node feature 'trainer_id'",
)
argparser.add_argument(
"--output",
type=str,
default="data",
help="Output path of partitioned graph.",
)
args = argparser.parse_args() args = argparser.parse_args()
start = time.time() start = time.time()
if args.dataset == 'reddit': if args.dataset == "reddit":
g, _ = load_reddit() g, _ = load_reddit()
elif args.dataset == 'ogb-product': elif args.dataset == "ogb-product":
g, _ = load_ogb('ogbn-products') g, _ = load_ogb("ogbn-products")
elif args.dataset == 'ogb-paper100M': elif args.dataset == "ogb-paper100M":
g, _ = load_ogb('ogbn-papers100M') g, _ = load_ogb("ogbn-papers100M")
print('load {} takes {:.3f} seconds'.format(args.dataset, time.time() - start)) print(
print('|V|={}, |E|={}'.format(g.number_of_nodes(), g.number_of_edges())) "load {} takes {:.3f} seconds".format(args.dataset, time.time() - start)
print('train: {}, valid: {}, test: {}'.format(th.sum(g.ndata['train_mask']), )
th.sum(g.ndata['val_mask']), print("|V|={}, |E|={}".format(g.number_of_nodes(), g.number_of_edges()))
th.sum(g.ndata['test_mask']))) print(
"train: {}, valid: {}, test: {}".format(
th.sum(g.ndata["train_mask"]),
th.sum(g.ndata["val_mask"]),
th.sum(g.ndata["test_mask"]),
)
)
if args.balance_train: if args.balance_train:
balance_ntypes = g.ndata['train_mask'] balance_ntypes = g.ndata["train_mask"]
else: else:
balance_ntypes = None balance_ntypes = None
...@@ -52,8 +84,13 @@ if __name__ == '__main__': ...@@ -52,8 +84,13 @@ if __name__ == '__main__':
sym_g.ndata[key] = g.ndata[key] sym_g.ndata[key] = g.ndata[key]
g = sym_g g = sym_g
dgl.distributed.partition_graph(g, args.dataset, args.num_parts, args.output, dgl.distributed.partition_graph(
part_method=args.part_method, g,
balance_ntypes=balance_ntypes, args.dataset,
balance_edges=args.balance_edges, args.num_parts,
num_trainers_per_machine=args.num_trainers_per_machine) args.output,
part_method=args.part_method,
balance_ntypes=balance_ntypes,
balance_edges=args.balance_edges,
num_trainers_per_machine=args.num_trainers_per_machine,
)
...@@ -11,47 +11,49 @@ Copyright (c) 2021 Intel Corporation ...@@ -11,47 +11,49 @@ Copyright (c) 2021 Intel Corporation
""" """
import os import argparse
import sys
import numpy as np
import csv import csv
from statistics import mean import os
import random import random
import sys
import time import time
import argparse from statistics import mean
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
import numpy as np
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from load_graph import load_ogb from load_graph import load_ogb
import dgl import dgl
from dgl.base import DGLError
from dgl.data import load_data from dgl.data import load_data
from dgl.distgnn.partition import partition_graph from dgl.distgnn.partition import partition_graph
from dgl.distgnn.tools import load_proteins from dgl.distgnn.tools import load_proteins
from dgl.base import DGLError
if __name__ == "__main__": if __name__ == "__main__":
argparser = argparse.ArgumentParser() argparser = argparse.ArgumentParser()
argparser.add_argument('--dataset', type=str, default='cora') argparser.add_argument("--dataset", type=str, default="cora")
argparser.add_argument('--num-parts', type=int, default=2) argparser.add_argument("--num-parts", type=int, default=2)
argparser.add_argument('--out-dir', type=str, default='./') argparser.add_argument("--out-dir", type=str, default="./")
args = argparser.parse_args() args = argparser.parse_args()
dataset = args.dataset dataset = args.dataset
num_community = args.num_parts num_community = args.num_parts
out_dir = 'Libra_result_' + dataset ## "Libra_result_" prefix is mandatory out_dir = "Libra_result_" + dataset ## "Libra_result_" prefix is mandatory
resultdir = os.path.join(args.out_dir, out_dir) resultdir = os.path.join(args.out_dir, out_dir)
print("Input dataset for partitioning: ", dataset) print("Input dataset for partitioning: ", dataset)
if args.dataset == 'ogbn-products': if args.dataset == "ogbn-products":
print("Loading ogbn-products") print("Loading ogbn-products")
G, _ = load_ogb('ogbn-products') G, _ = load_ogb("ogbn-products")
elif args.dataset == 'ogbn-papers100M': elif args.dataset == "ogbn-papers100M":
print("Loading ogbn-papers100M") print("Loading ogbn-papers100M")
G, _ = load_ogb('ogbn-papers100M') G, _ = load_ogb("ogbn-papers100M")
elif args.dataset == 'proteins': elif args.dataset == "proteins":
G = load_proteins('proteins') G = load_proteins("proteins")
elif args.dataset == 'ogbn-arxiv': elif args.dataset == "ogbn-arxiv":
print("Loading ogbn-arxiv") print("Loading ogbn-arxiv")
G, _ = load_ogb('ogbn-arxiv') G, _ = load_ogb("ogbn-arxiv")
else: else:
try: try:
G = load_data(args)[0] G = load_data(args)[0]
......
import dgl
import torch as th import torch as th
import dgl
def load_reddit(self_loop=True): def load_reddit(self_loop=True):
from dgl.data import RedditDataset from dgl.data import RedditDataset
# load reddit data # load reddit data
data = RedditDataset(self_loop=self_loop) data = RedditDataset(self_loop=self_loop)
g = data[0] g = data[0]
g.ndata['features'] = g.ndata.pop('feat') g.ndata["features"] = g.ndata.pop("feat")
g.ndata['labels'] = g.ndata.pop('label') g.ndata["labels"] = g.ndata.pop("label")
return g, data.num_classes return g, data.num_classes
def load_ogb(name, root='dataset'):
def load_ogb(name, root="dataset"):
from ogb.nodeproppred import DglNodePropPredDataset from ogb.nodeproppred import DglNodePropPredDataset
print('load', name) print("load", name)
data = DglNodePropPredDataset(name=name, root=root) data = DglNodePropPredDataset(name=name, root=root)
print('finish loading', name) print("finish loading", name)
splitted_idx = data.get_idx_split() splitted_idx = data.get_idx_split()
graph, labels = data[0] graph, labels = data[0]
labels = labels[:, 0] labels = labels[:, 0]
graph.ndata['features'] = graph.ndata.pop('feat') graph.ndata["features"] = graph.ndata.pop("feat")
graph.ndata['labels'] = labels graph.ndata["labels"] = labels
in_feats = graph.ndata['features'].shape[1] in_feats = graph.ndata["features"].shape[1]
num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))])) num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))]))
# Find the node IDs in the training, validation, and test set. # Find the node IDs in the training, validation, and test set.
train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] train_nid, val_nid, test_nid = (
splitted_idx["train"],
splitted_idx["valid"],
splitted_idx["test"],
)
train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
train_mask[train_nid] = True train_mask[train_nid] = True
val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
val_mask[val_nid] = True val_mask[val_nid] = True
test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
test_mask[test_nid] = True test_mask[test_nid] = True
graph.ndata['train_mask'] = train_mask graph.ndata["train_mask"] = train_mask
graph.ndata['val_mask'] = val_mask graph.ndata["val_mask"] = val_mask
graph.ndata['test_mask'] = test_mask graph.ndata["test_mask"] = test_mask
print('finish constructing', name) print("finish constructing", name)
return graph, num_labels return graph, num_labels
def inductive_split(g): def inductive_split(g):
"""Split the graph into training graph, validation graph, and test graph by training """Split the graph into training graph, validation graph, and test graph by training
and validation masks. Suitable for inductive models.""" and validation masks. Suitable for inductive models."""
train_g = g.subgraph(g.ndata['train_mask']) train_g = g.subgraph(g.ndata["train_mask"])
val_g = g.subgraph(g.ndata['train_mask'] | g.ndata['val_mask']) val_g = g.subgraph(g.ndata["train_mask"] | g.ndata["val_mask"])
test_g = g test_g = g
return train_g, val_g, test_g return train_g, val_g, test_g
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment