Unverified Commit f19f05ce authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

[Misc] Black auto fix. (#4651)


Co-authored-by: default avatarSteve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
parent 977b1ba4
import torch as th
import dgl
class NegativeSampler(object):
def __init__(self, g, k, neg_share=False, device=None):
if device is None:
......@@ -16,6 +18,6 @@ class NegativeSampler(object):
dst = self.weights.multinomial(n, replacement=True)
dst = dst.view(-1, 1, self.k).expand(-1, self.k, -1).flatten()
else:
dst = self.weights.multinomial(n*self.k, replacement=True)
dst = self.weights.multinomial(n * self.k, replacement=True)
src = src.repeat_interleave(self.k)
return src, dst
import argparse
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics.functional as MF
import tqdm
from ogb.nodeproppred import DglNodePropPredDataset
import dgl
import dgl.nn as dglnn
import time
import numpy as np
from ogb.nodeproppred import DglNodePropPredDataset
import tqdm
import argparse
class SAGE(nn.Module):
def __init__(self, in_feats, n_hidden, n_classes):
super().__init__()
self.layers = nn.ModuleList()
self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, "mean"))
self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, "mean"))
self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, "mean"))
self.dropout = nn.Dropout(0.5)
self.n_hidden = n_hidden
self.n_classes = n_classes
......@@ -33,20 +36,31 @@ class SAGE(nn.Module):
def inference(self, g, device, batch_size, num_workers, buffer_device=None):
# The difference between this inference function and the one in the official
# example is that the intermediate results can also benefit from prefetching.
feat = g.ndata['feat']
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1, prefetch_node_feats=['feat'])
feat = g.ndata["feat"]
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(
1, prefetch_node_feats=["feat"]
)
dataloader = dgl.dataloading.DataLoader(
g, torch.arange(g.num_nodes()).to(g.device), sampler, device=device,
batch_size=batch_size, shuffle=False, drop_last=False,
num_workers=num_workers)
g,
torch.arange(g.num_nodes()).to(g.device),
sampler,
device=device,
batch_size=batch_size,
shuffle=False,
drop_last=False,
num_workers=num_workers,
)
if buffer_device is None:
buffer_device = device
for l, layer in enumerate(self.layers):
y = torch.empty(
g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes,
device=buffer_device, pin_memory=True)
g.num_nodes(),
self.n_hidden if l != len(self.layers) - 1 else self.n_classes,
device=buffer_device,
pin_memory=True,
)
feat = feat.to(device)
for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
# use an explicitly contuous slice
......@@ -57,44 +71,64 @@ class SAGE(nn.Module):
h = self.dropout(h)
# be design, our output nodes are contiguous so we can take
# advantage of that here
y[output_nodes[0]:output_nodes[-1]+1] = h.to(buffer_device)
y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
feat = y
return y
dataset = DglNodePropPredDataset('ogbn-products')
dataset = DglNodePropPredDataset("ogbn-products")
graph, labels = dataset[0]
graph.ndata['label'] = labels.squeeze()
graph.ndata["label"] = labels.squeeze()
split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx['train'], split_idx['valid'], split_idx['test']
train_idx, valid_idx, test_idx = (
split_idx["train"],
split_idx["valid"],
split_idx["test"],
)
device = 'cuda'
device = "cuda"
train_idx = train_idx.to(device)
valid_idx = valid_idx.to(device)
test_idx = test_idx.to(device)
graph = graph.to(device)
model = SAGE(graph.ndata['feat'].shape[1], 256, dataset.num_classes).to(device)
model = SAGE(graph.ndata["feat"].shape[1], 256, dataset.num_classes).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
sampler = dgl.dataloading.NeighborSampler(
[15, 10, 5], prefetch_node_feats=['feat'], prefetch_labels=['label'])
[15, 10, 5], prefetch_node_feats=["feat"], prefetch_labels=["label"]
)
train_dataloader = dgl.dataloading.DataLoader(
graph, train_idx, sampler, device=device, batch_size=1024, shuffle=True,
drop_last=False, num_workers=0, use_uva=False)
graph,
train_idx,
sampler,
device=device,
batch_size=1024,
shuffle=True,
drop_last=False,
num_workers=0,
use_uva=False,
)
valid_dataloader = dgl.dataloading.DataLoader(
graph, valid_idx, sampler, device=device, batch_size=1024, shuffle=True,
drop_last=False, num_workers=0, use_uva=False)
graph,
valid_idx,
sampler,
device=device,
batch_size=1024,
shuffle=True,
drop_last=False,
num_workers=0,
use_uva=False,
)
durations = []
for _ in range(10):
model.train()
t0 = time.time()
for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
x = blocks[0].srcdata['feat']
y = blocks[-1].dstdata['label']
x = blocks[0].srcdata["feat"]
y = blocks[-1].dstdata["label"]
y_hat = model(blocks, x)
loss = F.cross_entropy(y_hat, y)
opt.zero_grad()
......@@ -103,7 +137,7 @@ for _ in range(10):
if it % 20 == 0:
acc = MF.accuracy(torch.argmax(y_hat, dim=1), y)
mem = torch.cuda.max_memory_allocated() / 1000000
print('Loss', loss.item(), 'Acc', acc.item(), 'GPU Mem', mem, 'MB')
print("Loss", loss.item(), "Acc", acc.item(), "GPU Mem", mem, "MB")
tt = time.time()
print(tt - t0)
durations.append(tt - t0)
......@@ -113,19 +147,19 @@ for _ in range(10):
y_hats = []
for it, (input_nodes, output_nodes, blocks) in enumerate(valid_dataloader):
with torch.no_grad():
x = blocks[0].srcdata['feat']
ys.append(blocks[-1].dstdata['label'])
x = blocks[0].srcdata["feat"]
ys.append(blocks[-1].dstdata["label"])
y_hats.append(torch.argmax(model(blocks, x), dim=1))
acc = MF.accuracy(torch.cat(y_hats), torch.cat(ys))
print('Validation acc:', acc.item())
print("Validation acc:", acc.item())
print(np.mean(durations[4:]), np.std(durations[4:]))
# Test accuracy and offline inference of all nodes
model.eval()
with torch.no_grad():
pred = model.inference(graph, device, 4096, 0, 'cpu')
pred = model.inference(graph, device, 4096, 0, "cpu")
pred = pred[test_idx].to(device)
label = graph.ndata['label'][test_idx]
label = graph.ndata["label"][test_idx]
acc = MF.accuracy(torch.argmax(pred, dim=1), label)
print('Test acc:', acc.item())
print("Test acc:", acc.item())
import dgl
import numpy as np
import torch as th
import argparse
import time
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from load_graph import load_reddit, load_ogb
import sys
import time
import numpy as np
import torch as th
import dgl
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from load_graph import load_ogb, load_reddit
if __name__ == '__main__':
if __name__ == "__main__":
argparser = argparse.ArgumentParser("Partition builtin graphs")
argparser.add_argument('--dataset', type=str, default='reddit',
help='datasets: reddit, ogb-product, ogb-paper100M')
argparser.add_argument('--num_parts', type=int, default=4,
help='number of partitions')
argparser.add_argument('--part_method', type=str, default='metis',
help='the partition method')
argparser.add_argument('--balance_train', action='store_true',
help='balance the training size in each partition.')
argparser.add_argument('--undirected', action='store_true',
help='turn the graph into an undirected graph.')
argparser.add_argument('--balance_edges', action='store_true',
help='balance the number of edges in each partition.')
argparser.add_argument('--num_trainers_per_machine', type=int, default=1,
help='the number of trainers per machine. The trainer ids are stored\
in the node feature \'trainer_id\'')
argparser.add_argument('--output', type=str, default='data',
help='Output path of partitioned graph.')
argparser.add_argument(
"--dataset",
type=str,
default="reddit",
help="datasets: reddit, ogb-product, ogb-paper100M",
)
argparser.add_argument(
"--num_parts", type=int, default=4, help="number of partitions"
)
argparser.add_argument(
"--part_method", type=str, default="metis", help="the partition method"
)
argparser.add_argument(
"--balance_train",
action="store_true",
help="balance the training size in each partition.",
)
argparser.add_argument(
"--undirected",
action="store_true",
help="turn the graph into an undirected graph.",
)
argparser.add_argument(
"--balance_edges",
action="store_true",
help="balance the number of edges in each partition.",
)
argparser.add_argument(
"--num_trainers_per_machine",
type=int,
default=1,
help="the number of trainers per machine. The trainer ids are stored\
in the node feature 'trainer_id'",
)
argparser.add_argument(
"--output",
type=str,
default="data",
help="Output path of partitioned graph.",
)
args = argparser.parse_args()
start = time.time()
if args.dataset == 'reddit':
if args.dataset == "reddit":
g, _ = load_reddit()
elif args.dataset == 'ogb-product':
g, _ = load_ogb('ogbn-products')
elif args.dataset == 'ogb-paper100M':
g, _ = load_ogb('ogbn-papers100M')
print('load {} takes {:.3f} seconds'.format(args.dataset, time.time() - start))
print('|V|={}, |E|={}'.format(g.number_of_nodes(), g.number_of_edges()))
print('train: {}, valid: {}, test: {}'.format(th.sum(g.ndata['train_mask']),
th.sum(g.ndata['val_mask']),
th.sum(g.ndata['test_mask'])))
elif args.dataset == "ogb-product":
g, _ = load_ogb("ogbn-products")
elif args.dataset == "ogb-paper100M":
g, _ = load_ogb("ogbn-papers100M")
print(
"load {} takes {:.3f} seconds".format(args.dataset, time.time() - start)
)
print("|V|={}, |E|={}".format(g.number_of_nodes(), g.number_of_edges()))
print(
"train: {}, valid: {}, test: {}".format(
th.sum(g.ndata["train_mask"]),
th.sum(g.ndata["val_mask"]),
th.sum(g.ndata["test_mask"]),
)
)
if args.balance_train:
balance_ntypes = g.ndata['train_mask']
balance_ntypes = g.ndata["train_mask"]
else:
balance_ntypes = None
......@@ -52,8 +84,13 @@ if __name__ == '__main__':
sym_g.ndata[key] = g.ndata[key]
g = sym_g
dgl.distributed.partition_graph(g, args.dataset, args.num_parts, args.output,
part_method=args.part_method,
balance_ntypes=balance_ntypes,
balance_edges=args.balance_edges,
num_trainers_per_machine=args.num_trainers_per_machine)
dgl.distributed.partition_graph(
g,
args.dataset,
args.num_parts,
args.output,
part_method=args.part_method,
balance_ntypes=balance_ntypes,
balance_edges=args.balance_edges,
num_trainers_per_machine=args.num_trainers_per_machine,
)
......@@ -11,47 +11,49 @@ Copyright (c) 2021 Intel Corporation
"""
import os
import sys
import numpy as np
import argparse
import csv
from statistics import mean
import os
import random
import sys
import time
import argparse
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from statistics import mean
import numpy as np
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from load_graph import load_ogb
import dgl
from dgl.base import DGLError
from dgl.data import load_data
from dgl.distgnn.partition import partition_graph
from dgl.distgnn.tools import load_proteins
from dgl.base import DGLError
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('--dataset', type=str, default='cora')
argparser.add_argument('--num-parts', type=int, default=2)
argparser.add_argument('--out-dir', type=str, default='./')
argparser.add_argument("--dataset", type=str, default="cora")
argparser.add_argument("--num-parts", type=int, default=2)
argparser.add_argument("--out-dir", type=str, default="./")
args = argparser.parse_args()
dataset = args.dataset
num_community = args.num_parts
out_dir = 'Libra_result_' + dataset ## "Libra_result_" prefix is mandatory
out_dir = "Libra_result_" + dataset ## "Libra_result_" prefix is mandatory
resultdir = os.path.join(args.out_dir, out_dir)
print("Input dataset for partitioning: ", dataset)
if args.dataset == 'ogbn-products':
if args.dataset == "ogbn-products":
print("Loading ogbn-products")
G, _ = load_ogb('ogbn-products')
elif args.dataset == 'ogbn-papers100M':
G, _ = load_ogb("ogbn-products")
elif args.dataset == "ogbn-papers100M":
print("Loading ogbn-papers100M")
G, _ = load_ogb('ogbn-papers100M')
elif args.dataset == 'proteins':
G = load_proteins('proteins')
elif args.dataset == 'ogbn-arxiv':
G, _ = load_ogb("ogbn-papers100M")
elif args.dataset == "proteins":
G = load_proteins("proteins")
elif args.dataset == "ogbn-arxiv":
print("Loading ogbn-arxiv")
G, _ = load_ogb('ogbn-arxiv')
G, _ = load_ogb("ogbn-arxiv")
else:
try:
G = load_data(args)[0]
......
import dgl
import torch as th
import dgl
def load_reddit(self_loop=True):
from dgl.data import RedditDataset
# load reddit data
data = RedditDataset(self_loop=self_loop)
g = data[0]
g.ndata['features'] = g.ndata.pop('feat')
g.ndata['labels'] = g.ndata.pop('label')
g.ndata["features"] = g.ndata.pop("feat")
g.ndata["labels"] = g.ndata.pop("label")
return g, data.num_classes
def load_ogb(name, root='dataset'):
def load_ogb(name, root="dataset"):
from ogb.nodeproppred import DglNodePropPredDataset
print('load', name)
print("load", name)
data = DglNodePropPredDataset(name=name, root=root)
print('finish loading', name)
print("finish loading", name)
splitted_idx = data.get_idx_split()
graph, labels = data[0]
labels = labels[:, 0]
graph.ndata['features'] = graph.ndata.pop('feat')
graph.ndata['labels'] = labels
in_feats = graph.ndata['features'].shape[1]
graph.ndata["features"] = graph.ndata.pop("feat")
graph.ndata["labels"] = labels
in_feats = graph.ndata["features"].shape[1]
num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))]))
# Find the node IDs in the training, validation, and test set.
train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
train_nid, val_nid, test_nid = (
splitted_idx["train"],
splitted_idx["valid"],
splitted_idx["test"],
)
train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
train_mask[train_nid] = True
val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
val_mask[val_nid] = True
test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
test_mask[test_nid] = True
graph.ndata['train_mask'] = train_mask
graph.ndata['val_mask'] = val_mask
graph.ndata['test_mask'] = test_mask
print('finish constructing', name)
graph.ndata["train_mask"] = train_mask
graph.ndata["val_mask"] = val_mask
graph.ndata["test_mask"] = test_mask
print("finish constructing", name)
return graph, num_labels
def inductive_split(g):
"""Split the graph into training graph, validation graph, and test graph by training
and validation masks. Suitable for inductive models."""
train_g = g.subgraph(g.ndata['train_mask'])
val_g = g.subgraph(g.ndata['train_mask'] | g.ndata['val_mask'])
train_g = g.subgraph(g.ndata["train_mask"])
val_g = g.subgraph(g.ndata["train_mask"] | g.ndata["val_mask"])
test_g = g
return train_g, val_g, test_g
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment