Unverified Commit a664b0c4 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[Doc] update large node classification (#6594)

parent 09a1a2f8
...@@ -22,49 +22,37 @@ Sampling for GNN Training <L0_neighbor_sampling_overview>`. ...@@ -22,49 +22,37 @@ Sampling for GNN Training <L0_neighbor_sampling_overview>`.
# Loading Dataset # Loading Dataset
# --------------- # ---------------
# #
# OGB already prepared the data as DGL graph. # `ogbn-arxiv` is already prepared as ``BuiltinDataset`` in GraphBolt.
# #
import os import os
os.environ["DGLBACKEND"] = "pytorch" os.environ["DGLBACKEND"] = "pytorch"
import dgl import dgl
import dgl.graphbolt as gb
import numpy as np import numpy as np
import torch import torch
from ogb.nodeproppred import DglNodePropPredDataset
dataset = DglNodePropPredDataset("ogbn-arxiv") dataset = gb.BuiltinDataset("ogbn-arxiv").load()
device = "cpu" # change to 'cuda' for GPU device = "cpu" # change to 'cuda' for GPU
###################################################################### ######################################################################
# OGB dataset is a collection of graphs and their labels. ``ogbn-arxiv`` # Dataset consists of graph, feature and tasks. You can get the
# dataset only contains a single graph. So you can # training-validation-test set from the tasks. Seed nodes and corresponding
# simply get the graph and its node labels like this: # labels are already stored in each training-validation-test set. Other
# metadata such as number of classes are also stored in the tasks. In this
# dataset, there is only one task: `node classification``.
# #
graph, node_labels = dataset[0] graph = dataset.graph
# Add reverse edges since ogbn-arxiv is unidirectional. feature = dataset.feature
graph = dgl.add_reverse_edges(graph) train_set = dataset.tasks[0].train_set
graph.ndata["label"] = node_labels[:, 0] valid_set = dataset.tasks[0].validation_set
print(graph) test_set = dataset.tasks[0].test_set
print(node_labels) task_name = dataset.tasks[0].metadata["name"]
num_classes = dataset.tasks[0].metadata["num_classes"]
node_features = graph.ndata["feat"] print(f"Task: {task_name}. Number of classes: {num_classes}")
num_features = node_features.shape[1]
num_classes = (node_labels.max() + 1).item()
print("Number of classes:", num_classes)
######################################################################
# You can get the training-validation-test split of the nodes with
# ``get_split_idx`` method.
#
idx_split = dataset.get_idx_split()
train_nids = idx_split["train"]
valid_nids = idx_split["valid"]
test_nids = idx_split["test"]
###################################################################### ######################################################################
...@@ -88,93 +76,58 @@ test_nids = idx_split["test"] ...@@ -88,93 +76,58 @@ test_nids = idx_split["test"]
# DGL provides tools to iterate over the dataset in minibatches # DGL provides tools to iterate over the dataset in minibatches
# while generating the computation dependencies to compute their outputs # while generating the computation dependencies to compute their outputs
# with the MFGs above. For node classification, you can use # with the MFGs above. For node classification, you can use
# ``dgl.dataloading.DataLoader`` for iterating over the dataset. # ``dgl.graphbolt.MultiProcessDataLoader`` for iterating over the dataset.
# It accepts a sampler object to control how to generate the computation # It accepts a data pipe that generates minibatches of nodes and their
# dependencies in the form of MFGs. DGL provides # labels, sample neighbors for each node, and generate the computation
# implementations of common sampling algorithms such as # dependencies in the form of MFGs. Feature fetching, block creation and
# ``dgl.dataloading.NeighborSampler`` which randomly picks # copying to target device are also supported. All these operations are
# a fixed number of neighbors for each node. # split into separate stages in the data pipe, so that you can customize
# the data pipeline by inserting your own operations.
# #
# .. note:: # .. note::
# #
# To write your own neighbor sampler, please refer to :ref:`this user # To write your own neighbor sampler, please refer to :ref:`this user
# guide section <guide-minibatch-customizing-neighborhood-sampler>`. # guide section <guide-minibatch-customizing-neighborhood-sampler>`.
# #
# The syntax of ``dgl.dataloading.DataLoader`` is mostly similar to a
# PyTorch ``DataLoader``, with the addition that it needs a graph to
# generate computation dependency from, a set of node IDs to iterate on,
# and the neighbor sampler you defined.
# #
# Let’s say that each node will gather messages from 4 neighbors on each # Let’s say that each node will gather messages from 4 neighbors on each
# layer. The code defining the data loader and neighbor sampler will look # layer. The code defining the data loader and neighbor sampler will look
# like the following. # like the following.
# #
sampler = dgl.dataloading.NeighborSampler([4, 4]) datapipe = gb.ItemSampler(train_set, batch_size=1024, shuffle=True)
train_dataloader = dgl.dataloading.DataLoader( datapipe = datapipe.sample_neighbor(graph, [4, 4])
# The following arguments are specific to DGL's DataLoader. datapipe = datapipe.fetch_feature(feature, node_feature_keys=["feat"])
graph, # The graph datapipe = datapipe.to_dgl()
train_nids, # The node IDs to iterate over in minibatches datapipe = datapipe.copy_to(device)
sampler, # The neighbor sampler train_dataloader = gb.MultiProcessDataLoader(datapipe, num_workers=0)
device=device, # Put the sampled MFGs on CPU or GPU
# The following arguments are inherited from PyTorch DataLoader.
batch_size=1024, # Batch size
shuffle=True, # Whether to shuffle the nodes for every epoch
drop_last=False, # Whether to drop the last incomplete batch
num_workers=0, # Number of sampler processes
)
###################################################################### ######################################################################
# .. note:: # .. note::
# #
# Since DGL 0.7 neighborhood sampling on GPU is supported. Please # In this example, neighborhood sampling runs on CPU, If you are
# refer to :ref:`guide-minibatch-gpu-sampling` if you are # interested in running it on GPU, please refer to
# interested. # :ref:`guide-minibatch-gpu-sampling`.
# #
###################################################################### ######################################################################
# You can iterate over the data loader and see what it yields. # You can iterate over the data loader and a ``DGLMiniBatch`` object
# is yielded.
# #
input_nodes, output_nodes, mfgs = example_minibatch = next( data = next(iter(train_dataloader))
iter(train_dataloader) print(data)
)
print(example_minibatch)
print(
"To compute {} nodes' outputs, we need {} nodes' input features".format(
len(output_nodes), len(input_nodes)
)
)
###################################################################### ######################################################################
# DGL's ``DataLoader`` gives us three items per iteration. # You can get the input node IDs from MFGs.
#
# - An ID tensor for the input nodes, i.e., nodes whose input features
# are needed on the first GNN layer for this minibatch.
# - An ID tensor for the output nodes, i.e. nodes whose representations
# are to be computed.
# - A list of MFGs storing the computation dependencies
# for each GNN layer.
# #
mfgs = data.blocks
###################################################################### input_nodes = mfgs[0].srcdata[dgl.NID]
# You can get the source and destination node IDs of the MFGs print(f"Input nodes: {input_nodes}.")
# and verify that the first few source nodes are always the same as the destination
# nodes. As we described in the :doc:`overview <L0_neighbor_sampling_overview>`,
# destination nodes' own features from the previous layer may also be necessary in
# the computation of the new features.
#
mfg_0_src = mfgs[0].srcdata[dgl.NID]
mfg_0_dst = mfgs[0].dstdata[dgl.NID]
print(mfg_0_src)
print(mfg_0_dst)
print(torch.equal(mfg_0_src[: mfgs[0].num_dst_nodes()], mfg_0_dst))
###################################################################### ######################################################################
# Defining Model # Defining Model
...@@ -207,7 +160,8 @@ class Model(nn.Module): ...@@ -207,7 +160,8 @@ class Model(nn.Module):
return h return h
model = Model(num_features, 128, num_classes).to(device) in_size = feature.size("node", None, "feat")[0]
model = Model(in_size, 64, num_classes).to(device)
###################################################################### ######################################################################
...@@ -273,16 +227,12 @@ opt = torch.optim.Adam(model.parameters()) ...@@ -273,16 +227,12 @@ opt = torch.optim.Adam(model.parameters())
# loader. # loader.
# #
valid_dataloader = dgl.dataloading.DataLoader( datapipe = gb.ItemSampler(valid_set, batch_size=1024, shuffle=False)
graph, datapipe = datapipe.sample_neighbor(graph, [4, 4])
valid_nids, datapipe = datapipe.fetch_feature(feature, node_feature_keys=["feat"])
sampler, datapipe = datapipe.to_dgl()
batch_size=1024, datapipe = datapipe.copy_to(device)
shuffle=False, valid_dataloader = gb.MultiProcessDataLoader(datapipe, num_workers=0)
drop_last=False,
num_workers=0,
device=device,
)
import sklearn.metrics import sklearn.metrics
...@@ -300,12 +250,11 @@ for epoch in range(10): ...@@ -300,12 +250,11 @@ for epoch in range(10):
model.train() model.train()
with tqdm.tqdm(train_dataloader) as tq: with tqdm.tqdm(train_dataloader) as tq:
for step, (input_nodes, output_nodes, mfgs) in enumerate(tq): for step, data in enumerate(tq):
# feature copy from CPU to GPU takes place here x = data.node_features["feat"]
inputs = mfgs[0].srcdata["feat"] labels = data.labels
labels = mfgs[-1].dstdata["label"]
predictions = model(mfgs, inputs) predictions = model(data.blocks, x)
loss = F.cross_entropy(predictions, labels) loss = F.cross_entropy(predictions, labels)
opt.zero_grad() opt.zero_grad()
...@@ -327,10 +276,10 @@ for epoch in range(10): ...@@ -327,10 +276,10 @@ for epoch in range(10):
predictions = [] predictions = []
labels = [] labels = []
with tqdm.tqdm(valid_dataloader) as tq, torch.no_grad(): with tqdm.tqdm(valid_dataloader) as tq, torch.no_grad():
for input_nodes, output_nodes, mfgs in tq: for data in tq:
inputs = mfgs[0].srcdata["feat"] x = data.node_features["feat"]
labels.append(mfgs[-1].dstdata["label"].cpu().numpy()) labels.append(data.labels.cpu().numpy())
predictions.append(model(mfgs, inputs).argmax(1).cpu().numpy()) predictions.append(model(data.blocks, x).argmax(1).cpu().numpy())
predictions = np.concatenate(predictions) predictions = np.concatenate(predictions)
labels = np.concatenate(labels) labels = np.concatenate(labels)
accuracy = sklearn.metrics.accuracy_score(labels, predictions) accuracy = sklearn.metrics.accuracy_score(labels, predictions)
...@@ -361,7 +310,3 @@ for epoch in range(10): ...@@ -361,7 +310,3 @@ for epoch in range(10):
# please refer to the :ref:`user guide on exact offline # please refer to the :ref:`user guide on exact offline
# inference <guide-minibatch-inference>`. # inference <guide-minibatch-inference>`.
# #
# Thumbnail credits: Stanford CS224W Notes
# sphinx_gallery_thumbnail_path = '_static/blitz_1_introduction.png'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment