"git@developer.sourcefind.cn:OpenDAS/dlib.git" did not exist on "979a22c51d9656b47c58b3dd171968f6a6d3151b"
Unverified Commit 362f72cb authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[Test] Performance benchmarks for DGL kernels (#2582)

* add initial kernel benchmarks

* finished kernel benchmarks

* add desc
parent 12f64296
...@@ -23,10 +23,10 @@ to disk. It does not support specifying branches and commits either. They are on ...@@ -23,10 +23,10 @@ to disk. It does not support specifying branches and commits either. They are on
available under ASV's managed environment.** available under ASV's managed environment.**
To change the device for benchmarking, set the `DGL_BENCH_DEVICE` environment variable. To change the device for benchmarking, set the `DGL_BENCH_DEVICE` environment variable.
Any valid PyTorch device strings are allowed. Allowed values are `"cpu"` or `"gpu"`.
```bash ```bash
export DGL_BENCH_DEVICE=cuda:0 export DGL_BENCH_DEVICE=gpu
``` ```
To select which benchmark to run, use the `--bench` flag. For example, To select which benchmark to run, use the `--bench` flag. For example,
...@@ -49,7 +49,7 @@ DGL runs all benchmarks automatically in docker container. To run bencmarks in d ...@@ -49,7 +49,7 @@ DGL runs all benchmarks automatically in docker container. To run bencmarks in d
* Use the `publish.sh` script. It accepts two arguments, a name specifying the identity of * Use the `publish.sh` script. It accepts two arguments, a name specifying the identity of
the test machine and a device name. For example, the test machine and a device name. For example,
```bash ```bash
bash publish.sh dev-machine cuda:0 bash publish.sh dev-machine gpu
``` ```
The script will output two folders `results` and `html`. The `html` folder contains the The script will output two folders `results` and `html`. The `html` folder contains the
......
import time
import dgl
import torch
from .. import utils
def calc_gflops(graph, feat_size, num_heads, time):
return round(2 * graph.num_edges() * feat_size / 1000000000 / time, 2) # count both mul and add
# The benchmarks include broadcasting cases.
# Given feat_size = D, num_heads = H, the node feature shape will be (H, D // H)
# while the edge feature shape will be (H, ), so tested operations will broadcast
# along the last dimension. The total FLOP is controlled by the feat_size no
# matter how many heads are there.
# If num_heads = 0, it falls back to the normal element-wise operation without
# broadcasting.
@utils.benchmark('flops', timeout=600)
@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
@utils.parametrize('feat_size', [4, 32, 256])
@utils.parametrize('num_heads', [0, 1, 4])
def track_flops(graph, feat_size, num_heads):
device = utils.get_bench_device()
graph = utils.get_graph(graph, format='coo').to(device)
if num_heads == 0:
x = torch.randn(graph.num_nodes(), feat_size, device=device)
else:
x = torch.randn(graph.num_nodes(), num_heads, feat_size // num_heads, device=device)
# dry run
for i in range(3):
y = dgl.ops.u_dot_v(graph, x, x)
# timing
accum = 0.
for i in range(10):
with utils.TorchOpTimer(device) as timer:
y = dgl.ops.u_dot_v(graph, x, x)
accum += timer.time
return calc_gflops(graph, feat_size, num_heads, accum / 10)
import time
import dgl
import torch
from .. import utils
def calc_gflops(graph, feat_size, time):
return round(graph.num_edges() * feat_size / 1000000000 / time, 2)
@utils.benchmark('flops', timeout=600)
@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
@utils.parametrize('feat_size', [4, 32, 256])
@utils.parametrize('reducer', ['sum', 'max'])
def track_flops(graph, feat_size, reducer):
device = utils.get_bench_device()
graph = utils.get_graph(graph, format='csc').to(device)
x = torch.randn(graph.num_nodes(), feat_size, device=device)
if reducer == 'sum':
op = dgl.ops.copy_u_sum
elif reducer == 'max':
op = dgl.ops.copy_u_max
else:
raise ValueError('Invalid reducer', reducer)
# dry run
for i in range(3):
y = op(graph, x)
# timing
accum = 0.
for i in range(10):
with utils.TorchOpTimer(device) as timer:
y = op(graph, x)
accum += timer.time
return calc_gflops(graph, feat_size, accum / 10)
import time
import dgl
import torch
from .. import utils
def calc_gflops(graph, feat_size, num_heads, time):
return round(2 * graph.num_edges() * feat_size / 1000000000 / time, 2) # count both mul and add
# The benchmarks include broadcasting cases.
# Given feat_size = D, num_heads = H, the node feature shape will be (H, D // H)
# while the edge feature shape will be (H, ), so tested operations will broadcast
# along the last dimension. The total FLOP is controlled by the feat_size no
# matter how many heads are there.
# If num_heads = 0, it falls back to the normal element-wise operation without
# broadcasting.
@utils.benchmark('flops', timeout=600)
@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
@utils.parametrize('feat_size', [4, 32, 256])
@utils.parametrize('num_heads', [0, 1, 4])
def track_flops(graph, feat_size, num_heads):
device = utils.get_bench_device()
graph = utils.get_graph(graph, format='csc').to(device)
if num_heads == 0:
x = torch.randn(graph.num_nodes(), feat_size, device=device)
w = torch.randn(graph.num_edges(), feat_size, device=device)
else:
x = torch.randn(graph.num_nodes(), num_heads, feat_size // num_heads, device=device)
w = torch.randn(graph.num_edges(), num_heads, 1, device=device)
# dry run
for i in range(3):
y = dgl.ops.u_mul_e_sum(graph, x, w)
# timing
accum = 0.
for i in range(10):
with utils.TorchOpTimer(device) as timer:
y = dgl.ops.u_mul_e_sum(graph, x, w)
accum += timer.time
return calc_gflops(graph, feat_size, num_heads, accum / 10)
...@@ -9,7 +9,8 @@ import numpy as np ...@@ -9,7 +9,8 @@ import numpy as np
import pandas import pandas
import dgl import dgl
import torch import torch
import time
from ogb.nodeproppred import DglNodePropPredDataset
def _download(url, path, filename): def _download(url, path, filename):
fn = os.path.join(path, filename) fn = os.path.join(path, filename)
...@@ -54,11 +55,17 @@ def get_graph(name, format): ...@@ -54,11 +55,17 @@ def get_graph(name, format):
else: else:
g = dgl.data.RedditDataset(self_loop=True)[0].formats([format]) g = dgl.data.RedditDataset(self_loop=True)[0].formats([format])
dgl.save_graphs(bin_path, [g]) dgl.save_graphs(bin_path, [g])
elif name.startswith("ogb"):
g = get_ogb_graph(name)
else: else:
raise Exception("Unknown dataset") raise Exception("Unknown dataset")
g = g.formats([format]) g = g.formats([format])
return g return g
def get_ogb_graph(name):
os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
data = DglNodePropPredDataset(name=name)
return data[0][0]
def get_livejournal(): def get_livejournal():
# Same as https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz # Same as https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz
...@@ -84,14 +91,6 @@ def get_friendster(): ...@@ -84,14 +91,6 @@ def get_friendster():
return dgl.graph((src, dst)) return dgl.graph((src, dst))
# def get_graph(name):
# if name == 'livejournal':
# return get_livejournal()
# else:
# print(name + " doesn't exist")
# return None
class OGBDataset(object): class OGBDataset(object):
def __init__(self, g, num_labels, predict_category=None): def __init__(self, g, num_labels, predict_category=None):
self._g = g self._g = g
...@@ -116,8 +115,6 @@ class OGBDataset(object): ...@@ -116,8 +115,6 @@ class OGBDataset(object):
def load_ogb_product(): def load_ogb_product():
name = 'ogbn-products' name = 'ogbn-products'
from ogb.nodeproppred import DglNodePropPredDataset
os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset')) os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
print('load', name) print('load', name)
...@@ -149,8 +146,6 @@ def load_ogb_product(): ...@@ -149,8 +146,6 @@ def load_ogb_product():
def load_ogb_mag(): def load_ogb_mag():
name = 'ogbn-mag' name = 'ogbn-mag'
from ogb.nodeproppred import DglNodePropPredDataset
os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset')) os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
print('load', name) print('load', name)
...@@ -296,15 +291,21 @@ def setup_track_acc(*args, **kwargs): ...@@ -296,15 +291,21 @@ def setup_track_acc(*args, **kwargs):
np.random.seed(42) np.random.seed(42)
torch.random.manual_seed(42) torch.random.manual_seed(42)
def setup_track_flops(*args, **kwargs):
# fix random seed
np.random.seed(42)
torch.random.manual_seed(42)
TRACK_UNITS = { TRACK_UNITS = {
'time': 's', 'time': 's',
'acc': '%', 'acc': '%',
'flops': 'GFLOPS',
} }
TRACK_SETUP = { TRACK_SETUP = {
'time': setup_track_time, 'time': setup_track_time,
'acc': setup_track_acc, 'acc': setup_track_acc,
'flops': setup_track_flops,
} }
...@@ -421,7 +422,7 @@ elif device == "gpu": ...@@ -421,7 +422,7 @@ elif device == "gpu":
parametrize_cpu = noop_decorator parametrize_cpu = noop_decorator
parametrize_gpu = parametrize parametrize_gpu = parametrize
else: else:
raise Exception("Unknown device") raise Exception("Unknown device. Must be one of ['cpu', 'gpu'], but got {}".format(device))
def skip_if_gpu(): def skip_if_gpu():
...@@ -447,6 +448,7 @@ def benchmark(track_type, timeout=60): ...@@ -447,6 +448,7 @@ def benchmark(track_type, timeout=60):
- 'time' : For timing. Unit: second. - 'time' : For timing. Unit: second.
- 'acc' : For accuracy. Unit: percentage, value between 0 and 100. - 'acc' : For accuracy. Unit: percentage, value between 0 and 100.
- 'flops' : Unit: GFlops, number of floating point operations per second.
timeout : int timeout : int
Timeout threshold in second. Timeout threshold in second.
...@@ -458,7 +460,7 @@ def benchmark(track_type, timeout=60): ...@@ -458,7 +460,7 @@ def benchmark(track_type, timeout=60):
def foo(): def foo():
pass pass
""" """
assert track_type in ['time', 'acc'] assert track_type in ['time', 'acc', 'flops']
def _wrapper(func): def _wrapper(func):
func.unit = TRACK_UNITS[track_type] func.unit = TRACK_UNITS[track_type]
...@@ -469,3 +471,28 @@ def benchmark(track_type, timeout=60): ...@@ -469,3 +471,28 @@ def benchmark(track_type, timeout=60):
func.benchmark_name = "skip_" + func.__name__ func.benchmark_name = "skip_" + func.__name__
return func return func
return _wrapper return _wrapper
#####################################
# Timer
#####################################
class TorchOpTimer:
def __init__(self, device):
self.device = device
def __enter__(self):
if self.device == 'cuda:0':
self.start_event = torch.cuda.Event(enable_timing=True)
self.end_event = torch.cuda.Event(enable_timing=True)
self.start_event.record()
else:
self.tic = time.time()
return self
def __exit__(self, type, value, traceback):
if self.device == 'cuda:0':
self.end_event.record()
torch.cuda.synchronize() # Wait for the events to be recorded!
self.time = self.start_event.elapsed_time(self.end_event) / 1e3
else:
self.time = time.time() - self.tic
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment