[Test] Performance benchmarks for DGL kernels (#2582)

* add initial kernel benchmarks * finished kernel benchmarks * add desc

[Test] Performance benchmarks for DGL kernels (#2582)
* add initial kernel benchmarks * finished kernel benchmarks * add desc
362f72cb · Minjie Wang · GitHub · 12f64296 · 362f72cb · 362f72cb
Unverified Commit 362f72cb authored Jan 27, 2021 by Minjie Wang Committed by GitHub Jan 27, 2021
6 changed files
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -23,10 +23,10 @@ to disk. It does not support specifying branches and commits either. They are on
 available under ASV's managed environment.**
 To change the device for benchmarking, set the `DGL_BENCH_DEVICE` environment variable.
-Any valid PyTorch device strings are allowed.
+Allowed values are `"cpu"` or `"gpu"`.
 ```bash
-export DGL_BENCH_DEVICE=cuda:0
+export DGL_BENCH_DEVICE=gpu
 ```
 To select which benchmark to run, use the `--bench` flag. For example,
@@ -49,7 +49,7 @@ DGL runs all benchmarks automatically in docker container. To run bencmarks in d
 * Use the `publish.sh` script. It accepts two arguments, a name specifying the identity of
  the test machine and a device name. For example,
  ```bash
-  bash publish.sh dev-machine cuda:0
+  bash publish.sh dev-machine gpu
  ```
 The script will output two folders `results` and `html`. The `html` folder contains the

--- a/benchmarks/benchmarks/kernel/__init__.py
+++ b/benchmarks/benchmarks/kernel/__init__.py
--- a/benchmarks/benchmarks/kernel/bench_gsddmm_u_dot_v.py
+++ b/benchmarks/benchmarks/kernel/bench_gsddmm_u_dot_v.py
+import time
+import dgl
+import torch
+from .. import utils
+def calc_gflops(graph, feat_size, num_heads, time):
+    return round(2 * graph.num_edges() * feat_size / 1000000000 / time, 2)  # count both mul and add
+# The benchmarks include broadcasting cases.
+# Given feat_size = D, num_heads = H, the node feature shape will be (H, D // H)
+#   while the edge feature shape will be (H, ), so tested operations will broadcast
+#   along the last dimension. The total FLOP is controlled by the feat_size no
+#   matter how many heads are there.
+# If num_heads = 0, it falls back to the normal element-wise operation without
+#   broadcasting.
+@utils.benchmark('flops', timeout=600)
+@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
+@utils.parametrize('feat_size', [4, 32, 256])
+@utils.parametrize('num_heads', [0, 1, 4])
+def track_flops(graph, feat_size, num_heads):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph, format='coo').to(device)
+    if num_heads == 0:
+        x = torch.randn(graph.num_nodes(), feat_size, device=device)
+    else:
+        x = torch.randn(graph.num_nodes(), num_heads, feat_size // num_heads, device=device)
+    # dry run
+    for i in range(3):
+        y = dgl.ops.u_dot_v(graph, x, x)
+    # timing
+    accum = 0.
+    for i in range(10):
+        with utils.TorchOpTimer(device) as timer:
+            y = dgl.ops.u_dot_v(graph, x, x)
+        accum += timer.time
+    return calc_gflops(graph, feat_size, num_heads, accum / 10)
--- a/benchmarks/benchmarks/kernel/bench_gspmm_copy_u.py
+++ b/benchmarks/benchmarks/kernel/bench_gspmm_copy_u.py
+import time
+import dgl
+import torch
+from .. import utils
+def calc_gflops(graph, feat_size, time):
+    return round(graph.num_edges() * feat_size / 1000000000 / time, 2)
+@utils.benchmark('flops', timeout=600)
+@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
+@utils.parametrize('feat_size', [4, 32, 256])
+@utils.parametrize('reducer', ['sum', 'max'])
+def track_flops(graph, feat_size, reducer):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph, format='csc').to(device)
+    x = torch.randn(graph.num_nodes(), feat_size, device=device)
+    if reducer == 'sum':
+        op = dgl.ops.copy_u_sum
+    elif reducer == 'max':
+        op = dgl.ops.copy_u_max
+    else:
+        raise ValueError('Invalid reducer', reducer)
+    # dry run
+    for i in range(3):
+        y = op(graph, x)
+    # timing
+    accum = 0.
+    for i in range(10):
+        with utils.TorchOpTimer(device) as timer:
+            y = op(graph, x)
+        accum += timer.time
+    return calc_gflops(graph, feat_size, accum / 10)
--- a/benchmarks/benchmarks/kernel/bench_gspmm_u_mul_e_sum.py
+++ b/benchmarks/benchmarks/kernel/bench_gspmm_u_mul_e_sum.py
+import time
+import dgl
+import torch
+from .. import utils
+def calc_gflops(graph, feat_size, num_heads, time):
+    return round(2 * graph.num_edges() * feat_size / 1000000000 / time, 2)  # count both mul and add
+# The benchmarks include broadcasting cases.
+# Given feat_size = D, num_heads = H, the node feature shape will be (H, D // H)
+#   while the edge feature shape will be (H, ), so tested operations will broadcast
+#   along the last dimension. The total FLOP is controlled by the feat_size no
+#   matter how many heads are there.
+# If num_heads = 0, it falls back to the normal element-wise operation without
+#   broadcasting.
+@utils.benchmark('flops', timeout=600)
+@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
+@utils.parametrize('feat_size', [4, 32, 256])
+@utils.parametrize('num_heads', [0, 1, 4])
+def track_flops(graph, feat_size, num_heads):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph, format='csc').to(device)
+    if num_heads == 0:
+        x = torch.randn(graph.num_nodes(), feat_size, device=device)
+        w = torch.randn(graph.num_edges(), feat_size, device=device)
+    else:
+        x = torch.randn(graph.num_nodes(), num_heads, feat_size // num_heads, device=device)
+        w = torch.randn(graph.num_edges(), num_heads, 1, device=device)
+    # dry run
+    for i in range(3):
+        y = dgl.ops.u_mul_e_sum(graph, x, w)
+    # timing
+    accum = 0.
+    for i in range(10):
+        with utils.TorchOpTimer(device) as timer:
+            y = dgl.ops.u_mul_e_sum(graph, x, w)
+        accum += timer.time
+    return calc_gflops(graph, feat_size, num_heads, accum / 10)
--- a/benchmarks/benchmarks/utils.py
+++ b/benchmarks/benchmarks/utils.py
@@ -9,7 +9,8 @@ import numpy as np
 import pandas
 import dgl
 import torch
+import time
+from ogb.nodeproppred import DglNodePropPredDataset
 def _download(url, path, filename):
    fn = os.path.join(path, filename)
@@ -54,11 +55,17 @@ def get_graph(name, format):
        else:
            g = dgl.data.RedditDataset(self_loop=True)[0].formats([format])
            dgl.save_graphs(bin_path, [g])
+    elif name.startswith("ogb"):
+        g = get_ogb_graph(name)
    else:
        raise Exception("Unknown dataset")
    g = g.formats([format])
    return g
+def get_ogb_graph(name):
+    os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
+    data = DglNodePropPredDataset(name=name)
+    return data[0][0]
 def get_livejournal():
    # Same as https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz
@@ -84,14 +91,6 @@ def get_friendster():
    return dgl.graph((src, dst))
-# def get_graph(name):
-#     if name == 'livejournal':
-#         return get_livejournal()
-#     else:
-#         print(name + " doesn't exist")
-#         return None
 class OGBDataset(object):
    def __init__(self, g, num_labels, predict_category=None):
        self._g = g
@@ -116,8 +115,6 @@ class OGBDataset(object):
 def load_ogb_product():
    name = 'ogbn-products'
-    from ogb.nodeproppred import DglNodePropPredDataset
    os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
    print('load', name)
@@ -149,8 +146,6 @@ def load_ogb_product():
 def load_ogb_mag():
    name = 'ogbn-mag'
-    from ogb.nodeproppred import DglNodePropPredDataset
    os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
    print('load', name)
@@ -296,15 +291,21 @@ def setup_track_acc(*args, **kwargs):
    np.random.seed(42)
    torch.random.manual_seed(42)
+def setup_track_flops(*args, **kwargs):
+    # fix random seed
+    np.random.seed(42)
+    torch.random.manual_seed(42)
 TRACK_UNITS = {
    'time': 's',
    'acc': '%',
+    'flops': 'GFLOPS',
 }
 TRACK_SETUP = {
    'time': setup_track_time,
    'acc': setup_track_acc,
+    'flops': setup_track_flops,
 }
@@ -421,7 +422,7 @@ elif device == "gpu":
    parametrize_cpu = noop_decorator
    parametrize_gpu = parametrize
 else:
-    raise Exception("Unknown device")
+    raise Exception("Unknown device. Must be one of ['cpu', 'gpu'], but got {}".format(device))
 def skip_if_gpu():
@@ -447,6 +448,7 @@ def benchmark(track_type, timeout=60):
            - 'time' : For timing. Unit: second.
            - 'acc' : For accuracy. Unit: percentage, value between 0 and 100.
+            - 'flops' : Unit: GFlops, number of floating point operations per second.
    timeout : int
        Timeout threshold in second.
@@ -458,7 +460,7 @@ def benchmark(track_type, timeout=60):
        def foo():
            pass
    """
-    assert track_type in ['time', 'acc']
+    assert track_type in ['time', 'acc', 'flops']
    def _wrapper(func):
        func.unit = TRACK_UNITS[track_type]
@@ -469,3 +471,28 @@ def benchmark(track_type, timeout=60):
            func.benchmark_name = "skip_" + func.__name__
        return func
    return _wrapper
+#####################################
+# Timer
+#####################################
+class TorchOpTimer:
+    def __init__(self, device):
+        self.device = device
+    def __enter__(self):
+        if self.device == 'cuda:0':
+            self.start_event = torch.cuda.Event(enable_timing=True)
+            self.end_event = torch.cuda.Event(enable_timing=True)
+            self.start_event.record()
+        else:
+            self.tic = time.time()
+        return self
+    def __exit__(self, type, value, traceback):
+        if self.device == 'cuda:0':
+            self.end_event.record()
+            torch.cuda.synchronize()  # Wait for the events to be recorded!
+            self.time = self.start_event.elapsed_time(self.end_event) / 1e3
+        else:
+            self.time = time.time() - self.tic