[CI] fix bugs for multigpu benchmarks (#5140)

46455328 · Rhett Ying · GitHub · d8370299 · 46455328 · 46455328
Unverified Commit 46455328 authored Jan 11, 2023 by Rhett Ying Committed by GitHub Jan 11, 2023
5 changed files
--- a/benchmarks/benchmarks/multigpu/bench_multigpu_rgcn.py
+++ b/benchmarks/benchmarks/multigpu/bench_multigpu_rgcn.py
@@ -48,7 +48,6 @@ class EntityClassify(nn.Module):
        num_hidden_layers=1,
        dropout=0,
        use_self_loop=False,
-        low_mem=True,
        layer_norm=False,
    ):
        super(EntityClassify, self).__init__()
@@ -61,7 +60,6 @@ class EntityClassify(nn.Module):
        self.num_hidden_layers = num_hidden_layers
        self.dropout = dropout
        self.use_self_loop = use_self_loop
-        self.low_mem = low_mem
        self.layer_norm = layer_norm
        self.layers = nn.ModuleList()
@@ -75,7 +73,6 @@ class EntityClassify(nn.Module):
                self.num_bases,
                activation=F.relu,
                self_loop=self.use_self_loop,
-                low_mem=self.low_mem,
                dropout=self.dropout,
                layer_norm=layer_norm,
            )
@@ -91,7 +88,6 @@ class EntityClassify(nn.Module):
                    self.num_bases,
                    activation=F.relu,
                    self_loop=self.use_self_loop,
-                    low_mem=self.low_mem,
                    dropout=self.dropout,
                    layer_norm=layer_norm,
                )
@@ -106,7 +102,6 @@ class EntityClassify(nn.Module):
                self.num_bases,
                activation=None,
                self_loop=self.use_self_loop,
-                low_mem=self.low_mem,
                layer_norm=layer_norm,
            )
        )
@@ -236,7 +231,6 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
        num_hidden_layers=args.n_layers - 2,
        dropout=args.dropout,
        use_self_loop=args.use_self_loop,
-        low_mem=args.low_mem,
        layer_norm=args.layer_norm,
    )
@@ -373,14 +367,12 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
 @utils.skip_if_not_4gpu()
 @utils.benchmark("time", timeout=600)
 @utils.parametrize("data", ["am", "ogbn-mag"])
-@utils.parametrize("low_mem", [True, False])
 @utils.parametrize("dgl_sparse", [True, False])
-def track_time(data, low_mem, dgl_sparse):
+def track_time(data, dgl_sparse):
    # load graph data
    dataset = utils.process_data(data)
    args = config()
    devices = [0, 1, 2, 3]
-    args.low_mem = low_mem
    args.dgl_sparse = dgl_sparse
    args.dataset = dataset
    ogb_dataset = False
@@ -572,49 +564,8 @@ def config():
        node_feats=False,
        num_workers=0,
        dgl_sparse=False,
-        low_mem=False,
    )
-    # parser.add_argument("--dropout", type=float, default=0,
-    #         help="dropout probability")
-    # parser.add_argument("--n-hidden", type=int, default=16,
-    #         help="number of hidden units")
-    # parser.add_argument("--gpu", type=str, default='0',
-    #         help="gpu")
-    # parser.add_argument("--lr", type=float, default=1e-2,
-    #         help="learning rate")
-    # parser.add_argument("--sparse-lr", type=float, default=2e-2,
-    #         help="sparse embedding learning rate")
-    # parser.add_argument("--n-bases", type=int, default=-1,
-    #         help="number of filter weight matrices, default: -1 [use all]")
-    # parser.add_argument("--n-layers", type=int, default=2,
-    #         help="number of propagation rounds")
-    # parser.add_argument("-e", "--n-epochs", type=int, default=50,
-    #         help="number of training epochs")
-    # parser.add_argument("-d", "--dataset", type=str, required=True,
-    #         help="dataset to use")
-    # parser.add_argument("--l2norm", type=float, default=0,
-    #         help="l2 norm coef")
-    # parser.add_argument("--fanout", type=str, default="4, 4",
-    #         help="Fan-out of neighbor sampling.")
-    # parser.add_argument("--use-self-loop", default=False, action='store_true',
-    #         help="include self feature as a special relation")
-    # fp = parser.add_mutually_exclusive_group(required=False)
-    # parser.add_argument("--batch-size", type=int, default=100,
-    #         help="Mini-batch size. ")
-    # parser.add_argument("--eval-batch-size", type=int, default=32,
-    #         help="Mini-batch size. ")
-    # parser.add_argument("--num-workers", type=int, default=0,
-    #         help="Number of workers for dataloader.")
-    # parser.add_argument("--low-mem", default=False, action='store_true',
-    #         help="Whether use low mem RelGraphCov")
-    # parser.add_argument("--dgl-sparse", default=False, action='store_true',
-    #         help='Use sparse embedding for node embeddings.')
-    # parser.add_argument('--node-feats', default=False, action='store_true',
-    #         help='Whether use node features')
-    # parser.add_argument('--layer-norm', default=False, action='store_true',
-    #         help='Use layer norm')
-    # parser.set_defaults(validation=True)
-    # args = parser.parse_args()
    return args

--- a/benchmarks/benchmarks/utils.py
+++ b/benchmarks/benchmarks/utils.py
@@ -534,7 +534,7 @@ def skip_if_not_4gpu():
    """skip if DGL_BENCH_DEVICE is gpu"""
    def _wrapper(func):
-        if GPU_COUNT != 4:
+        if GPU_COUNT < 4:
            # skip if not enabled
            print("Skip {}".format(func.__name__))
            func.benchmark_name = "skip_" + func.__name__

--- a/benchmarks/scripts/build_dgl_asv.sh
+++ b/benchmarks/scripts/build_dgl_asv.sh
@@ -10,7 +10,7 @@ pip install -r /asv/torch_gpu_pip.txt
 # build
 CMAKE_VARS="-DUSE_OPENMP=ON -DBUILD_TORCH=ON -DBUILD_SPARSE=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda"
 if [[ $DEVICE == "gpu" ]]; then
-    CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
+    CMAKE_VARS="-DUSE_CUDA=ON -DUSE_NCCL=ON $CMAKE_VARS"
 fi
 arch=`uname -m`
 if [[ $arch == *"x86"* ]]; then

--- a/benchmarks/scripts/publish.sh
+++ b/benchmarks/scripts/publish.sh
@@ -26,7 +26,7 @@ else
 fi
 WS_ROOT=/asv/dgl
-docker pull public.ecr.aws/s1o7b3d9/benchmark_test:cu116
+docker pull public.ecr.aws/s1o7b3d9/benchmark_test:cu116_v230110
 if [ -z "$DGL_REG_CONF" ]; then
    DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
 else
@@ -56,14 +56,14 @@ if [[ $DEVICE == "cpu" ]]; then
        $DOCKER_MOUNT_OPT \
        $DOCKER_ENV_OPT \
        --shm-size="16g" \
-        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
+        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116_v230110 /bin/bash
 else
    docker run --name dgl-reg \
        --rm --gpus all \
        $DOCKER_MOUNT_OPT \
        $DOCKER_ENV_OPT \
        --shm-size="16g" \
-        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
+        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116_v230110 /bin/bash
 fi
 pwd

--- a/docker/Dockerfile.ci_benchmark
+++ b/docker/Dockerfile.ci_benchmark
@@ -27,5 +27,4 @@ ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
 ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH}
 ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LIBRARY_PATH}
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
-ENV CUDA_VISIBLE_DEVICES=0
 ENV TF_FORCE_GPU_ALLOW_GROWTH=true
\ No newline at end of file