[CI] updgrade pytorch version for benchmark CI (#5102)

* [CI] updgrade pytorch version for benchmark CI * update build arguments * updage * updage * upgrade torch to 1.13 * updage docker image * update cmake args * try with cu116_torch112 * update build * update * update * update * update docker image * update * update * update * update * final update * fix continue running * update * update * update

[CI] updgrade pytorch version for benchmark CI (#5102)
* [CI] updgrade pytorch version for benchmark CI * update build arguments * updage * updage * upgrade torch to 1.13 * updage docker image * update cmake args * try with cu116_torch112 * update build * update * update * update * update docker image * update * update * update * update * final update * fix continue running * update * update * update
46a3fc2b · Rhett Ying · GitHub · dde5cf5d · 46a3fc2b · 46a3fc2b
Unverified Commit 46a3fc2b authored Jan 06, 2023 by Rhett Ying Committed by GitHub Jan 06, 2023
9 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -134,6 +134,8 @@ def is_admin(name) {
  return (name in admins)
 }

+def regression_test_done = false
+
 pipeline {
  agent any
  triggers {
@@ -196,7 +198,6 @@ pipeline {
      }
      when { triggeredBy 'IssueCommentCause' }
      steps {
-        // container('dgl-ci-lint') {
          checkout scm
          script {
              def comment = env.GITHUB_COMMENT
@@ -229,12 +230,12 @@ pipeline {
              }
              pullRequest.comment("Finished the Regression test. Result table is at https://dgl-asv-data.s3-us-west-2.amazonaws.com/${env.GIT_COMMIT}_${instance_type}/results/result.csv. Jenkins job link is ${RUN_DISPLAY_URL}. ")
              currentBuild.result = 'SUCCESS'
-              return
+              regression_test_done = true
          }
-        // }
      }
    }
    stage('CI') {
+      when { expression { !regression_test_done } }
      stages {
        stage('Lint Check') {
          agent {

--- a/benchmarks/run.sh
+++ b/benchmarks/run.sh
@@ -9,7 +9,9 @@ ROOT=/asv/dgl

 conda activate base
 pip install --upgrade pip
-pip install asv
+# Newer asv version like 0.5.1 has different result format,
+# so we fix the version here. Or `generate_excel.py` has to be changed.
+pip install asv==0.4.2
 pip uninstall -y dgl

 export DGL_BENCH_DEVICE=$DEVICE

--- a/benchmarks/scripts/build_dgl_asv.sh
+++ b/benchmarks/scripts/build_dgl_asv.sh
@@ -2,19 +2,15 @@

 set -e

-# . /opt/conda/etc/profile.d/conda.sh
-# conda activate pytorch-ci
 # Default building only with cpu
 DEVICE=${DGL_BENCH_DEVICE:-cpu}

 pip install -r /asv/torch_gpu_pip.txt
-pip install pandas rdflib ogb

 # build
-if [[ $DEVICE == "cpu" ]]; then
-    CMAKE_VARS=""
-else
-    CMAKE_VARS="-DUSE_CUDA=ON"
+CMAKE_VARS="-DUSE_OPENMP=ON -DBUILD_TORCH=ON -DBUILD_SPARSE=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda"
+if [[ $DEVICE == "gpu" ]]; then
+    CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
 fi
 arch=`uname -m`
 if [[ $arch == *"x86"* ]]; then
@@ -22,8 +18,6 @@ if [[ $arch == *"x86"* ]]; then
 fi
 mkdir -p build
 pushd build
-cmake -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda -DBUILD_TORCH=ON $CMAKE_VARS ..
-make -j
+cmake $CMAKE_VARS ..
+make -j8
 popd
-
-# conda deactivate
--- a/benchmarks/scripts/generate_excel.py
+++ b/benchmarks/scripts/generate_excel.py
@@ -23,13 +23,9 @@ def get_branch_name_from_hash(hash):
 def main():
    results_path = Path("../results")
    results_path.is_dir()
-    benchmark_json_path = results_path / "benchmarks.json"
-    with benchmark_json_path.open() as f:
-        benchmark_json = json.load(f)
    machines = [f for f in results_path.glob("*") if f.is_dir()]
    output_results_dict = {}
    for machine in machines:
-        # commit_results_dict = {}
        per_machine_result = {}
        commit_results_json_paths = [
            f for f in machine.glob("*") if f.name != "machine.json"

--- a/benchmarks/scripts/install_dgl_asv.sh
+++ b/benchmarks/scripts/install_dgl_asv.sh
@@ -2,8 +2,6 @@

 set -e

-# . /opt/conda/etc/profile.d/conda.sh
-
 # install
 pushd python
 rm -rf build *.egg-info dist

--- a/benchmarks/scripts/publish.sh
+++ b/benchmarks/scripts/publish.sh
@@ -26,20 +26,20 @@ else
 fi

 WS_ROOT=/asv/dgl
-docker pull public.ecr.aws/s1o7b3d9/benchmakrk_pyg_dgl:cu111_torch181_pyg170
-if [ -z "$DGL_REG_CONF"]; then
+docker pull public.ecr.aws/s1o7b3d9/benchmark_test:cu116
+if [ -z "$DGL_REG_CONF" ]; then
    DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
 else
    DOCKER_ENV_OPT=" -e DGL_REG_CONF=$DGL_REG_CONF $DOCKER_ENV_OPT"
 fi

-if [ -z "$INSTANCE_TYPE"]; then
+if [ -z "$INSTANCE_TYPE" ]; then
    DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
 else
    DOCKER_ENV_OPT=" -e INSTANCE_TYPE=$INSTANCE_TYPE $DOCKER_ENV_OPT"
 fi

-if [ -z "$MOUNT_PATH"]; then
+if [ -z "$MOUNT_PATH" ]; then
    DOCKER_MOUNT_OPT=""
 else
    DOCKER_MOUNT_OPT="-v ${MOUNT_PATH}:/tmp/dataset -v ${MOUNT_PATH}/dgl_home/:/root/.dgl/"
@@ -56,16 +56,18 @@ if [[ $DEVICE == "cpu" ]]; then
        $DOCKER_MOUNT_OPT \
        $DOCKER_ENV_OPT \
        --shm-size="16g" \
-        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmakrk_pyg_dgl:cu111_torch181_pyg170 /bin/bash
+        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
 else
    docker run --name dgl-reg \
        --rm --gpus all \
        $DOCKER_MOUNT_OPT \
        $DOCKER_ENV_OPT \
        --shm-size="16g" \
-        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmakrk_pyg_dgl:cu111_torch181_pyg170 /bin/bash
+        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
 fi

+pwd
+
 docker exec dgl-reg mkdir -p $WS_ROOT
 docker cp ../../.git dgl-reg:$WS_ROOT
 docker cp ../ dgl-reg:$WS_ROOT/benchmarks/

--- a/benchmarks/scripts/torch_gpu_pip.txt
+++ b/benchmarks/scripts/torch_gpu_pip.txt
--find-links https://download.pytorch.org/whl/torch
-torch==1.9.0+cu111
-torchvision
+--find-links https://download.pytorch.org/whl/torch_stable.html
+torch==1.13.1+cu116
+torchvision==0.14.1+cu116
+torchmetrics
 pytest
 nose
 numpy
 cython
 scipy
-networkx==2.5.1
+networkx
 matplotlib
 nltk
 requests[security]
@@ -15,5 +16,4 @@ awscli
 torchtext
 pandas
 rdflib
-ogb==1.3.1
-torchmetrics
\ No newline at end of file
+ogb
--- a/docker/Dockerfile.ci_gpu_cu11
+++ b/docker/Dockerfile.ci_gpu_cu11
 # CI docker GPU env
-FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu16.04
+FROM nvidia/cuda:11.6.0-cudnn8-devel-ubuntu20.04
+
+ENV TZ=US
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

 RUN apt-get update --fix-missing

@@ -15,16 +18,6 @@ RUN bash /install/ubuntu_install_conda.sh

 ENV CONDA_ALWAYS_YES="true"

-COPY install/conda_env/torch_gpu.yml /install/conda_env/torch_gpu.yml
-COPY install/conda_env/torch_gpu_pip_latest.txt /install/conda_env/torch_gpu_pip.txt
-RUN ["/bin/bash", "-i", "-c", "conda env create -f /install/conda_env/torch_gpu.yml"]
-
-# COPY install/conda_env/tensorflow_gpu.yml /install/conda_env/tensorflow_gpu.yml
-# RUN ["/bin/bash", "-i", "-c", "conda env create -f /install/conda_env/tensorflow_gpu.yml"]
-
-# COPY install/conda_env/mxnet_gpu.yml /install/conda_env/mxnet_gpu.yml
-# RUN ["/bin/bash", "-i", "-c", "conda env create -f /install/conda_env/mxnet_gpu.yml"]
-
 ENV CONDA_ALWAYS_YES=

 # Environment variables

--- a/docker/install/ubuntu_install_conda.sh
+++ b/docker/install/ubuntu_install_conda.sh
@@ -7,7 +7,7 @@ apt-get update --fix-missing && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \
+wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
    /bin/bash ~/miniconda.sh -b -p /opt/conda && \
    rm ~/miniconda.sh && \
    /opt/conda/bin/conda clean -tipsy && \