Oneflow 0.8 for DCU

21d47d0e · yuguo · 21d47d0e · 21d47d0e · 21d47d0e · 21d47d0e
Commit 21d47d0e authored Oct 24, 2022 by yuguo
20 changed files
--- a/ci/test/excludelist
+++ b/ci/test/excludelist
+# This file lists libraries that we will assume to be present on the host system and hence
+# should NOT be bundled inside AppImages. This is a working document; expect it to change
+# over time. File format: one filename per line. Each entry should have a justification comment.
+
+# See the useful tool at https://abi-laboratory.pro/index.php?view=navigator&symbol=hb_buffer_set_cluster_level#result
+# to investigate issues with missing symbols.
+
+ld-linux.so.2
+ld-linux-x86-64.so.2
+libanl.so.1
+libBrokenLocale.so.1
+libcidn.so.1
+# libcrypt.so.1 # Not part of glibc anymore as of Fedora 30. See https://github.com/slic3r/Slic3r/issues/4798 and https://pagure.io/fedora-docs/release-notes/c/01d74b33564faa42959c035e1eee286940e9170e?branch=f28
+libc.so.6
+libdl.so.2
+libm.so.6
+libmvec.so.1
+# libnsl.so.1 # Not part of glibc anymore as of Fedora 28. See https://github.com/RPCS3/rpcs3/issues/5224#issuecomment-434930594
+libnss_compat.so.2
+# libnss_db.so.2 # Not part of neon-useredition-20190321-0530-amd64.iso
+libnss_dns.so.2
+libnss_files.so.2
+libnss_hesiod.so.2
+libnss_nisplus.so.2
+libnss_nis.so.2
+libpthread.so.0
+libresolv.so.2
+librt.so.1
+libthread_db.so.1
+libutil.so.1
+# These files are all part of the GNU C Library which should never be bundled.
+# List was generated from a fresh build of glibc 2.25.
+
+libstdc++.so.6
+# Workaround for:
+# usr/lib/libstdc++.so.6: version `GLIBCXX_3.4.21' not found
+
+libGL.so.1
+# The above may be missing on Chrome OS, https://www.reddit.com/r/Crostini/comments/d1lp67/ultimaker_cura_no_longer_running_as_an_appimage/
+libEGL.so.1
+# Part of the video driver (OpenGL); present on any regular
+# desktop system, may also be provided by proprietary drivers.
+# Known to cause issues if it's bundled.
+
+libGLdispatch.so.0
+libGLX.so.0
+# reported to be superfluent and conflicting system libraries (graphics driver)
+# see https://github.com/linuxdeploy/linuxdeploy/issues/89
+
+libOpenGL.so.0
+# Qt installed via install-qt.sh apparently links to this library
+# part of OpenGL like libGL/libEGL, so excluding it should not cause any problems
+# https://github.com/linuxdeploy/linuxdeploy/issues/152
+
+libdrm.so.2
+# Workaround for:
+# Antergos Linux release 2015.11 (ISO-Rolling)
+# /usr/lib/libdrm_amdgpu.so.1: error: symbol lookup error: undefined symbol: drmGetNodeTypeFromFd (fatal)
+# libGL error: unable to load driver: swrast_dri.so
+# libGL error: failed to load driver: swrast
+# Unrecognized OpenGL version
+
+libglapi.so.0
+# Part of mesa
+# known to cause problems with graphics, see https://github.com/RPCS3/rpcs3/issues/4427#issuecomment-381674910
+
+libgbm.so.1
+# Part of mesa
+# https://github.com/probonopd/linuxdeployqt/issues/390#issuecomment-529036305
+
+libxcb.so.1
+# Workaround for:
+# Fedora 23
+# symbol lookup error: /lib64/libxcb-dri3.so.0: undefined symbol: xcb_send_fd
+# Uncertain if this is required to be bundled for some distributions - if so we need to write a version check script and use LD_PRELOAD to load the system version if it is newer
+# Fedora 25:
+# undefined symbol: xcb_send_request_with_fds
+# https://github.com/AppImage/AppImages/issues/128
+
+libX11.so.6
+# Workaround for:
+# Fedora 23
+# symbol lookup error: ./lib/libX11.so.6: undefined symbol: xcb_wait_for_reply64
+# Uncertain if this is required to be bundled for some distributions - if so we need to write a version check script and use LD_PRELOAD to load the system version if it is newer
+
+libgio-2.0.so.0
+# Workaround for:
+# On Ubuntu, "symbol lookup error: /usr/lib/x86_64-linux-gnu/gtk-2.0/modules/liboverlay-scrollbar.so: undefined symbol: g_settings_new"
+
+# libgdk-x11-2.0.so.0 # Missing on openSUSE-Tumbleweed-KDE-Live-x86_64-Snapshot20170601-Media.iso
+# libgtk-x11-2.0.so.0 # Missing on openSUSE-Tumbleweed-KDE-Live-x86_64-Snapshot20170601-Media.iso
+
+libasound.so.2
+# Workaround for:
+# No sound, e.g., in VLC.AppImage (does not find sound cards)
+
+# https://github.com/AppImage/pkg2appimage/issues/475
+# libgdk_pixbuf-2.0.so.0
+# Was: Workaround for:
+# On Ubuntu, get (inkscape:25621): GdkPixbuf-WARNING **: Error loading XPM image loader: Image type 'xpm' is not supported
+
+libfontconfig.so.1
+# Workaround for:
+# Application stalls when loading fonts during application launch; e.g., KiCad on ubuntu-mate
+
+libthai.so.0
+# Workaround for:
+# audacity: /tmp/.mount_AudaciUsFbON/usr/lib/libthai.so.0: version `LIBTHAI_0.1.25' not found (required by /usr/lib64/libpango-1.0.so.0)
+# on openSUSE Tumbleweed
+
+# other "low-level" font rendering libraries
+# should fix https://github.com/probonopd/linuxdeployqt/issues/261#issuecomment-377522251
+# and https://github.com/probonopd/linuxdeployqt/issues/157#issuecomment-320755694
+libfreetype.so.6
+libharfbuzz.so.0
+
+# Note, after discussion we do not exlude this, but we can use a dummy library that just does nothing
+# libselinux.so.1
+# Workaround for:
+# sed: error while loading shared libraries: libpcre.so.3: cannot open shared object file: No such file or directory
+# Some distributions, such as Arch Linux, do not come with libselinux.so.1 by default.
+# The solution is to bundle a dummy mock library:
+# echo "extern int is_selinux_enabled(void){return 0;}" >> selinux-mock.c
+# gcc -s -shared -o libselinux.so.1 -Wl,-soname,libselinux.so.1 selinux-mock.c
+# strip libselinux.so.1
+# More information: https://github.com/AppImage/AppImages/issues/83
+# and https://github.com/AppImage/AppImageKit/issues/775#issuecomment-614954821
+# https://gitlab.com/sulinos/devel/libselinux-dummy
+
+# The following are assumed to be part of the base system
+# Removing these has worked e.g., for Krita. Feel free to report if
+# you think that some of these should go into AppImages and why.
+libcom_err.so.2
+libexpat.so.1
+libgcc_s.so.1
+libglib-2.0.so.0
+libgpg-error.so.0
+# libgssapi_krb5.so.2 # Disputed, seemingly needed by Arch Linux since Kerberos is named differently there
+# libgssapi.so.3 # Seemingly needed when running Ubuntu 14.04 binaries on Fedora 23
+# libhcrypto.so.4 # Missing on openSUSE LEAP 42.0
+# libheimbase.so.1 # Seemingly needed when running Ubuntu 14.04 binaries on Fedora 23
+# libheimntlm.so.0 # Seemingly needed when running Ubuntu 14.04 binaries on Fedora 23
+# libhx509.so.5 # Missing on openSUSE LEAP 42.0
+libICE.so.6
+# libidn.so.11 # Does not come with Solus by default
+# libk5crypto.so.3 # Runnning AppImage built on Debian 9 or Ubuntu 16.04 on an Archlinux fails otherwise; https://github.com/AppImage/AppImages/issues/301
+# libkeyutils.so.1 # Does not come with Void Linux by default; https://github.com/Subsurface-divelog/subsurface/issues/1971#issuecomment-466606834
+# libkrb5.so.26 # Disputed, seemingly needed by Arch Linux since Kerberos is named differently there. Missing on openSUSE LEAP 42.0
+# libkrb5.so.3 # Disputed, seemingly needed by Arch Linux since Kerberos is named differently there
+# libkrb5support.so.0 # Disputed, seemingly needed by Arch Linux since Kerberos is named differently there
+libp11-kit.so.0
+# libpcre.so.3 # Missing on Fedora 24, SLED 12 SP1, and openSUSE Leap 42.2
+# libroken.so.18 # Mission on openSUSE LEAP 42.0
+# libsasl2.so.2 # Seemingly needed when running Ubuntu 14.04 binaries on Fedora 23
+libSM.so.6
+libusb-1.0.so.0
+libuuid.so.1
+# libwind.so.0 # Missing on openSUSE LEAP 42.0
+
+# Potentially dangerous libraries
+libgobject-2.0.so.0
+
+# Workaround for:
+# Rectangles instead of fonts
+# https://github.com/AppImage/AppImages/issues/240
+libpangoft2-1.0.so.0
+libpangocairo-1.0.so.0
+libpango-1.0.so.0
+
+# FIXME:
+# Can get symbol lookup error: /lib64/libpango-1.0.so.0: undefined symbol: g_log_structured_standard
+# if libcairo is bundled but libpango is not
+
+# Workaround for:
+# e.g., Spotify
+# relocation error: /lib/x86_64-linux-gnu/libgcrypt.so.20:
+# symbol gpgrt_lock_lock, version GPG_ERROR_1.0 not defined
+# in file libgpg-error.so.0 with link time reference
+libgpg-error.so.0
+
+libjack.so.0
+# it must match the ABI of the JACK server which is installed in the base system
+# rncbc confirmed this
+# However, this library is missing on Fedora-WS-Live-31-1-9
+# which means that we should avoid using JACK altogether if possible
+
+# Unsolved issue:
+# https://github.com/probonopd/linuxdeployqt/issues/35
+# Error initializing NSS with a persistent database (sql:/home/me/.pki/nssdb): libsoftokn3.so: cannot open shared object file: No such file or directory
+# Error initializing NSS without a persistent database: NSS error code: -5925
+# nss_error=-5925, os_error=0
+# libnss3.so should not be removed from the bundles, as this causes other issues, e.g.,
+# https://github.com/probonopd/linuxdeployqt/issues/35#issuecomment-256213517
+# and https://github.com/AppImage/AppImages/pull/114
+# libnss3.so
+
+# The following cannot be excluded, see
+# https://github.com/AppImage/AppImages/commit/6c7473d8cdaaa2572248dcc53d7f617a577ade6b
+# http://stackoverflow.com/questions/32644157/forcing-a-binary-to-use-a-specific-newer-version-of-a-shared-library-so
+# libssl.so.1
+# libssl.so.1.0.0
+# libcrypto.so.1
+# libcrypto.so.1.0.0
+
+# According to https://github.com/RicardoEPRodrigues/3Engine/issues/4#issuecomment-511598362
+# libGLEW is not tied to a specific GPU. It's linked against libGL.so.1
+# and that one is different depending on the installed driver.
+# In fact libGLEW is changing its soversion very often, so you should always bundle libGLEW.so.2.0
+
+# libglut.so.3 # to be confirmed
+
+libxcb-dri3.so.0 # https://github.com/AppImage/AppImages/issues/348
+libxcb-dri2.so.0 # https://github.com/probonopd/linuxdeployqt/issues/331#issuecomment-442276277
+
+# If the next line turns out to cause issues, we will have to remove it again and find another solution
+libfribidi.so.0 # https://github.com/olive-editor/olive/issues/221 and https://github.com/knapsu/plex-media-player-appimage/issues/14
+
+# Workaround for:
+# symbol lookup error: /lib/x86_64-linux-gnu/libgnutls.so.30: undefined symbol: __gmpz_limbs_write
+# https://github.com/ONLYOFFICE/appimage-desktopeditors/issues/3
+# Apparently coreutils depends on it, so it should be safe to assume that it comes with every target system
+libgmp.so.10
--- a/ci/test/expensive_generic_test_multi_client.sh
+++ b/ci/test/expensive_generic_test_multi_client.sh
+#!/bin/bash
+set -xe
+
+export PYTHONUNBUFFERED=1
+
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+ONEFLOW_TEST_DIR=${ONEFLOW_TEST_DIR:-"$PWD/python/oneflow/test/modules"}
+
+cd $ONEFLOW_TEST_DIR
+
+if [ -z "$ONEFLOW_TEST_CPU_ONLY" ]
+then
+    gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    for ((i=0;i<gpu_num;i++)); do
+        parallel_spec="$parallel_spec --tx popen//env:CUDA_VISIBLE_DEVICES=${i}"
+    done
+else
+    parallel_spec="-n auto"
+fi
+
+unset HTTP_PROXY
+unset HTTPS_PROXY
+unset http_proxy
+unset https_proxy
+
+export ONEFLOW_TEST_DEVICE_NUM=1
+
+COMMON_PYTEST_ARGS="--max-worker-restart=0 -x --durations=50 --capture=sys"
+python3 -m pytest ${COMMON_PYTEST_ARGS} --failed-first --dist loadfile ${parallel_spec} ${PWD}
+if [[ "$(python3 -c 'import oneflow.sysconfig;print(oneflow.sysconfig.has_rpc_backend_grpc())')" == *"True"* ]]; then
+    export ONEFLOW_TEST_DEVICE_NUM=2
+    python3 -m oneflow.distributed.launch --nproc_per_node 2 -m pytest ${COMMON_PYTEST_ARGS} ${PWD}
+
+    export ONEFLOW_TEST_DEVICE_NUM=4
+    python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest ${COMMON_PYTEST_ARGS} ${PWD}
+else
+    python3 -c 'import oneflow.sysconfig;assert(oneflow.sysconfig.has_rpc_backend_grpc() == False)'
+fi
--- a/ci/test/generic_test.sh
+++ b/ci/test/generic_test.sh
+#!/bin/bash
+set -xe
+
+export TF_CPP_MIN_LOG_LEVEL=3
+export PYTHONUNBUFFERED=1
+
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+test_dir=${ONEFLOW_TEST_DIR:-"$PWD/python/oneflow/test/ops"}
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
+export ONEFLOW_TEST_UTILS_DIR=$src_dir/python/oneflow/test_utils
+
+rm -rf $test_tmp_dir
+mkdir -p $test_tmp_dir
+cp -r $test_dir $test_tmp_dir
+cd ${test_tmp_dir}/$(basename $test_dir)
+
+gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+export ONEFLOW_TEST_DEVICE_NUM=1
+python3 $src_dir/ci/test/parallel_run.py \
+    --gpu_num=${gpu_num} \
+    --dir=${PWD} \
+    --timeout=1 \
+    --verbose \
+    --chunk=1
+
+export ONEFLOW_TEST_DEVICE_NUM=2
+python3 -m unittest discover ${PWD} --failfast --verbose
+
+export ONEFLOW_TEST_DEVICE_NUM=4
+python3 -m unittest discover ${PWD} --failfast --verbose
--- a/ci/test/generic_test_multi_client.sh
+++ b/ci/test/generic_test_multi_client.sh
+#!/bin/bash
+set -xe
+
+export PYTHONUNBUFFERED=1
+
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+ONEFLOW_TEST_DIR=${ONEFLOW_TEST_DIR:-"$PWD/python/oneflow/test/modules"}
+ONEFLOW_TEST_TASKS_PER_GPU=${ONEFLOW_TEST_TASKS_PER_GPU:-"4"}
+
+if [ -z "$ONEFLOW_TEST_CPU_ONLY" ]
+then
+    gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    for ((i=0;i<gpu_num;i++)); do
+        for ((j=0;j<ONEFLOW_TEST_TASKS_PER_GPU;j++)); do
+            parallel_spec="$parallel_spec --tx popen//env:CUDA_VISIBLE_DEVICES=${i}"
+        done
+    done
+    multi_launch_device_num=${gpu_num}
+else
+    parallel_spec="-n auto"
+    multi_launch_device_num=8
+fi
+
+unset HTTP_PROXY
+unset HTTPS_PROXY
+unset http_proxy
+unset https_proxy
+
+export ONEFLOW_TEST_DEVICE_NUM=1
+
+COMMON_PYTEST_ARGS="-p no:warnings -p no:randomly -p no:cacheprovider --max-worker-restart=0 -x --durations=50 --capture=sys --ignore=log"
+time python3 -m pytest ${COMMON_PYTEST_ARGS} --dist loadfile ${parallel_spec} ${ONEFLOW_TEST_DIR}
+if [[ "$(python3 -c 'import oneflow.sysconfig;print(oneflow.sysconfig.has_rpc_backend_grpc())')" == *"True"* ]]; then
+    export ONEFLOW_TEST_DEVICE_NUM=2
+    time python3 ${src_dir}/ci/test/multi_launch.py \
+        --files "${ONEFLOW_TEST_DIR}/**/test_*.py" \
+        --master_port 29500 \
+        --master_port 29501 \
+        --master_port 29502 \
+        --master_port 29503 \
+        -n master_port \
+        --group_size 2 \
+        --auto_cuda_visible_devices \
+        --device_num $multi_launch_device_num \
+        -m oneflow.distributed.launch --nproc_per_node 2 -m pytest ${COMMON_PYTEST_ARGS}
+
+    export ONEFLOW_TEST_DEVICE_NUM=4
+    time python3 ${src_dir}/ci/test/multi_launch.py \
+        --files "${ONEFLOW_TEST_DIR}/**/test_*.py" \
+        -n 4 \
+        --group_size 4 \
+        --device_num $multi_launch_device_num \
+        --auto_cuda_visible_devices \
+        -m oneflow.distributed.launch --nproc_per_node 4 -m pytest ${COMMON_PYTEST_ARGS}
+else
+    python3 -c 'import oneflow.sysconfig;assert(oneflow.sysconfig.has_rpc_backend_grpc() == False)'
+fi
--- a/ci/test/multi_client_exception_test.sh
+++ b/ci/test/multi_client_exception_test.sh
+#!/bin/bash
+set -xe
+
+export PYTHONUNBUFFERED=1
+
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+test_dir="$PWD/python/oneflow/test/exceptions"
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
+export ONEFLOW_TEST_UTILS_DIR=$src_dir/python/oneflow/test_utils
+
+
+rm -rf $test_tmp_dir
+mkdir -p $test_tmp_dir
+cp -r $test_dir $test_tmp_dir
+cd ${test_tmp_dir}/$(basename $test_dir)
+
+export ONEFLOW_DEBUG_MODE=1
+
+for file in $(ls ${PWD}/test_*.py)
+do
+    if test -f $file
+    then
+        export ONEFLOW_TEST_DEVICE_NUM=1
+        python3 $file --failfast --verbose
+        if [[ "$(python3 -c 'import oneflow.sysconfig;print(oneflow.sysconfig.has_rpc_backend_grpc())')" == *"True"* ]]; then
+            export ONEFLOW_TEST_DEVICE_NUM=2
+            python3 -m oneflow.distributed.launch --nproc_per_node 2 $file --failfast --verbose
+
+            export ONEFLOW_TEST_DEVICE_NUM=4
+            python3 -m oneflow.distributed.launch --nproc_per_node 4 $file --failfast --verbose
+        else
+            python3 -c 'import oneflow.sysconfig;assert(oneflow.sysconfig.has_rpc_backend_grpc() == False)'
+        fi
+    fi
+done
+
+unset ONEFLOW_DEBUG_MODE
--- a/ci/test/multi_launch.py
+++ b/ci/test/multi_launch.py
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""
+This file is mostly copied from PyTorch v1.8.1 torch/distributed/launch.py
+"""
+import asyncio
+import os
+import random
+import sys
+from argparse import REMAINDER, ArgumentParser
+from typing import IO, Any, List, Optional
+import glob
+import hashlib
+from math import ceil
+
+stdout_filename = "stdout"
+stderr_filename = "stderr"
+
+global PARALLEL_NUM
+global SUCCESS_NUM
+PARALLEL_NUM = 0
+SUCCESS_NUM = 0
+
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(
+        description="helper to start multiple distributed launches in parallel"
+    )
+    parser.add_argument(
+        "--files",
+        type=str,
+        help="files to run, support pattern",
+        required=True,
+        nargs="+",
+    )
+    parser.add_argument(
+        "--group_size",
+        type=int,
+        help="for one command, how many duplications to run",
+        required=True,
+    )
+    parser.add_argument(
+        "--device_num", type=int, help="how many devices to run on", required=True,
+    )
+    parser.add_argument(
+        "-n",
+        "--parallel_num",
+        type=str,
+        help="how many launches, could be a number, or 'master_port'",
+        required=True,
+    )
+    parser.add_argument(
+        "--auto_cuda_visible_devices",
+        action="store_true",
+        required=False,
+        default=False,
+    )
+    parser.add_argument(
+        "--shuffle", action="store_true", required=False, default=False,
+    )
+    parser.add_argument(
+        "--verbose", action="store_true", required=False, default=False,
+    )
+    parser.add_argument(
+        "--master_port",
+        default=[],
+        action="append",
+        help="Master node (rank 0)'s free port, pass this multiple `--master_port` to launch more instances",
+    )
+    parser.add_argument(
+        "-m",
+        "--module",
+        default=False,
+        action="store_true",
+        help="Changes each process to interpret the launch script as a python module, executing with the same behavior as'python -m'.",
+    )
+    parser.add_argument(
+        "training_script",
+        type=str,
+        help="The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script",
+    )
+    parser.add_argument("training_script_args", nargs=REMAINDER)
+    return parser.parse_args()
+
+
+async def run_and_capture(cmd=None, prefix=None, **kwargs):
+    proc = await asyncio.create_subprocess_exec(
+        *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, **kwargs
+    )
+    while True:
+        line = await proc.stdout.readline()
+        print(prefix, line.decode(), end="")
+        if not line:
+            break
+    await proc.wait()
+    assert proc.returncode == 0, prefix
+    global PARALLEL_NUM
+    global SUCCESS_NUM
+    SUCCESS_NUM += 1
+    print(f"{prefix} succeed ({SUCCESS_NUM}/{PARALLEL_NUM})")
+
+
+async def launch_multiple(
+    cmds=None, group_size=None, auto_cuda_env=False, device_num=None
+):
+    visible_groups = [
+        [str(x) for x in range(device_num)[i : i + group_size]]  # to get ["0", "1"]
+        for i in range(0, device_num, group_size)
+    ]
+    spawns = []
+    for i, cmd in enumerate(cmds):
+        group_idx = i % len(visible_groups)
+        cuda_visible_devices = ",".join(visible_groups[group_idx])
+        print(cuda_visible_devices, cmd, "\n")
+        env = os.environ
+        if auto_cuda_env:
+            env = dict(env, CUDA_VISIBLE_DEVICES=cuda_visible_devices)
+        process = run_and_capture(
+            cmd=cmd, prefix=f"[wg={i}][device={cuda_visible_devices}]", env=env,
+        )
+        spawns.append(process)
+    await asyncio.gather(*spawns)
+
+
+def main():
+    args = parse_args()
+    # find files and chuck them
+    files = []
+    for f in args.files:
+        files += list(glob.glob(f, recursive=True))
+    print("total files:", len(files))
+    files = sorted(
+        files,
+        key=lambda x: hashlib.md5(os.path.basename(x.encode("ascii"))).hexdigest(),
+    )
+    if args.shuffle:
+        random.shuffle(files)
+    files_hash = hashlib.md5(
+        "".join([os.path.basename(x) for x in files]).encode()
+    ).hexdigest()[:8]
+    if args.verbose:
+        print(
+            f"::warning file=testFilesHash,line={len(files)},col=0,endColumn=0::shuffle-{args.shuffle}-group_size-{args.group_size}-md5-{files_hash}"
+        )
+    if args.parallel_num == "master_port":
+        parallel_num = len(args.master_port)
+        master_ports = args.master_port
+    else:
+        parallel_num = int(args.parallel_num)
+        if parallel_num != len(args.master_port):
+            print(
+                "warning", "parallel_num != len(args.master_port)", "will auto generate"
+            )
+        default_master_port = 29500
+        master_ports = list(
+            range(default_master_port, default_master_port + parallel_num)
+        )
+    assert parallel_num > 0
+    assert len(master_ports) == parallel_num
+    chunk_size = ceil(len(files) / parallel_num)
+    global PARALLEL_NUM
+    PARALLEL_NUM = parallel_num
+    chunks = [files[i : i + chunk_size] for i in range(0, len(files), chunk_size)]
+
+    # check args
+    assert args.training_script == "oneflow.distributed.launch"
+
+    # generate commands
+    cmds = [
+        [sys.executable, "-m", args.training_script, "--master_port", str(master_port)]
+        + args.training_script_args
+        + chunck
+        for (master_port, chunck) in zip(master_ports, chunks)
+    ]
+    loop = asyncio.get_event_loop()
+    processes = launch_multiple(
+        cmds=cmds,
+        auto_cuda_env=args.auto_cuda_visible_devices,
+        group_size=args.group_size,
+        device_num=args.device_num,
+    )
+    loop.run_until_complete(processes)
+
+
+if __name__ == "__main__":
+    main()
--- a/ci/test/parallel_run.py
+++ b/ci/test/parallel_run.py
+import asyncio
+import os
+import argparse
+from subprocess import PIPE, STDOUT
+import glob
+import sys
+import time
+import socket
+from contextlib import closing
+import uuid
+
+
+def gen_cmds(cmd=None, dir=None, doctest=False):
+    if doctest:
+        paths = glob.glob(os.path.join(dir, "**/*.py"), recursive=True)
+        paths = [
+            p
+            for p in paths
+            if "compatible" not in p
+            and "single_client" not in p
+            and "unittest.py" not in p
+        ]
+        with_doctest = []
+        for p in paths:
+            with open(p) as f:
+                content = f.read()
+                if "import doctest" in content:
+                    with_doctest.append("{} {} -v".format(cmd, p))
+        print(with_doctest)
+        return with_doctest
+    else:
+        paths = glob.glob(os.path.join(dir, "test_*.py"), recursive=False)
+        return ["{} {} --failfast --verbose".format(cmd, p) for p in paths]
+
+
+def find_free_port():
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("localhost", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+
+
+def split_and_print(prefix, text):
+    lines = text.splitlines(keepends=True)
+    prefixed = ""
+    for l in lines:
+        prefixed += f"{prefix} {l}"
+    print(prefixed, flush=True)
+
+
+def everyN(l: list, n: int):
+    for i in range(0, len(l), n):
+        yield l[i : i + n]
+
+
+def contains_oom_info(txt: str):
+    return "memory" in txt or "Memory" in txt or "CUDNN" in txt or "ALLOC" in txt
+
+
+def should_retry(txt: str):
+    return contains_oom_info(txt)
+
+
+def print_out(prefix: str = "", content: str = ""):
+    for l in content.split("\n"):
+        print(f"[{prefix}]", l)
+
+
+async def spawn_shell_and_check(cmd: str = None, gpu_id: int = -1, check: bool = False):
+    is_cpu_only = os.getenv("ONEFLOW_TEST_CPU_ONLY")
+    print(f"[gpu={gpu_id}]", cmd)
+    p = await asyncio.create_subprocess_shell(
+        cmd,
+        stdout=PIPE,
+        stderr=STDOUT,
+        env=dict(
+            os.environ,
+            CUDA_VISIBLE_DEVICES=("-1" if is_cpu_only else ",".join([str(gpu_id)])),
+            ONEFLOW_TEST_MASTER_PORT=str(find_free_port()),
+            ONEFLOW_TEST_LOG_DIR=("./unittest-log-" + str(uuid.uuid4())),
+        ),
+    )
+    (stdout_data, stderr_data) = await p.communicate()
+    decoded = stdout_data.decode()
+    if check or should_retry(decoded) == False:
+        if p.returncode != 0:
+            print_out(prefix=cmd, content=decoded)
+            raise RuntimeError(cmd)
+    return {"returncode": p.returncode, "cmd": cmd, "stdout": decoded}
+
+
+async def run_cmds(
+    cmds, gpu_num=0, timeout=10, chunk=1, verbose=False, per_gpu_process_num=1
+):
+    is_cpu_only = os.getenv("ONEFLOW_TEST_CPU_ONLY")
+    if is_cpu_only:
+        gpu_num = os.cpu_count()
+    fails = []
+    assert gpu_num > 0
+    for cmdN in everyN(cmds, per_gpu_process_num * gpu_num):
+        results = await asyncio.gather(
+            *[
+                spawn_shell_and_check(
+                    cmd=cmd, gpu_id=i, check=(per_gpu_process_num == 1)
+                )
+                for cmd_gpu_num in everyN(cmdN, gpu_num)
+                for (i, cmd) in enumerate(cmd_gpu_num)
+            ],
+        )
+        for r in list(results):
+            if r["returncode"] != 0:
+                fails.append(r["cmd"])
+            else:
+                print_out(prefix=r["cmd"], content=r["stdout"])
+    return fails
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gpu_num", type=int, required=True, default=0)
+    parser.add_argument("--dir", type=str, required=True, default=".")
+    parser.add_argument("--cmd", type=str, required=False, default=sys.executable)
+    parser.add_argument("--timeout", type=int, required=False, default=2)
+    parser.add_argument("--chunk", type=int, required=True)
+    parser.add_argument("--verbose", action="store_true", required=False, default=False)
+    parser.add_argument("--doctest", action="store_true", required=False, default=False)
+    args = parser.parse_args()
+    cmds = gen_cmds(cmd=args.cmd, dir=args.dir, doctest=args.doctest)
+    start = time.time()
+    loop = asyncio.get_event_loop()
+    PER_GPU_PROCESS_NUMS = [12, 8, 2, 1]
+    is_cpu_only = os.getenv("ONEFLOW_TEST_CPU_ONLY")
+    if is_cpu_only:
+        PER_GPU_PROCESS_NUMS = [1]
+    for per_gpu_process_num in PER_GPU_PROCESS_NUMS:
+        print("[per_gpu_process_num]", per_gpu_process_num)
+        cmds = loop.run_until_complete(
+            run_cmds(
+                cmds,
+                gpu_num=args.gpu_num,
+                timeout=args.timeout,
+                chunk=args.chunk,
+                verbose=args.verbose,
+                per_gpu_process_num=per_gpu_process_num,
+            )
+        )
+    elapsed = time.time() - start
+    elapsed_time_txt = time.strftime("elapsed: %H:%M:%S", time.gmtime(elapsed))
+    print(elapsed_time_txt)
--- a/ci/test/print_stack_from_core.sh
+++ b/ci/test/print_stack_from_core.sh
+set -ex
+if compgen -G "$2/core.*" > /dev/null; then
+    gdb --batch --quiet -ex "thread apply all bt full" -ex "quit" $1 $2/core.*
+fi
--- a/ci/test/print_stack_in_all_dirs.sh
+++ b/ci/test/print_stack_in_all_dirs.sh
+set -ex
+find . -type f -name "core.*" -exec gdb --batch --quiet -ex "thread apply all bt full" -ex "quit" python3 {} \;
--- a/ci/test/resource-spec/1x-gtx-1080.json
+++ b/ci/test/resource-spec/1x-gtx-1080.json
+{
+  "version": {
+    "major": 1,
+    "minor": 0
+  },
+  "local": [
+    {
+      "vram": [
+        {
+          "id": "0",
+          "slots": 8117
+        }
+      ]
+    }
+  ]
+}
--- a/ci/test/resource-spec/2x-rtx-2080.json
+++ b/ci/test/resource-spec/2x-rtx-2080.json
+{
+  "version": {
+    "major": 1,
+    "minor": 0
+  },
+  "local": [
+    {
+      "vram": [
+        {
+          "id": "0",
+          "slots": 7982
+        },
+        {
+          "id": "1",
+          "slots": 7982
+        }
+      ]
+    }
+  ]
+}
--- a/ci/test/resource-spec/4x-rtx-2080ti.json
+++ b/ci/test/resource-spec/4x-rtx-2080ti.json
+{
+  "version": {
+    "major": 1,
+    "minor": 0
+  },
+  "local": [
+    {
+      "vram": [
+        {
+          "id": "0",
+          "slots": 11019
+        },
+        {
+          "id": "1",
+          "slots": 11019
+        },
+        {
+          "id": "2",
+          "slots": 11019
+        },
+        {
+          "id": "3",
+          "slots": 11019
+        }
+      ]
+    }
+  ]
+}
--- a/ci/test/test_resnet50_graph_ddp.sh
+++ b/ci/test/test_resnet50_graph_ddp.sh
+#!/usr/bin/env bash
+
+set -ex
+
+cd $ONEFLOW_MODELS_DIR
+
+OFRECORD_PATH=/dataset/imagenette/ofrecord
+if [ ! -d "/dataset/imagenette/ofrecord/train" ];then
+    mkdir -p ./dataset/ofrecord
+    ln -s /dataset/imagenette/ofrecord ./dataset/ofrecord/train
+    OFRECORD_PATH=./dataset/ofrecord
+fi
+
+python3 -m oneflow.distributed.launch --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 Vision/classification/image/resnet50/train.py --ofrecord-path $OFRECORD_PATH --ofrecord-part-num 1 --num-devices-per-node 1 --lr 0.004 --momentum 0.875 --num-epochs 1 --train-batch-size 4 --val-batch-size 50 --print-interval 10 --exit-num 1 --ddp
+python3 -m oneflow.distributed.launch --nproc_per_node 2 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 Vision/classification/image/resnet50/train.py --ofrecord-path $OFRECORD_PATH --ofrecord-part-num 2 --num-devices-per-node 1 --lr 0.004 --momentum 0.875 --num-epochs 1 --train-batch-size 4 --val-batch-size 50 --print-interval 10 --exit-num 1 --use-fp16 --channel-last --scale-grad --graph --fuse-bn-relu --fuse-bn-add-relu --use-gpu-decode
--- a/ci/test/test_speed_multi_client.sh
+++ b/ci/test/test_speed_multi_client.sh
+#!/usr/bin/env bash
+
+set -uxo pipefail
+
+rc=0
+# accumulate the score of every test
+trap 'rc=$(($rc + $?))' ERR
+
+cd $ONEFLOW_MODELS_DIR
+
+function check_relative_speed {
+  # Default score is 1
+  SCORE=${2:-1}
+  awk -F'[:(]' -v threshold=$1 -v score=$SCORE 'BEGIN { ret=2 } /Relative speed/{ if ($2 >= threshold) { printf "✔️ "; ret=0 } else { printf "❌ "; ret=score }} {print $0} END { exit ret }'
+}
+
+function check_millisecond_time {
+  # Default score is 1
+  SCORE=${2:-1}
+  awk -F'[:(]' -v threshold=$1 -v score=$SCORE 'BEGIN { ret=2 } /OneFlow/{ if (substr($2, 2, length($2) - 4) <= threshold) { printf "✔️ "; ret=0 } else { printf "❌ "; ret=score }} { print $0 } END { exit ret }'
+}
+
+function write_to_file_and_print {
+  tee -a result
+  printf "\n" >> result
+}
+
+python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 16x3x224x224 --no-show-memory --times 100 | check_relative_speed 1.05 | check_millisecond_time 129.0 2 | write_to_file_and_print
+python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 8x3x224x224 --no-show-memory --times 100 | check_relative_speed 1.04 | write_to_file_and_print
+python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 4x3x224x224 --no-show-memory --times 200 | check_relative_speed 1.01 | write_to_file_and_print
+python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 2x3x224x224 --no-show-memory --times 200 | check_relative_speed 0.99 | write_to_file_and_print
+python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 1x3x224x224 --no-show-memory --times 200 | check_relative_speed 0.95 | write_to_file_and_print
+
+python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 1 | write_to_file_and_print
+python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 4 | write_to_file_and_print
+python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 8 | write_to_file_and_print
+
+export OMP_NUM_THREADS=1
+python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 16x3x224x224 --no-show-memory --times 100 --ddp | check_relative_speed 1.12 | check_millisecond_time 136.3 2 | write_to_file_and_print
+python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 8x3x224x224 --no-show-memory --times 100 --ddp | check_relative_speed 1.1 | write_to_file_and_print
+python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 4x3x224x224 --no-show-memory --times 200 --ddp | check_relative_speed 1.18 | write_to_file_and_print
+python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 2x3x224x224 --no-show-memory --times 200 --ddp | check_relative_speed 1.18 | write_to_file_and_print
+python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 1x3x224x224 --no-show-memory --times 200 --ddp | check_relative_speed 1.15 | write_to_file_and_print
+
+result="GPU Name: `nvidia-smi --query-gpu=name --format=csv,noheader -i 0` \n\n `cat result`"
+# escape newline for github actions: https://github.community/t/set-output-truncates-multiline-strings/16852/2
+# note that we escape \n and \r to \\n and \\r (i.e. raw string "\n" and "\r") instead of %0A and %0D, 
+# so that they can be correctly handled in javascript code
+result="${result//'%'/'%25'}"
+result="${result//$'\n'/'\\n'}"
+result="${result//$'\r'/'\\r'}"
+
+echo "::set-output name=stats::$result"
+
+# Only fail when the sum of score >= 2
+if (( $rc >= 2 ))
+then
+  exit 1
+else
+  exit 0
+fi
--- a/ci/test/try_install.sh
+++ b/ci/test/try_install.sh
+#!/bin/bash
+set -xe
+
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+wheel_path=${ONEFLOW_WHEEL_PATH:-"$PWD/wheelhouse"}
+index=${ONEFLOW_PIP_INDEX}
+pkg_name=${ONEFLOW_PACKAGE_NAME:-"oneflow"}
+
+if [ -n "$index" ]; then
+    python3 -m pip install --find-links ${index} ${pkg_name}
+elif [ -d "$wheel_path" ]; then
+    ls -la $wheel_path
+    export PATH=/root/.local/bin:$PATH
+    python3 -m pip install https://oneflow-static.oss-cn-beijing.aliyuncs.com/pipindex/pipindex-0.1.3-py2.py3-none-any.whl --user
+    pipindex build $wheel_path
+    python3 -m pip install -U --user --extra-index-url file://${wheel_path}/simple ${pkg_name}
+elif [ -e "$wheel_path" ]; then
+    python3 -m pip install --user "$wheel_path"
+elif [ -d "$src_dir" ]; then
+    python3 -m pip install -e "$src_dir" --user
+else
+    echo "wheel not found: $wheel_path, src dir not found: $src_dir, continue anyway..."
+fi
--- a/cmake/caches/ci/canary/cuda.cmake
+++ b/cmake/caches/ci/canary/cuda.cmake
+set(BUILD_CUDA YES CACHE BOOL "")
+set(BUILD_GIT_VERSION YES CACHE BOOL "")
+set(BUILD_TESTING OFF CACHE BOOL "")
+set(BUILD_RDMA YES CACHE BOOL "")
+set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "")
+set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
+set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(CMAKE_GENERATOR Ninja CACHE STRING "")
+set(CMAKE_CUDA_ARCHITECTURES "61-real;70-real;75-real;80-real;86-real" CACHE STRING "")
+set(CUDNN_STATIC OFF CACHE BOOL "")
+set(WITH_MLIR ON CACHE BOOL "")
+set(BUILD_CPP_API OFF CACHE BOOL "")
+set(CUDA_NVCC_THREADS_NUMBER 8 CACHE STRING "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CUDA_COMPILER_LAUNCHER ccache CACHE STRING "")
--- a/cmake/caches/ci/cpu.cmake
+++ b/cmake/caches/ci/cpu.cmake
+set(BUILD_CUDA NO CACHE BOOL "")
+set(BUILD_GIT_VERSION YES CACHE BOOL "")
+set(BUILD_TESTING YES CACHE BOOL "")
+set(WITH_ONEDNN YES CACHE BOOL "")
+set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "")
+set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
+set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(CMAKE_GENERATOR Ninja CACHE STRING "")
+set(BUILD_CPP_API ON CACHE BOOL "")
+set(WITH_MLIR ON CACHE BOOL "")
+set(BUILD_FOR_CI ON CACHE BOOL "")
+set(BUILD_SHARED_LIBS ON CACHE BOOL "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CPU_THREADING_RUNTIME "TBB" CACHE BOOL "")
--- a/cmake/caches/ci/cuda-xla.cmake
+++ b/cmake/caches/ci/cuda-xla.cmake
+set(BUILD_CUDA YES CACHE BOOL "")
+set(BUILD_GIT_VERSION YES CACHE BOOL "")
+set(BUILD_TESTING YES CACHE BOOL "")
+set(BUILD_RDMA YES CACHE BOOL "")
+set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "")
+set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
+set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(CMAKE_GENERATOR Ninja CACHE STRING "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CUDA_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CUDA_ARCHITECTURES "61;75" CACHE STRING "")
+set(CUDNN_STATIC OFF CACHE BOOL "")
+set(RPC_BACKEND "LOCAL" CACHE STRING "")
+set(CUDA_NVCC_THREADS_NUMBER 8 CACHE STRING "")
--- a/cmake/caches/ci/cuda.cmake
+++ b/cmake/caches/ci/cuda.cmake
+set(BUILD_CUDA YES CACHE BOOL "")
+set(BUILD_GIT_VERSION YES CACHE BOOL "")
+set(BUILD_TESTING YES CACHE BOOL "")
+set(BUILD_RDMA YES CACHE BOOL "")
+set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "")
+set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
+set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(CMAKE_GENERATOR Ninja CACHE STRING "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CUDA_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CUDA_ARCHITECTURES "61;75" CACHE STRING "")
+set(CUDNN_STATIC ON CACHE BOOL "")
+set(WITH_MLIR ON CACHE BOOL "")
+set(BUILD_CPP_API ON CACHE BOOL "")
+set(CUDA_NVCC_THREADS_NUMBER 8 CACHE STRING "")
+set(BUILD_FOR_CI ON CACHE BOOL "")
+set(CPU_THREADING_RUNTIME "SEQ" CACHE BOOL "")
--- a/cmake/caches/ci/gh-hosted/cpu-clang.cmake
+++ b/cmake/caches/ci/gh-hosted/cpu-clang.cmake
+set(CMAKE_C_COMPILER "clang" CACHE STRING "")
+set(CMAKE_CXX_COMPILER "clang++" CACHE STRING "")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld" CACHE STRING "")
+set(CMAKE_MODULE_LINKER_FLAGS_INIT "-fuse-ld=lld" CACHE STRING "")
+set(CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld" CACHE STRING "")
+set(BUILD_SHARED_LIBS YES CACHE BOOL "")
+set(BUILD_CUDA NO CACHE BOOL "")
+set(BUILD_TESTING YES CACHE BOOL "")
+set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
+set(CMAKE_GENERATOR Ninja CACHE STRING "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF CACHE BOOL "")