Commit 21d47d0e authored by yuguo's avatar yuguo
Browse files

Oneflow 0.8 for DCU

parents
# This file lists libraries that we will assume to be present on the host system and hence
# should NOT be bundled inside AppImages. This is a working document; expect it to change
# over time. File format: one filename per line. Each entry should have a justification comment.
# See the useful tool at https://abi-laboratory.pro/index.php?view=navigator&symbol=hb_buffer_set_cluster_level#result
# to investigate issues with missing symbols.
ld-linux.so.2
ld-linux-x86-64.so.2
libanl.so.1
libBrokenLocale.so.1
libcidn.so.1
# libcrypt.so.1 # Not part of glibc anymore as of Fedora 30. See https://github.com/slic3r/Slic3r/issues/4798 and https://pagure.io/fedora-docs/release-notes/c/01d74b33564faa42959c035e1eee286940e9170e?branch=f28
libc.so.6
libdl.so.2
libm.so.6
libmvec.so.1
# libnsl.so.1 # Not part of glibc anymore as of Fedora 28. See https://github.com/RPCS3/rpcs3/issues/5224#issuecomment-434930594
libnss_compat.so.2
# libnss_db.so.2 # Not part of neon-useredition-20190321-0530-amd64.iso
libnss_dns.so.2
libnss_files.so.2
libnss_hesiod.so.2
libnss_nisplus.so.2
libnss_nis.so.2
libpthread.so.0
libresolv.so.2
librt.so.1
libthread_db.so.1
libutil.so.1
# These files are all part of the GNU C Library which should never be bundled.
# List was generated from a fresh build of glibc 2.25.
libstdc++.so.6
# Workaround for:
# usr/lib/libstdc++.so.6: version `GLIBCXX_3.4.21' not found
libGL.so.1
# The above may be missing on Chrome OS, https://www.reddit.com/r/Crostini/comments/d1lp67/ultimaker_cura_no_longer_running_as_an_appimage/
libEGL.so.1
# Part of the video driver (OpenGL); present on any regular
# desktop system, may also be provided by proprietary drivers.
# Known to cause issues if it's bundled.
libGLdispatch.so.0
libGLX.so.0
# reported to be superfluent and conflicting system libraries (graphics driver)
# see https://github.com/linuxdeploy/linuxdeploy/issues/89
libOpenGL.so.0
# Qt installed via install-qt.sh apparently links to this library
# part of OpenGL like libGL/libEGL, so excluding it should not cause any problems
# https://github.com/linuxdeploy/linuxdeploy/issues/152
libdrm.so.2
# Workaround for:
# Antergos Linux release 2015.11 (ISO-Rolling)
# /usr/lib/libdrm_amdgpu.so.1: error: symbol lookup error: undefined symbol: drmGetNodeTypeFromFd (fatal)
# libGL error: unable to load driver: swrast_dri.so
# libGL error: failed to load driver: swrast
# Unrecognized OpenGL version
libglapi.so.0
# Part of mesa
# known to cause problems with graphics, see https://github.com/RPCS3/rpcs3/issues/4427#issuecomment-381674910
libgbm.so.1
# Part of mesa
# https://github.com/probonopd/linuxdeployqt/issues/390#issuecomment-529036305
libxcb.so.1
# Workaround for:
# Fedora 23
# symbol lookup error: /lib64/libxcb-dri3.so.0: undefined symbol: xcb_send_fd
# Uncertain if this is required to be bundled for some distributions - if so we need to write a version check script and use LD_PRELOAD to load the system version if it is newer
# Fedora 25:
# undefined symbol: xcb_send_request_with_fds
# https://github.com/AppImage/AppImages/issues/128
libX11.so.6
# Workaround for:
# Fedora 23
# symbol lookup error: ./lib/libX11.so.6: undefined symbol: xcb_wait_for_reply64
# Uncertain if this is required to be bundled for some distributions - if so we need to write a version check script and use LD_PRELOAD to load the system version if it is newer
libgio-2.0.so.0
# Workaround for:
# On Ubuntu, "symbol lookup error: /usr/lib/x86_64-linux-gnu/gtk-2.0/modules/liboverlay-scrollbar.so: undefined symbol: g_settings_new"
# libgdk-x11-2.0.so.0 # Missing on openSUSE-Tumbleweed-KDE-Live-x86_64-Snapshot20170601-Media.iso
# libgtk-x11-2.0.so.0 # Missing on openSUSE-Tumbleweed-KDE-Live-x86_64-Snapshot20170601-Media.iso
libasound.so.2
# Workaround for:
# No sound, e.g., in VLC.AppImage (does not find sound cards)
# https://github.com/AppImage/pkg2appimage/issues/475
# libgdk_pixbuf-2.0.so.0
# Was: Workaround for:
# On Ubuntu, get (inkscape:25621): GdkPixbuf-WARNING **: Error loading XPM image loader: Image type 'xpm' is not supported
libfontconfig.so.1
# Workaround for:
# Application stalls when loading fonts during application launch; e.g., KiCad on ubuntu-mate
libthai.so.0
# Workaround for:
# audacity: /tmp/.mount_AudaciUsFbON/usr/lib/libthai.so.0: version `LIBTHAI_0.1.25' not found (required by /usr/lib64/libpango-1.0.so.0)
# on openSUSE Tumbleweed
# other "low-level" font rendering libraries
# should fix https://github.com/probonopd/linuxdeployqt/issues/261#issuecomment-377522251
# and https://github.com/probonopd/linuxdeployqt/issues/157#issuecomment-320755694
libfreetype.so.6
libharfbuzz.so.0
# Note, after discussion we do not exlude this, but we can use a dummy library that just does nothing
# libselinux.so.1
# Workaround for:
# sed: error while loading shared libraries: libpcre.so.3: cannot open shared object file: No such file or directory
# Some distributions, such as Arch Linux, do not come with libselinux.so.1 by default.
# The solution is to bundle a dummy mock library:
# echo "extern int is_selinux_enabled(void){return 0;}" >> selinux-mock.c
# gcc -s -shared -o libselinux.so.1 -Wl,-soname,libselinux.so.1 selinux-mock.c
# strip libselinux.so.1
# More information: https://github.com/AppImage/AppImages/issues/83
# and https://github.com/AppImage/AppImageKit/issues/775#issuecomment-614954821
# https://gitlab.com/sulinos/devel/libselinux-dummy
# The following are assumed to be part of the base system
# Removing these has worked e.g., for Krita. Feel free to report if
# you think that some of these should go into AppImages and why.
libcom_err.so.2
libexpat.so.1
libgcc_s.so.1
libglib-2.0.so.0
libgpg-error.so.0
# libgssapi_krb5.so.2 # Disputed, seemingly needed by Arch Linux since Kerberos is named differently there
# libgssapi.so.3 # Seemingly needed when running Ubuntu 14.04 binaries on Fedora 23
# libhcrypto.so.4 # Missing on openSUSE LEAP 42.0
# libheimbase.so.1 # Seemingly needed when running Ubuntu 14.04 binaries on Fedora 23
# libheimntlm.so.0 # Seemingly needed when running Ubuntu 14.04 binaries on Fedora 23
# libhx509.so.5 # Missing on openSUSE LEAP 42.0
libICE.so.6
# libidn.so.11 # Does not come with Solus by default
# libk5crypto.so.3 # Runnning AppImage built on Debian 9 or Ubuntu 16.04 on an Archlinux fails otherwise; https://github.com/AppImage/AppImages/issues/301
# libkeyutils.so.1 # Does not come with Void Linux by default; https://github.com/Subsurface-divelog/subsurface/issues/1971#issuecomment-466606834
# libkrb5.so.26 # Disputed, seemingly needed by Arch Linux since Kerberos is named differently there. Missing on openSUSE LEAP 42.0
# libkrb5.so.3 # Disputed, seemingly needed by Arch Linux since Kerberos is named differently there
# libkrb5support.so.0 # Disputed, seemingly needed by Arch Linux since Kerberos is named differently there
libp11-kit.so.0
# libpcre.so.3 # Missing on Fedora 24, SLED 12 SP1, and openSUSE Leap 42.2
# libroken.so.18 # Mission on openSUSE LEAP 42.0
# libsasl2.so.2 # Seemingly needed when running Ubuntu 14.04 binaries on Fedora 23
libSM.so.6
libusb-1.0.so.0
libuuid.so.1
# libwind.so.0 # Missing on openSUSE LEAP 42.0
# Potentially dangerous libraries
libgobject-2.0.so.0
# Workaround for:
# Rectangles instead of fonts
# https://github.com/AppImage/AppImages/issues/240
libpangoft2-1.0.so.0
libpangocairo-1.0.so.0
libpango-1.0.so.0
# FIXME:
# Can get symbol lookup error: /lib64/libpango-1.0.so.0: undefined symbol: g_log_structured_standard
# if libcairo is bundled but libpango is not
# Workaround for:
# e.g., Spotify
# relocation error: /lib/x86_64-linux-gnu/libgcrypt.so.20:
# symbol gpgrt_lock_lock, version GPG_ERROR_1.0 not defined
# in file libgpg-error.so.0 with link time reference
libgpg-error.so.0
libjack.so.0
# it must match the ABI of the JACK server which is installed in the base system
# rncbc confirmed this
# However, this library is missing on Fedora-WS-Live-31-1-9
# which means that we should avoid using JACK altogether if possible
# Unsolved issue:
# https://github.com/probonopd/linuxdeployqt/issues/35
# Error initializing NSS with a persistent database (sql:/home/me/.pki/nssdb): libsoftokn3.so: cannot open shared object file: No such file or directory
# Error initializing NSS without a persistent database: NSS error code: -5925
# nss_error=-5925, os_error=0
# libnss3.so should not be removed from the bundles, as this causes other issues, e.g.,
# https://github.com/probonopd/linuxdeployqt/issues/35#issuecomment-256213517
# and https://github.com/AppImage/AppImages/pull/114
# libnss3.so
# The following cannot be excluded, see
# https://github.com/AppImage/AppImages/commit/6c7473d8cdaaa2572248dcc53d7f617a577ade6b
# http://stackoverflow.com/questions/32644157/forcing-a-binary-to-use-a-specific-newer-version-of-a-shared-library-so
# libssl.so.1
# libssl.so.1.0.0
# libcrypto.so.1
# libcrypto.so.1.0.0
# According to https://github.com/RicardoEPRodrigues/3Engine/issues/4#issuecomment-511598362
# libGLEW is not tied to a specific GPU. It's linked against libGL.so.1
# and that one is different depending on the installed driver.
# In fact libGLEW is changing its soversion very often, so you should always bundle libGLEW.so.2.0
# libglut.so.3 # to be confirmed
libxcb-dri3.so.0 # https://github.com/AppImage/AppImages/issues/348
libxcb-dri2.so.0 # https://github.com/probonopd/linuxdeployqt/issues/331#issuecomment-442276277
# If the next line turns out to cause issues, we will have to remove it again and find another solution
libfribidi.so.0 # https://github.com/olive-editor/olive/issues/221 and https://github.com/knapsu/plex-media-player-appimage/issues/14
# Workaround for:
# symbol lookup error: /lib/x86_64-linux-gnu/libgnutls.so.30: undefined symbol: __gmpz_limbs_write
# https://github.com/ONLYOFFICE/appimage-desktopeditors/issues/3
# Apparently coreutils depends on it, so it should be safe to assume that it comes with every target system
libgmp.so.10
#!/bin/bash
set -xe
export PYTHONUNBUFFERED=1
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
ONEFLOW_TEST_DIR=${ONEFLOW_TEST_DIR:-"$PWD/python/oneflow/test/modules"}
cd $ONEFLOW_TEST_DIR
if [ -z "$ONEFLOW_TEST_CPU_ONLY" ]
then
gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
for ((i=0;i<gpu_num;i++)); do
parallel_spec="$parallel_spec --tx popen//env:CUDA_VISIBLE_DEVICES=${i}"
done
else
parallel_spec="-n auto"
fi
unset HTTP_PROXY
unset HTTPS_PROXY
unset http_proxy
unset https_proxy
export ONEFLOW_TEST_DEVICE_NUM=1
COMMON_PYTEST_ARGS="--max-worker-restart=0 -x --durations=50 --capture=sys"
python3 -m pytest ${COMMON_PYTEST_ARGS} --failed-first --dist loadfile ${parallel_spec} ${PWD}
if [[ "$(python3 -c 'import oneflow.sysconfig;print(oneflow.sysconfig.has_rpc_backend_grpc())')" == *"True"* ]]; then
export ONEFLOW_TEST_DEVICE_NUM=2
python3 -m oneflow.distributed.launch --nproc_per_node 2 -m pytest ${COMMON_PYTEST_ARGS} ${PWD}
export ONEFLOW_TEST_DEVICE_NUM=4
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest ${COMMON_PYTEST_ARGS} ${PWD}
else
python3 -c 'import oneflow.sysconfig;assert(oneflow.sysconfig.has_rpc_backend_grpc() == False)'
fi
#!/bin/bash
set -xe
export TF_CPP_MIN_LOG_LEVEL=3
export PYTHONUNBUFFERED=1
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
test_dir=${ONEFLOW_TEST_DIR:-"$PWD/python/oneflow/test/ops"}
test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
export ONEFLOW_TEST_UTILS_DIR=$src_dir/python/oneflow/test_utils
rm -rf $test_tmp_dir
mkdir -p $test_tmp_dir
cp -r $test_dir $test_tmp_dir
cd ${test_tmp_dir}/$(basename $test_dir)
gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
export ONEFLOW_TEST_DEVICE_NUM=1
python3 $src_dir/ci/test/parallel_run.py \
--gpu_num=${gpu_num} \
--dir=${PWD} \
--timeout=1 \
--verbose \
--chunk=1
export ONEFLOW_TEST_DEVICE_NUM=2
python3 -m unittest discover ${PWD} --failfast --verbose
export ONEFLOW_TEST_DEVICE_NUM=4
python3 -m unittest discover ${PWD} --failfast --verbose
#!/bin/bash
set -xe
export PYTHONUNBUFFERED=1
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
ONEFLOW_TEST_DIR=${ONEFLOW_TEST_DIR:-"$PWD/python/oneflow/test/modules"}
ONEFLOW_TEST_TASKS_PER_GPU=${ONEFLOW_TEST_TASKS_PER_GPU:-"4"}
if [ -z "$ONEFLOW_TEST_CPU_ONLY" ]
then
gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
for ((i=0;i<gpu_num;i++)); do
for ((j=0;j<ONEFLOW_TEST_TASKS_PER_GPU;j++)); do
parallel_spec="$parallel_spec --tx popen//env:CUDA_VISIBLE_DEVICES=${i}"
done
done
multi_launch_device_num=${gpu_num}
else
parallel_spec="-n auto"
multi_launch_device_num=8
fi
unset HTTP_PROXY
unset HTTPS_PROXY
unset http_proxy
unset https_proxy
export ONEFLOW_TEST_DEVICE_NUM=1
COMMON_PYTEST_ARGS="-p no:warnings -p no:randomly -p no:cacheprovider --max-worker-restart=0 -x --durations=50 --capture=sys --ignore=log"
time python3 -m pytest ${COMMON_PYTEST_ARGS} --dist loadfile ${parallel_spec} ${ONEFLOW_TEST_DIR}
if [[ "$(python3 -c 'import oneflow.sysconfig;print(oneflow.sysconfig.has_rpc_backend_grpc())')" == *"True"* ]]; then
export ONEFLOW_TEST_DEVICE_NUM=2
time python3 ${src_dir}/ci/test/multi_launch.py \
--files "${ONEFLOW_TEST_DIR}/**/test_*.py" \
--master_port 29500 \
--master_port 29501 \
--master_port 29502 \
--master_port 29503 \
-n master_port \
--group_size 2 \
--auto_cuda_visible_devices \
--device_num $multi_launch_device_num \
-m oneflow.distributed.launch --nproc_per_node 2 -m pytest ${COMMON_PYTEST_ARGS}
export ONEFLOW_TEST_DEVICE_NUM=4
time python3 ${src_dir}/ci/test/multi_launch.py \
--files "${ONEFLOW_TEST_DIR}/**/test_*.py" \
-n 4 \
--group_size 4 \
--device_num $multi_launch_device_num \
--auto_cuda_visible_devices \
-m oneflow.distributed.launch --nproc_per_node 4 -m pytest ${COMMON_PYTEST_ARGS}
else
python3 -c 'import oneflow.sysconfig;assert(oneflow.sysconfig.has_rpc_backend_grpc() == False)'
fi
#!/bin/bash
set -xe
export PYTHONUNBUFFERED=1
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
test_dir="$PWD/python/oneflow/test/exceptions"
test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
export ONEFLOW_TEST_UTILS_DIR=$src_dir/python/oneflow/test_utils
rm -rf $test_tmp_dir
mkdir -p $test_tmp_dir
cp -r $test_dir $test_tmp_dir
cd ${test_tmp_dir}/$(basename $test_dir)
export ONEFLOW_DEBUG_MODE=1
for file in $(ls ${PWD}/test_*.py)
do
if test -f $file
then
export ONEFLOW_TEST_DEVICE_NUM=1
python3 $file --failfast --verbose
if [[ "$(python3 -c 'import oneflow.sysconfig;print(oneflow.sysconfig.has_rpc_backend_grpc())')" == *"True"* ]]; then
export ONEFLOW_TEST_DEVICE_NUM=2
python3 -m oneflow.distributed.launch --nproc_per_node 2 $file --failfast --verbose
export ONEFLOW_TEST_DEVICE_NUM=4
python3 -m oneflow.distributed.launch --nproc_per_node 4 $file --failfast --verbose
else
python3 -c 'import oneflow.sysconfig;assert(oneflow.sysconfig.has_rpc_backend_grpc() == False)'
fi
fi
done
unset ONEFLOW_DEBUG_MODE
"""
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""
This file is mostly copied from PyTorch v1.8.1 torch/distributed/launch.py
"""
import asyncio
import os
import random
import sys
from argparse import REMAINDER, ArgumentParser
from typing import IO, Any, List, Optional
import glob
import hashlib
from math import ceil
stdout_filename = "stdout"
stderr_filename = "stderr"
global PARALLEL_NUM
global SUCCESS_NUM
PARALLEL_NUM = 0
SUCCESS_NUM = 0
def parse_args():
"""
Helper function parsing the command line options
@retval ArgumentParser
"""
parser = ArgumentParser(
description="helper to start multiple distributed launches in parallel"
)
parser.add_argument(
"--files",
type=str,
help="files to run, support pattern",
required=True,
nargs="+",
)
parser.add_argument(
"--group_size",
type=int,
help="for one command, how many duplications to run",
required=True,
)
parser.add_argument(
"--device_num", type=int, help="how many devices to run on", required=True,
)
parser.add_argument(
"-n",
"--parallel_num",
type=str,
help="how many launches, could be a number, or 'master_port'",
required=True,
)
parser.add_argument(
"--auto_cuda_visible_devices",
action="store_true",
required=False,
default=False,
)
parser.add_argument(
"--shuffle", action="store_true", required=False, default=False,
)
parser.add_argument(
"--verbose", action="store_true", required=False, default=False,
)
parser.add_argument(
"--master_port",
default=[],
action="append",
help="Master node (rank 0)'s free port, pass this multiple `--master_port` to launch more instances",
)
parser.add_argument(
"-m",
"--module",
default=False,
action="store_true",
help="Changes each process to interpret the launch script as a python module, executing with the same behavior as'python -m'.",
)
parser.add_argument(
"training_script",
type=str,
help="The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script",
)
parser.add_argument("training_script_args", nargs=REMAINDER)
return parser.parse_args()
async def run_and_capture(cmd=None, prefix=None, **kwargs):
proc = await asyncio.create_subprocess_exec(
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, **kwargs
)
while True:
line = await proc.stdout.readline()
print(prefix, line.decode(), end="")
if not line:
break
await proc.wait()
assert proc.returncode == 0, prefix
global PARALLEL_NUM
global SUCCESS_NUM
SUCCESS_NUM += 1
print(f"{prefix} succeed ({SUCCESS_NUM}/{PARALLEL_NUM})")
async def launch_multiple(
cmds=None, group_size=None, auto_cuda_env=False, device_num=None
):
visible_groups = [
[str(x) for x in range(device_num)[i : i + group_size]] # to get ["0", "1"]
for i in range(0, device_num, group_size)
]
spawns = []
for i, cmd in enumerate(cmds):
group_idx = i % len(visible_groups)
cuda_visible_devices = ",".join(visible_groups[group_idx])
print(cuda_visible_devices, cmd, "\n")
env = os.environ
if auto_cuda_env:
env = dict(env, CUDA_VISIBLE_DEVICES=cuda_visible_devices)
process = run_and_capture(
cmd=cmd, prefix=f"[wg={i}][device={cuda_visible_devices}]", env=env,
)
spawns.append(process)
await asyncio.gather(*spawns)
def main():
args = parse_args()
# find files and chuck them
files = []
for f in args.files:
files += list(glob.glob(f, recursive=True))
print("total files:", len(files))
files = sorted(
files,
key=lambda x: hashlib.md5(os.path.basename(x.encode("ascii"))).hexdigest(),
)
if args.shuffle:
random.shuffle(files)
files_hash = hashlib.md5(
"".join([os.path.basename(x) for x in files]).encode()
).hexdigest()[:8]
if args.verbose:
print(
f"::warning file=testFilesHash,line={len(files)},col=0,endColumn=0::shuffle-{args.shuffle}-group_size-{args.group_size}-md5-{files_hash}"
)
if args.parallel_num == "master_port":
parallel_num = len(args.master_port)
master_ports = args.master_port
else:
parallel_num = int(args.parallel_num)
if parallel_num != len(args.master_port):
print(
"warning", "parallel_num != len(args.master_port)", "will auto generate"
)
default_master_port = 29500
master_ports = list(
range(default_master_port, default_master_port + parallel_num)
)
assert parallel_num > 0
assert len(master_ports) == parallel_num
chunk_size = ceil(len(files) / parallel_num)
global PARALLEL_NUM
PARALLEL_NUM = parallel_num
chunks = [files[i : i + chunk_size] for i in range(0, len(files), chunk_size)]
# check args
assert args.training_script == "oneflow.distributed.launch"
# generate commands
cmds = [
[sys.executable, "-m", args.training_script, "--master_port", str(master_port)]
+ args.training_script_args
+ chunck
for (master_port, chunck) in zip(master_ports, chunks)
]
loop = asyncio.get_event_loop()
processes = launch_multiple(
cmds=cmds,
auto_cuda_env=args.auto_cuda_visible_devices,
group_size=args.group_size,
device_num=args.device_num,
)
loop.run_until_complete(processes)
if __name__ == "__main__":
main()
import asyncio
import os
import argparse
from subprocess import PIPE, STDOUT
import glob
import sys
import time
import socket
from contextlib import closing
import uuid
def gen_cmds(cmd=None, dir=None, doctest=False):
if doctest:
paths = glob.glob(os.path.join(dir, "**/*.py"), recursive=True)
paths = [
p
for p in paths
if "compatible" not in p
and "single_client" not in p
and "unittest.py" not in p
]
with_doctest = []
for p in paths:
with open(p) as f:
content = f.read()
if "import doctest" in content:
with_doctest.append("{} {} -v".format(cmd, p))
print(with_doctest)
return with_doctest
else:
paths = glob.glob(os.path.join(dir, "test_*.py"), recursive=False)
return ["{} {} --failfast --verbose".format(cmd, p) for p in paths]
def find_free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(("localhost", 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
return s.getsockname()[1]
def split_and_print(prefix, text):
lines = text.splitlines(keepends=True)
prefixed = ""
for l in lines:
prefixed += f"{prefix} {l}"
print(prefixed, flush=True)
def everyN(l: list, n: int):
for i in range(0, len(l), n):
yield l[i : i + n]
def contains_oom_info(txt: str):
return "memory" in txt or "Memory" in txt or "CUDNN" in txt or "ALLOC" in txt
def should_retry(txt: str):
return contains_oom_info(txt)
def print_out(prefix: str = "", content: str = ""):
for l in content.split("\n"):
print(f"[{prefix}]", l)
async def spawn_shell_and_check(cmd: str = None, gpu_id: int = -1, check: bool = False):
is_cpu_only = os.getenv("ONEFLOW_TEST_CPU_ONLY")
print(f"[gpu={gpu_id}]", cmd)
p = await asyncio.create_subprocess_shell(
cmd,
stdout=PIPE,
stderr=STDOUT,
env=dict(
os.environ,
CUDA_VISIBLE_DEVICES=("-1" if is_cpu_only else ",".join([str(gpu_id)])),
ONEFLOW_TEST_MASTER_PORT=str(find_free_port()),
ONEFLOW_TEST_LOG_DIR=("./unittest-log-" + str(uuid.uuid4())),
),
)
(stdout_data, stderr_data) = await p.communicate()
decoded = stdout_data.decode()
if check or should_retry(decoded) == False:
if p.returncode != 0:
print_out(prefix=cmd, content=decoded)
raise RuntimeError(cmd)
return {"returncode": p.returncode, "cmd": cmd, "stdout": decoded}
async def run_cmds(
cmds, gpu_num=0, timeout=10, chunk=1, verbose=False, per_gpu_process_num=1
):
is_cpu_only = os.getenv("ONEFLOW_TEST_CPU_ONLY")
if is_cpu_only:
gpu_num = os.cpu_count()
fails = []
assert gpu_num > 0
for cmdN in everyN(cmds, per_gpu_process_num * gpu_num):
results = await asyncio.gather(
*[
spawn_shell_and_check(
cmd=cmd, gpu_id=i, check=(per_gpu_process_num == 1)
)
for cmd_gpu_num in everyN(cmdN, gpu_num)
for (i, cmd) in enumerate(cmd_gpu_num)
],
)
for r in list(results):
if r["returncode"] != 0:
fails.append(r["cmd"])
else:
print_out(prefix=r["cmd"], content=r["stdout"])
return fails
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--gpu_num", type=int, required=True, default=0)
parser.add_argument("--dir", type=str, required=True, default=".")
parser.add_argument("--cmd", type=str, required=False, default=sys.executable)
parser.add_argument("--timeout", type=int, required=False, default=2)
parser.add_argument("--chunk", type=int, required=True)
parser.add_argument("--verbose", action="store_true", required=False, default=False)
parser.add_argument("--doctest", action="store_true", required=False, default=False)
args = parser.parse_args()
cmds = gen_cmds(cmd=args.cmd, dir=args.dir, doctest=args.doctest)
start = time.time()
loop = asyncio.get_event_loop()
PER_GPU_PROCESS_NUMS = [12, 8, 2, 1]
is_cpu_only = os.getenv("ONEFLOW_TEST_CPU_ONLY")
if is_cpu_only:
PER_GPU_PROCESS_NUMS = [1]
for per_gpu_process_num in PER_GPU_PROCESS_NUMS:
print("[per_gpu_process_num]", per_gpu_process_num)
cmds = loop.run_until_complete(
run_cmds(
cmds,
gpu_num=args.gpu_num,
timeout=args.timeout,
chunk=args.chunk,
verbose=args.verbose,
per_gpu_process_num=per_gpu_process_num,
)
)
elapsed = time.time() - start
elapsed_time_txt = time.strftime("elapsed: %H:%M:%S", time.gmtime(elapsed))
print(elapsed_time_txt)
set -ex
if compgen -G "$2/core.*" > /dev/null; then
gdb --batch --quiet -ex "thread apply all bt full" -ex "quit" $1 $2/core.*
fi
set -ex
find . -type f -name "core.*" -exec gdb --batch --quiet -ex "thread apply all bt full" -ex "quit" python3 {} \;
{
"version": {
"major": 1,
"minor": 0
},
"local": [
{
"vram": [
{
"id": "0",
"slots": 8117
}
]
}
]
}
{
"version": {
"major": 1,
"minor": 0
},
"local": [
{
"vram": [
{
"id": "0",
"slots": 7982
},
{
"id": "1",
"slots": 7982
}
]
}
]
}
{
"version": {
"major": 1,
"minor": 0
},
"local": [
{
"vram": [
{
"id": "0",
"slots": 11019
},
{
"id": "1",
"slots": 11019
},
{
"id": "2",
"slots": 11019
},
{
"id": "3",
"slots": 11019
}
]
}
]
}
#!/usr/bin/env bash
set -ex
cd $ONEFLOW_MODELS_DIR
OFRECORD_PATH=/dataset/imagenette/ofrecord
if [ ! -d "/dataset/imagenette/ofrecord/train" ];then
mkdir -p ./dataset/ofrecord
ln -s /dataset/imagenette/ofrecord ./dataset/ofrecord/train
OFRECORD_PATH=./dataset/ofrecord
fi
python3 -m oneflow.distributed.launch --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 Vision/classification/image/resnet50/train.py --ofrecord-path $OFRECORD_PATH --ofrecord-part-num 1 --num-devices-per-node 1 --lr 0.004 --momentum 0.875 --num-epochs 1 --train-batch-size 4 --val-batch-size 50 --print-interval 10 --exit-num 1 --ddp
python3 -m oneflow.distributed.launch --nproc_per_node 2 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 Vision/classification/image/resnet50/train.py --ofrecord-path $OFRECORD_PATH --ofrecord-part-num 2 --num-devices-per-node 1 --lr 0.004 --momentum 0.875 --num-epochs 1 --train-batch-size 4 --val-batch-size 50 --print-interval 10 --exit-num 1 --use-fp16 --channel-last --scale-grad --graph --fuse-bn-relu --fuse-bn-add-relu --use-gpu-decode
#!/usr/bin/env bash
set -uxo pipefail
rc=0
# accumulate the score of every test
trap 'rc=$(($rc + $?))' ERR
cd $ONEFLOW_MODELS_DIR
function check_relative_speed {
# Default score is 1
SCORE=${2:-1}
awk -F'[:(]' -v threshold=$1 -v score=$SCORE 'BEGIN { ret=2 } /Relative speed/{ if ($2 >= threshold) { printf "✔️ "; ret=0 } else { printf "❌ "; ret=score }} {print $0} END { exit ret }'
}
function check_millisecond_time {
# Default score is 1
SCORE=${2:-1}
awk -F'[:(]' -v threshold=$1 -v score=$SCORE 'BEGIN { ret=2 } /OneFlow/{ if (substr($2, 2, length($2) - 4) <= threshold) { printf "✔️ "; ret=0 } else { printf "❌ "; ret=score }} { print $0 } END { exit ret }'
}
function write_to_file_and_print {
tee -a result
printf "\n" >> result
}
python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 16x3x224x224 --no-show-memory --times 100 | check_relative_speed 1.05 | check_millisecond_time 129.0 2 | write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 8x3x224x224 --no-show-memory --times 100 | check_relative_speed 1.04 | write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 4x3x224x224 --no-show-memory --times 200 | check_relative_speed 1.01 | write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 2x3x224x224 --no-show-memory --times 200 | check_relative_speed 0.99 | write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 1x3x224x224 --no-show-memory --times 200 | check_relative_speed 0.95 | write_to_file_and_print
python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 1 | write_to_file_and_print
python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 4 | write_to_file_and_print
python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 8 | write_to_file_and_print
export OMP_NUM_THREADS=1
python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 16x3x224x224 --no-show-memory --times 100 --ddp | check_relative_speed 1.12 | check_millisecond_time 136.3 2 | write_to_file_and_print
python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 8x3x224x224 --no-show-memory --times 100 --ddp | check_relative_speed 1.1 | write_to_file_and_print
python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 4x3x224x224 --no-show-memory --times 200 --ddp | check_relative_speed 1.18 | write_to_file_and_print
python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 2x3x224x224 --no-show-memory --times 200 --ddp | check_relative_speed 1.18 | write_to_file_and_print
python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 1x3x224x224 --no-show-memory --times 200 --ddp | check_relative_speed 1.15 | write_to_file_and_print
result="GPU Name: `nvidia-smi --query-gpu=name --format=csv,noheader -i 0` \n\n `cat result`"
# escape newline for github actions: https://github.community/t/set-output-truncates-multiline-strings/16852/2
# note that we escape \n and \r to \\n and \\r (i.e. raw string "\n" and "\r") instead of %0A and %0D,
# so that they can be correctly handled in javascript code
result="${result//'%'/'%25'}"
result="${result//$'\n'/'\\n'}"
result="${result//$'\r'/'\\r'}"
echo "::set-output name=stats::$result"
# Only fail when the sum of score >= 2
if (( $rc >= 2 ))
then
exit 1
else
exit 0
fi
#!/bin/bash
set -xe
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
wheel_path=${ONEFLOW_WHEEL_PATH:-"$PWD/wheelhouse"}
index=${ONEFLOW_PIP_INDEX}
pkg_name=${ONEFLOW_PACKAGE_NAME:-"oneflow"}
if [ -n "$index" ]; then
python3 -m pip install --find-links ${index} ${pkg_name}
elif [ -d "$wheel_path" ]; then
ls -la $wheel_path
export PATH=/root/.local/bin:$PATH
python3 -m pip install https://oneflow-static.oss-cn-beijing.aliyuncs.com/pipindex/pipindex-0.1.3-py2.py3-none-any.whl --user
pipindex build $wheel_path
python3 -m pip install -U --user --extra-index-url file://${wheel_path}/simple ${pkg_name}
elif [ -e "$wheel_path" ]; then
python3 -m pip install --user "$wheel_path"
elif [ -d "$src_dir" ]; then
python3 -m pip install -e "$src_dir" --user
else
echo "wheel not found: $wheel_path, src dir not found: $src_dir, continue anyway..."
fi
set(BUILD_CUDA YES CACHE BOOL "")
set(BUILD_GIT_VERSION YES CACHE BOOL "")
set(BUILD_TESTING OFF CACHE BOOL "")
set(BUILD_RDMA YES CACHE BOOL "")
set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "")
set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
set(CMAKE_GENERATOR Ninja CACHE STRING "")
set(CMAKE_CUDA_ARCHITECTURES "61-real;70-real;75-real;80-real;86-real" CACHE STRING "")
set(CUDNN_STATIC OFF CACHE BOOL "")
set(WITH_MLIR ON CACHE BOOL "")
set(BUILD_CPP_API OFF CACHE BOOL "")
set(CUDA_NVCC_THREADS_NUMBER 8 CACHE STRING "")
set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CUDA_COMPILER_LAUNCHER ccache CACHE STRING "")
set(BUILD_CUDA NO CACHE BOOL "")
set(BUILD_GIT_VERSION YES CACHE BOOL "")
set(BUILD_TESTING YES CACHE BOOL "")
set(WITH_ONEDNN YES CACHE BOOL "")
set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "")
set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
set(CMAKE_GENERATOR Ninja CACHE STRING "")
set(BUILD_CPP_API ON CACHE BOOL "")
set(WITH_MLIR ON CACHE BOOL "")
set(BUILD_FOR_CI ON CACHE BOOL "")
set(BUILD_SHARED_LIBS ON CACHE BOOL "")
set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CPU_THREADING_RUNTIME "TBB" CACHE BOOL "")
set(BUILD_CUDA YES CACHE BOOL "")
set(BUILD_GIT_VERSION YES CACHE BOOL "")
set(BUILD_TESTING YES CACHE BOOL "")
set(BUILD_RDMA YES CACHE BOOL "")
set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "")
set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
set(CMAKE_GENERATOR Ninja CACHE STRING "")
set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CUDA_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CUDA_ARCHITECTURES "61;75" CACHE STRING "")
set(CUDNN_STATIC OFF CACHE BOOL "")
set(RPC_BACKEND "LOCAL" CACHE STRING "")
set(CUDA_NVCC_THREADS_NUMBER 8 CACHE STRING "")
set(BUILD_CUDA YES CACHE BOOL "")
set(BUILD_GIT_VERSION YES CACHE BOOL "")
set(BUILD_TESTING YES CACHE BOOL "")
set(BUILD_RDMA YES CACHE BOOL "")
set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "")
set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
set(CMAKE_GENERATOR Ninja CACHE STRING "")
set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CUDA_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CUDA_ARCHITECTURES "61;75" CACHE STRING "")
set(CUDNN_STATIC ON CACHE BOOL "")
set(WITH_MLIR ON CACHE BOOL "")
set(BUILD_CPP_API ON CACHE BOOL "")
set(CUDA_NVCC_THREADS_NUMBER 8 CACHE STRING "")
set(BUILD_FOR_CI ON CACHE BOOL "")
set(CPU_THREADING_RUNTIME "SEQ" CACHE BOOL "")
set(CMAKE_C_COMPILER "clang" CACHE STRING "")
set(CMAKE_CXX_COMPILER "clang++" CACHE STRING "")
set(CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld" CACHE STRING "")
set(CMAKE_MODULE_LINKER_FLAGS_INIT "-fuse-ld=lld" CACHE STRING "")
set(CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld" CACHE STRING "")
set(BUILD_SHARED_LIBS YES CACHE BOOL "")
set(BUILD_CUDA NO CACHE BOOL "")
set(BUILD_TESTING YES CACHE BOOL "")
set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
set(CMAKE_GENERATOR Ninja CACHE STRING "")
set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF CACHE BOOL "")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment