Oneflow 0.8 for DCU

21d47d0e · yuguo · 21d47d0e · 21d47d0e · 21d47d0e · 21d47d0e
Commit 21d47d0e authored Oct 24, 2022 by yuguo
20 changed files
--- a/ci/manylinux/build-gcc7.sh
+++ b/ci/manylinux/build-gcc7.sh
+source scl_source enable devtoolset-7
+set -ex
+ONEFLOW_CI_BUILD_PARALLEL=${ONEFLOW_CI_BUILD_PARALLEL:-$(nproc)}
+gcc --version
+ld --version
+# clean python dir
+cd ${ONEFLOW_CI_SRC_DIR}
+${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user -r ci/fixed-dev-requirements.txt
+cd python
+function clean_artifacts {
+    git clean -nXd -e \!dist -e \!dist/**
+    git clean -fXd -e \!dist -e \!dist/**
+}
+clean_artifacts
+# cmake config
+mkdir -p ${ONEFLOW_CI_BUILD_DIR}
+cd ${ONEFLOW_CI_BUILD_DIR}
+find ${ONEFLOW_CI_BUILD_DIR} -name CMakeCache.txt
+find ${ONEFLOW_CI_BUILD_DIR} -name CMakeCache.txt -delete
+if [ ! -f "$ONEFLOW_CI_CMAKE_INIT_CACHE" ]; then
+    echo "$ONEFLOW_CI_CMAKE_INIT_CACHE does not exist."
+    exit 1
+fi
+export PATH="${PATH}:$(dirname ${ONEFLOW_CI_PYTHON_EXE})"
+export PYTHON_BIN_PATH=${ONEFLOW_CI_PYTHON_EXE}
+cmake -S ${ONEFLOW_CI_SRC_DIR} -C ${ONEFLOW_CI_CMAKE_INIT_CACHE} -DPython3_EXECUTABLE=${ONEFLOW_CI_PYTHON_EXE}
+# cmake build
+cd ${ONEFLOW_CI_BUILD_DIR}
+cmake --build . --parallel ${ONEFLOW_CI_BUILD_PARALLEL}
+if [ ! -z "$ONEFLOW_CI_BUILD_RUN_LIT" ]; then
+    ${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user flowvision==0.1.0
+    export PATH=$PATH:$(dirname $ONEFLOW_CI_PYTHON_EXE)
+    cmake --build . -t c1
+fi
+# build pip
+cd ${ONEFLOW_CI_SRC_DIR}
+cd python
+${ONEFLOW_CI_PYTHON_EXE} setup.py bdist_wheel
--- a/ci/manylinux/build.sh
+++ b/ci/manylinux/build.sh
+set -ex
+ONEFLOW_CI_BUILD_PARALLEL=${ONEFLOW_CI_BUILD_PARALLEL:-$(nproc)}
+gcc --version
+ld --version
+# clean python dir
+cd ${ONEFLOW_CI_SRC_DIR}
+${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user -r ci/fixed-dev-requirements.txt
+cd python
+function clean_artifacts {
+    git clean -nXd -e \!dist -e \!dist/**
+    git clean -fXd -e \!dist -e \!dist/**
+}
+clean_artifacts
+# cmake config
+mkdir -p ${ONEFLOW_CI_BUILD_DIR}
+cd ${ONEFLOW_CI_BUILD_DIR}
+find ${ONEFLOW_CI_BUILD_DIR} -name CMakeCache.txt
+find ${ONEFLOW_CI_BUILD_DIR} -name CMakeCache.txt -delete
+if [ ! -f "$ONEFLOW_CI_CMAKE_INIT_CACHE" ]; then
+    echo "$ONEFLOW_CI_CMAKE_INIT_CACHE does not exist."
+    exit 1
+fi
+cmake -S ${ONEFLOW_CI_SRC_DIR} -C ${ONEFLOW_CI_CMAKE_INIT_CACHE} -DPython3_EXECUTABLE=${ONEFLOW_CI_PYTHON_EXE}
+# cmake build
+cd ${ONEFLOW_CI_BUILD_DIR}
+cmake --build . --parallel ${ONEFLOW_CI_BUILD_PARALLEL}
+if [ ! -z "$ONEFLOW_CI_BUILD_RUN_LIT" ]; then
+    ${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user flowvision==0.1.0
+    export PATH=$PATH:$(dirname $ONEFLOW_CI_PYTHON_EXE)
+    cmake --build . -t c1
+fi
+# build pip
+cd ${ONEFLOW_CI_SRC_DIR}
+cd python
+${ONEFLOW_CI_PYTHON_EXE} setup.py bdist_wheel
--- a/ci/requirements.txt
+++ b/ci/requirements.txt
+pycocotools
+opencv-python==4.3.0.38; sys_platform == 'darwin'
+opencv-python==4.2.0.34; sys_platform != 'darwin'
+scipy
+pillow
+tensorflow-addons==0.13.0
+tensorflow==2.5.0
--- a/ci/reset_submodule.sh
+++ b/ci/reset_submodule.sh
+set -x
+set -e
+git reset --hard
+git submodule deinit -f .
+rm -rf .git/modules/*
--- a/ci/setup_submodule.py
+++ b/ci/setup_submodule.py
+import configparser
+import argparse
+import os
+parser = argparse.ArgumentParser()
+parser.add_argument("-s", "--oneflow_src_local_path", type=str, required=False)
+parser.add_argument("-r", "--oneflow_src_remote_url", type=str, required=False)
+args = parser.parse_args()
+assert (
+    args.oneflow_src_local_path or args.oneflow_src_remote_url
+), "require one of oneflow_src_local_path or oneflow_src_remote_url"
+config = configparser.ConfigParser()
+config.read(".gitmodules")
+for s in config.sections():
+    path = config[s]["path"]
+    if args.oneflow_src_local_path:
+        src_path = os.path.join(args.oneflow_src_local_path, path)
+        assert os.path.exists("{}/.git".format(src_path)), src_path
+        config[s]["url"] = "file://{}".format(src_path)
+    else:
+        src_path = os.path.join(args.oneflow_src_remote_url, path)
+        config[s]["url"] = src_path
+with open(".gitmodules", "w") as configfile:
+    config.write(configfile)
--- a/ci/setup_submodule.sh
+++ b/ci/setup_submodule.sh
+set -x
+set -e
+src_dir=${ONEFLOW_CI_SRC_DIR:-"$HOME/oneflow"}
+python3 ci/setup_submodule.py --oneflow_src_local_path=$src_dir
+git submodule sync
+git submodule update --init --recursive
--- a/ci/test/1node_benchmark_test.sh
+++ b/ci/test/1node_benchmark_test.sh
+set -xe
+rm -rf /benchmarks
+cp -r python/oneflow/compatible/single_client/benchmarks /benchmarks
+cd /benchmarks
+python3 cnn_benchmark/of_cnn_benchmarks.py \
+    --gpu_num_per_node=1 \
+    --model="vgg16" \
+    --batch_size_per_device=8 \
+    --iter_num=5 \
+    --learning_rate=0.01 \
+    --optimizer="sgd" \
+    --loss_print_every_n_iter=1 \
+    --data_dir="/dataset/imagenet_227/train/32"
+python3 cnn_benchmark/of_cnn_benchmarks.py \
+    --gpu_num_per_node=1 \
+    --model="alexnet" \
+    --batch_size_per_device=8 \
+    --iter_num=5 \
+    --learning_rate=0.01 \
+    --optimizer="sgd" \
+    --loss_print_every_n_iter=1 \
+    --data_dir="/dataset/imagenet_227/train/32"
+python3 cnn_benchmark/of_cnn_benchmarks.py \
+    --gpu_num_per_node=1 \
+    --model="resnet50" \
+    --batch_size_per_device=8 \
+    --iter_num=5 \
+    --gpu_image_decoder=True \
+    --learning_rate=0.01 \
+    --optimizer="sgd" \
+    --loss_print_every_n_iter=1 \
+    --data_dir="/dataset/imagenet_227/train/32"
+python3 cnn_benchmark/of_cnn_benchmarks.py \
+    --gpu_num_per_node=1 \
+    --model="resnet50" \
+    --batch_size_per_device=8 \
+    --iter_num=5 \
+    --learning_rate=0.01 \
+    --optimizer="sgd" \
+    --loss_print_every_n_iter=1
+python3 bert_benchmark/run_pretraining.py \
+    --gpu_num_per_node=1 \
+    --node_num=1 \
+    --learning_rate=1e-4 \
+    --weight_decay_rate=0.01 \
+    --batch_size_per_device=24 \
+    --iter_num=5 \
+    --loss_print_every_n_iter=1 \
+    --data_dir="/dataset/bert/bert_seq_len_128_repeat1024" \
+    --data_part_num=1 \
+    --seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --num_hidden_layers=12 \
+    --num_attention_heads=12 \
+    --max_position_embeddings=512 \
+    --type_vocab_size=2 \
+    --vocab_size=30522 \
+    --attention_probs_dropout_prob=0.1 \
+    --hidden_dropout_prob=0.1 \
+    --hidden_size_per_head=64
--- a/ci/test/1node_benchmark_test_fp16.sh
+++ b/ci/test/1node_benchmark_test_fp16.sh
+set -ex
+rm -rf /benchmarks
+cp -r python/oneflow/compatible/single_client/benchmarks /benchmarks
+cd /benchmarks
+python3 cnn_benchmark/of_cnn_benchmarks.py \
+    --gpu_num_per_node=1 \
+    --model="vgg16" \
+    --batch_size_per_device=8 \
+    --iter_num=5 \
+    --learning_rate=0.01 \
+    --optimizer="sgd" \
+    --loss_print_every_n_iter=1 \
+    --data_dir="/dataset/imagenet_227/train/32" \
+    --enable_auto_mixed_precision=True
+python3 cnn_benchmark/of_cnn_benchmarks.py \
+    --gpu_num_per_node=1 \
+    --model="alexnet" \
+    --batch_size_per_device=8 \
+    --iter_num=5 \
+    --learning_rate=0.01 \
+    --optimizer="sgd" \
+    --loss_print_every_n_iter=1 \
+    --data_dir="/dataset/imagenet_227/train/32" \
+    --enable_auto_mixed_precision=True
+python3 cnn_benchmark/of_cnn_benchmarks.py \
+    --gpu_num_per_node=1 \
+    --model="resnet50" \
+    --batch_size_per_device=8 \
+    --iter_num=5 \
+    --learning_rate=0.01 \
+    --optimizer="sgd" \
+    --loss_print_every_n_iter=1 \
+    --data_dir="/dataset/imagenet_227/train/32" \
+    --enable_auto_mixed_precision=True
+python3 bert_benchmark/run_pretraining.py \
+    --gpu_num_per_node=1 \
+    --node_num=1 \
+    --learning_rate=1e-4 \
+    --weight_decay_rate=0.01 \
+    --batch_size_per_device=24 \
+    --iter_num=5 \
+    --loss_print_every_n_iter=1 \
+    --data_dir="/dataset/bert/bert_seq_len_128_repeat1024" \
+    --data_part_num=1 \
+    --seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --num_hidden_layers=12 \
+    --num_attention_heads=12 \
+    --max_position_embeddings=512 \
+    --type_vocab_size=2 \
+    --vocab_size=30522 \
+    --attention_probs_dropout_prob=0.1 \
+    --hidden_dropout_prob=0.1 \
+    --hidden_size_per_head=64 \
+    --enable_auto_mixed_precision=True
--- a/ci/test/1node_custom_op_test.sh
+++ b/ci/test/1node_custom_op_test.sh
+#!/bin/bash
+set -xe
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
+rm -rf $test_tmp_dir
+mkdir -p $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/test/custom_ops $test_tmp_dir
+cd $test_tmp_dir
+export ONEFLOW_TEST_DEVICE_NUM=1
+python3 -m unittest discover ./custom_ops --failfast --verbose
--- a/ci/test/1node_model_eager_test.sh
+++ b/ci/test/1node_model_eager_test.sh
+#!/bin/bash
+set -xe
+cp -r python/oneflow/test /test_dir
+cd /test_dir
+python3 models/eager_1node_test.py
--- a/ci/test/1node_model_serve_test.sh
+++ b/ci/test/1node_model_serve_test.sh
+#!/bin/bash
+set -xe
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
+rm -rf $test_tmp_dir
+mkdir -p $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/test $test_tmp_dir
+cd $test_tmp_dir
+export ONEFLOW_TEST_DEVICE_NUM=1
+python3 -m unittest discover test/serving --failfast --verbose
--- a/ci/test/1node_model_test.sh
+++ b/ci/test/1node_model_test.sh
+#!/bin/bash
+set -xe
+cp -r python/oneflow/compatible/single_client/test /test_dir
+cd /test_dir
+python3 models/1node_test.py
--- a/ci/test/1node_op_test.sh
+++ b/ci/test/1node_op_test.sh
+#!/bin/bash
+set -xe
+export TF_CPP_MIN_LOG_LEVEL=3
+export PYTHONUNBUFFERED=1
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
+rm -rf $test_tmp_dir
+mkdir -p $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/test $test_tmp_dir
+cd $test_tmp_dir
+python3 -m oneflow --doctor
+gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+for CHUNK in 1
+do
+	export ONEFLOW_TEST_DEVICE_NUM=${CHUNK}
+    python3 $src_dir/ci/test/parallel_run.py \
+        --gpu_num="${gpu_num}" \
+        --dir=test/ops \
+        --timeout=1 \
+        --verbose \
+        --chunk=${CHUNK}
+done
+if [ -z "$ONEFLOW_TEST_ENABLE_EAGER" ]
+then
+    export ONEFLOW_TEST_DEVICE_NUM=2
+    python3 -m unittest discover test/ops --failfast --verbose
+    export ONEFLOW_TEST_DEVICE_NUM=4
+    python3 -m unittest discover test/ops --failfast --verbose
+else
+    echo "deadlock unsolved, skipping multi-card eager"
+fi
--- a/ci/test/2node_op_test.sh
+++ b/ci/test/2node_op_test.sh
+#!/bin/bash
+set -xe
+export PYTHONUNBUFFERED=1
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
+rm -rf $test_tmp_dir
+mkdir -p $test_tmp_dir
+chmod -R o+w $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/test $test_tmp_dir
+cd $test_tmp_dir
+ONEFLOW_TEST_DEVICE_NUM=1 python3 test/ops/test_assign.py --failfast --verbose
+ONEFLOW_TEST_DEVICE_NUM=1 python3 test/ops/test_two_node_boxing.py --failfast --verbose
+for device_num in 1 2 4
+do
+    ONEFLOW_TEST_ENABLE_INIT_BY_HOST_LIST=1 ONEFLOW_TEST_DEVICE_NUM=$device_num python3 -m unittest discover test/ops --failfast --verbose
+    # use a invalid ibverbs lib to test if falling back to epoll works
+    ONEFLOW_TEST_ENABLE_INIT_BY_HOST_LIST=1 ONEFLOW_TEST_DEVICE_NUM=$device_num ONEFLOW_LIBIBVERBS_PATH=invalid_lib python3 -m unittest discover test/ops --failfast --verbose
+done
--- a/ci/test/2node_op_test_multi_client.sh
+++ b/ci/test/2node_op_test_multi_client.sh
+#!/bin/bash
+set -xeu
+export PYTHONUNBUFFERED=1
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+ONEFLOW_CI_DEVICE_NUMS=${ONEFLOW_CI_DEVICE_NUMS:-"1 2 4"}
+for device_num in ${ONEFLOW_CI_DEVICE_NUMS}
+do
+    export ONEFLOW_TEST_NODE_NUM=2
+    export ONEFLOW_TEST_DEVICE_NUM=$device_num
+    time python3 ${src_dir}/ci/test/multi_launch.py \
+        --files "${ONEFLOW_TEST_DIR}/**/test_*.py" \
+        -n 4 \
+        --group_size $device_num \
+        --device_num 4 \
+        --verbose \
+        --auto_cuda_visible_devices \
+        -m oneflow.distributed.launch \
+        --nproc_per_node $device_num --nnodes=2 --node_rank=$NODE_RANK --master_addr $_MASTER_ADDR \
+        -m pytest --max-worker-restart=0 -x --durations=50 --capture=sys -p no:cacheprovider -p no:randomly --ignore=log
+done
--- a/ci/test/CMakeLists.txt
+++ b/ci/test/CMakeLists.txt
+set(PYTHON_EXECUTABLE python3 CACHE STRING "python3 exe to run test, usually is the python3 installation oneflow is linked to")
+set(ONEFLOW_SRC_DIR ${CMAKE_SOURCE_DIR} CACHE STRING "source dir of oneflow")
+set(IS_DEV ON CACHE BOOL "")
+set(CTEST_RESOURCE_SPEC_FILE "${CMAKE_CURRENT_SOURCE_DIR}/resource-spec/2x-rtx-2080.json" CACHE STRING "")
+# CTEST_OUTPUT_ON_FAILURE=1 CTEST_PARALLEL_LEVEL=20 ninja test
+file(GLOB_RECURSE PYTHON_TEST_FILES LIST_DIRECTORIES false RELATIVE ${ONEFLOW_SRC_DIR} "${ONEFLOW_SRC_DIR}/python/oneflow/test_*.py")
+foreach(PYTHON_TEST_FILE ${PYTHON_TEST_FILES})
+  set(TEST_NAME ${PYTHON_TEST_FILE})
+  add_test(NAME ${TEST_NAME}
+    COMMAND ${PYTHON_EXECUTABLE} ${ONEFLOW_SRC_DIR}/${PYTHON_TEST_FILE} --failfast --verbose
+  )
+  set_tests_properties(${TEST_NAME}
+    PROPERTIES
+      ENVIRONMENT "$<$<NOT:$<BOOL:${BUILD_CUDA}>>:ONEFLOW_TEST_CPU_ONLY=1>;$<$<BOOL:${IS_DEV}>:PYTHONPATH=${ONEFLOW_SRC_DIR}/python:$ENV{PYTHONPATH}>"
+      RESOURCE_GROUPS
+        "vram:2000"
+  )
+endforeach()
--- a/ci/test/build_docs.sh
+++ b/ci/test/build_docs.sh
+set -ex
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"$PWD/build-docs"}
+rm -rf $test_tmp_dir
+cp -r docs ${test_tmp_dir}
+cd ${test_tmp_dir}
+make html SPHINXOPTS="-W --keep-going"
--- a/ci/test/distributed_run.py
+++ b/ci/test/distributed_run.py
--- a/ci/test/doctest.sh
+++ b/ci/test/doctest.sh
+#!/bin/bash
+set -xe
+export PYTHONUNBUFFERED=1
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
+mkdir -p ${test_tmp_dir}
+cd ${test_tmp_dir}
+python3 -c 'import oneflow; f=open("oneflow_path.txt", "w"); f.write(oneflow.__path__[0])'
+gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+python3 $src_dir/ci/test/parallel_run.py \
+    --gpu_num=${gpu_num} \
+    --dir=$(cat oneflow_path.txt) \
+    --timeout=1 \
+    --verbose \
+    --chunk=1 \
+    --doctest
--- a/ci/test/dry_run_test.sh
+++ b/ci/test/dry_run_test.sh
+#!/bin/bash
+set -xe
+export PYTHONUNBUFFERED=1
+src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
+rm -rf $test_tmp_dir
+mkdir -p $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/benchmarks $test_tmp_dir
+cd $test_tmp_dir/benchmarks
+export ONEFLOW_DRY_RUN=1
+# turn on ONEFLOW_DEBUG_MODE will cause protobuf err
+# export ONEFLOW_DEBUG_MODE=1
+node_num=2
+generated_node_list=$(seq -f "mockhost%02g" -s, $node_num)
+# heaptrack
+# valgrind --tool=massif --threshold=0.0001
+# /usr/bin/time -v
+time python3 bert_benchmark/run_pretraining.py \
+    --learning_rate=1e-4 \
+    --weight_decay_rate=0.01 \
+    --batch_size_per_device=24 \
+    --iter_num=5 \
+    --loss_print_every_n_iter=1 \
+    --data_dir="/dataset/bert/bert_seq_len_128_repeat1024" \
+    --data_part_num=1 \
+    --seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --num_hidden_layers=12 \
+    --num_attention_heads=12 \
+    --max_position_embeddings=512 \
+    --type_vocab_size=2 \
+    --vocab_size=30522 \
+    --attention_probs_dropout_prob=0.1 \
+    --hidden_dropout_prob=0.1 \
+    --hidden_size_per_head=64 \
+    --node_list=${generated_node_list} \
+    --node_num=${node_num} \
+    --gpu_num_per_node=8