Commit 21d47d0e authored by yuguo's avatar yuguo
Browse files

Oneflow 0.8 for DCU

parents
source scl_source enable devtoolset-7
set -ex
ONEFLOW_CI_BUILD_PARALLEL=${ONEFLOW_CI_BUILD_PARALLEL:-$(nproc)}
gcc --version
ld --version
# clean python dir
cd ${ONEFLOW_CI_SRC_DIR}
${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user -r ci/fixed-dev-requirements.txt
cd python
function clean_artifacts {
git clean -nXd -e \!dist -e \!dist/**
git clean -fXd -e \!dist -e \!dist/**
}
clean_artifacts
# cmake config
mkdir -p ${ONEFLOW_CI_BUILD_DIR}
cd ${ONEFLOW_CI_BUILD_DIR}
find ${ONEFLOW_CI_BUILD_DIR} -name CMakeCache.txt
find ${ONEFLOW_CI_BUILD_DIR} -name CMakeCache.txt -delete
if [ ! -f "$ONEFLOW_CI_CMAKE_INIT_CACHE" ]; then
echo "$ONEFLOW_CI_CMAKE_INIT_CACHE does not exist."
exit 1
fi
export PATH="${PATH}:$(dirname ${ONEFLOW_CI_PYTHON_EXE})"
export PYTHON_BIN_PATH=${ONEFLOW_CI_PYTHON_EXE}
cmake -S ${ONEFLOW_CI_SRC_DIR} -C ${ONEFLOW_CI_CMAKE_INIT_CACHE} -DPython3_EXECUTABLE=${ONEFLOW_CI_PYTHON_EXE}
# cmake build
cd ${ONEFLOW_CI_BUILD_DIR}
cmake --build . --parallel ${ONEFLOW_CI_BUILD_PARALLEL}
if [ ! -z "$ONEFLOW_CI_BUILD_RUN_LIT" ]; then
${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user flowvision==0.1.0
export PATH=$PATH:$(dirname $ONEFLOW_CI_PYTHON_EXE)
cmake --build . -t c1
fi
# build pip
cd ${ONEFLOW_CI_SRC_DIR}
cd python
${ONEFLOW_CI_PYTHON_EXE} setup.py bdist_wheel
set -ex
ONEFLOW_CI_BUILD_PARALLEL=${ONEFLOW_CI_BUILD_PARALLEL:-$(nproc)}
gcc --version
ld --version
# clean python dir
cd ${ONEFLOW_CI_SRC_DIR}
${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user -r ci/fixed-dev-requirements.txt
cd python
function clean_artifacts {
git clean -nXd -e \!dist -e \!dist/**
git clean -fXd -e \!dist -e \!dist/**
}
clean_artifacts
# cmake config
mkdir -p ${ONEFLOW_CI_BUILD_DIR}
cd ${ONEFLOW_CI_BUILD_DIR}
find ${ONEFLOW_CI_BUILD_DIR} -name CMakeCache.txt
find ${ONEFLOW_CI_BUILD_DIR} -name CMakeCache.txt -delete
if [ ! -f "$ONEFLOW_CI_CMAKE_INIT_CACHE" ]; then
echo "$ONEFLOW_CI_CMAKE_INIT_CACHE does not exist."
exit 1
fi
cmake -S ${ONEFLOW_CI_SRC_DIR} -C ${ONEFLOW_CI_CMAKE_INIT_CACHE} -DPython3_EXECUTABLE=${ONEFLOW_CI_PYTHON_EXE}
# cmake build
cd ${ONEFLOW_CI_BUILD_DIR}
cmake --build . --parallel ${ONEFLOW_CI_BUILD_PARALLEL}
if [ ! -z "$ONEFLOW_CI_BUILD_RUN_LIT" ]; then
${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user flowvision==0.1.0
export PATH=$PATH:$(dirname $ONEFLOW_CI_PYTHON_EXE)
cmake --build . -t c1
fi
# build pip
cd ${ONEFLOW_CI_SRC_DIR}
cd python
${ONEFLOW_CI_PYTHON_EXE} setup.py bdist_wheel
pycocotools
opencv-python==4.3.0.38; sys_platform == 'darwin'
opencv-python==4.2.0.34; sys_platform != 'darwin'
scipy
pillow
tensorflow-addons==0.13.0
tensorflow==2.5.0
set -x
set -e
git reset --hard
git submodule deinit -f .
rm -rf .git/modules/*
import configparser
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--oneflow_src_local_path", type=str, required=False)
parser.add_argument("-r", "--oneflow_src_remote_url", type=str, required=False)
args = parser.parse_args()
assert (
args.oneflow_src_local_path or args.oneflow_src_remote_url
), "require one of oneflow_src_local_path or oneflow_src_remote_url"
config = configparser.ConfigParser()
config.read(".gitmodules")
for s in config.sections():
path = config[s]["path"]
if args.oneflow_src_local_path:
src_path = os.path.join(args.oneflow_src_local_path, path)
assert os.path.exists("{}/.git".format(src_path)), src_path
config[s]["url"] = "file://{}".format(src_path)
else:
src_path = os.path.join(args.oneflow_src_remote_url, path)
config[s]["url"] = src_path
with open(".gitmodules", "w") as configfile:
config.write(configfile)
set -x
set -e
src_dir=${ONEFLOW_CI_SRC_DIR:-"$HOME/oneflow"}
python3 ci/setup_submodule.py --oneflow_src_local_path=$src_dir
git submodule sync
git submodule update --init --recursive
set -xe
rm -rf /benchmarks
cp -r python/oneflow/compatible/single_client/benchmarks /benchmarks
cd /benchmarks
python3 cnn_benchmark/of_cnn_benchmarks.py \
--gpu_num_per_node=1 \
--model="vgg16" \
--batch_size_per_device=8 \
--iter_num=5 \
--learning_rate=0.01 \
--optimizer="sgd" \
--loss_print_every_n_iter=1 \
--data_dir="/dataset/imagenet_227/train/32"
python3 cnn_benchmark/of_cnn_benchmarks.py \
--gpu_num_per_node=1 \
--model="alexnet" \
--batch_size_per_device=8 \
--iter_num=5 \
--learning_rate=0.01 \
--optimizer="sgd" \
--loss_print_every_n_iter=1 \
--data_dir="/dataset/imagenet_227/train/32"
python3 cnn_benchmark/of_cnn_benchmarks.py \
--gpu_num_per_node=1 \
--model="resnet50" \
--batch_size_per_device=8 \
--iter_num=5 \
--gpu_image_decoder=True \
--learning_rate=0.01 \
--optimizer="sgd" \
--loss_print_every_n_iter=1 \
--data_dir="/dataset/imagenet_227/train/32"
python3 cnn_benchmark/of_cnn_benchmarks.py \
--gpu_num_per_node=1 \
--model="resnet50" \
--batch_size_per_device=8 \
--iter_num=5 \
--learning_rate=0.01 \
--optimizer="sgd" \
--loss_print_every_n_iter=1
python3 bert_benchmark/run_pretraining.py \
--gpu_num_per_node=1 \
--node_num=1 \
--learning_rate=1e-4 \
--weight_decay_rate=0.01 \
--batch_size_per_device=24 \
--iter_num=5 \
--loss_print_every_n_iter=1 \
--data_dir="/dataset/bert/bert_seq_len_128_repeat1024" \
--data_part_num=1 \
--seq_length=128 \
--max_predictions_per_seq=20 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64
set -ex
rm -rf /benchmarks
cp -r python/oneflow/compatible/single_client/benchmarks /benchmarks
cd /benchmarks
python3 cnn_benchmark/of_cnn_benchmarks.py \
--gpu_num_per_node=1 \
--model="vgg16" \
--batch_size_per_device=8 \
--iter_num=5 \
--learning_rate=0.01 \
--optimizer="sgd" \
--loss_print_every_n_iter=1 \
--data_dir="/dataset/imagenet_227/train/32" \
--enable_auto_mixed_precision=True
python3 cnn_benchmark/of_cnn_benchmarks.py \
--gpu_num_per_node=1 \
--model="alexnet" \
--batch_size_per_device=8 \
--iter_num=5 \
--learning_rate=0.01 \
--optimizer="sgd" \
--loss_print_every_n_iter=1 \
--data_dir="/dataset/imagenet_227/train/32" \
--enable_auto_mixed_precision=True
python3 cnn_benchmark/of_cnn_benchmarks.py \
--gpu_num_per_node=1 \
--model="resnet50" \
--batch_size_per_device=8 \
--iter_num=5 \
--learning_rate=0.01 \
--optimizer="sgd" \
--loss_print_every_n_iter=1 \
--data_dir="/dataset/imagenet_227/train/32" \
--enable_auto_mixed_precision=True
python3 bert_benchmark/run_pretraining.py \
--gpu_num_per_node=1 \
--node_num=1 \
--learning_rate=1e-4 \
--weight_decay_rate=0.01 \
--batch_size_per_device=24 \
--iter_num=5 \
--loss_print_every_n_iter=1 \
--data_dir="/dataset/bert/bert_seq_len_128_repeat1024" \
--data_part_num=1 \
--seq_length=128 \
--max_predictions_per_seq=20 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64 \
--enable_auto_mixed_precision=True
#!/bin/bash
set -xe
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
rm -rf $test_tmp_dir
mkdir -p $test_tmp_dir
cp -r $src_dir/python/oneflow/compatible/single_client/test/custom_ops $test_tmp_dir
cd $test_tmp_dir
export ONEFLOW_TEST_DEVICE_NUM=1
python3 -m unittest discover ./custom_ops --failfast --verbose
#!/bin/bash
set -xe
cp -r python/oneflow/test /test_dir
cd /test_dir
python3 models/eager_1node_test.py
#!/bin/bash
set -xe
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
rm -rf $test_tmp_dir
mkdir -p $test_tmp_dir
cp -r $src_dir/python/oneflow/compatible/single_client/test $test_tmp_dir
cd $test_tmp_dir
export ONEFLOW_TEST_DEVICE_NUM=1
python3 -m unittest discover test/serving --failfast --verbose
#!/bin/bash
set -xe
cp -r python/oneflow/compatible/single_client/test /test_dir
cd /test_dir
python3 models/1node_test.py
#!/bin/bash
set -xe
export TF_CPP_MIN_LOG_LEVEL=3
export PYTHONUNBUFFERED=1
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
rm -rf $test_tmp_dir
mkdir -p $test_tmp_dir
cp -r $src_dir/python/oneflow/compatible/single_client/test $test_tmp_dir
cd $test_tmp_dir
python3 -m oneflow --doctor
gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
for CHUNK in 1
do
export ONEFLOW_TEST_DEVICE_NUM=${CHUNK}
python3 $src_dir/ci/test/parallel_run.py \
--gpu_num="${gpu_num}" \
--dir=test/ops \
--timeout=1 \
--verbose \
--chunk=${CHUNK}
done
if [ -z "$ONEFLOW_TEST_ENABLE_EAGER" ]
then
export ONEFLOW_TEST_DEVICE_NUM=2
python3 -m unittest discover test/ops --failfast --verbose
export ONEFLOW_TEST_DEVICE_NUM=4
python3 -m unittest discover test/ops --failfast --verbose
else
echo "deadlock unsolved, skipping multi-card eager"
fi
#!/bin/bash
set -xe
export PYTHONUNBUFFERED=1
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
rm -rf $test_tmp_dir
mkdir -p $test_tmp_dir
chmod -R o+w $test_tmp_dir
cp -r $src_dir/python/oneflow/compatible/single_client/test $test_tmp_dir
cd $test_tmp_dir
ONEFLOW_TEST_DEVICE_NUM=1 python3 test/ops/test_assign.py --failfast --verbose
ONEFLOW_TEST_DEVICE_NUM=1 python3 test/ops/test_two_node_boxing.py --failfast --verbose
for device_num in 1 2 4
do
ONEFLOW_TEST_ENABLE_INIT_BY_HOST_LIST=1 ONEFLOW_TEST_DEVICE_NUM=$device_num python3 -m unittest discover test/ops --failfast --verbose
# use a invalid ibverbs lib to test if falling back to epoll works
ONEFLOW_TEST_ENABLE_INIT_BY_HOST_LIST=1 ONEFLOW_TEST_DEVICE_NUM=$device_num ONEFLOW_LIBIBVERBS_PATH=invalid_lib python3 -m unittest discover test/ops --failfast --verbose
done
#!/bin/bash
set -xeu
export PYTHONUNBUFFERED=1
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
ONEFLOW_CI_DEVICE_NUMS=${ONEFLOW_CI_DEVICE_NUMS:-"1 2 4"}
for device_num in ${ONEFLOW_CI_DEVICE_NUMS}
do
export ONEFLOW_TEST_NODE_NUM=2
export ONEFLOW_TEST_DEVICE_NUM=$device_num
time python3 ${src_dir}/ci/test/multi_launch.py \
--files "${ONEFLOW_TEST_DIR}/**/test_*.py" \
-n 4 \
--group_size $device_num \
--device_num 4 \
--verbose \
--auto_cuda_visible_devices \
-m oneflow.distributed.launch \
--nproc_per_node $device_num --nnodes=2 --node_rank=$NODE_RANK --master_addr $_MASTER_ADDR \
-m pytest --max-worker-restart=0 -x --durations=50 --capture=sys -p no:cacheprovider -p no:randomly --ignore=log
done
set(PYTHON_EXECUTABLE python3 CACHE STRING "python3 exe to run test, usually is the python3 installation oneflow is linked to")
set(ONEFLOW_SRC_DIR ${CMAKE_SOURCE_DIR} CACHE STRING "source dir of oneflow")
set(IS_DEV ON CACHE BOOL "")
set(CTEST_RESOURCE_SPEC_FILE "${CMAKE_CURRENT_SOURCE_DIR}/resource-spec/2x-rtx-2080.json" CACHE STRING "")
# CTEST_OUTPUT_ON_FAILURE=1 CTEST_PARALLEL_LEVEL=20 ninja test
file(GLOB_RECURSE PYTHON_TEST_FILES LIST_DIRECTORIES false RELATIVE ${ONEFLOW_SRC_DIR} "${ONEFLOW_SRC_DIR}/python/oneflow/test_*.py")
foreach(PYTHON_TEST_FILE ${PYTHON_TEST_FILES})
set(TEST_NAME ${PYTHON_TEST_FILE})
add_test(NAME ${TEST_NAME}
COMMAND ${PYTHON_EXECUTABLE} ${ONEFLOW_SRC_DIR}/${PYTHON_TEST_FILE} --failfast --verbose
)
set_tests_properties(${TEST_NAME}
PROPERTIES
ENVIRONMENT "$<$<NOT:$<BOOL:${BUILD_CUDA}>>:ONEFLOW_TEST_CPU_ONLY=1>;$<$<BOOL:${IS_DEV}>:PYTHONPATH=${ONEFLOW_SRC_DIR}/python:$ENV{PYTHONPATH}>"
RESOURCE_GROUPS
"vram:2000"
)
endforeach()
set -ex
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"$PWD/build-docs"}
rm -rf $test_tmp_dir
cp -r docs ${test_tmp_dir}
cd ${test_tmp_dir}
make html SPHINXOPTS="-W --keep-going"
This diff is collapsed.
#!/bin/bash
set -xe
export PYTHONUNBUFFERED=1
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
mkdir -p ${test_tmp_dir}
cd ${test_tmp_dir}
python3 -c 'import oneflow; f=open("oneflow_path.txt", "w"); f.write(oneflow.__path__[0])'
gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
python3 $src_dir/ci/test/parallel_run.py \
--gpu_num=${gpu_num} \
--dir=$(cat oneflow_path.txt) \
--timeout=1 \
--verbose \
--chunk=1 \
--doctest
#!/bin/bash
set -xe
export PYTHONUNBUFFERED=1
src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
rm -rf $test_tmp_dir
mkdir -p $test_tmp_dir
cp -r $src_dir/python/oneflow/compatible/single_client/benchmarks $test_tmp_dir
cd $test_tmp_dir/benchmarks
export ONEFLOW_DRY_RUN=1
# turn on ONEFLOW_DEBUG_MODE will cause protobuf err
# export ONEFLOW_DEBUG_MODE=1
node_num=2
generated_node_list=$(seq -f "mockhost%02g" -s, $node_num)
# heaptrack
# valgrind --tool=massif --threshold=0.0001
# /usr/bin/time -v
time python3 bert_benchmark/run_pretraining.py \
--learning_rate=1e-4 \
--weight_decay_rate=0.01 \
--batch_size_per_device=24 \
--iter_num=5 \
--loss_print_every_n_iter=1 \
--data_dir="/dataset/bert/bert_seq_len_128_repeat1024" \
--data_part_num=1 \
--seq_length=128 \
--max_predictions_per_seq=20 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64 \
--node_list=${generated_node_list} \
--node_num=${node_num} \
--gpu_num_per_node=8
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment