Unverified Commit acb9ef00 authored by msbaines's avatar msbaines Committed by GitHub
Browse files

[chore] add testing of torch 1.9.0 nightly build (#559)

parent daa1bad5
...@@ -121,6 +121,25 @@ install_dep_181: &install_dep_181 ...@@ -121,6 +121,25 @@ install_dep_181: &install_dep_181
python -m torch.utils.collect_env python -m torch.utils.collect_env
wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py
install_dep_190: &install_dep_190
- run:
name: Install Dependencies with torch 1.9.0
command: |
# make sure that apt-get retries if needed
sudo sh -c "echo 'APT::Acquire::Retries "3";' > /etc/apt/apt.conf.d/80-retries"
sudo apt-get update -y
sudo apt-get install -y libopenmpi-dev
# check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.8 && exit 0; fi
# start installing
pip install --pre --progress-bar off torch==1.9.0.dev20210330+cu101 torchvision==0.10.0.dev20210330+cu101 -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html
pip install --progress-bar off -r requirements-test.txt
pip install --progress-bar off -r requirements-benchmarks.txt
python -c 'import torch; print("Torch version:", torch.__version__)'
python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "9"], "wrong torch version"'
python -m torch.utils.collect_env
wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py
install_repo: &install_repo install_repo: &install_repo
- run: - run:
name: Install Repository name: Install Repository
...@@ -393,7 +412,7 @@ jobs: ...@@ -393,7 +412,7 @@ jobs:
- store_test_results: - store_test_results:
path: test-results path: test-results
- <<: *upload_coverage - <<: *upload_coverage
gpu_tests_171: gpu_tests_171:
...@@ -435,7 +454,7 @@ jobs: ...@@ -435,7 +454,7 @@ jobs:
- store_test_results: - store_test_results:
path: test-results path: test-results
- <<: *upload_coverage - <<: *upload_coverage
gpu_tests_181: gpu_tests_181:
...@@ -477,9 +496,49 @@ jobs: ...@@ -477,9 +496,49 @@ jobs:
- store_test_results: - store_test_results:
path: test-results path: test-results
- <<: *upload_coverage - <<: *upload_coverage
gpu_tests_190:
parameters:
test_list_file:
type: string
default: "/dev/non_exist"
<<: *gpu
working_directory: ~/fairscale
steps:
- checkout
- run: nvidia-smi
# Run this to make sure we use python3 from the system.
- run: pyenv global 3.7.0
- <<: *setup_venv
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-gpu-190-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- <<: *install_dep_190
- save_cache:
paths:
- ~/venv
key: cache-key-gpu-190-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- <<: *install_repo
- run_unittests_from_list:
test_list_file: <<parameters.test_list_file>>
- store_test_results:
path: test-results
benchmarks_1: benchmarks_1:
<<: *gpu <<: *gpu
...@@ -591,17 +650,23 @@ workflows: ...@@ -591,17 +650,23 @@ workflows:
test_list_file: tests/ci_test_list_1.txt test_list_file: tests/ci_test_list_1.txt
- gpu_tests_181: - gpu_tests_181:
test_list_file: tests/ci_test_list_1.txt test_list_file: tests/ci_test_list_1.txt
- gpu_tests_190:
test_list_file: tests/ci_test_list_1.txt
- gpu_tests_160: - gpu_tests_160:
test_list_file: tests/ci_test_list_2.txt test_list_file: tests/ci_test_list_2.txt
- gpu_tests_171: - gpu_tests_171:
test_list_file: tests/ci_test_list_2.txt test_list_file: tests/ci_test_list_2.txt
- gpu_tests_181: - gpu_tests_181:
test_list_file: tests/ci_test_list_2.txt test_list_file: tests/ci_test_list_2.txt
- gpu_tests_190:
test_list_file: tests/ci_test_list_2.txt
- gpu_tests_160: - gpu_tests_160:
test_list_file: tests/ci_test_list_3.txt test_list_file: tests/ci_test_list_3.txt
- gpu_tests_171: - gpu_tests_171:
test_list_file: tests/ci_test_list_3.txt test_list_file: tests/ci_test_list_3.txt
- gpu_tests_181: - gpu_tests_181:
test_list_file: tests/ci_test_list_3.txt test_list_file: tests/ci_test_list_3.txt
- gpu_tests_190:
test_list_file: tests/ci_test_list_3.txt
- benchmarks_1 - benchmarks_1
- benchmarks_2 - benchmarks_2
...@@ -23,7 +23,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP ...@@ -23,7 +23,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
from fairscale.nn.data_parallel import ShardedDataParallel from fairscale.nn.data_parallel import ShardedDataParallel
from fairscale.optim import OSS from fairscale.optim import OSS
from fairscale.optim.grad_scaler import ShardedGradScaler from fairscale.optim.grad_scaler import ShardedGradScaler
from fairscale.utils.testing import check_same_model_params, skip_if_no_cuda, skip_if_single_gpu from fairscale.utils.testing import check_same_model_params, skip_if_no_cuda, skip_if_single_gpu, torch_version
""" """
Check that ShardedDDP gets the same results as DDP in a variety of scenarii Check that ShardedDDP gets the same results as DDP in a variety of scenarii
...@@ -168,7 +168,10 @@ def run_ddp_parity( ...@@ -168,7 +168,10 @@ def run_ddp_parity(
# NOTE: DDP does not handle parameters trainability being changed after the fact, see # NOTE: DDP does not handle parameters trainability being changed after the fact, see
# https://github.com/pytorch/pytorch/blob/5781aec74ef00284e0262817a649278c2e8072bf/torch/nn/parallel/distributed.py#L471 # https://github.com/pytorch/pytorch/blob/5781aec74ef00284e0262817a649278c2e8072bf/torch/nn/parallel/distributed.py#L471
if clip_grad_norm and not change_train_graph: if clip_grad_norm and not change_train_graph:
total_norm = torch.nn.utils.clip_grad_norm_(ddp_model.parameters(), 0.3, norm_type=2.0) # type: ignore if torch_version() >= (1, 9, 0):
total_norm = torch.nn.utils.clip_grad_norm_(ddp_model.parameters(), 0.3, norm_type=2.0, error_if_nonfinite=False) # type: ignore
else:
total_norm = torch.nn.utils.clip_grad_norm_(ddp_model.parameters(), 0.3, norm_type=2.0) # type: ignore
if not torch.isnan(total_norm): if not torch.isnan(total_norm):
oss_total_norm = sharded_optimizer.clip_grad_norm(0.3, norm_type=2.0) oss_total_norm = sharded_optimizer.clip_grad_norm(0.3, norm_type=2.0)
allclose = torch.allclose(oss_total_norm, total_norm, atol=1e-2 if amp else 1e-8) allclose = torch.allclose(oss_total_norm, total_norm, atol=1e-2 if amp else 1e-8)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment