[chore] add testing of torch 1.9.0 nightly build (#559)

acb9ef00 · msbaines · GitHub · daa1bad5 · acb9ef00 · acb9ef00
Unverified Commit acb9ef00 authored Mar 31, 2021 by msbaines Committed by GitHub Mar 31, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 73 additions and 5 deletions

.circleci/config.yml .circleci/config.yml +68 -3

tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py +5 -2

No files found.
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -121,6 +121,25 @@ install_dep_181: &install_dep_181
        python -m torch.utils.collect_env
        wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py

+install_dep_190: &install_dep_190
+  - run:
+      name: Install Dependencies with torch 1.9.0
+      command: |
+        # make sure that apt-get retries if needed
+        sudo sh -c "echo 'APT::Acquire::Retries "3";' > /etc/apt/apt.conf.d/80-retries"
+        sudo apt-get update -y
+        sudo apt-get install -y libopenmpi-dev
+        # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
+        if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.8 && exit 0; fi
+        # start installing
+        pip install --pre --progress-bar off torch==1.9.0.dev20210330+cu101 torchvision==0.10.0.dev20210330+cu101 -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html
+        pip install --progress-bar off -r requirements-test.txt
+        pip install --progress-bar off -r requirements-benchmarks.txt
+        python -c 'import torch; print("Torch version:", torch.__version__)'
+        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "9"], "wrong torch version"'
+        python -m torch.utils.collect_env
+        wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py
+
 install_repo: &install_repo
  - run:
      name: Install Repository
@@ -393,7 +412,7 @@ jobs:

      - store_test_results:
          path: test-results
-      
+
      - <<: *upload_coverage

  gpu_tests_171:
@@ -435,7 +454,7 @@ jobs:

      - store_test_results:
          path: test-results
-      
+
      - <<: *upload_coverage

  gpu_tests_181:
@@ -477,9 +496,49 @@ jobs:

      - store_test_results:
          path: test-results
-      
+
      - <<: *upload_coverage

+  gpu_tests_190:
+    parameters:
+      test_list_file:
+        type: string
+        default: "/dev/non_exist"
+
+    <<: *gpu
+
+    working_directory: ~/fairscale
+
+    steps:
+      - checkout
+
+      - run: nvidia-smi
+
+      # Run this to make sure we use python3 from the system.
+      - run: pyenv global 3.7.0
+
+      - <<: *setup_venv
+
+      # Cache the venv directory that contains dependencies
+      - restore_cache:
+          keys:
+            - cache-key-gpu-190-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+
+      - <<: *install_dep_190
+
+      - save_cache:
+          paths:
+            - ~/venv
+          key: cache-key-gpu-190-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+
+      - <<: *install_repo
+
+      - run_unittests_from_list:
+          test_list_file: <<parameters.test_list_file>>
+
+      - store_test_results:
+          path: test-results
+
  benchmarks_1:
    <<: *gpu

@@ -591,17 +650,23 @@ workflows:
          test_list_file: tests/ci_test_list_1.txt
      - gpu_tests_181:
          test_list_file: tests/ci_test_list_1.txt
+      - gpu_tests_190:
+          test_list_file: tests/ci_test_list_1.txt
      - gpu_tests_160:
          test_list_file: tests/ci_test_list_2.txt
      - gpu_tests_171:
          test_list_file: tests/ci_test_list_2.txt
      - gpu_tests_181:
          test_list_file: tests/ci_test_list_2.txt
+      - gpu_tests_190:
+          test_list_file: tests/ci_test_list_2.txt
      - gpu_tests_160:
          test_list_file: tests/ci_test_list_3.txt
      - gpu_tests_171:
          test_list_file: tests/ci_test_list_3.txt
      - gpu_tests_181:
          test_list_file: tests/ci_test_list_3.txt
+      - gpu_tests_190:
+          test_list_file: tests/ci_test_list_3.txt
      - benchmarks_1
      - benchmarks_2
--- a/tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py
+++ b/tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py
@@ -23,7 +23,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from fairscale.nn.data_parallel import ShardedDataParallel
 from fairscale.optim import OSS
 from fairscale.optim.grad_scaler import ShardedGradScaler
-from fairscale.utils.testing import check_same_model_params, skip_if_no_cuda, skip_if_single_gpu
+from fairscale.utils.testing import check_same_model_params, skip_if_no_cuda, skip_if_single_gpu, torch_version

 """
 Check that ShardedDDP gets the same results as DDP in a variety of scenarii
@@ -168,7 +168,10 @@ def run_ddp_parity(
        # NOTE: DDP does not handle parameters trainability being changed after the fact, see
        # https://github.com/pytorch/pytorch/blob/5781aec74ef00284e0262817a649278c2e8072bf/torch/nn/parallel/distributed.py#L471
        if clip_grad_norm and not change_train_graph:
-            total_norm = torch.nn.utils.clip_grad_norm_(ddp_model.parameters(), 0.3, norm_type=2.0)  # type: ignore
+            if torch_version() >= (1, 9, 0):
+                total_norm = torch.nn.utils.clip_grad_norm_(ddp_model.parameters(), 0.3, norm_type=2.0, error_if_nonfinite=False)  # type: ignore
+            else:
+                total_norm = torch.nn.utils.clip_grad_norm_(ddp_model.parameters(), 0.3, norm_type=2.0)  # type: ignore
            if not torch.isnan(total_norm):
                oss_total_norm = sharded_optimizer.clip_grad_norm(0.3, norm_type=2.0)
                allclose = torch.allclose(oss_total_norm, total_norm, atol=1e-2 if amp else 1e-8)