[test]: test with py39 + torch 1.8 nightly (#339)

* [test]: test with py39 + torch 1.8 nightly * version fix * more fix * fix version function for nightly version * fix torch_pg build * invalidate cache * separate benchmark requirements * comment * fixed mypy * fixed a test

[test]: test with py39 + torch 1.8 nightly (#339)
* [test]: test with py39 + torch 1.8 nightly * version fix * more fix * fix version function for nightly version * fix torch_pg build * invalidate cache * separate benchmark requirements * comment * fixed mypy * fixed a test
e348806b · Min Xu · GitHub · eaee5976 · e348806b · e348806b
Unverified Commit e348806b authored Jan 29, 2021 by Min Xu Committed by GitHub Jan 29, 2021
9 changed files
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -50,22 +50,24 @@ setup_venv: &setup_venv
 install_dep_151: &install_dep_151
  - run:
-      name: Install Dependencies
+      name: Install Dependencies with torch 1.5.1
      command: |
        sudo apt-get install -y libopenmpi-dev
        pip install --progress-bar off torch==1.5.1+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
        pip install --progress-bar off -r requirements-test.txt
+        pip install --progress-bar off -r requirements-benchmarks.txt
        python -c 'import torch; print("Torch version:", torch.__version__)'
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "5"], "wrong torch version"'
        python -m torch.utils.collect_env
 install_dep_160: &install_dep_160
  - run:
-      name: Install Dependencies
+      name: Install Dependencies with torch 1.6.0
      command: |
        sudo apt-get install -y libopenmpi-dev
        pip install --progress-bar off torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
        pip install --progress-bar off -r requirements-test.txt
+        pip install --progress-bar off -r requirements-benchmarks.txt
        pip install --progress-bar off git+https://github.com/msbaines/torch_pg.git@c85c96f#egg=torch-pg
        python -c 'import torch; print("Torch version:", torch.__version__)'
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "6"], "wrong torch version"'
@@ -73,16 +75,31 @@ install_dep_160: &install_dep_160
 install_dep_171: &install_dep_171
  - run:
-      name: Install Dependencies
+      name: Install Dependencies with torch 1.7.1
      command: |
        sudo apt-get install -y libopenmpi-dev
        pip install --progress-bar off torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html
        pip install --progress-bar off -r requirements-test.txt
+        pip install --progress-bar off -r requirements-benchmarks.txt
        pip install --progress-bar off git+https://github.com/msbaines/torch_pg.git@c85c96f#egg=torch-pg
        python -c 'import torch; print("Torch version:", torch.__version__)'
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "7"], "wrong torch version"'
        python -m torch.utils.collect_env
+install_dep_180: &install_dep_180
+  - run:
+      name: Install Dependencies with torch 1.8.0 nightly
+      command: |
+        sudo apt-get install -y libopenmpi-dev
+        pip install --pre --progress-bar off torch==1.8.0.dev20210128+cu110 -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html
+        pip install --progress-bar off  git+https://github.com/min-xu-ai/torch_pg.git@c723ab4#egg=torch-pg
+        pip install --progress-bar off -r requirements-test.txt
+        # TODO: We don't use 180 to run benchmark yet, because torchvision is not yet available on py39.
+        #pip install --progress-bar off -r requirements-benchmarks.txt
+        python -c 'import torch; print("Torch version:", torch.__version__)'
+        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "8"], "wrong torch version"'
+        python -m torch.utils.collect_env
 install_repo_cpu: &install_repo_cpu
  - run:
      name: Install Repository
@@ -267,14 +284,18 @@ jobs:
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
-            - cache-key-cpu-py39-171-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-cpu-py39-180-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
-      - <<: *install_dep_171
+      # py3.9 doesn't work well with torch < 1.8. See this PR:
+      # https://github.com/pytorch/pytorch/pull/50998
+      #
+      # Therefore, we test py39 with torch 1.8.0.
+      - <<: *install_dep_180
      - save_cache:
          paths:
            - ~/venv
-          key: cache-key-cpu-py39-171-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-cpu-py39-180-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
      - <<: *install_repo_cpu
@@ -282,12 +303,8 @@ jobs:
      - <<: *run_black
      - <<: *run_mypy
      - <<: *run_flake8
-      # FIXME: py39 still not stable for us, example:
+      - <<: *run_unittests
-      # https://app.circleci.com/pipelines/github/facebookresearch/fairscale/1349/workflows/534aae41-e01d-404e-bfc1-fdc58566c39c/jobs/5952
+      - <<: *run_mpi_unittests
-      # - <<: *run_unittests
-      # example:
-      # https://app.circleci.com/pipelines/github/facebookresearch/fairscale/1350/workflows/26ebd69e-777e-491a-ae12-da3154ef80f9/jobs/5953
-      # - <<: *run_mpi_unittests
      - <<: *run_doc_build
      - store_test_results:

--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,9 @@
 build/
 dist/
+# Pytest verbose output
+test-results/
 # Coverage reports
 .coverage
 .coverage.*

--- a/fairscale/optim/adascale.py
+++ b/fairscale/optim/adascale.py
@@ -289,7 +289,7 @@ class AdaScale(Optimizer):
        if pg_idx is not None:
            return self._state["grad_sqr_avg"][pg_idx]
        else:
-            return np.sum(self._state["grad_sqr_avg"])
+            return float(np.sum(self._state["grad_sqr_avg"]))
    def _grad_var_avg(self, pg_idx: Optional[int] = None) -> float:
        """
@@ -307,7 +307,7 @@ class AdaScale(Optimizer):
        if pg_idx is not None:
            return self._state["grad_var_avg"][pg_idx]
        else:
-            return np.sum(self._state["grad_var_avg"])
+            return float(np.sum(self._state["grad_var_avg"]))
    def gain(self, pg_idx: Optional[int] = None) -> float:
        """
@@ -349,8 +349,8 @@ class AdaScale(Optimizer):
            #       after some iterations are done. But, then the if condition
            #       below will need to be a np.where. I leave this corner
            #       case to a future exercise.
-            count = self._state.get(name + "_count", 0)
+            count = self._state.get(name + "_count", np.zeros(1))
-            count += 1
+            count[0] += 1
            self._state[name + "_count"] = count
            if count < 1 / (1 - self._smoothing):
                total = self._state.get(name + "_total", None)
@@ -514,8 +514,9 @@ class AdaScale(Optimizer):
        # Extend the states.
        for name in self._state.keys():
            assert name.startswith("grad_sqr_avg") or name.startswith("grad_var_avg"), name
-            if isinstance(self._state[name], int):
+            if name.endswith("_count"):
-                # This is the "_count" variable.
+                # This is the "_count" variable, should be a 1D int.
+                assert self._state[name].shape == (1,), self._state[name].shape
                continue
            # must be a np array, extend it with the right value and check the shape.
            val = 1 if name == "grad_sqr_avg" else 0

--- a/fairscale/utils/testing.py
+++ b/fairscale/utils/testing.py
@@ -74,9 +74,7 @@ def set_random_seed(seed: int) -> None:
 def torch_version() -> Tuple[int, ...]:
-    numbering = torch.__version__.split("+")[0].split(".")
+    numbering = torch.__version__.split("+")[0].split(".")[:3]
-    assert len(numbering) == 3
    # Catch torch version if run against internal pre-releases, like `1.8.0a0fb`,
    if not numbering[2].isnumeric():

--- a/requirements-benchmarks.txt
+++ b/requirements-benchmarks.txt
+# Bring in everything that tests depends on.
+-r requirements-test.txt
+# Benchmark dependencies.
+torchtext == 0.6.0
+torchvision >= 0.6.0
+timm == 0.3.4
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
+# Core deps.
 -r requirements.txt
+# For pre-commit hooks.
 pre-commit
--- a/requirements-test.txt
+++ b/requirements-test.txt
+# Get core deps.
+-r requirements.txt
 # Tools for static checking.
 black == 19.10b0
 flake8 == 3.7.9
 isort == 5.6.4
 mypy == 0.790
-# Tools for testing & coverage.
+# Tools for unit tests & coverage.
 pytest == 5.4.1
 pytest-cov == 2.10.0
 pytest-mpi == 0.4
 pytest-timeout == 1.4.2
 mpi4py == 3.0.3
-# Library dependencies.
-torchtext == 0.6.0
-torch >= 1.5.1
-torchvision >= 0.6.0
-timm == 0.3.4
--- a/requirements.txt
+++ b/requirements.txt
+# FairScale should only depends on torch, not things higher level than torch.
 torch >= 1.5.1
--- a/tests/optim/test_single_node_adascale.py
+++ b/tests/optim/test_single_node_adascale.py
@@ -203,7 +203,7 @@ def test_add_param_group(debias_ewma):
        model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2))
        model1.bias.fill_(0.1)
    optim = AdaScale(SGD(model1.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=debias_ewma)
-    assert len(optim._hook_handles) == 2
+    assert len(optim._hook_handles) == 2, len(optim._hook_handles)
    model2 = Linear(2, 3, bias=True)
    with torch.no_grad():
@@ -211,7 +211,7 @@ def test_add_param_group(debias_ewma):
        model2.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(3, 2))
        model2.bias.fill_(0.2)
    optim.add_param_group({"params": model2.parameters()})
-    assert len(optim._hook_handles) == 4
+    assert len(optim._hook_handles) == 4, len(optim._hook_handles)
    # make sure we can run the model.
    model = Sequential(model1, model2).cuda()
@@ -238,7 +238,7 @@ def test_add_param_group(debias_ewma):
        model3.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0] * 2).reshape(4, 3))
        model3.bias.fill_(0.2)
    optim.add_param_group({"params": model3.parameters()})
-    assert len(optim._hook_handles) == 6
+    assert len(optim._hook_handles) == 6, len(optim._hook_handles)
    # make sure we can run the model.
    model = Sequential(model1, model2, model3).cuda()