Setup pre-commit github action and apply pre-commit to all files (#849)

* adding pre-commit files * applying pre-commit to all files * adding no-strict-optional argument to mypy in circle ci config * fix typo * updating python versions * [skip ci] remove extra args * adding python 3.9 * [skip ci] set pre-commit version in requirements-dev.txt * set CACHE_VERSION * move linters from circleci to github actions * update python version * update python version in benchmarks_2 * moving to python 3.9.7

Setup pre-commit github action and apply pre-commit to all files (#849)
* adding pre-commit files * applying pre-commit to all files * adding no-strict-optional argument to mypy in circle ci config * fix typo * updating python versions * [skip ci] remove extra args * adding python 3.9 * [skip ci] set pre-commit version in requirements-dev.txt * set CACHE_VERSION * move linters from circleci to github actions * update python version * update python version in benchmarks_2 * moving to python 3.9.7
7d7edf6d · Anupam Bhatnagar · GitHub · 6f3931a4 · 7d7edf6d · 7d7edf6d
Unverified Commit 7d7edf6d authored Nov 11, 2021 by Anupam Bhatnagar Committed by GitHub Nov 11, 2021
20 changed files
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -6,6 +6,14 @@
 # https://github.com/facebookresearch/detectron2/blob/main/.circleci/config.yml
 #
 # Pro tip: download circle ci cli to validate the config locally during development.
+#
+# To reset/clean the cache update the CACHE_VERSION variable in project settings
+# in the fairscale project in CircleCI. The CACHE_VERSION follows the convention
+# v$(FAIRSCALE_VERSION)-${CACHE_NUMBER}. E.g. v0.4.2-1. CACHE_NUMBER must start
+# at 1 and increase in whole numbers. When changing the CACHE_VERSION manually
+# always set the FAIRSCALE_VERSION value to the fairscale version being tested.
+# To reset the cache when not updating the fairscale version, only update the
+# CACHE_NUMBER value.

 version: 2.1
 orbs:
@@ -15,23 +23,26 @@ orbs:
 # -------------------------------------------------------------------------------------
 cpu_py37: &cpu_py37
  docker:
+    # python version 3.7.12
    - image: circleci/python:3.7
  resource_class: large

 cpu_py38: &cpu_py38
  docker:
+    # python version 3.8.12
    - image: circleci/python:3.8
  resource_class: large

 cpu_py39: &cpu_py39
  docker:
+    # python version 3.9.7
    - image: circleci/python:3.9
  resource_class: large

-# Here are list of GPU images:
+# Here is the list of GPU images:
 # https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
-# We need to use multiple gpus for several jobs. the resource_class values are
-# available here T101565170
+# We need to use multiple gpus for several jobs. The resource_class
+# values are available here T101565170
 # gpu.nvidia.small.multi = 2 gpus with 16 GB ram each
 # gpu.nvidia.medium.multi = 4 gpus with 16 GB ram each

@@ -122,30 +133,6 @@ install_repo: &install_repo
        # Test import.
        python -c 'import sys; sys.path = sys.path[1:]; import fairscale'

-run_isort: &run_isort
-   - run:
-       name: Run Linter (isort)
-       command: |
-         isort . --check
-
-run_black: &run_black
-   - run:
-       name: Run Linter (black)
-       command: |
-         black --check .
-
-run_mypy: &run_mypy
-   - run:
-       name: Run type-checking (mypy)
-       command: |
-         mypy --ignore-missing-imports --scripts-are-modules --pretty .
-
-run_flake8: &run_flake8
-  - run:
-      name: Run Linter (flake8)
-      command: |
-        flake8 --show-source --statistics
-
 check_test_list: &check_test_list
  - run:
      name: Verify that unit test list files are correct
@@ -260,21 +247,16 @@ jobs:
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
-            - cache-key-cpu-py37-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-cpu-py37-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_dep_1_10_0

      - save_cache:
          paths:
            - ~/venv
-          key: cache-key-cpu-py37-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-cpu-py37-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_repo
-
-      - <<: *run_isort
-      - <<: *run_black
-      - <<: *run_mypy
-      - <<: *run_flake8
      - <<: *run_unittests
      - <<: *run_doc_build

@@ -294,20 +276,15 @@ jobs:
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
-            - cache-key-cpu-py38-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-cpu-py38-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
      - <<: *install_dep_1_10_0

      - save_cache:
          paths:
            - ~/venv
-          key: cache-key-cpu-py38-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-cpu-py38-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_repo
-
-      - <<: *run_isort
-      - <<: *run_black
-      - <<: *run_mypy
-      - <<: *run_flake8
      - <<: *run_unittests
      - <<: *run_doc_build

@@ -327,21 +304,16 @@ jobs:
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
-            - cache-key-cpu-py39-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-cpu-py39-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_dep_1_10_0

      - save_cache:
          paths:
            - ~/venv
-          key: cache-key-cpu-py39-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-cpu-py39-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_repo
-
-      - <<: *run_isort
-      - <<: *run_black
-      - <<: *run_mypy
-      - <<: *run_flake8
      - <<: *run_unittests
      - <<: *run_doc_build

@@ -365,21 +337,21 @@ jobs:

      # Run this to make sure we use python3 from the system.
      - setup_pyenv:
-          version: 3.7.0
+          version: 3.9.7

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
-            - cache-key-py37-gpu-torch-1-8-1-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-py-3-9-7-gpu-torch-1-8-1-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_dep_1_8_1

      - save_cache:
          paths:
            - ~/venv
-          key: cache-key-py37-gpu-torch-1-8-1-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-py-3-9-7-gpu-torch-1-8-1-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_repo

@@ -408,21 +380,21 @@ jobs:

      # Run this to make sure we use python3 from the system.
      - setup_pyenv:
-          version: 3.8.6
+          version: 3.9.7

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
-            - cache-key-py38-gpu-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-py-3-9-7-gpu-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_dep_1_10_0

      - save_cache:
          paths:
            - ~/venv
-          key: cache-key-py38-gpu-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-py-3-9-7-gpu-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_repo

@@ -449,21 +421,21 @@ jobs:

      # Run this to make sure we use python3 from the system.
      - setup_pyenv:
-          version: 3.8.6
+          version: 3.9.7

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
-            - cache-key-py38-gpu-pytorch-nightly-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-py-3-9-7-gpu-pytorch-nightly-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_dep_pytorch_nightly

      - save_cache:
          paths:
            - ~/venv
-          key: cache-key-py38-gpu-pytorch-nightly-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-py-3-9-7-gpu-pytorch-nightly-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_repo

@@ -484,26 +456,26 @@ jobs:
      - run: nvidia-smi

      - setup_pyenv:
-          version: 3.7.0
+          version: 3.9.7

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
-            - cache-key-py37-benchmarks-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-py-3-9-7-benchmarks-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      # Cache the MNIST directory that contains benchmark data
      - restore_cache:
          keys:
-            - cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
+            - cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION }}-{{checksum "benchmarks/datasets/mnist.py"}}

      - <<: *install_dep_1_10_0

      - save_cache:
          paths:
            - ~/venv
-          key: cache-key-py37-benchmarks-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-py-3-9-7-benchmarks-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_repo

@@ -520,7 +492,7 @@ jobs:
      - save_cache:
          paths:
            - /tmp/MNIST
-          key: cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
+          key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION }}-{{checksum "benchmarks/datasets/mnist.py"}}

  benchmarks_2:
    <<: *gpu_cu_11_2_medium_multi
@@ -533,27 +505,27 @@ jobs:
      - run: nvidia-smi

      - setup_pyenv:
-          version: 3.7.0
+          version: 3.9.7

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
-            - cache-key-py37-benchmarks-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-py-3-9-7-benchmarks-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}


      # Cache the MNIST directory that contains benchmark data
      - restore_cache:
          keys:
-            - cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
+            - cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION }}-{{checksum "benchmarks/datasets/mnist.py"}}

      - <<: *install_dep_1_10_0

      - save_cache:
          paths:
            - ~/venv
-          key: cache-key-py37-benchmarks-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-py-3-9-7-benchmarks-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}

      - <<: *install_repo

@@ -562,7 +534,7 @@ jobs:
      - save_cache:
          paths:
            - /tmp/MNIST
-          key: cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
+          key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}


 workflows:

--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        # make sure python versions are consistent with those used in .circleci/config.yml
+        python-version: ['3.7.12', '3.8.12', '3.9.7']
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - uses: pre-commit/action@v2.0.3
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ default_language_version:

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v2.5.0
+    rev: v4.0.1
    hooks:
    -   id: trailing-whitespace
    -   id: check-ast
@@ -20,29 +20,31 @@ repos:
    -   id: end-of-file-fixer

 -   repo: https://github.com/ambv/black
-    rev: 19.10b0
+    rev: 21.10b0
    hooks:
    - id: black

 -   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.7.9
+    rev: 4.0.1
    hooks:
    -   id: flake8
+        args: [--show-source, --statistics]

 -   repo: https://github.com/asottile/seed-isort-config
-    rev: v2.1.0
+    rev: v2.2.0
    hooks:
    -   id: seed-isort-config

 -   repo: https://github.com/pycqa/isort
-    rev: 5.6.4
+    rev: 5.10.1
    hooks:
    -   id: isort
        exclude: README.md
        additional_dependencies: [toml]

 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v0.790'
+    rev: 'v0.910'
    hooks:
    -   id: mypy
+        args: [--no-strict-optional, --ignore-missing-imports, --scripts-are-modules, --pretty]
        additional_dependencies: [numpy]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/benchmarks/experimental/benchmark_dataset.py
+++ b/benchmarks/experimental/benchmark_dataset.py
@@ -42,7 +42,10 @@ class BenchmarkLMDataset(Dataset):
    """

    def __init__(
-        self, vocab_size=10000, max_source_positions=1024, total_samples=10000,
+        self,
+        vocab_size=10000,
+        max_source_positions=1024,
+        total_samples=10000,
    ):
        self.vocab_size = vocab_size
        self.max_source_positions = max_source_positions

--- a/benchmarks/experimental/benchmark_mevo.py
+++ b/benchmarks/experimental/benchmark_mevo.py
@@ -35,7 +35,7 @@ KERNELS = [


 def run_on_gpu(kernel, data, repeats, no_grad, fwd_bwd):
-    """ Measure both GPU runtime and peak memory usage of a kernel. """
+    """Measure both GPU runtime and peak memory usage of a kernel."""
    tokens = data[0].shape[0]

    def get_cuda_data():

--- a/benchmarks/experimental/experimental_async_approaches.py
+++ b/benchmarks/experimental/experimental_async_approaches.py
@@ -142,7 +142,7 @@ class MySGD(Optimizer):
        super(MySGD, self).__setstate__(state)

    def step(self, closure=None):
-        """ Performs a single optimization step.
+        """Performs a single optimization step.
        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
@@ -234,7 +234,7 @@ class SpectrainSGDMomentum(Optimizer):
                    p.data.sub_(param_state["momentum_buffer"].data, alpha=multiplier)

    def step(self, weight_prediction=True, closure=None):
-        """ Performs a single optimization step.
+        """Performs a single optimization step.
        Args:
            weight_prediction (bool, optional): Enable weight prediction based updates
            closure (callable, optional): A closure that reevaluates the model

--- a/benchmarks/experimental/offload.py
+++ b/benchmarks/experimental/offload.py
@@ -413,7 +413,10 @@ parser.add_argument(
    help="Print debugging statements which is more verbose than the default.",
 )
 parser.add_argument(
-    "--model_name", default="lm", type=str, help="Language Model(LM) used to benchmark nn.pipe.",
+    "--model_name",
+    default="lm",
+    type=str,
+    help="Language Model(LM) used to benchmark nn.pipe.",
 )
 parser.add_argument(
    "--use_synthetic_data", default=True, action="store_true", help="Uses synthetic data for running benchmarks."

--- a/benchmarks/oss.py
+++ b/benchmarks/oss.py
@@ -320,14 +320,22 @@ if __name__ == "__main__":
    if args.optim_type == OptimType.oss_ddp or args.optim_type == OptimType.everyone:
        logging.info("\n*** Benchmark OSS with DDP")
        mp.spawn(
-            train, args=(args, BACKEND, OptimType.oss_ddp, args.check_regression), nprocs=args.world_size, join=True,  # type: ignore
+            train,
+            args=(args, BACKEND, OptimType.oss_ddp, args.check_regression),
+            nprocs=args.world_size,
+            join=True,  # type: ignore
        )

    if args.optim_type == OptimType.oss_sharded_ddp or args.optim_type == OptimType.everyone:
        logging.info("\n*** Benchmark OSS with ShardedDDP")
        mp.spawn(
            train,  # type: ignore
-            args=(args, BACKEND, OptimType.oss_sharded_ddp, args.check_regression,),
+            args=(
+                args,
+                BACKEND,
+                OptimType.oss_sharded_ddp,
+                args.check_regression,
+            ),
            nprocs=args.world_size,
            join=True,
        )
--- a/fairscale/experimental/nn/ampnet_pipe/ampnet.py
+++ b/fairscale/experimental/nn/ampnet_pipe/ampnet.py
@@ -34,13 +34,20 @@ from fairscale.nn.pipe.worker import Task


 def create_task_without_skip_trackers(
-    checkpoint_stop: int, i: int, j: int, batch: Batch, partition: nn.Sequential,
+    checkpoint_stop: int,
+    i: int,
+    j: int,
+    batch: Batch,
+    partition: nn.Sequential,
 ) -> Task:
    # Determine whether checkpointing or not.
    if i < checkpoint_stop:

        def function(
-            input: TensorOrTensors, partition: nn.Sequential = partition, chunk_id: int = i, part_id: int = j,
+            input: TensorOrTensors,
+            partition: nn.Sequential = partition,
+            chunk_id: int = i,
+            part_id: int = j,
        ) -> TensorOrTensors:
            with record_function("chunk%d-part%d" % (chunk_id, part_id)):
                return partition(input)
@@ -52,7 +59,10 @@ def create_task_without_skip_trackers(
    else:

        def compute(
-            batch: Batch = batch, partition: nn.Sequential = partition, chunk_id: int = i, part_id: int = j,
+            batch: Batch = batch,
+            partition: nn.Sequential = partition,
+            chunk_id: int = i,
+            part_id: int = j,
        ) -> Batch:
            with record_function("chunk%d-part%d" % (chunk_id, part_id)):
                return batch.call(partition)
@@ -93,7 +103,11 @@ class AsyncAMPnetEventLoop:

    def async_send_inner(self, batch: Batch, index: int) -> Tuple[Batch, PipeMessage]:
        task = create_task_without_skip_trackers(
-            self.checkpoint_stop, index, self.group.rank(), batch, self.partitions[0].module,
+            self.checkpoint_stop,
+            index,
+            self.group.rank(),
+            batch,
+            self.partitions[0].module,
        )
        result = task.compute()
        task.finalize(result)
@@ -258,7 +272,11 @@ class AsyncAMPnetEventLoop:
                if self.weight_prediction:
                    optimizer.update_weight_using_future_predictions(cur_rank, N, count, self.chunks, forward=True)
                task = create_task_without_skip_trackers(
-                    self.checkpoint_stop, args.microbatch_index, self.group.rank(), batch, self.partitions[0].module,
+                    self.checkpoint_stop,
+                    args.microbatch_index,
+                    self.group.rank(),
+                    batch,
+                    self.partitions[0].module,
                )
                output = task.compute()
                activations[args.microbatch_index] = output

--- a/fairscale/experimental/nn/ampnet_pipe/pipe.py
+++ b/fairscale/experimental/nn/ampnet_pipe/pipe.py
@@ -46,8 +46,8 @@ class AMPnetPipe(AsyncPipe):
        assert self.group
        rank = self.group.rank()

-        transport = self.pipeline.transport  # type: ignore
-        checkpoint_stop = self.pipeline.checkpoint_stop  # type: ignore
+        transport = self.pipeline.transport
+        checkpoint_stop = self.pipeline.checkpoint_stop
        ampnet_event_loop = AsyncAMPnetEventLoop(
            partitions,
            self.group,

--- a/fairscale/experimental/nn/data_parallel/gossip/distributed.py
+++ b/fairscale/experimental/nn/data_parallel/gossip/distributed.py
@@ -312,7 +312,7 @@ class SlowMoDistributedDataParallel(Module):
        self.logger.debug("Initialization of SlowMoDistributedDataParallel complete")

    def _initialize_logger(self, verbose: bool, process_rank: int) -> None:
-        """ Initializes the logger """
+        """Initializes the logger"""
        self.logger = logging.getLogger(__name__)
        if verbose:
            self.logger.setLevel(logging.DEBUG)
@@ -331,7 +331,7 @@ class SlowMoDistributedDataParallel(Module):
        master_group: Optional[torch.distributed.ProcessGroup],
        local_node_group: Optional[torch.distributed.ProcessGroup],
    ) -> Tuple[int, int]:
-        """ Creates the process groups required for the SlowMo implementation """
+        """Creates the process groups required for the SlowMo implementation"""

        self.local_rank = process_rank % self.nprocs_per_node
        assert (
@@ -392,7 +392,12 @@ class SlowMoDistributedDataParallel(Module):

        self.logger.debug("Initializing local process groups")
        for node in range(logical_world_size):
-            node_processes_ranks = list(range(node * self.nprocs_per_node, (node + 1) * self.nprocs_per_node,))
+            node_processes_ranks = list(
+                range(
+                    node * self.nprocs_per_node,
+                    (node + 1) * self.nprocs_per_node,
+                )
+            )
            # Process group to communicate between processes on this machine
            new_local_group = create_process_group(node_processes_ranks)
            if process_rank in node_processes_ranks:
@@ -401,24 +406,26 @@ class SlowMoDistributedDataParallel(Module):
        self.logger.debug("Initialization of local groups complete")

    def forward(self, *inputs: Any, **kwargs: Any) -> Union[torch.Tensor, List[torch.Tensor]]:
-        """ Forward pass performed in parallel across all devices on node """
+        """Forward pass performed in parallel across all devices on node"""
        return self.module(*inputs, **kwargs)

    def _sync_params(self) -> None:
-        """ Synchronize parameters across devices (intra-node) """
+        """Synchronize parameters across devices (intra-node)"""
        if self.local_node_group is None:
            return

        # intra-node parameter sync
        params = cast(List[torch.Tensor], list(self.module.parameters()))
        communication_op = functools.partial(
-            dist.broadcast, src=self.logical_rank * self.nprocs_per_node, group=self.local_node_group,
+            dist.broadcast,
+            src=self.logical_rank * self.nprocs_per_node,
+            group=self.local_node_group,
        )
        communicate(params, communication_op)
        self.logger.debug("Intra-node param sync complete")

    def _sync_buffers(self) -> None:
-        """ Synchronize buffers across nodes """
+        """Synchronize buffers across nodes"""
        # module buffer sync
        if self.broadcast_buffers and len(self.module_buffers) > 0:
            # Synchronize buffers across processes.
@@ -432,17 +439,18 @@ class SlowMoDistributedDataParallel(Module):
        dist._broadcast_coalesced(process_group, tensors, buffer_size)

    def _create_event_recorder(self, event_name: str) -> EventRecorder:
-        """ Creates an cuda event recorder which helps in profiling """
+        """Creates an cuda event recorder which helps in profiling"""
        return create_event_recorder(event_name, dummy=not self.profile_mode)

    def _fp16_fp32_iterator(
        self, optimizer: torch.optim.Optimizer, fp32_params: Optional[torch.Tensor]
    ) -> Iterable[Tuple[torch.Tensor, torch.Tensor]]:
-        """ Iterator for those fp16 parameters which have a fp32 copy """
+        """Iterator for those fp16 parameters which have a fp32 copy"""
        # Handle apex fp16 optimizer
        if hasattr(optimizer, "_amp_stash") and hasattr(optimizer._amp_stash, "fp16_groups"):
            for p_fp16_group, p_fp32_group in zip(
-                optimizer._amp_stash.fp16_groups, optimizer._amp_stash.fp32_from_fp16_groups,
+                optimizer._amp_stash.fp16_groups,
+                optimizer._amp_stash.fp32_from_fp16_groups,
            ):
                for p_fp16, p_fp32 in zip(p_fp16_group, p_fp32_group):
                    yield p_fp16, p_fp32
@@ -594,7 +602,7 @@ class SlowMoDistributedDataParallel(Module):
                    ef1.copy_(p_fp32 - p_fp16.float())

    def perform_slowmo(self, optimizer: torch.optim.Optimizer, fp32_params: Optional[torch.Tensor] = None) -> None:
-        """ This is to be called after optimizer.step(). It performs the approximate averaging using
+        """This is to be called after optimizer.step(). It performs the approximate averaging using
        the base algorithm (SGP/ LocalSGD) and the slow momentum step. Since LocalSGD and the slow
        momentum step are not performed every iteration, it only performs those when needed.

@@ -645,7 +653,7 @@ class SlowMoDistributedDataParallel(Module):
        self.num_updates += 1

    def _init_global_momentum_buffers(self, optimizer: torch.optim.Optimizer) -> None:
-        """ Initializes the slow momentum buffers """
+        """Initializes the slow momentum buffers"""
        self.global_momentum_buffers_initialized = True

        if not self.slowmo:
@@ -707,7 +715,7 @@ class SlowMoDistributedDataParallel(Module):
        self.global_momentum_buffer = torch.zeros_like(self.old_params).detach()

    def _distributed_comm(self, optimizer: torch.optim.Optimizer, mode: str) -> None:
-        """ Performs the communication needed for the efficient SlowMo implementation """
+        """Performs the communication needed for the efficient SlowMo implementation"""
        offset = 0
        slowmo_comm_lists: List[List[torch.Tensor]] = [[] for _ in range(self.slowmo_num_shards)]
        with torch.no_grad():
@@ -743,7 +751,7 @@ class SlowMoDistributedDataParallel(Module):
                communicate(slowmo_comm_list, communication_op)

    def _global_momentum_step(self, optimizer: torch.optim.Optimizer) -> None:
-        """ Performs the slow momentum step """
+        """Performs the slow momentum step"""
        if not self.slowmo:
            return

@@ -760,7 +768,7 @@ class SlowMoDistributedDataParallel(Module):
            self._distributed_comm(optimizer, mode="scatter")

    def _perform_local_optimization(self, optimizer: torch.optim.Optimizer) -> None:
-        """ Performs the slow momentum on the local shard """
+        """Performs the slow momentum on the local shard"""
        assert self.portion_start is not None

        with torch.no_grad():
@@ -838,7 +846,7 @@ class SlowMoDistributedDataParallel(Module):
        self.logger.debug("making forward pre-hook")

        def hook(*unused: Any) -> None:
-            """ Query gossip queue and de-bias during forward pass """
+            """Query gossip queue and de-bias during forward pass"""
            # sync buffers before the forward pass
            self._sync_buffers()

@@ -869,7 +877,7 @@ class SlowMoDistributedDataParallel(Module):
        use_streams: bool = True,
        slowmo_sgp_average_params: bool = False,
    ) -> None:
-        """ Perform initialization for Stochastic Gradient Push base algorithm """
+        """Perform initialization for Stochastic Gradient Push base algorithm"""

        if graph is None:
            graph = NPDDEGraph(logical_rank, logical_world_size, self.nprocs_per_node, self.local_rank)
@@ -959,7 +967,7 @@ class SlowMoDistributedDataParallel(Module):
        super(SlowMoDistributedDataParallel, self).load_state_dict(cast(Dict[str, torch.Tensor], state_dict))

    def _sgp_ps_numerator(self) -> None:
-        """ Convert model params to ps-numerator """
+        """Convert model params to ps-numerator"""
        if not self.is_sgp_ps_numerator:
            if not self.lazy_mixing:
                ps_weight = self.ps_weight
@@ -969,7 +977,7 @@ class SlowMoDistributedDataParallel(Module):
            self.is_sgp_ps_numerator = True

    def _sgp_unbias(self) -> None:
-        """ Convert model params to de-biased estimate """
+        """Convert model params to de-biased estimate"""
        if self.is_sgp_ps_numerator:
            if not self.lazy_mixing:
                ps_weight = self.ps_weight
@@ -992,7 +1000,7 @@ class SlowMoDistributedDataParallel(Module):
        return self

    def _sgp_query_gossip_queue(self, non_blocking: bool = False) -> bool:
-        """ Check gossip-queue for push-sum residuals and update model """
+        """Check gossip-queue for push-sum residuals and update model"""
        if not self.gossip_enable:
            return False

@@ -1046,7 +1054,7 @@ class SlowMoDistributedDataParallel(Module):
        return False

    def _sgp_transfer_params(self, mix: bool = True) -> bool:
-        """ Transfers COPY of model parameters to gossip queue """
+        """Transfers COPY of model parameters to gossip queue"""
        if not self.gossip_enable or self.process_rank % self.nprocs_per_node != 0:
            return False

@@ -1130,7 +1138,7 @@ class SlowMoDistributedDataParallel(Module):
        gossip_ps_factor: torch.Tensor,
        gossip_stream: torch.cuda.Stream,
    ) -> None:
-        """ Gossip thread, which performs push-sum on model params """
+        """Gossip thread, which performs push-sum on model params"""
        logger = make_logger(dist_config["logical_rank"], dist_config["verbose"])

        gossip_params_by_dtype = group_by_dtype(gossip_params)

--- a/fairscale/experimental/nn/data_parallel/gossip/gossiper.py
+++ b/fairscale/experimental/nn/data_parallel/gossip/gossiper.py
@@ -30,7 +30,7 @@ class dist_backend(str, Enum):


 class Gossiper(object):
-    """ Generic gossip averaging object for multi-peer communication
+    """Generic gossip averaging object for multi-peer communication

    Args:
        msg (torch.Tensor): message used to initialize recv buffer
@@ -121,7 +121,7 @@ class Gossiper(object):
        self._graph_manager.peers_per_itr = v

    def refresh_peers_(self, rotate: Optional[bool] = None) -> None:
-        """ Update in- and out-peers """
+        """Update in- and out-peers"""
        if rotate is None:
            rotate = self._graph_manager.is_dynamic_graph()
        # cannot cycle peers in a static graph
@@ -129,11 +129,11 @@ class Gossiper(object):
        self.out_edges, self.in_edges = self._graph_manager.get_edges(rotate)

    def refresh_mixing_weights_(self, residual_adjusted: bool = False) -> None:
-        """ Update mixing-matrix weights """
+        """Update mixing-matrix weights"""
        self.mixing_weights = self._mixing_manager.get_mixing_weights(residual_adjusted)

    def mix_out_msg_(self, out_msg: torch.Tensor, ps_weight: torch.Tensor) -> Iterator[torch.Tensor]:
-        """ Returns a generator mixing messages on the fly """
+        """Returns a generator mixing messages on the fly"""
        self.refresh_mixing_weights_(residual_adjusted=True)
        self.ps_weight = ps_weight

@@ -153,14 +153,14 @@ class Gossiper(object):
                yield out_msg.mul(weight.type(out_msg.dtype))  # type: ignore

    def clean_msg_buffers_(self) -> None:
-        """ Clean outgoing message buffer """
+        """Clean outgoing message buffer"""
        while len(self.out_msg_buffer) > 0:
            req, msg = self.out_msg_buffer.pop()
            req.wait()
            msg.set_()

    def parse_in_msg_buffer(self) -> Tuple[torch.Tensor, torch.Tensor]:
-        """ Parse in-msg buffer and return msg and ps-weight separately """
+        """Parse in-msg buffer and return msg and ps-weight separately"""
        msg = self.in_msg_buffer
        if not self.regular:
            return msg.narrow(0, 0, len(msg) - 1), msg[-1]
@@ -168,15 +168,15 @@ class Gossiper(object):
            return msg, self.ps_weight * self.peers_per_itr_device

    def mix(self, out_msg: torch.Tensor, ps_weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """ Single gossip step """
+        """Single gossip step"""
        raise NotImplementedError


 class PushSum(Gossiper):
-    """ 1-peer Push-Sum consensus averaging module """
+    """1-peer Push-Sum consensus averaging module"""

    def mix(self, out_msg: torch.Tensor, ps_weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """ Consensus averaging step """
+        """Consensus averaging step"""
        # out_msg must be on the correct device
        assert out_msg.device.type == self.device.type
        if self.logger is not None:
@@ -189,7 +189,12 @@ class PushSum(Gossiper):
        for out_edge in self.out_edges:
            msg = next(mixed_out_msgs)
            assert self.rank == out_edge.src
-            req = dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group, async_op=True,)
+            req = dist.broadcast(
+                tensor=msg,
+                src=out_edge.src,
+                group=out_edge.process_group,
+                async_op=True,
+            )
            self.out_msg_buffer.append((req, msg))

        # blocking recv w/ some code optimization to avoid buffer prep overhead
@@ -204,7 +209,9 @@ class PushSum(Gossiper):

            for in_edge in self.in_edges:
                dist.broadcast(
-                    tensor=self.placeholder, src=in_edge.src, group=in_edge.process_group,
+                    tensor=self.placeholder,
+                    src=in_edge.src,
+                    group=in_edge.process_group,
                )
                self.in_msg_buffer.add_(self.placeholder)  # type: ignore

@@ -214,7 +221,7 @@ class PushSum(Gossiper):


 class PushPull(Gossiper):
-    """ Doubly-stochastic consensus averaging module """
+    """Doubly-stochastic consensus averaging module"""

    def mix(self, out_msg: torch.Tensor, ps_weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        # out_msg must be on the correct device
@@ -232,11 +239,15 @@ class PushPull(Gossiper):
            if not self.passive:
                dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group)
                dist.broadcast(
-                    tensor=self.in_msg_buffer, src=in_edge.src, group=in_edge.process_group,
+                    tensor=self.in_msg_buffer,
+                    src=in_edge.src,
+                    group=in_edge.process_group,
                )
            else:
                dist.broadcast(
-                    tensor=self.in_msg_buffer, src=in_edge.src, group=in_edge.process_group,
+                    tensor=self.in_msg_buffer,
+                    src=in_edge.src,
+                    group=in_edge.process_group,
                )
                dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group)

@@ -251,11 +262,15 @@ class PushPull(Gossiper):
                if not self.passive:
                    dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group)
                    dist.broadcast(
-                        tensor=self.placeholder, src=in_edge.src, group=in_edge.process_group,
+                        tensor=self.placeholder,
+                        src=in_edge.src,
+                        group=in_edge.process_group,
                    )
                else:
                    dist.broadcast(
-                        tensor=self.placeholder, src=in_edge.src, group=in_edge.process_group,
+                        tensor=self.placeholder,
+                        src=in_edge.src,
+                        group=in_edge.process_group,
                    )
                    dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group)
                self.in_msg_buffer.add_(self.placeholder)  # type: ignore

--- a/fairscale/experimental/nn/data_parallel/gossip/graph_manager.py
+++ b/fairscale/experimental/nn/data_parallel/gossip/graph_manager.py
@@ -77,26 +77,26 @@ class GraphManager(ABC):

    @abstractmethod
    def is_regular_graph(self) -> bool:
-        """ Whether each node has the same number of in-peers as out-peers """
+        """Whether each node has the same number of in-peers as out-peers"""
        raise NotImplementedError

    @abstractmethod
    def is_bipartite_graph(self) -> bool:
-        """ Whether graph is bipartite or not """
+        """Whether graph is bipartite or not"""
        raise NotImplementedError

    @abstractmethod
    def is_passive(self, rank: Optional[int] = None) -> bool:
-        """ Whether 'rank' is a passive node or not """
+        """Whether 'rank' is a passive node or not"""
        raise NotImplementedError

    @abstractmethod
    def is_dynamic_graph(self) -> bool:
-        """ Whether the graph-type is dynamic (as opposed to static) """
+        """Whether the graph-type is dynamic (as opposed to static)"""
        raise NotImplementedError

    def get_peers(self, rotate: bool = False) -> Tuple[List[int], List[int]]:
-        """ Returns the out and in-peers corresponding to 'self.rank' """
+        """Returns the out and in-peers corresponding to 'self.rank'"""
        # cycle through in- and out-peers by updating group-index
        if rotate:
            self._rotate_group_indices()
@@ -113,8 +113,8 @@ class GraphManager(ABC):
        return out_peers, in_peers

    def get_edges(self, rotate: bool = False) -> Tuple[List[Edge], List[Edge]]:
-        """ Returns the pairwise process groups between rank and the out and
-        in-peers corresponding to 'self.rank' """
+        """Returns the pairwise process groups between rank and the out and
+        in-peers corresponding to 'self.rank'"""
        # cycle through in- and out-peers by updating group-index
        if rotate:
            self._rotate_group_indices()
@@ -131,17 +131,17 @@ class GraphManager(ABC):
        return out_edges, in_edges

    def _rotate_group_indices(self) -> None:
-        """ Incerement group indices to point to the next out-peer """
+        """Incerement group indices to point to the next out-peer"""
        increment = self.peers_per_itr
        for i, group_index in enumerate(self._group_indices):
            self._group_indices[i] = int((group_index + increment) % len(self.phone_book[self.rank]))

    def _rotate_forward(self, r: int, p: int) -> int:
-        """ Helper function returns peer that is p hops ahead of r """
+        """Helper function returns peer that is p hops ahead of r"""
        return (r + p) % self.world_size

    def _rotate_backward(self, r: int, p: int) -> int:
-        """ Helper function returns peer that is p hops behind r """
+        """Helper function returns peer that is p hops behind r"""
        return (r - p) % self.world_size



--- a/fairscale/experimental/nn/data_parallel/gossip/mixing_manager.py
+++ b/fairscale/experimental/nn/data_parallel/gossip/mixing_manager.py
@@ -32,18 +32,18 @@ class MixingManager(ABC):

    @abstractmethod
    def is_uniform(self) -> bool:
-        """ Whether mixing weights are distributed uniformly over peers """
+        """Whether mixing weights are distributed uniformly over peers"""
        raise NotImplementedError

    @abstractmethod
    def get_mixing_weights(self, residual_adjusted: bool = True) -> Dict[Union[str, int], torch.Tensor]:
-        """ Create mixing weight dictionary using uniform allocation """
+        """Create mixing weight dictionary using uniform allocation"""
        raise NotImplementedError


 class UniformMixing(MixingManager):
    def get_mixing_weights(self, residual_adjusted: bool = True) -> Dict[Union[str, int], torch.Tensor]:
-        """ Create mixing weight dictionary using uniform allocation """
+        """Create mixing weight dictionary using uniform allocation"""
        mixing_weights: Dict[Union[str, int], torch.Tensor] = {}
        out_peers, _ = self.graph_manager.get_peers()


--- a/fairscale/experimental/nn/data_parallel/gossip/utils/cuda_metering.py
+++ b/fairscale/experimental/nn/data_parallel/gossip/utils/cuda_metering.py
@@ -36,7 +36,7 @@ def create_event_recorder(event_name: str, dummy: bool = False) -> EventRecorder


 class CudaEventRecorder(EventRecorder):
-    """ Allows profiling in an easy-to-use manner. CudaEventRecorder can be used
+    """Allows profiling in an easy-to-use manner. CudaEventRecorder can be used
    in a loop. When it is used in a loop (or when an event recorder is created
    multiple times with the same name), get_timings returns the statistics of the
    timings since the last reset. Note: in case the number of timings is greater than
@@ -92,19 +92,22 @@ class CudaEventRecorder(EventRecorder):
            time_taken_list = [event_recorder.find_time_elapsed() for event_recorder in event_recorder_list]

            all_timings_str += ("{}: Time taken: avg: {}, std: {}, count: " "{}\n").format(
-                event_name, statistics.mean(time_taken_list), statistics.pstdev(time_taken_list), len(time_taken_list),
+                event_name,
+                statistics.mean(time_taken_list),
+                statistics.pstdev(time_taken_list),
+                len(time_taken_list),
            )

        return all_timings_str

    @classmethod
    def get_timings(cls) -> str:
-        """ Returns the timings since last reset was called """
+        """Returns the timings since last reset was called"""
        return cls.get_common_timings(cls.event_recorders, "Timings since last reset")

    @classmethod
    def get_all_timings(cls) -> str:
-        """ Returns the statistics of all the timings """
+        """Returns the statistics of all the timings"""
        return cls.get_common_timings(cls.all_event_recorders, "All timings")



--- a/fairscale/experimental/nn/data_parallel/gossip/utils/helpers.py
+++ b/fairscale/experimental/nn/data_parallel/gossip/utils/helpers.py
@@ -86,7 +86,10 @@ def communicate(tensors: List[torch.Tensor], communication_op: Any, logger: logg
        if logger is not None:
            logger.debug("Commmunication completed")
        with torch.no_grad():
-            for f, t in zip(unflatten_tensors(flat_tensor, tensors_with_same_dtype), tensors_with_same_dtype,):
+            for f, t in zip(
+                unflatten_tensors(flat_tensor, tensors_with_same_dtype),
+                tensors_with_same_dtype,
+            ):
                t.copy_(f)
        if logger is not None:
            logger.debug("Unflatten completed")

--- a/fairscale/experimental/nn/distributed_pipeline/graph.py
+++ b/fairscale/experimental/nn/distributed_pipeline/graph.py
@@ -198,7 +198,9 @@ class PipelineModulesGraph(nn.Module):
                remote_module = partition[0].module.get_module_rref()
            else:
                remote_module = rpc.remote(
-                    partition[0].module.on, RemoteSequential, args=([p.module.get_module_rref() for p in partition],),
+                    partition[0].module.on,
+                    RemoteSequential,
+                    args=([p.module.get_module_rref() for p in partition],),
                )
            partitions.append((partition, remote_module))


--- a/fairscale/experimental/nn/distributed_pipeline/partition_handler.py
+++ b/fairscale/experimental/nn/distributed_pipeline/partition_handler.py
@@ -25,7 +25,7 @@ ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType]


 class DistributedPipelineRecord:
-    """ A class for storing a single mini-batch (consisting of multiple micro-batches) as input to
+    """A class for storing a single mini-batch (consisting of multiple micro-batches) as input to
    a single partition.
    Args:
        device: the local device that runs the partition.
@@ -73,7 +73,7 @@ class DistributedPipelineRecord:
        return {}

    def feed(self, chunk: int, input_idx: int, input: Tensor) -> Tensor:
-        """ This function is called remotely to provide individual tensors of a given chunk."""
+        """This function is called remotely to provide individual tensors of a given chunk."""
        if input.device.type == "cpu":
            input = input.to(self.device)
        cuda_stream = torch.cuda.current_stream(input.device) if input.device.type == "cuda" else None

--- a/fairscale/experimental/nn/distributed_pipeline/pipeline.py
+++ b/fairscale/experimental/nn/distributed_pipeline/pipeline.py
@@ -70,7 +70,12 @@ class DistributedPipeline(nn.Module):

    DataConsumer = DataConsumer[Partition]

-    def __init__(self, graph: PipelineModulesGraph, chunks: int = 1, checkpoint: str = "except_last",) -> None:
+    def __init__(
+        self,
+        graph: PipelineModulesGraph,
+        chunks: int = 1,
+        checkpoint: str = "except_last",
+    ) -> None:
        super().__init__()

        check_pytorch_version()