Remove sort_iseed_config and related dependencies. (#969)

This is no longer needed since isort's version is 5.10 Also fix black version to 22.3.0 to fix issue with click dependency. Update files that now fail with new version of black {a = 2 ** 4} -> {a = 2**4}

Remove sort_iseed_config and related dependencies. (#969)
This is no longer needed since isort's version is 5.10 Also fix black version to 22.3.0 to fix issue with click dependency. Update files that now fail with new version of black {a = 2 ** 4} -> {a = 2**4}
72f373c1 · Paul Johnson · GitHub · 1bc96fa8 · 72f373c1 · 72f373c1
Unverified Commit 72f373c1 authored Mar 30, 2022 by Paul Johnson Committed by GitHub Mar 30, 2022
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
    -   id: end-of-file-fixer
 -   repo: https://github.com/ambv/black
-    rev: 21.10b0
+    rev: 22.3.0
    hooks:
    - id: black
@@ -33,11 +33,6 @@ repos:
    -   id: flake8
        args: [--show-source, --statistics]
-   repo: https://github.com/asottile/seed-isort-config
-    rev: v2.2.0
-    hooks:
-    -   id: seed-isort-config
 -   repo: https://github.com/pycqa/isort
    rev: 5.10.1
    hooks:

--- a/benchmarks/datasets/wikitext2_data.py
+++ b/benchmarks/datasets/wikitext2_data.py
@@ -4,11 +4,11 @@
 # LICENSE file in the root directory of this source tree.
 from collections import namedtuple
+from distutils.version import LooseVersion
 import io
 import operator
 import tempfile
-from distutils.version import LooseVersion
 import torch
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler

--- a/benchmarks/experimental/offload.py
+++ b/benchmarks/experimental/offload.py
@@ -135,11 +135,11 @@ def train_seq(model_config, benchmark_config, model_specs, args):
                        loss.backward()
                    optimizer.step()
            logging.info(
-                "Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2 ** 30)
+                "Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2**30)
            )
            logging.info(
                "Loss {:.2f} - throughput {:.2f}fps".format(
-                    loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10 ** 9
+                    loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10**9
                )
            )
            num_iters -= 1

--- a/benchmarks/fsdp.py
+++ b/benchmarks/fsdp.py
@@ -267,7 +267,7 @@ def benchmark_language_model(model_config, model, benchmark_config, model_specs,
        print("Throughput(wps) is {:.2f}.".format(wps))
    print(
        "Peak allocated bytes on cuda:{}: {:4f}GB".format(
-            dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2 ** 30
+            dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2**30
        )
    )

--- a/benchmarks/golden_configs/lm_wikitext2.py
+++ b/benchmarks/golden_configs/lm_wikitext2.py
@@ -97,7 +97,7 @@ class FSDP:
        return {
            "avg_wps": 486.303,
            "std_dev_wps": 71.307,
-            "peak_mem_usage": [5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30],
+            "peak_mem_usage": [5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30],
        }

--- a/benchmarks/oss.py
+++ b/benchmarks/oss.py
@@ -89,7 +89,7 @@ def validate_benchmark(measurements, final_loss, args, check_regression):
    if not args.cpu:
        # TODO(anj-s): Check if we need to synchronize before we caculate total training time.
        torch.cuda.synchronize(rank)
-        max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20
+        max_memory = torch.cuda.max_memory_allocated(rank) / 2**20
        logging.info(f"[{rank}] : Peak memory {max_memory:.1f}MiB")
    measurements.sort()

--- a/fairscale/experimental/nn/data_parallel/gossip/graph_manager.py
+++ b/fairscale/experimental/nn/data_parallel/gossip/graph_manager.py
@@ -149,8 +149,8 @@ class DynamicDirectedExponentialGraph(GraphManager):
    def _make_graph(self) -> None:
        for rank in range(self.world_size):
            for i in range(0, int(mlog(self.world_size - 1, 2)) + 1):
-                f_peer = self._rotate_forward(rank, 2 ** i)
+                f_peer = self._rotate_forward(rank, 2**i)
-                b_peer = self._rotate_backward(rank, 2 ** i)
+                b_peer = self._rotate_backward(rank, 2**i)
                self._add_peers(rank, [f_peer, b_peer])
    def is_regular_graph(self) -> bool:
@@ -196,8 +196,8 @@ class DynamicBipartiteExponentialGraph(GraphManager):
                    f_peer = self._rotate_forward(rank, 1)
                    b_peer = self._rotate_backward(rank, 1)
                else:
-                    f_peer = self._rotate_forward(rank, 1 + 2 ** i)
+                    f_peer = self._rotate_forward(rank, 1 + 2**i)
-                    b_peer = self._rotate_backward(rank, 1 + 2 ** i)
+                    b_peer = self._rotate_backward(rank, 1 + 2**i)
                # create directory for non-passive peers
                if not self.is_passive(rank) and (self.is_passive(f_peer) and self.is_passive(b_peer)):
                    self._add_peers(rank, [f_peer, b_peer])

--- a/fairscale/experimental/nn/data_parallel/gossip/utils/cuda_metering.py
+++ b/fairscale/experimental/nn/data_parallel/gossip/utils/cuda_metering.py
@@ -14,7 +14,7 @@ from typing import ClassVar, Deque, Dict, Optional
 import torch
-MAX_LEN_DEQUEUE = 10 ** 4
+MAX_LEN_DEQUEUE = 10**4
 deque_with_max_len_fixed = partial(deque, maxlen=MAX_LEN_DEQUEUE)

--- a/fairscale/experimental/optim/dynamic_loss_scaler.py
+++ b/fairscale/experimental/optim/dynamic_loss_scaler.py
@@ -36,7 +36,7 @@ class DynamicLossScaler(object):
    def __init__(
        self,
-        init_scale: float = 2.0 ** 15,
+        init_scale: float = 2.0**15,
        scale_factor: float = 2.0,
        scale_window: int = 2000,
        tolerance: float = 0.0,

--- a/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
+++ b/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
@@ -700,7 +700,7 @@ class FullyShardedDataParallel(nn.Module):
            total_norm = local_norm
            dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
        else:
-            total_norm = local_norm ** norm_type
+            total_norm = local_norm**norm_type
            dist.all_reduce(total_norm, group=self.process_group)
            total_norm = total_norm ** (1.0 / norm_type)
@@ -2408,7 +2408,7 @@ class FullyShardedDataParallel(nn.Module):
        if restart:
            self._tstart = time.time()
        if self.rank == 0:
-            gb_denom = 1024 ** 3
+            gb_denom = 1024**3
            logging.info(
                f"{msg} cur={torch.cuda.memory_allocated()/gb_denom: .4f} GB, max={torch.cuda.max_memory_allocated()/gb_denom: .4f} GB, t={time.time()-self._tstart: .1f}"
            )

--- a/fairscale/nn/data_parallel/sharded_ddp.py
+++ b/fairscale/nn/data_parallel/sharded_ddp.py
@@ -100,7 +100,7 @@ class ShardedDataParallel(nn.Module):
        process_group: Any = None,
        broadcast_buffers: bool = True,
        sync_models_at_startup: bool = True,
-        reduce_buffer_size: int = 2 ** 23,
+        reduce_buffer_size: int = 2**23,
        auto_refresh_trainable: bool = True,
        reduce_fp16: bool = False,
        warn_on_trainable_params_changed: bool = True,
@@ -178,7 +178,7 @@ class ShardedDataParallel(nn.Module):
        logging.info(
            "ShardedDDP bucket size: {:.2f}M parameters, model size {:.2f}M parameters".format(
-                self._buffer_max_size / 2 ** 20, model_size / 2 ** 20
+                self._buffer_max_size / 2**20, model_size / 2**20
            )
        )
        self._use_buckets = self._buffer_max_size > 0

--- a/fairscale/nn/pipe/batchnorm.py
+++ b/fairscale/nn/pipe/batchnorm.py
@@ -71,7 +71,7 @@ class DeferredBatchNorm(_BatchNorm):
        with torch.no_grad():
            self.sum += input.sum(dim)
-            self.sum_squares += (input ** 2).sum(dim)
+            self.sum_squares += (input**2).sum(dim)
        size = input.size().numel() // input.size(1)
        self.counter += size
@@ -89,7 +89,7 @@ class DeferredBatchNorm(_BatchNorm):
            exponential_average_factor = self.momentum
        mean = self.sum / self.counter
-        var = self.sum_squares / self.counter - mean ** 2
+        var = self.sum_squares / self.counter - mean**2
        # Calculate the exponential moving average here.
        m = exponential_average_factor

--- a/fairscale/optim/adam.py
+++ b/fairscale/optim/adam.py
@@ -98,7 +98,7 @@ try:
                assert parameters[0].dtype == torch.float16
            self.optim_type = torch.float16 if precision is Precision.PURE_FP16 else torch.float32
-            self._optim_scale = float(2 ** 16) if precision is Precision.PURE_FP16 else 1.0
+            self._optim_scale = float(2**16) if precision is Precision.PURE_FP16 else 1.0
            self._steps_since_optim_scale_change = 0
            self._optim_scale_update_freq = 2000  # This is the value that GradScaler uses by default
            self._overflow_buf = torch.cuda.IntTensor([0])  # type: ignore
@@ -291,11 +291,10 @@ try:
                if self._steps_since_optim_scale_change == self._optim_scale_update_freq:
                    self._steps_since_optim_scale_change = 0
-                    if self._optim_scale < 2 ** 16:
+                    if self._optim_scale < 2**16:
                        self._optim_scale *= 2
            return loss
 except ImportError:
    pass
--- a/fairscale/optim/adascale.py
+++ b/fairscale/optim/adascale.py
@@ -453,7 +453,7 @@ class AdaScale(Optimizer):
        # accumulation.
        if self._num_grads_to_accum > 1:
            # np array doesn't support /=.
-            total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum ** 2)
+            total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum**2)
        # Wait for all_reduce to be done and move it to cpu & np.
        if work:

--- a/fairscale/optim/grad_scaler.py
+++ b/fairscale/optim/grad_scaler.py
@@ -76,7 +76,7 @@ class ShardedGradScaler(TorchGradScaler):
    def __init__(
        self,
-        init_scale: float = 2.0 ** 16,
+        init_scale: float = 2.0**16,
        growth_factor: float = 2.0,
        backoff_factor: float = 0.5,
        growth_interval: int = 2000,

--- a/fairscale/optim/oss.py
+++ b/fairscale/optim/oss.py
@@ -289,7 +289,7 @@ class OSS(Optimizer):
                # n_i = sum_rank(a^p)^1/p
                # -> n_total = all_reduce(n_i^p)^(1/p) = sum_i(n_i^p)^1/p = sum_i(sum_rank(a^p))^1/p
                # all reduce over data parallel and model parallel workers
-                total_norm = local_norm ** norm_type
+                total_norm = local_norm**norm_type
                dist.all_reduce(total_norm)
                total_norm = total_norm ** (1.0 / norm_type)

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,4 +27,3 @@ use_parentheses = true
 skip_glob = ["build/*", "stubs/*"]
 # Don't split "import" and "from".
 force_sort_within_sections = true
-known_third_party = ["benchmark_dataset", "datasets", "distutils", "golden_configs", "models", "numpy", "parameterized", "pytest", "recommonmark", "setuptools", "sklearn", "torch", "torchtext", "torchvision", "utils"]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -6,11 +6,10 @@
 #     function typing with mypy.
 #   - if you change versions below, please make sure it is in-sync with
 #     .pre-commit-config.yaml for pre-commit.
-black == 21.10b0
+black == 22.3.0
 flake8 == 4.0.1
 flake8-annotations == 2.7.0
 isort == 5.10.1
-seed-isort-config == 2.2.0
 mypy == 0.910
 pre-commit >= 2.15.0

--- a/tests/nn/data_parallel/test_fsdp_optimizer_utils.py
+++ b/tests/nn/data_parallel/test_fsdp_optimizer_utils.py
@@ -162,13 +162,13 @@ class TestOptimizerUtils(DistributedTest):
            assert len(no_broadcast_children) == 1, f"Length of non shared params {len(no_broadcast_children)}"
            assert fsdp._fsdp_instances[-1].no_broadcast_optim_state
        torch.cuda.empty_cache()
-        cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3
+        cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
        tstart = time()
        sd = fsdp.gather_full_optim_state_dict(fsdp_optim, recipient_rank=0)
        duration = time() - tstart
        assert duration < fsdp.world_size, f"gather optim state took {duration} seconds, suspect change in _consolidate"
-        cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3
+        cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
        mem_usg_gb = cuda_gb_after - cuda_gb_before
        assert mem_usg_gb == 0, f"gather_full_optim_state_dict used {mem_usg_gb:.2f} CUDA GB, max allowed is 0"
        assert cuda_gb_after > 0, "got 0 memory usage, logging is broken"

--- a/tests/nn/data_parallel/test_sharded_ddp_features.py
+++ b/tests/nn/data_parallel/test_sharded_ddp_features.py
@@ -146,7 +146,7 @@ def run_test(backend, device, world_size, broadcast_buffers, grad_accumulation,
 @skip_if_single_gpu
 @pytest.mark.parametrize("broadcast_buffers", [True, False])
 @pytest.mark.parametrize("grad_accumulation", [True, False])
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 @pytest.mark.parametrize("optimizer_type", [torch.optim.SGD, SGDWithPausingCompute])
 @pytest.mark.parametrize("reduce_fp16", [False, True])
 @pytest.mark.parametrize(
@@ -204,7 +204,7 @@ def run_test_two_inputs(rank, world_size, backend, device, temp_file_name, reduc
    dist.destroy_process_group()
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 @pytest.mark.parametrize("backend", ["gloo", "nccl"])
 @pytest.mark.parametrize("device", available_devices)
 @skip_if_single_gpu
@@ -354,7 +354,7 @@ def run_test_device_change(rank, world_size, backend, device, temp_file_name, re
 @skip_if_no_cuda
 @skip_if_single_gpu
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 def test_device_change(reduce_buffer_size):
    # Check that ShardedDDP handles a device change properly
    world_size = 2
@@ -392,7 +392,7 @@ def run_test_training_change(rank, world_size, backend, device, temp_file_name,
 @skip_if_no_cuda
 @skip_if_single_gpu
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 def test_training_change(reduce_buffer_size):
    world_size = 2
    backend = "nccl"
@@ -528,7 +528,7 @@ def run_test_gpt2(rank, world_size, backend, device, temp_file_name, reduce_buff
 @skip_if_no_cuda
 @skip_if_single_gpu
 @pytest.mark.parametrize("world_size", [1, 2])
-@pytest.mark.parametrize("reduce_buffer", [2 ** 23, 2 ** 40])
+@pytest.mark.parametrize("reduce_buffer", [2**23, 2**40])
 def test_gpt2(world_size, reduce_buffer):
    # Check that having trainable unused params is fine
    backend = "gloo"
@@ -598,7 +598,7 @@ def run_test_multiple_groups(rank, world_size, tempfile_name, backend, reduce_bu
 @skip_if_less_than_four_gpu
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 @pytest.mark.parametrize("backend", ["gloo", "nccl"])
 def test_multiple_groups(reduce_buffer_size, backend):
    world_size = 4