Unverified Commit 72f373c1 authored by Paul Johnson's avatar Paul Johnson Committed by GitHub
Browse files

Remove sort_iseed_config and related dependencies. (#969)

This is no longer needed since isort's version is 5.10

Also fix black version to 22.3.0 to fix issue with click
dependency.

Update files that now fail with new version of black {a = 2 ** 4} ->
{a = 2**4}
parent 1bc96fa8
...@@ -23,7 +23,7 @@ repos: ...@@ -23,7 +23,7 @@ repos:
- id: end-of-file-fixer - id: end-of-file-fixer
- repo: https://github.com/ambv/black - repo: https://github.com/ambv/black
rev: 21.10b0 rev: 22.3.0
hooks: hooks:
- id: black - id: black
...@@ -33,11 +33,6 @@ repos: ...@@ -33,11 +33,6 @@ repos:
- id: flake8 - id: flake8
args: [--show-source, --statistics] args: [--show-source, --statistics]
- repo: https://github.com/asottile/seed-isort-config
rev: v2.2.0
hooks:
- id: seed-isort-config
- repo: https://github.com/pycqa/isort - repo: https://github.com/pycqa/isort
rev: 5.10.1 rev: 5.10.1
hooks: hooks:
......
...@@ -4,11 +4,11 @@ ...@@ -4,11 +4,11 @@
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
from collections import namedtuple from collections import namedtuple
from distutils.version import LooseVersion
import io import io
import operator import operator
import tempfile import tempfile
from distutils.version import LooseVersion
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
......
...@@ -135,11 +135,11 @@ def train_seq(model_config, benchmark_config, model_specs, args): ...@@ -135,11 +135,11 @@ def train_seq(model_config, benchmark_config, model_specs, args):
loss.backward() loss.backward()
optimizer.step() optimizer.step()
logging.info( logging.info(
"Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2 ** 30) "Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2**30)
) )
logging.info( logging.info(
"Loss {:.2f} - throughput {:.2f}fps".format( "Loss {:.2f} - throughput {:.2f}fps".format(
loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10 ** 9 loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10**9
) )
) )
num_iters -= 1 num_iters -= 1
......
...@@ -267,7 +267,7 @@ def benchmark_language_model(model_config, model, benchmark_config, model_specs, ...@@ -267,7 +267,7 @@ def benchmark_language_model(model_config, model, benchmark_config, model_specs,
print("Throughput(wps) is {:.2f}.".format(wps)) print("Throughput(wps) is {:.2f}.".format(wps))
print( print(
"Peak allocated bytes on cuda:{}: {:4f}GB".format( "Peak allocated bytes on cuda:{}: {:4f}GB".format(
dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2 ** 30 dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2**30
) )
) )
......
...@@ -97,7 +97,7 @@ class FSDP: ...@@ -97,7 +97,7 @@ class FSDP:
return { return {
"avg_wps": 486.303, "avg_wps": 486.303,
"std_dev_wps": 71.307, "std_dev_wps": 71.307,
"peak_mem_usage": [5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30], "peak_mem_usage": [5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30],
} }
......
...@@ -89,7 +89,7 @@ def validate_benchmark(measurements, final_loss, args, check_regression): ...@@ -89,7 +89,7 @@ def validate_benchmark(measurements, final_loss, args, check_regression):
if not args.cpu: if not args.cpu:
# TODO(anj-s): Check if we need to synchronize before we caculate total training time. # TODO(anj-s): Check if we need to synchronize before we caculate total training time.
torch.cuda.synchronize(rank) torch.cuda.synchronize(rank)
max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20 max_memory = torch.cuda.max_memory_allocated(rank) / 2**20
logging.info(f"[{rank}] : Peak memory {max_memory:.1f}MiB") logging.info(f"[{rank}] : Peak memory {max_memory:.1f}MiB")
measurements.sort() measurements.sort()
......
...@@ -149,8 +149,8 @@ class DynamicDirectedExponentialGraph(GraphManager): ...@@ -149,8 +149,8 @@ class DynamicDirectedExponentialGraph(GraphManager):
def _make_graph(self) -> None: def _make_graph(self) -> None:
for rank in range(self.world_size): for rank in range(self.world_size):
for i in range(0, int(mlog(self.world_size - 1, 2)) + 1): for i in range(0, int(mlog(self.world_size - 1, 2)) + 1):
f_peer = self._rotate_forward(rank, 2 ** i) f_peer = self._rotate_forward(rank, 2**i)
b_peer = self._rotate_backward(rank, 2 ** i) b_peer = self._rotate_backward(rank, 2**i)
self._add_peers(rank, [f_peer, b_peer]) self._add_peers(rank, [f_peer, b_peer])
def is_regular_graph(self) -> bool: def is_regular_graph(self) -> bool:
...@@ -196,8 +196,8 @@ class DynamicBipartiteExponentialGraph(GraphManager): ...@@ -196,8 +196,8 @@ class DynamicBipartiteExponentialGraph(GraphManager):
f_peer = self._rotate_forward(rank, 1) f_peer = self._rotate_forward(rank, 1)
b_peer = self._rotate_backward(rank, 1) b_peer = self._rotate_backward(rank, 1)
else: else:
f_peer = self._rotate_forward(rank, 1 + 2 ** i) f_peer = self._rotate_forward(rank, 1 + 2**i)
b_peer = self._rotate_backward(rank, 1 + 2 ** i) b_peer = self._rotate_backward(rank, 1 + 2**i)
# create directory for non-passive peers # create directory for non-passive peers
if not self.is_passive(rank) and (self.is_passive(f_peer) and self.is_passive(b_peer)): if not self.is_passive(rank) and (self.is_passive(f_peer) and self.is_passive(b_peer)):
self._add_peers(rank, [f_peer, b_peer]) self._add_peers(rank, [f_peer, b_peer])
......
...@@ -14,7 +14,7 @@ from typing import ClassVar, Deque, Dict, Optional ...@@ -14,7 +14,7 @@ from typing import ClassVar, Deque, Dict, Optional
import torch import torch
MAX_LEN_DEQUEUE = 10 ** 4 MAX_LEN_DEQUEUE = 10**4
deque_with_max_len_fixed = partial(deque, maxlen=MAX_LEN_DEQUEUE) deque_with_max_len_fixed = partial(deque, maxlen=MAX_LEN_DEQUEUE)
......
...@@ -36,7 +36,7 @@ class DynamicLossScaler(object): ...@@ -36,7 +36,7 @@ class DynamicLossScaler(object):
def __init__( def __init__(
self, self,
init_scale: float = 2.0 ** 15, init_scale: float = 2.0**15,
scale_factor: float = 2.0, scale_factor: float = 2.0,
scale_window: int = 2000, scale_window: int = 2000,
tolerance: float = 0.0, tolerance: float = 0.0,
......
...@@ -700,7 +700,7 @@ class FullyShardedDataParallel(nn.Module): ...@@ -700,7 +700,7 @@ class FullyShardedDataParallel(nn.Module):
total_norm = local_norm total_norm = local_norm
dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group) dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
else: else:
total_norm = local_norm ** norm_type total_norm = local_norm**norm_type
dist.all_reduce(total_norm, group=self.process_group) dist.all_reduce(total_norm, group=self.process_group)
total_norm = total_norm ** (1.0 / norm_type) total_norm = total_norm ** (1.0 / norm_type)
...@@ -2408,7 +2408,7 @@ class FullyShardedDataParallel(nn.Module): ...@@ -2408,7 +2408,7 @@ class FullyShardedDataParallel(nn.Module):
if restart: if restart:
self._tstart = time.time() self._tstart = time.time()
if self.rank == 0: if self.rank == 0:
gb_denom = 1024 ** 3 gb_denom = 1024**3
logging.info( logging.info(
f"{msg} cur={torch.cuda.memory_allocated()/gb_denom: .4f} GB, max={torch.cuda.max_memory_allocated()/gb_denom: .4f} GB, t={time.time()-self._tstart: .1f}" f"{msg} cur={torch.cuda.memory_allocated()/gb_denom: .4f} GB, max={torch.cuda.max_memory_allocated()/gb_denom: .4f} GB, t={time.time()-self._tstart: .1f}"
) )
......
...@@ -100,7 +100,7 @@ class ShardedDataParallel(nn.Module): ...@@ -100,7 +100,7 @@ class ShardedDataParallel(nn.Module):
process_group: Any = None, process_group: Any = None,
broadcast_buffers: bool = True, broadcast_buffers: bool = True,
sync_models_at_startup: bool = True, sync_models_at_startup: bool = True,
reduce_buffer_size: int = 2 ** 23, reduce_buffer_size: int = 2**23,
auto_refresh_trainable: bool = True, auto_refresh_trainable: bool = True,
reduce_fp16: bool = False, reduce_fp16: bool = False,
warn_on_trainable_params_changed: bool = True, warn_on_trainable_params_changed: bool = True,
...@@ -178,7 +178,7 @@ class ShardedDataParallel(nn.Module): ...@@ -178,7 +178,7 @@ class ShardedDataParallel(nn.Module):
logging.info( logging.info(
"ShardedDDP bucket size: {:.2f}M parameters, model size {:.2f}M parameters".format( "ShardedDDP bucket size: {:.2f}M parameters, model size {:.2f}M parameters".format(
self._buffer_max_size / 2 ** 20, model_size / 2 ** 20 self._buffer_max_size / 2**20, model_size / 2**20
) )
) )
self._use_buckets = self._buffer_max_size > 0 self._use_buckets = self._buffer_max_size > 0
......
...@@ -71,7 +71,7 @@ class DeferredBatchNorm(_BatchNorm): ...@@ -71,7 +71,7 @@ class DeferredBatchNorm(_BatchNorm):
with torch.no_grad(): with torch.no_grad():
self.sum += input.sum(dim) self.sum += input.sum(dim)
self.sum_squares += (input ** 2).sum(dim) self.sum_squares += (input**2).sum(dim)
size = input.size().numel() // input.size(1) size = input.size().numel() // input.size(1)
self.counter += size self.counter += size
...@@ -89,7 +89,7 @@ class DeferredBatchNorm(_BatchNorm): ...@@ -89,7 +89,7 @@ class DeferredBatchNorm(_BatchNorm):
exponential_average_factor = self.momentum exponential_average_factor = self.momentum
mean = self.sum / self.counter mean = self.sum / self.counter
var = self.sum_squares / self.counter - mean ** 2 var = self.sum_squares / self.counter - mean**2
# Calculate the exponential moving average here. # Calculate the exponential moving average here.
m = exponential_average_factor m = exponential_average_factor
......
...@@ -98,7 +98,7 @@ try: ...@@ -98,7 +98,7 @@ try:
assert parameters[0].dtype == torch.float16 assert parameters[0].dtype == torch.float16
self.optim_type = torch.float16 if precision is Precision.PURE_FP16 else torch.float32 self.optim_type = torch.float16 if precision is Precision.PURE_FP16 else torch.float32
self._optim_scale = float(2 ** 16) if precision is Precision.PURE_FP16 else 1.0 self._optim_scale = float(2**16) if precision is Precision.PURE_FP16 else 1.0
self._steps_since_optim_scale_change = 0 self._steps_since_optim_scale_change = 0
self._optim_scale_update_freq = 2000 # This is the value that GradScaler uses by default self._optim_scale_update_freq = 2000 # This is the value that GradScaler uses by default
self._overflow_buf = torch.cuda.IntTensor([0]) # type: ignore self._overflow_buf = torch.cuda.IntTensor([0]) # type: ignore
...@@ -291,11 +291,10 @@ try: ...@@ -291,11 +291,10 @@ try:
if self._steps_since_optim_scale_change == self._optim_scale_update_freq: if self._steps_since_optim_scale_change == self._optim_scale_update_freq:
self._steps_since_optim_scale_change = 0 self._steps_since_optim_scale_change = 0
if self._optim_scale < 2 ** 16: if self._optim_scale < 2**16:
self._optim_scale *= 2 self._optim_scale *= 2
return loss return loss
except ImportError: except ImportError:
pass pass
...@@ -453,7 +453,7 @@ class AdaScale(Optimizer): ...@@ -453,7 +453,7 @@ class AdaScale(Optimizer):
# accumulation. # accumulation.
if self._num_grads_to_accum > 1: if self._num_grads_to_accum > 1:
# np array doesn't support /=. # np array doesn't support /=.
total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum ** 2) total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum**2)
# Wait for all_reduce to be done and move it to cpu & np. # Wait for all_reduce to be done and move it to cpu & np.
if work: if work:
......
...@@ -76,7 +76,7 @@ class ShardedGradScaler(TorchGradScaler): ...@@ -76,7 +76,7 @@ class ShardedGradScaler(TorchGradScaler):
def __init__( def __init__(
self, self,
init_scale: float = 2.0 ** 16, init_scale: float = 2.0**16,
growth_factor: float = 2.0, growth_factor: float = 2.0,
backoff_factor: float = 0.5, backoff_factor: float = 0.5,
growth_interval: int = 2000, growth_interval: int = 2000,
......
...@@ -289,7 +289,7 @@ class OSS(Optimizer): ...@@ -289,7 +289,7 @@ class OSS(Optimizer):
# n_i = sum_rank(a^p)^1/p # n_i = sum_rank(a^p)^1/p
# -> n_total = all_reduce(n_i^p)^(1/p) = sum_i(n_i^p)^1/p = sum_i(sum_rank(a^p))^1/p # -> n_total = all_reduce(n_i^p)^(1/p) = sum_i(n_i^p)^1/p = sum_i(sum_rank(a^p))^1/p
# all reduce over data parallel and model parallel workers # all reduce over data parallel and model parallel workers
total_norm = local_norm ** norm_type total_norm = local_norm**norm_type
dist.all_reduce(total_norm) dist.all_reduce(total_norm)
total_norm = total_norm ** (1.0 / norm_type) total_norm = total_norm ** (1.0 / norm_type)
......
...@@ -27,4 +27,3 @@ use_parentheses = true ...@@ -27,4 +27,3 @@ use_parentheses = true
skip_glob = ["build/*", "stubs/*"] skip_glob = ["build/*", "stubs/*"]
# Don't split "import" and "from". # Don't split "import" and "from".
force_sort_within_sections = true force_sort_within_sections = true
known_third_party = ["benchmark_dataset", "datasets", "distutils", "golden_configs", "models", "numpy", "parameterized", "pytest", "recommonmark", "setuptools", "sklearn", "torch", "torchtext", "torchvision", "utils"]
...@@ -6,11 +6,10 @@ ...@@ -6,11 +6,10 @@
# function typing with mypy. # function typing with mypy.
# - if you change versions below, please make sure it is in-sync with # - if you change versions below, please make sure it is in-sync with
# .pre-commit-config.yaml for pre-commit. # .pre-commit-config.yaml for pre-commit.
black == 21.10b0 black == 22.3.0
flake8 == 4.0.1 flake8 == 4.0.1
flake8-annotations == 2.7.0 flake8-annotations == 2.7.0
isort == 5.10.1 isort == 5.10.1
seed-isort-config == 2.2.0
mypy == 0.910 mypy == 0.910
pre-commit >= 2.15.0 pre-commit >= 2.15.0
......
...@@ -162,13 +162,13 @@ class TestOptimizerUtils(DistributedTest): ...@@ -162,13 +162,13 @@ class TestOptimizerUtils(DistributedTest):
assert len(no_broadcast_children) == 1, f"Length of non shared params {len(no_broadcast_children)}" assert len(no_broadcast_children) == 1, f"Length of non shared params {len(no_broadcast_children)}"
assert fsdp._fsdp_instances[-1].no_broadcast_optim_state assert fsdp._fsdp_instances[-1].no_broadcast_optim_state
torch.cuda.empty_cache() torch.cuda.empty_cache()
cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3 cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
tstart = time() tstart = time()
sd = fsdp.gather_full_optim_state_dict(fsdp_optim, recipient_rank=0) sd = fsdp.gather_full_optim_state_dict(fsdp_optim, recipient_rank=0)
duration = time() - tstart duration = time() - tstart
assert duration < fsdp.world_size, f"gather optim state took {duration} seconds, suspect change in _consolidate" assert duration < fsdp.world_size, f"gather optim state took {duration} seconds, suspect change in _consolidate"
cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3 cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
mem_usg_gb = cuda_gb_after - cuda_gb_before mem_usg_gb = cuda_gb_after - cuda_gb_before
assert mem_usg_gb == 0, f"gather_full_optim_state_dict used {mem_usg_gb:.2f} CUDA GB, max allowed is 0" assert mem_usg_gb == 0, f"gather_full_optim_state_dict used {mem_usg_gb:.2f} CUDA GB, max allowed is 0"
assert cuda_gb_after > 0, "got 0 memory usage, logging is broken" assert cuda_gb_after > 0, "got 0 memory usage, logging is broken"
......
...@@ -146,7 +146,7 @@ def run_test(backend, device, world_size, broadcast_buffers, grad_accumulation, ...@@ -146,7 +146,7 @@ def run_test(backend, device, world_size, broadcast_buffers, grad_accumulation,
@skip_if_single_gpu @skip_if_single_gpu
@pytest.mark.parametrize("broadcast_buffers", [True, False]) @pytest.mark.parametrize("broadcast_buffers", [True, False])
@pytest.mark.parametrize("grad_accumulation", [True, False]) @pytest.mark.parametrize("grad_accumulation", [True, False])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20]) @pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("optimizer_type", [torch.optim.SGD, SGDWithPausingCompute]) @pytest.mark.parametrize("optimizer_type", [torch.optim.SGD, SGDWithPausingCompute])
@pytest.mark.parametrize("reduce_fp16", [False, True]) @pytest.mark.parametrize("reduce_fp16", [False, True])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -204,7 +204,7 @@ def run_test_two_inputs(rank, world_size, backend, device, temp_file_name, reduc ...@@ -204,7 +204,7 @@ def run_test_two_inputs(rank, world_size, backend, device, temp_file_name, reduc
dist.destroy_process_group() dist.destroy_process_group()
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20]) @pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("backend", ["gloo", "nccl"]) @pytest.mark.parametrize("backend", ["gloo", "nccl"])
@pytest.mark.parametrize("device", available_devices) @pytest.mark.parametrize("device", available_devices)
@skip_if_single_gpu @skip_if_single_gpu
...@@ -354,7 +354,7 @@ def run_test_device_change(rank, world_size, backend, device, temp_file_name, re ...@@ -354,7 +354,7 @@ def run_test_device_change(rank, world_size, backend, device, temp_file_name, re
@skip_if_no_cuda @skip_if_no_cuda
@skip_if_single_gpu @skip_if_single_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20]) @pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
def test_device_change(reduce_buffer_size): def test_device_change(reduce_buffer_size):
# Check that ShardedDDP handles a device change properly # Check that ShardedDDP handles a device change properly
world_size = 2 world_size = 2
...@@ -392,7 +392,7 @@ def run_test_training_change(rank, world_size, backend, device, temp_file_name, ...@@ -392,7 +392,7 @@ def run_test_training_change(rank, world_size, backend, device, temp_file_name,
@skip_if_no_cuda @skip_if_no_cuda
@skip_if_single_gpu @skip_if_single_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20]) @pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
def test_training_change(reduce_buffer_size): def test_training_change(reduce_buffer_size):
world_size = 2 world_size = 2
backend = "nccl" backend = "nccl"
...@@ -528,7 +528,7 @@ def run_test_gpt2(rank, world_size, backend, device, temp_file_name, reduce_buff ...@@ -528,7 +528,7 @@ def run_test_gpt2(rank, world_size, backend, device, temp_file_name, reduce_buff
@skip_if_no_cuda @skip_if_no_cuda
@skip_if_single_gpu @skip_if_single_gpu
@pytest.mark.parametrize("world_size", [1, 2]) @pytest.mark.parametrize("world_size", [1, 2])
@pytest.mark.parametrize("reduce_buffer", [2 ** 23, 2 ** 40]) @pytest.mark.parametrize("reduce_buffer", [2**23, 2**40])
def test_gpt2(world_size, reduce_buffer): def test_gpt2(world_size, reduce_buffer):
# Check that having trainable unused params is fine # Check that having trainable unused params is fine
backend = "gloo" backend = "gloo"
...@@ -598,7 +598,7 @@ def run_test_multiple_groups(rank, world_size, tempfile_name, backend, reduce_bu ...@@ -598,7 +598,7 @@ def run_test_multiple_groups(rank, world_size, tempfile_name, backend, reduce_bu
@skip_if_less_than_four_gpu @skip_if_less_than_four_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20]) @pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("backend", ["gloo", "nccl"]) @pytest.mark.parametrize("backend", ["gloo", "nccl"])
def test_multiple_groups(reduce_buffer_size, backend): def test_multiple_groups(reduce_buffer_size, backend):
world_size = 4 world_size = 4
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment