Unverified Commit 72f373c1 authored by Paul Johnson's avatar Paul Johnson Committed by GitHub
Browse files

Remove sort_iseed_config and related dependencies. (#969)

This is no longer needed since isort's version is 5.10

Also fix black version to 22.3.0 to fix issue with click
dependency.

Update files that now fail with new version of black {a = 2 ** 4} ->
{a = 2**4}
parent 1bc96fa8
......@@ -23,7 +23,7 @@ repos:
- id: end-of-file-fixer
- repo: https://github.com/ambv/black
rev: 21.10b0
rev: 22.3.0
hooks:
- id: black
......@@ -33,11 +33,6 @@ repos:
- id: flake8
args: [--show-source, --statistics]
- repo: https://github.com/asottile/seed-isort-config
rev: v2.2.0
hooks:
- id: seed-isort-config
- repo: https://github.com/pycqa/isort
rev: 5.10.1
hooks:
......
......@@ -4,11 +4,11 @@
# LICENSE file in the root directory of this source tree.
from collections import namedtuple
from distutils.version import LooseVersion
import io
import operator
import tempfile
from distutils.version import LooseVersion
import torch
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
......
......@@ -135,11 +135,11 @@ def train_seq(model_config, benchmark_config, model_specs, args):
loss.backward()
optimizer.step()
logging.info(
"Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2 ** 30)
"Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2**30)
)
logging.info(
"Loss {:.2f} - throughput {:.2f}fps".format(
loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10 ** 9
loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10**9
)
)
num_iters -= 1
......
......@@ -267,7 +267,7 @@ def benchmark_language_model(model_config, model, benchmark_config, model_specs,
print("Throughput(wps) is {:.2f}.".format(wps))
print(
"Peak allocated bytes on cuda:{}: {:4f}GB".format(
dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2 ** 30
dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2**30
)
)
......
......@@ -97,7 +97,7 @@ class FSDP:
return {
"avg_wps": 486.303,
"std_dev_wps": 71.307,
"peak_mem_usage": [5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30],
"peak_mem_usage": [5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30],
}
......
......@@ -89,7 +89,7 @@ def validate_benchmark(measurements, final_loss, args, check_regression):
if not args.cpu:
# TODO(anj-s): Check if we need to synchronize before we caculate total training time.
torch.cuda.synchronize(rank)
max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20
max_memory = torch.cuda.max_memory_allocated(rank) / 2**20
logging.info(f"[{rank}] : Peak memory {max_memory:.1f}MiB")
measurements.sort()
......
......@@ -149,8 +149,8 @@ class DynamicDirectedExponentialGraph(GraphManager):
def _make_graph(self) -> None:
for rank in range(self.world_size):
for i in range(0, int(mlog(self.world_size - 1, 2)) + 1):
f_peer = self._rotate_forward(rank, 2 ** i)
b_peer = self._rotate_backward(rank, 2 ** i)
f_peer = self._rotate_forward(rank, 2**i)
b_peer = self._rotate_backward(rank, 2**i)
self._add_peers(rank, [f_peer, b_peer])
def is_regular_graph(self) -> bool:
......@@ -196,8 +196,8 @@ class DynamicBipartiteExponentialGraph(GraphManager):
f_peer = self._rotate_forward(rank, 1)
b_peer = self._rotate_backward(rank, 1)
else:
f_peer = self._rotate_forward(rank, 1 + 2 ** i)
b_peer = self._rotate_backward(rank, 1 + 2 ** i)
f_peer = self._rotate_forward(rank, 1 + 2**i)
b_peer = self._rotate_backward(rank, 1 + 2**i)
# create directory for non-passive peers
if not self.is_passive(rank) and (self.is_passive(f_peer) and self.is_passive(b_peer)):
self._add_peers(rank, [f_peer, b_peer])
......
......@@ -14,7 +14,7 @@ from typing import ClassVar, Deque, Dict, Optional
import torch
MAX_LEN_DEQUEUE = 10 ** 4
MAX_LEN_DEQUEUE = 10**4
deque_with_max_len_fixed = partial(deque, maxlen=MAX_LEN_DEQUEUE)
......
......@@ -36,7 +36,7 @@ class DynamicLossScaler(object):
def __init__(
self,
init_scale: float = 2.0 ** 15,
init_scale: float = 2.0**15,
scale_factor: float = 2.0,
scale_window: int = 2000,
tolerance: float = 0.0,
......
......@@ -700,7 +700,7 @@ class FullyShardedDataParallel(nn.Module):
total_norm = local_norm
dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
else:
total_norm = local_norm ** norm_type
total_norm = local_norm**norm_type
dist.all_reduce(total_norm, group=self.process_group)
total_norm = total_norm ** (1.0 / norm_type)
......@@ -2408,7 +2408,7 @@ class FullyShardedDataParallel(nn.Module):
if restart:
self._tstart = time.time()
if self.rank == 0:
gb_denom = 1024 ** 3
gb_denom = 1024**3
logging.info(
f"{msg} cur={torch.cuda.memory_allocated()/gb_denom: .4f} GB, max={torch.cuda.max_memory_allocated()/gb_denom: .4f} GB, t={time.time()-self._tstart: .1f}"
)
......
......@@ -100,7 +100,7 @@ class ShardedDataParallel(nn.Module):
process_group: Any = None,
broadcast_buffers: bool = True,
sync_models_at_startup: bool = True,
reduce_buffer_size: int = 2 ** 23,
reduce_buffer_size: int = 2**23,
auto_refresh_trainable: bool = True,
reduce_fp16: bool = False,
warn_on_trainable_params_changed: bool = True,
......@@ -178,7 +178,7 @@ class ShardedDataParallel(nn.Module):
logging.info(
"ShardedDDP bucket size: {:.2f}M parameters, model size {:.2f}M parameters".format(
self._buffer_max_size / 2 ** 20, model_size / 2 ** 20
self._buffer_max_size / 2**20, model_size / 2**20
)
)
self._use_buckets = self._buffer_max_size > 0
......
......@@ -71,7 +71,7 @@ class DeferredBatchNorm(_BatchNorm):
with torch.no_grad():
self.sum += input.sum(dim)
self.sum_squares += (input ** 2).sum(dim)
self.sum_squares += (input**2).sum(dim)
size = input.size().numel() // input.size(1)
self.counter += size
......@@ -89,7 +89,7 @@ class DeferredBatchNorm(_BatchNorm):
exponential_average_factor = self.momentum
mean = self.sum / self.counter
var = self.sum_squares / self.counter - mean ** 2
var = self.sum_squares / self.counter - mean**2
# Calculate the exponential moving average here.
m = exponential_average_factor
......
......@@ -98,7 +98,7 @@ try:
assert parameters[0].dtype == torch.float16
self.optim_type = torch.float16 if precision is Precision.PURE_FP16 else torch.float32
self._optim_scale = float(2 ** 16) if precision is Precision.PURE_FP16 else 1.0
self._optim_scale = float(2**16) if precision is Precision.PURE_FP16 else 1.0
self._steps_since_optim_scale_change = 0
self._optim_scale_update_freq = 2000 # This is the value that GradScaler uses by default
self._overflow_buf = torch.cuda.IntTensor([0]) # type: ignore
......@@ -291,11 +291,10 @@ try:
if self._steps_since_optim_scale_change == self._optim_scale_update_freq:
self._steps_since_optim_scale_change = 0
if self._optim_scale < 2 ** 16:
if self._optim_scale < 2**16:
self._optim_scale *= 2
return loss
except ImportError:
pass
......@@ -453,7 +453,7 @@ class AdaScale(Optimizer):
# accumulation.
if self._num_grads_to_accum > 1:
# np array doesn't support /=.
total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum ** 2)
total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum**2)
# Wait for all_reduce to be done and move it to cpu & np.
if work:
......
......@@ -76,7 +76,7 @@ class ShardedGradScaler(TorchGradScaler):
def __init__(
self,
init_scale: float = 2.0 ** 16,
init_scale: float = 2.0**16,
growth_factor: float = 2.0,
backoff_factor: float = 0.5,
growth_interval: int = 2000,
......
......@@ -289,7 +289,7 @@ class OSS(Optimizer):
# n_i = sum_rank(a^p)^1/p
# -> n_total = all_reduce(n_i^p)^(1/p) = sum_i(n_i^p)^1/p = sum_i(sum_rank(a^p))^1/p
# all reduce over data parallel and model parallel workers
total_norm = local_norm ** norm_type
total_norm = local_norm**norm_type
dist.all_reduce(total_norm)
total_norm = total_norm ** (1.0 / norm_type)
......
......@@ -27,4 +27,3 @@ use_parentheses = true
skip_glob = ["build/*", "stubs/*"]
# Don't split "import" and "from".
force_sort_within_sections = true
known_third_party = ["benchmark_dataset", "datasets", "distutils", "golden_configs", "models", "numpy", "parameterized", "pytest", "recommonmark", "setuptools", "sklearn", "torch", "torchtext", "torchvision", "utils"]
......@@ -6,11 +6,10 @@
# function typing with mypy.
# - if you change versions below, please make sure it is in-sync with
# .pre-commit-config.yaml for pre-commit.
black == 21.10b0
black == 22.3.0
flake8 == 4.0.1
flake8-annotations == 2.7.0
isort == 5.10.1
seed-isort-config == 2.2.0
mypy == 0.910
pre-commit >= 2.15.0
......
......@@ -162,13 +162,13 @@ class TestOptimizerUtils(DistributedTest):
assert len(no_broadcast_children) == 1, f"Length of non shared params {len(no_broadcast_children)}"
assert fsdp._fsdp_instances[-1].no_broadcast_optim_state
torch.cuda.empty_cache()
cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3
cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
tstart = time()
sd = fsdp.gather_full_optim_state_dict(fsdp_optim, recipient_rank=0)
duration = time() - tstart
assert duration < fsdp.world_size, f"gather optim state took {duration} seconds, suspect change in _consolidate"
cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3
cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
mem_usg_gb = cuda_gb_after - cuda_gb_before
assert mem_usg_gb == 0, f"gather_full_optim_state_dict used {mem_usg_gb:.2f} CUDA GB, max allowed is 0"
assert cuda_gb_after > 0, "got 0 memory usage, logging is broken"
......
......@@ -146,7 +146,7 @@ def run_test(backend, device, world_size, broadcast_buffers, grad_accumulation,
@skip_if_single_gpu
@pytest.mark.parametrize("broadcast_buffers", [True, False])
@pytest.mark.parametrize("grad_accumulation", [True, False])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("optimizer_type", [torch.optim.SGD, SGDWithPausingCompute])
@pytest.mark.parametrize("reduce_fp16", [False, True])
@pytest.mark.parametrize(
......@@ -204,7 +204,7 @@ def run_test_two_inputs(rank, world_size, backend, device, temp_file_name, reduc
dist.destroy_process_group()
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("backend", ["gloo", "nccl"])
@pytest.mark.parametrize("device", available_devices)
@skip_if_single_gpu
......@@ -354,7 +354,7 @@ def run_test_device_change(rank, world_size, backend, device, temp_file_name, re
@skip_if_no_cuda
@skip_if_single_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
def test_device_change(reduce_buffer_size):
# Check that ShardedDDP handles a device change properly
world_size = 2
......@@ -392,7 +392,7 @@ def run_test_training_change(rank, world_size, backend, device, temp_file_name,
@skip_if_no_cuda
@skip_if_single_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
def test_training_change(reduce_buffer_size):
world_size = 2
backend = "nccl"
......@@ -528,7 +528,7 @@ def run_test_gpt2(rank, world_size, backend, device, temp_file_name, reduce_buff
@skip_if_no_cuda
@skip_if_single_gpu
@pytest.mark.parametrize("world_size", [1, 2])
@pytest.mark.parametrize("reduce_buffer", [2 ** 23, 2 ** 40])
@pytest.mark.parametrize("reduce_buffer", [2**23, 2**40])
def test_gpt2(world_size, reduce_buffer):
# Check that having trainable unused params is fine
backend = "gloo"
......@@ -598,7 +598,7 @@ def run_test_multiple_groups(rank, world_size, tempfile_name, backend, reduce_bu
@skip_if_less_than_four_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("backend", ["gloo", "nccl"])
def test_multiple_groups(reduce_buffer_size, backend):
world_size = 4
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment