Unverified Commit 7d7edf6d authored by Anupam Bhatnagar's avatar Anupam Bhatnagar Committed by GitHub
Browse files

Setup pre-commit github action and apply pre-commit to all files (#849)

* adding pre-commit files

* applying pre-commit to all files

* adding no-strict-optional argument to mypy in circle ci config

* fix typo

* updating python versions

* [skip ci] remove extra args

* adding python 3.9

* [skip ci] set pre-commit version in requirements-dev.txt

* set CACHE_VERSION

* move linters from circleci to github actions

* update python version

* update python version in benchmarks_2

* moving to python 3.9.7
parent 6f3931a4
......@@ -6,6 +6,14 @@
# https://github.com/facebookresearch/detectron2/blob/main/.circleci/config.yml
#
# Pro tip: download circle ci cli to validate the config locally during development.
#
# To reset/clean the cache update the CACHE_VERSION variable in project settings
# in the fairscale project in CircleCI. The CACHE_VERSION follows the convention
# v$(FAIRSCALE_VERSION)-${CACHE_NUMBER}. E.g. v0.4.2-1. CACHE_NUMBER must start
# at 1 and increase in whole numbers. When changing the CACHE_VERSION manually
# always set the FAIRSCALE_VERSION value to the fairscale version being tested.
# To reset the cache when not updating the fairscale version, only update the
# CACHE_NUMBER value.
version: 2.1
orbs:
......@@ -15,23 +23,26 @@ orbs:
# -------------------------------------------------------------------------------------
cpu_py37: &cpu_py37
docker:
# python version 3.7.12
- image: circleci/python:3.7
resource_class: large
cpu_py38: &cpu_py38
docker:
# python version 3.8.12
- image: circleci/python:3.8
resource_class: large
cpu_py39: &cpu_py39
docker:
# python version 3.9.7
- image: circleci/python:3.9
resource_class: large
# Here are list of GPU images:
# Here is the list of GPU images:
# https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
# We need to use multiple gpus for several jobs. the resource_class values are
# available here T101565170
# We need to use multiple gpus for several jobs. The resource_class
# values are available here T101565170
# gpu.nvidia.small.multi = 2 gpus with 16 GB ram each
# gpu.nvidia.medium.multi = 4 gpus with 16 GB ram each
......@@ -122,30 +133,6 @@ install_repo: &install_repo
# Test import.
python -c 'import sys; sys.path = sys.path[1:]; import fairscale'
run_isort: &run_isort
- run:
name: Run Linter (isort)
command: |
isort . --check
run_black: &run_black
- run:
name: Run Linter (black)
command: |
black --check .
run_mypy: &run_mypy
- run:
name: Run type-checking (mypy)
command: |
mypy --ignore-missing-imports --scripts-are-modules --pretty .
run_flake8: &run_flake8
- run:
name: Run Linter (flake8)
command: |
flake8 --show-source --statistics
check_test_list: &check_test_list
- run:
name: Verify that unit test list files are correct
......@@ -260,21 +247,16 @@ jobs:
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-cpu-py37-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-cpu-py37-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_dep_1_10_0
- save_cache:
paths:
- ~/venv
key: cache-key-cpu-py37-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-cpu-py37-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_repo
- <<: *run_isort
- <<: *run_black
- <<: *run_mypy
- <<: *run_flake8
- <<: *run_unittests
- <<: *run_doc_build
......@@ -294,20 +276,15 @@ jobs:
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-cpu-py38-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-cpu-py38-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_dep_1_10_0
- save_cache:
paths:
- ~/venv
key: cache-key-cpu-py38-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-cpu-py38-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_repo
- <<: *run_isort
- <<: *run_black
- <<: *run_mypy
- <<: *run_flake8
- <<: *run_unittests
- <<: *run_doc_build
......@@ -327,21 +304,16 @@ jobs:
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-cpu-py39-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-cpu-py39-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_dep_1_10_0
- save_cache:
paths:
- ~/venv
key: cache-key-cpu-py39-torch-1-10-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-cpu-py39-torch-1-10-0-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_repo
- <<: *run_isort
- <<: *run_black
- <<: *run_mypy
- <<: *run_flake8
- <<: *run_unittests
- <<: *run_doc_build
......@@ -365,21 +337,21 @@ jobs:
# Run this to make sure we use python3 from the system.
- setup_pyenv:
version: 3.7.0
version: 3.9.7
- <<: *setup_venv
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-py37-gpu-torch-1-8-1-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-py-3-9-7-gpu-torch-1-8-1-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_dep_1_8_1
- save_cache:
paths:
- ~/venv
key: cache-key-py37-gpu-torch-1-8-1-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-py-3-9-7-gpu-torch-1-8-1-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_repo
......@@ -408,21 +380,21 @@ jobs:
# Run this to make sure we use python3 from the system.
- setup_pyenv:
version: 3.8.6
version: 3.9.7
- <<: *setup_venv
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-py38-gpu-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-py-3-9-7-gpu-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_dep_1_10_0
- save_cache:
paths:
- ~/venv
key: cache-key-py38-gpu-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-py-3-9-7-gpu-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_repo
......@@ -449,21 +421,21 @@ jobs:
# Run this to make sure we use python3 from the system.
- setup_pyenv:
version: 3.8.6
version: 3.9.7
- <<: *setup_venv
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-py38-gpu-pytorch-nightly-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-py-3-9-7-gpu-pytorch-nightly-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_dep_pytorch_nightly
- save_cache:
paths:
- ~/venv
key: cache-key-py38-gpu-pytorch-nightly-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-py-3-9-7-gpu-pytorch-nightly-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_repo
......@@ -484,26 +456,26 @@ jobs:
- run: nvidia-smi
- setup_pyenv:
version: 3.7.0
version: 3.9.7
- <<: *setup_venv
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-py37-benchmarks-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-py-3-9-7-benchmarks-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
# Cache the MNIST directory that contains benchmark data
- restore_cache:
keys:
- cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
- cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION }}-{{checksum "benchmarks/datasets/mnist.py"}}
- <<: *install_dep_1_10_0
- save_cache:
paths:
- ~/venv
key: cache-key-py37-benchmarks-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-py-3-9-7-benchmarks-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_repo
......@@ -520,7 +492,7 @@ jobs:
- save_cache:
paths:
- /tmp/MNIST
key: cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION }}-{{checksum "benchmarks/datasets/mnist.py"}}
benchmarks_2:
<<: *gpu_cu_11_2_medium_multi
......@@ -533,27 +505,27 @@ jobs:
- run: nvidia-smi
- setup_pyenv:
version: 3.7.0
version: 3.9.7
- <<: *setup_venv
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-py37-benchmarks-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-py-3-9-7-benchmarks-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
# Cache the MNIST directory that contains benchmark data
- restore_cache:
keys:
- cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
- cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION }}-{{checksum "benchmarks/datasets/mnist.py"}}
- <<: *install_dep_1_10_0
- save_cache:
paths:
- ~/venv
key: cache-key-py37-benchmarks-torch-1-10-0-cuda-11-2-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-py-3-9-7-benchmarks-torch-1-10-0-cuda-11-2-{{.Environment.CACHE_VERSION }}-{{checksum "setup.py"}}-{{checksum "requirements-test.txt"}}
- <<: *install_repo
......@@ -562,7 +534,7 @@ jobs:
- save_cache:
paths:
- /tmp/MNIST
key: cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}
workflows:
......
name: pre-commit
on:
pull_request:
push:
branches: [main]
jobs:
pre-commit:
runs-on: ubuntu-latest
strategy:
matrix:
# make sure python versions are consistent with those used in .circleci/config.yml
python-version: ['3.7.12', '3.8.12', '3.9.7']
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- uses: pre-commit/action@v2.0.3
......@@ -8,7 +8,7 @@ default_language_version:
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.5.0
rev: v4.0.1
hooks:
- id: trailing-whitespace
- id: check-ast
......@@ -20,29 +20,31 @@ repos:
- id: end-of-file-fixer
- repo: https://github.com/ambv/black
rev: 19.10b0
rev: 21.10b0
hooks:
- id: black
- repo: https://gitlab.com/pycqa/flake8
rev: 3.7.9
rev: 4.0.1
hooks:
- id: flake8
args: [--show-source, --statistics]
- repo: https://github.com/asottile/seed-isort-config
rev: v2.1.0
rev: v2.2.0
hooks:
- id: seed-isort-config
- repo: https://github.com/pycqa/isort
rev: 5.6.4
rev: 5.10.1
hooks:
- id: isort
exclude: README.md
additional_dependencies: [toml]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: 'v0.790'
rev: 'v0.910'
hooks:
- id: mypy
args: [--no-strict-optional, --ignore-missing-imports, --scripts-are-modules, --pretty]
additional_dependencies: [numpy]
......@@ -42,7 +42,10 @@ class BenchmarkLMDataset(Dataset):
"""
def __init__(
self, vocab_size=10000, max_source_positions=1024, total_samples=10000,
self,
vocab_size=10000,
max_source_positions=1024,
total_samples=10000,
):
self.vocab_size = vocab_size
self.max_source_positions = max_source_positions
......
......@@ -35,7 +35,7 @@ KERNELS = [
def run_on_gpu(kernel, data, repeats, no_grad, fwd_bwd):
""" Measure both GPU runtime and peak memory usage of a kernel. """
"""Measure both GPU runtime and peak memory usage of a kernel."""
tokens = data[0].shape[0]
def get_cuda_data():
......
......@@ -142,7 +142,7 @@ class MySGD(Optimizer):
super(MySGD, self).__setstate__(state)
def step(self, closure=None):
""" Performs a single optimization step.
"""Performs a single optimization step.
Args:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
......@@ -234,7 +234,7 @@ class SpectrainSGDMomentum(Optimizer):
p.data.sub_(param_state["momentum_buffer"].data, alpha=multiplier)
def step(self, weight_prediction=True, closure=None):
""" Performs a single optimization step.
"""Performs a single optimization step.
Args:
weight_prediction (bool, optional): Enable weight prediction based updates
closure (callable, optional): A closure that reevaluates the model
......
......@@ -413,7 +413,10 @@ parser.add_argument(
help="Print debugging statements which is more verbose than the default.",
)
parser.add_argument(
"--model_name", default="lm", type=str, help="Language Model(LM) used to benchmark nn.pipe.",
"--model_name",
default="lm",
type=str,
help="Language Model(LM) used to benchmark nn.pipe.",
)
parser.add_argument(
"--use_synthetic_data", default=True, action="store_true", help="Uses synthetic data for running benchmarks."
......
......@@ -320,14 +320,22 @@ if __name__ == "__main__":
if args.optim_type == OptimType.oss_ddp or args.optim_type == OptimType.everyone:
logging.info("\n*** Benchmark OSS with DDP")
mp.spawn(
train, args=(args, BACKEND, OptimType.oss_ddp, args.check_regression), nprocs=args.world_size, join=True, # type: ignore
train,
args=(args, BACKEND, OptimType.oss_ddp, args.check_regression),
nprocs=args.world_size,
join=True, # type: ignore
)
if args.optim_type == OptimType.oss_sharded_ddp or args.optim_type == OptimType.everyone:
logging.info("\n*** Benchmark OSS with ShardedDDP")
mp.spawn(
train, # type: ignore
args=(args, BACKEND, OptimType.oss_sharded_ddp, args.check_regression,),
args=(
args,
BACKEND,
OptimType.oss_sharded_ddp,
args.check_regression,
),
nprocs=args.world_size,
join=True,
)
......@@ -34,13 +34,20 @@ from fairscale.nn.pipe.worker import Task
def create_task_without_skip_trackers(
checkpoint_stop: int, i: int, j: int, batch: Batch, partition: nn.Sequential,
checkpoint_stop: int,
i: int,
j: int,
batch: Batch,
partition: nn.Sequential,
) -> Task:
# Determine whether checkpointing or not.
if i < checkpoint_stop:
def function(
input: TensorOrTensors, partition: nn.Sequential = partition, chunk_id: int = i, part_id: int = j,
input: TensorOrTensors,
partition: nn.Sequential = partition,
chunk_id: int = i,
part_id: int = j,
) -> TensorOrTensors:
with record_function("chunk%d-part%d" % (chunk_id, part_id)):
return partition(input)
......@@ -52,7 +59,10 @@ def create_task_without_skip_trackers(
else:
def compute(
batch: Batch = batch, partition: nn.Sequential = partition, chunk_id: int = i, part_id: int = j,
batch: Batch = batch,
partition: nn.Sequential = partition,
chunk_id: int = i,
part_id: int = j,
) -> Batch:
with record_function("chunk%d-part%d" % (chunk_id, part_id)):
return batch.call(partition)
......@@ -93,7 +103,11 @@ class AsyncAMPnetEventLoop:
def async_send_inner(self, batch: Batch, index: int) -> Tuple[Batch, PipeMessage]:
task = create_task_without_skip_trackers(
self.checkpoint_stop, index, self.group.rank(), batch, self.partitions[0].module,
self.checkpoint_stop,
index,
self.group.rank(),
batch,
self.partitions[0].module,
)
result = task.compute()
task.finalize(result)
......@@ -258,7 +272,11 @@ class AsyncAMPnetEventLoop:
if self.weight_prediction:
optimizer.update_weight_using_future_predictions(cur_rank, N, count, self.chunks, forward=True)
task = create_task_without_skip_trackers(
self.checkpoint_stop, args.microbatch_index, self.group.rank(), batch, self.partitions[0].module,
self.checkpoint_stop,
args.microbatch_index,
self.group.rank(),
batch,
self.partitions[0].module,
)
output = task.compute()
activations[args.microbatch_index] = output
......
......@@ -46,8 +46,8 @@ class AMPnetPipe(AsyncPipe):
assert self.group
rank = self.group.rank()
transport = self.pipeline.transport # type: ignore
checkpoint_stop = self.pipeline.checkpoint_stop # type: ignore
transport = self.pipeline.transport
checkpoint_stop = self.pipeline.checkpoint_stop
ampnet_event_loop = AsyncAMPnetEventLoop(
partitions,
self.group,
......
......@@ -312,7 +312,7 @@ class SlowMoDistributedDataParallel(Module):
self.logger.debug("Initialization of SlowMoDistributedDataParallel complete")
def _initialize_logger(self, verbose: bool, process_rank: int) -> None:
""" Initializes the logger """
"""Initializes the logger"""
self.logger = logging.getLogger(__name__)
if verbose:
self.logger.setLevel(logging.DEBUG)
......@@ -331,7 +331,7 @@ class SlowMoDistributedDataParallel(Module):
master_group: Optional[torch.distributed.ProcessGroup],
local_node_group: Optional[torch.distributed.ProcessGroup],
) -> Tuple[int, int]:
""" Creates the process groups required for the SlowMo implementation """
"""Creates the process groups required for the SlowMo implementation"""
self.local_rank = process_rank % self.nprocs_per_node
assert (
......@@ -392,7 +392,12 @@ class SlowMoDistributedDataParallel(Module):
self.logger.debug("Initializing local process groups")
for node in range(logical_world_size):
node_processes_ranks = list(range(node * self.nprocs_per_node, (node + 1) * self.nprocs_per_node,))
node_processes_ranks = list(
range(
node * self.nprocs_per_node,
(node + 1) * self.nprocs_per_node,
)
)
# Process group to communicate between processes on this machine
new_local_group = create_process_group(node_processes_ranks)
if process_rank in node_processes_ranks:
......@@ -401,24 +406,26 @@ class SlowMoDistributedDataParallel(Module):
self.logger.debug("Initialization of local groups complete")
def forward(self, *inputs: Any, **kwargs: Any) -> Union[torch.Tensor, List[torch.Tensor]]:
""" Forward pass performed in parallel across all devices on node """
"""Forward pass performed in parallel across all devices on node"""
return self.module(*inputs, **kwargs)
def _sync_params(self) -> None:
""" Synchronize parameters across devices (intra-node) """
"""Synchronize parameters across devices (intra-node)"""
if self.local_node_group is None:
return
# intra-node parameter sync
params = cast(List[torch.Tensor], list(self.module.parameters()))
communication_op = functools.partial(
dist.broadcast, src=self.logical_rank * self.nprocs_per_node, group=self.local_node_group,
dist.broadcast,
src=self.logical_rank * self.nprocs_per_node,
group=self.local_node_group,
)
communicate(params, communication_op)
self.logger.debug("Intra-node param sync complete")
def _sync_buffers(self) -> None:
""" Synchronize buffers across nodes """
"""Synchronize buffers across nodes"""
# module buffer sync
if self.broadcast_buffers and len(self.module_buffers) > 0:
# Synchronize buffers across processes.
......@@ -432,17 +439,18 @@ class SlowMoDistributedDataParallel(Module):
dist._broadcast_coalesced(process_group, tensors, buffer_size)
def _create_event_recorder(self, event_name: str) -> EventRecorder:
""" Creates an cuda event recorder which helps in profiling """
"""Creates an cuda event recorder which helps in profiling"""
return create_event_recorder(event_name, dummy=not self.profile_mode)
def _fp16_fp32_iterator(
self, optimizer: torch.optim.Optimizer, fp32_params: Optional[torch.Tensor]
) -> Iterable[Tuple[torch.Tensor, torch.Tensor]]:
""" Iterator for those fp16 parameters which have a fp32 copy """
"""Iterator for those fp16 parameters which have a fp32 copy"""
# Handle apex fp16 optimizer
if hasattr(optimizer, "_amp_stash") and hasattr(optimizer._amp_stash, "fp16_groups"):
for p_fp16_group, p_fp32_group in zip(
optimizer._amp_stash.fp16_groups, optimizer._amp_stash.fp32_from_fp16_groups,
optimizer._amp_stash.fp16_groups,
optimizer._amp_stash.fp32_from_fp16_groups,
):
for p_fp16, p_fp32 in zip(p_fp16_group, p_fp32_group):
yield p_fp16, p_fp32
......@@ -594,7 +602,7 @@ class SlowMoDistributedDataParallel(Module):
ef1.copy_(p_fp32 - p_fp16.float())
def perform_slowmo(self, optimizer: torch.optim.Optimizer, fp32_params: Optional[torch.Tensor] = None) -> None:
""" This is to be called after optimizer.step(). It performs the approximate averaging using
"""This is to be called after optimizer.step(). It performs the approximate averaging using
the base algorithm (SGP/ LocalSGD) and the slow momentum step. Since LocalSGD and the slow
momentum step are not performed every iteration, it only performs those when needed.
......@@ -645,7 +653,7 @@ class SlowMoDistributedDataParallel(Module):
self.num_updates += 1
def _init_global_momentum_buffers(self, optimizer: torch.optim.Optimizer) -> None:
""" Initializes the slow momentum buffers """
"""Initializes the slow momentum buffers"""
self.global_momentum_buffers_initialized = True
if not self.slowmo:
......@@ -707,7 +715,7 @@ class SlowMoDistributedDataParallel(Module):
self.global_momentum_buffer = torch.zeros_like(self.old_params).detach()
def _distributed_comm(self, optimizer: torch.optim.Optimizer, mode: str) -> None:
""" Performs the communication needed for the efficient SlowMo implementation """
"""Performs the communication needed for the efficient SlowMo implementation"""
offset = 0
slowmo_comm_lists: List[List[torch.Tensor]] = [[] for _ in range(self.slowmo_num_shards)]
with torch.no_grad():
......@@ -743,7 +751,7 @@ class SlowMoDistributedDataParallel(Module):
communicate(slowmo_comm_list, communication_op)
def _global_momentum_step(self, optimizer: torch.optim.Optimizer) -> None:
""" Performs the slow momentum step """
"""Performs the slow momentum step"""
if not self.slowmo:
return
......@@ -760,7 +768,7 @@ class SlowMoDistributedDataParallel(Module):
self._distributed_comm(optimizer, mode="scatter")
def _perform_local_optimization(self, optimizer: torch.optim.Optimizer) -> None:
""" Performs the slow momentum on the local shard """
"""Performs the slow momentum on the local shard"""
assert self.portion_start is not None
with torch.no_grad():
......@@ -838,7 +846,7 @@ class SlowMoDistributedDataParallel(Module):
self.logger.debug("making forward pre-hook")
def hook(*unused: Any) -> None:
""" Query gossip queue and de-bias during forward pass """
"""Query gossip queue and de-bias during forward pass"""
# sync buffers before the forward pass
self._sync_buffers()
......@@ -869,7 +877,7 @@ class SlowMoDistributedDataParallel(Module):
use_streams: bool = True,
slowmo_sgp_average_params: bool = False,
) -> None:
""" Perform initialization for Stochastic Gradient Push base algorithm """
"""Perform initialization for Stochastic Gradient Push base algorithm"""
if graph is None:
graph = NPDDEGraph(logical_rank, logical_world_size, self.nprocs_per_node, self.local_rank)
......@@ -959,7 +967,7 @@ class SlowMoDistributedDataParallel(Module):
super(SlowMoDistributedDataParallel, self).load_state_dict(cast(Dict[str, torch.Tensor], state_dict))
def _sgp_ps_numerator(self) -> None:
""" Convert model params to ps-numerator """
"""Convert model params to ps-numerator"""
if not self.is_sgp_ps_numerator:
if not self.lazy_mixing:
ps_weight = self.ps_weight
......@@ -969,7 +977,7 @@ class SlowMoDistributedDataParallel(Module):
self.is_sgp_ps_numerator = True
def _sgp_unbias(self) -> None:
""" Convert model params to de-biased estimate """
"""Convert model params to de-biased estimate"""
if self.is_sgp_ps_numerator:
if not self.lazy_mixing:
ps_weight = self.ps_weight
......@@ -992,7 +1000,7 @@ class SlowMoDistributedDataParallel(Module):
return self
def _sgp_query_gossip_queue(self, non_blocking: bool = False) -> bool:
""" Check gossip-queue for push-sum residuals and update model """
"""Check gossip-queue for push-sum residuals and update model"""
if not self.gossip_enable:
return False
......@@ -1046,7 +1054,7 @@ class SlowMoDistributedDataParallel(Module):
return False
def _sgp_transfer_params(self, mix: bool = True) -> bool:
""" Transfers COPY of model parameters to gossip queue """
"""Transfers COPY of model parameters to gossip queue"""
if not self.gossip_enable or self.process_rank % self.nprocs_per_node != 0:
return False
......@@ -1130,7 +1138,7 @@ class SlowMoDistributedDataParallel(Module):
gossip_ps_factor: torch.Tensor,
gossip_stream: torch.cuda.Stream,
) -> None:
""" Gossip thread, which performs push-sum on model params """
"""Gossip thread, which performs push-sum on model params"""
logger = make_logger(dist_config["logical_rank"], dist_config["verbose"])
gossip_params_by_dtype = group_by_dtype(gossip_params)
......
......@@ -30,7 +30,7 @@ class dist_backend(str, Enum):
class Gossiper(object):
""" Generic gossip averaging object for multi-peer communication
"""Generic gossip averaging object for multi-peer communication
Args:
msg (torch.Tensor): message used to initialize recv buffer
......@@ -121,7 +121,7 @@ class Gossiper(object):
self._graph_manager.peers_per_itr = v
def refresh_peers_(self, rotate: Optional[bool] = None) -> None:
""" Update in- and out-peers """
"""Update in- and out-peers"""
if rotate is None:
rotate = self._graph_manager.is_dynamic_graph()
# cannot cycle peers in a static graph
......@@ -129,11 +129,11 @@ class Gossiper(object):
self.out_edges, self.in_edges = self._graph_manager.get_edges(rotate)
def refresh_mixing_weights_(self, residual_adjusted: bool = False) -> None:
""" Update mixing-matrix weights """
"""Update mixing-matrix weights"""
self.mixing_weights = self._mixing_manager.get_mixing_weights(residual_adjusted)
def mix_out_msg_(self, out_msg: torch.Tensor, ps_weight: torch.Tensor) -> Iterator[torch.Tensor]:
""" Returns a generator mixing messages on the fly """
"""Returns a generator mixing messages on the fly"""
self.refresh_mixing_weights_(residual_adjusted=True)
self.ps_weight = ps_weight
......@@ -153,14 +153,14 @@ class Gossiper(object):
yield out_msg.mul(weight.type(out_msg.dtype)) # type: ignore
def clean_msg_buffers_(self) -> None:
""" Clean outgoing message buffer """
"""Clean outgoing message buffer"""
while len(self.out_msg_buffer) > 0:
req, msg = self.out_msg_buffer.pop()
req.wait()
msg.set_()
def parse_in_msg_buffer(self) -> Tuple[torch.Tensor, torch.Tensor]:
""" Parse in-msg buffer and return msg and ps-weight separately """
"""Parse in-msg buffer and return msg and ps-weight separately"""
msg = self.in_msg_buffer
if not self.regular:
return msg.narrow(0, 0, len(msg) - 1), msg[-1]
......@@ -168,15 +168,15 @@ class Gossiper(object):
return msg, self.ps_weight * self.peers_per_itr_device
def mix(self, out_msg: torch.Tensor, ps_weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
""" Single gossip step """
"""Single gossip step"""
raise NotImplementedError
class PushSum(Gossiper):
""" 1-peer Push-Sum consensus averaging module """
"""1-peer Push-Sum consensus averaging module"""
def mix(self, out_msg: torch.Tensor, ps_weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
""" Consensus averaging step """
"""Consensus averaging step"""
# out_msg must be on the correct device
assert out_msg.device.type == self.device.type
if self.logger is not None:
......@@ -189,7 +189,12 @@ class PushSum(Gossiper):
for out_edge in self.out_edges:
msg = next(mixed_out_msgs)
assert self.rank == out_edge.src
req = dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group, async_op=True,)
req = dist.broadcast(
tensor=msg,
src=out_edge.src,
group=out_edge.process_group,
async_op=True,
)
self.out_msg_buffer.append((req, msg))
# blocking recv w/ some code optimization to avoid buffer prep overhead
......@@ -204,7 +209,9 @@ class PushSum(Gossiper):
for in_edge in self.in_edges:
dist.broadcast(
tensor=self.placeholder, src=in_edge.src, group=in_edge.process_group,
tensor=self.placeholder,
src=in_edge.src,
group=in_edge.process_group,
)
self.in_msg_buffer.add_(self.placeholder) # type: ignore
......@@ -214,7 +221,7 @@ class PushSum(Gossiper):
class PushPull(Gossiper):
""" Doubly-stochastic consensus averaging module """
"""Doubly-stochastic consensus averaging module"""
def mix(self, out_msg: torch.Tensor, ps_weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
# out_msg must be on the correct device
......@@ -232,11 +239,15 @@ class PushPull(Gossiper):
if not self.passive:
dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group)
dist.broadcast(
tensor=self.in_msg_buffer, src=in_edge.src, group=in_edge.process_group,
tensor=self.in_msg_buffer,
src=in_edge.src,
group=in_edge.process_group,
)
else:
dist.broadcast(
tensor=self.in_msg_buffer, src=in_edge.src, group=in_edge.process_group,
tensor=self.in_msg_buffer,
src=in_edge.src,
group=in_edge.process_group,
)
dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group)
......@@ -251,11 +262,15 @@ class PushPull(Gossiper):
if not self.passive:
dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group)
dist.broadcast(
tensor=self.placeholder, src=in_edge.src, group=in_edge.process_group,
tensor=self.placeholder,
src=in_edge.src,
group=in_edge.process_group,
)
else:
dist.broadcast(
tensor=self.placeholder, src=in_edge.src, group=in_edge.process_group,
tensor=self.placeholder,
src=in_edge.src,
group=in_edge.process_group,
)
dist.broadcast(tensor=msg, src=out_edge.src, group=out_edge.process_group)
self.in_msg_buffer.add_(self.placeholder) # type: ignore
......
......@@ -77,26 +77,26 @@ class GraphManager(ABC):
@abstractmethod
def is_regular_graph(self) -> bool:
""" Whether each node has the same number of in-peers as out-peers """
"""Whether each node has the same number of in-peers as out-peers"""
raise NotImplementedError
@abstractmethod
def is_bipartite_graph(self) -> bool:
""" Whether graph is bipartite or not """
"""Whether graph is bipartite or not"""
raise NotImplementedError
@abstractmethod
def is_passive(self, rank: Optional[int] = None) -> bool:
""" Whether 'rank' is a passive node or not """
"""Whether 'rank' is a passive node or not"""
raise NotImplementedError
@abstractmethod
def is_dynamic_graph(self) -> bool:
""" Whether the graph-type is dynamic (as opposed to static) """
"""Whether the graph-type is dynamic (as opposed to static)"""
raise NotImplementedError
def get_peers(self, rotate: bool = False) -> Tuple[List[int], List[int]]:
""" Returns the out and in-peers corresponding to 'self.rank' """
"""Returns the out and in-peers corresponding to 'self.rank'"""
# cycle through in- and out-peers by updating group-index
if rotate:
self._rotate_group_indices()
......@@ -113,8 +113,8 @@ class GraphManager(ABC):
return out_peers, in_peers
def get_edges(self, rotate: bool = False) -> Tuple[List[Edge], List[Edge]]:
""" Returns the pairwise process groups between rank and the out and
in-peers corresponding to 'self.rank' """
"""Returns the pairwise process groups between rank and the out and
in-peers corresponding to 'self.rank'"""
# cycle through in- and out-peers by updating group-index
if rotate:
self._rotate_group_indices()
......@@ -131,17 +131,17 @@ class GraphManager(ABC):
return out_edges, in_edges
def _rotate_group_indices(self) -> None:
""" Incerement group indices to point to the next out-peer """
"""Incerement group indices to point to the next out-peer"""
increment = self.peers_per_itr
for i, group_index in enumerate(self._group_indices):
self._group_indices[i] = int((group_index + increment) % len(self.phone_book[self.rank]))
def _rotate_forward(self, r: int, p: int) -> int:
""" Helper function returns peer that is p hops ahead of r """
"""Helper function returns peer that is p hops ahead of r"""
return (r + p) % self.world_size
def _rotate_backward(self, r: int, p: int) -> int:
""" Helper function returns peer that is p hops behind r """
"""Helper function returns peer that is p hops behind r"""
return (r - p) % self.world_size
......
......@@ -32,18 +32,18 @@ class MixingManager(ABC):
@abstractmethod
def is_uniform(self) -> bool:
""" Whether mixing weights are distributed uniformly over peers """
"""Whether mixing weights are distributed uniformly over peers"""
raise NotImplementedError
@abstractmethod
def get_mixing_weights(self, residual_adjusted: bool = True) -> Dict[Union[str, int], torch.Tensor]:
""" Create mixing weight dictionary using uniform allocation """
"""Create mixing weight dictionary using uniform allocation"""
raise NotImplementedError
class UniformMixing(MixingManager):
def get_mixing_weights(self, residual_adjusted: bool = True) -> Dict[Union[str, int], torch.Tensor]:
""" Create mixing weight dictionary using uniform allocation """
"""Create mixing weight dictionary using uniform allocation"""
mixing_weights: Dict[Union[str, int], torch.Tensor] = {}
out_peers, _ = self.graph_manager.get_peers()
......
......@@ -36,7 +36,7 @@ def create_event_recorder(event_name: str, dummy: bool = False) -> EventRecorder
class CudaEventRecorder(EventRecorder):
""" Allows profiling in an easy-to-use manner. CudaEventRecorder can be used
"""Allows profiling in an easy-to-use manner. CudaEventRecorder can be used
in a loop. When it is used in a loop (or when an event recorder is created
multiple times with the same name), get_timings returns the statistics of the
timings since the last reset. Note: in case the number of timings is greater than
......@@ -92,19 +92,22 @@ class CudaEventRecorder(EventRecorder):
time_taken_list = [event_recorder.find_time_elapsed() for event_recorder in event_recorder_list]
all_timings_str += ("{}: Time taken: avg: {}, std: {}, count: " "{}\n").format(
event_name, statistics.mean(time_taken_list), statistics.pstdev(time_taken_list), len(time_taken_list),
event_name,
statistics.mean(time_taken_list),
statistics.pstdev(time_taken_list),
len(time_taken_list),
)
return all_timings_str
@classmethod
def get_timings(cls) -> str:
""" Returns the timings since last reset was called """
"""Returns the timings since last reset was called"""
return cls.get_common_timings(cls.event_recorders, "Timings since last reset")
@classmethod
def get_all_timings(cls) -> str:
""" Returns the statistics of all the timings """
"""Returns the statistics of all the timings"""
return cls.get_common_timings(cls.all_event_recorders, "All timings")
......
......@@ -86,7 +86,10 @@ def communicate(tensors: List[torch.Tensor], communication_op: Any, logger: logg
if logger is not None:
logger.debug("Commmunication completed")
with torch.no_grad():
for f, t in zip(unflatten_tensors(flat_tensor, tensors_with_same_dtype), tensors_with_same_dtype,):
for f, t in zip(
unflatten_tensors(flat_tensor, tensors_with_same_dtype),
tensors_with_same_dtype,
):
t.copy_(f)
if logger is not None:
logger.debug("Unflatten completed")
......
......@@ -198,7 +198,9 @@ class PipelineModulesGraph(nn.Module):
remote_module = partition[0].module.get_module_rref()
else:
remote_module = rpc.remote(
partition[0].module.on, RemoteSequential, args=([p.module.get_module_rref() for p in partition],),
partition[0].module.on,
RemoteSequential,
args=([p.module.get_module_rref() for p in partition],),
)
partitions.append((partition, remote_module))
......
......@@ -25,7 +25,7 @@ ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType]
class DistributedPipelineRecord:
""" A class for storing a single mini-batch (consisting of multiple micro-batches) as input to
"""A class for storing a single mini-batch (consisting of multiple micro-batches) as input to
a single partition.
Args:
device: the local device that runs the partition.
......@@ -73,7 +73,7 @@ class DistributedPipelineRecord:
return {}
def feed(self, chunk: int, input_idx: int, input: Tensor) -> Tensor:
""" This function is called remotely to provide individual tensors of a given chunk."""
"""This function is called remotely to provide individual tensors of a given chunk."""
if input.device.type == "cpu":
input = input.to(self.device)
cuda_stream = torch.cuda.current_stream(input.device) if input.device.type == "cuda" else None
......
......@@ -70,7 +70,12 @@ class DistributedPipeline(nn.Module):
DataConsumer = DataConsumer[Partition]
def __init__(self, graph: PipelineModulesGraph, chunks: int = 1, checkpoint: str = "except_last",) -> None:
def __init__(
self,
graph: PipelineModulesGraph,
chunks: int = 1,
checkpoint: str = "except_last",
) -> None:
super().__init__()
check_pytorch_version()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment