[refactor] AsyncPipe: do not sub-class MultiProcessPipe (#370)

08c10993 · msbaines · GitHub · 77d94861 · 08c10993 · 08c10993
Unverified Commit 08c10993 authored Feb 08, 2021 by msbaines Committed by GitHub Feb 08, 2021
4 changed files
--- a/fairscale/nn/pipe/async_pipe.py
+++ b/fairscale/nn/pipe/async_pipe.py
@@ -6,14 +6,20 @@
 from collections import OrderedDict
 from dataclasses import dataclass, field
 import itertools
-from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union
+import threading
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
+import warnings

 import torch
 from torch import Tensor, nn

+from fairscale.nn.model_parallel import get_pipeline_parallel_group
+
+from . import microbatch
 from .async_pipeline import AsyncPipeline
 from .async_schedule import Invocation, Location, ModuleWrapper
-from .multiprocess_pipe import MultiProcessPipe
+from .batchnorm import DeferredBatchNorm
+from .skip.layout import SkipLayout
 from .skip.skippable import Skippable
 from .types import LazyModule

@@ -38,10 +44,164 @@ class PartitionInfo:
        return len(self.modules)


-class AsyncPipe(MultiProcessPipe):
-    def __init__(self, *args: Any, **kwargs: Any):
-        super().__init__(*args, **kwargs)
-        self.pipelined_backward = False
+def verify_module(module: Union[nn.Sequential, List[LazyModule]]) -> None:
+    if len(set(map(id, module))) != len(module):
+        raise ValueError("module with duplicate children is not supported")
+
+
+def check_balance(module: Union[nn.Sequential, List[LazyModule]], balance: List[int]) -> None:
+    if len(module) != sum(balance):
+        raise ValueError(
+            f"module and sum of balance have different length (module: {len(module)}, sum of balance: {sum(balance)})"
+        )
+
+    if any(x <= 0 for x in balance):
+        raise ValueError(f"all balance numbers must be positive integer (balance: {balance})")
+
+
+MOVING_DENIED = TypeError("denied to move parameters and buffers, because Pipe should manage device placement")
+
+
+class AsyncPipe(Module):
+    """Wraps an arbitrary :class:`nn.Sequential <torch.nn.Sequential>` module
+    to train on Pipe_. If the module requires lots of memory, Pipe will be
+    very efficient.
+
+    Pipe combines pipeline parallelism with checkpointing to reduce peak
+    memory required to train while minimizing device under-utilization.
+
+    You should determine the balance when defining a :class:`AsyncPipe` module, as
+    balancing will not be done automatically. The module will be partitioned
+    into multiple devices according to the given balance. You may rely on
+    heuristics to find your own optimal configuration.
+
+    Args:
+        module (torch.nn.Sequential):
+            sequential module to be parallelized
+        balance (ints):
+            list of number of layers in each partition
+
+    Keyword Args:
+        group (ProcessGroup):
+            the process group that all
+            pipeline stages are a member of. Defaults to
+            `get_pipeline_parallel_group()`
+        worker_map (Dict[int, str]):
+            a map from worker name (the first argument to
+            `torch.distributed.rpc.init_rpc`) to global rank (i.e.
+            `torch.distributed.get_rank()`) needed in order for pipeline stages
+            to communicate with each other
+        input_device (device):
+            the device on which tensors should be located before being passed to
+            the first module in a given pipeline stage
+        chunks (int):
+            number of micro-batches (default: ``1``)
+        checkpoint (str):
+            when to enable checkpointing, one of ``'always'``,
+            ``'except_last'``, or ``'never'`` (default: ``'except_last'``)
+        deferred_batch_norm (bool):
+            whether to use deferred BatchNorm moving statistics (default:
+            :data:`False`, see :class:`DeferredBatchNorm` for more
+            details)
+
+    Raises:
+        TypeError:
+            the module is not a :class:`nn.Sequential <torch.nn.Sequential>`.
+        ValueError:
+            invalid arguments, or wrong balance
+        IndexError:
+            the number of devices is fewer than the number of partitions.
+
+    """
+
+    #: The number of layers in each partition.
+    balance: List[int] = []
+    #                    ^^
+    # The default value [] required for Sphinx's autoattribute.
+
+    #: The devices mapped to each partition.
+    #:
+    #: ``devices[-1]`` refers to the device of the last partition, which means
+    #: it is the output device. Probably, you need to use it to transfer the
+    #: target to calculate the loss without a device mismatch
+    #: :exc:`RuntimeError`. For example::
+    #:
+    #:     out_device = pipe.devices[-1]
+    #:
+    #:     for input, target in loader:
+    #:         target = target.to(out_device, non_blocking=True)
+    #:         output = pipe(input)
+    #:         loss = F.cross_entropy(output, target)
+    #:
+
+    #: The number of micro-batches.
+    chunks: int = 1
+
+    #: The checkpoint mode to determine when to enable checkpointing. It is one
+    #: of ``'always'``, ``'except_last'``, or ``'never'``.
+    checkpoint: str = "except_last"
+
+    def __init__(
+        self,
+        module: Union[nn.Sequential, List[LazyModule]],
+        balance: Iterable[int],
+        *,
+        group: Optional[torch.distributed.ProcessGroup] = None,
+        worker_map: Optional[Dict[int, str]] = None,
+        input_device: Union[None, int, str, torch.device] = None,
+        chunks: int = chunks,
+        checkpoint: str = checkpoint,
+        deferred_batch_norm: bool = False,
+    ) -> None:
+        super().__init__()
+
+        if chunks <= 0:
+            raise ValueError("number of chunks must be positive integer")
+        if checkpoint not in ["always", "except_last", "never"]:
+            raise ValueError("checkpoint is not one of 'always', 'except_last', or 'never'")
+
+        self.balance = list(balance)
+        verify_module(module)
+        check_balance(module, self.balance)
+
+        self.chunks = chunks
+        self.checkpoint = checkpoint
+        self.pipeline: Optional[AsyncPipeline]
+        self.lock = threading.Lock()
+
+        self.worker_map = worker_map
+        self.input_device = input_device
+
+        self.group: torch.distributed.ProcessGroup
+        if group is None:
+            self.group = get_pipeline_parallel_group()
+        else:
+            self.group = group
+
+        if self.group.size() < len(self.balance):
+            raise IndexError(
+                f"too few ranks to hold given partitions (ranks: {self.group.size()}, partitions:"
+                f" {len(self.balance)})"
+            )
+
+        self._skip_layout = SkipLayout(len(module), {})  # FIXME(tom)
+
+        rank = self.group.rank()
+        self.final_stage = rank == len(self.balance) - 1
+        if rank >= len(self.balance):
+            warnings.warn("More ranks than partitions, some ranks unused")
+            self.partitions: List[ModuleWrapper] = []
+            self.pipeline = None
+        else:
+            self.partitions = self.instantiate_partition(module, self.balance, self.group)
+            if deferred_batch_norm:
+                for part in self.partitions:
+                    part.module = DeferredBatchNorm.convert_deferred_batch_norm(part.module, chunks)
+            for name, part in enumerate(self.partitions):
+                self.add_module(str(name), part.module)
+            self.create_pipeline()
+
+        del module

    def create_pipeline(self) -> None:
        # The micro-batch index where the checkpointing stops.
@@ -150,3 +310,85 @@ class AsyncPipe(MultiProcessPipe):
            result.append(wrapper)

        return result
+
+    def __len__(self) -> int:
+        """Counts the length of the underlying sequential module."""
+        return sum(len(p) for p in self.partitions)
+
+    def __getitem__(self, index: int) -> nn.Module:
+        """Gets a layer in the underlying sequential module."""
+        partitions: List[Any]
+        partitions = self.partitions
+
+        if index < 0:
+            partitions = partitions[::-1]
+
+        for partition in partitions:
+            try:
+                if isinstance(partition, ModuleWrapper):
+                    return partition.module[index]
+                else:
+                    return partition[index]
+            except IndexError:
+                pass
+
+            shift = len(partition)
+
+            if index < 0:
+                index += shift
+            else:
+                index -= shift
+
+        raise IndexError
+
+    def __iter__(self) -> Iterable[nn.Module]:
+        """Iterates over children of the underlying sequential module."""
+        for partition in self.partitions:
+            yield from partition.module
+
+    def forward(self, input: TensorOrTensors, *, event=None) -> TensorOrTensors:  # type: ignore
+        """:class:`AsyncPipe` is a fairly transparent module wrapper. It doesn't
+        modify the input and output signature of the underlying module. But
+        there's type restriction. Input and output have to be a
+        :class:`~torch.Tensor` or a tuple of tensors. This restriction is
+        applied at partition boundaries too.
+
+        Args:
+            input (torch.Tensor or tensors): input mini-batch
+
+        Returns:
+            tensor or tensors: output mini-batch
+
+        Raises:
+            TypeError: input is not a tensor or tensors.
+
+        """
+        microbatch.check(input)
+
+        if not self.pipeline:
+            # No pipeline is not illegal, more ranks than partitions
+            return input
+
+        # Divide a mini-batch into micro-batches.
+        batches = microbatch.scatter(input, self.chunks)
+
+        # Run pipeline parallelism.
+        with self.lock:
+            self.pipeline.run(self.training, batches, event)
+
+            if self.final_stage:
+                output = microbatch.gather(batches)
+            else:
+                # Don't merge micro-batches to avoid unnecessary edges in autograd
+                # graph
+                # FIXME(tom) should figure out a proper type here
+                output = batches  # type: ignore
+
+            return output
+
+    def back_helper(self, output: List[microbatch.Batch]) -> None:
+        if self.final_stage:
+            raise ValueError("back_helper should only be called on non-final stages")
+
+        if self.pipeline:
+            self.pipeline.back_helper(output)
--- a/fairscale/nn/pipe/async_pipeline.py
+++ b/fairscale/nn/pipe/async_pipeline.py
@@ -4,18 +4,54 @@
 # LICENSE file in the root directory of this source tree.

 import logging
+import os
 from threading import Event
-from typing import List, Optional
+from typing import Dict, List, Optional, Union

 import torch

-from .async_schedule import AsyncEventLoop
+from .async_schedule import AsyncEventLoop, ModuleWrapper
+from .messages import MakeTransport
 from .microbatch import Batch
-from .multiprocess_pipeline import MultiProcessPipeline
+from .skip.layout import SkipLayout
 from .skip.tracker import SkipTrackerThroughPotals


-class AsyncPipeline(MultiProcessPipeline):
+class AsyncPipeline:
+    """The async pipeline parallelism for Pipe."""
+
+    def __init__(
+        self,
+        partitions: List[ModuleWrapper],
+        skip_layout: SkipLayout,
+        checkpoint_stop: int,
+        group: torch.distributed.ProcessGroup,
+        *,
+        worker_map: Optional[Dict[int, str]] = None,
+        input_device: Union[None, int, str, torch.device] = None,
+        final_stage: bool = False,
+    ) -> None:
+        self.partitions = partitions
+        self.skip_layout = skip_layout
+        self.__checkpoint_stop = checkpoint_stop
+        self.group = group
+        self.training: bool
+        self.transport = MakeTransport(
+            use_rpc=("OMPI_COMM_WORLD_RANK" not in os.environ) or ("FORCE_RPC" in os.environ),
+            worker_map=worker_map,
+            input_device=input_device,
+        )
+        self.input_device = input_device
+        self.final_stage = final_stage
+
+    @property
+    def checkpoint_stop(self) -> int:
+        # Disable checkpointing if in eval mode.
+        training = self.partitions[0].module.training
+        if not training:
+            return 0
+        return self.__checkpoint_stop
+
    def run(self, training: bool, batches: List[Batch], event: Optional[Event]) -> None:

        """Runs pipeline parallelism.

--- a/fairscale/nn/pipe/multiprocess_pipe.py
+++ b/fairscale/nn/pipe/multiprocess_pipe.py
@@ -20,7 +20,7 @@
 """The MultiProcessPipe interface."""
 from collections import OrderedDict
 import threading
-from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 import warnings

 import torch
@@ -210,17 +210,18 @@ class MultiProcessPipe(Module):
        self.final_stage = rank == len(self.balance) - 1
        if rank >= len(self.balance):
            warnings.warn("More ranks than partitions, some ranks unused")
-            self.partitions: List[ModuleWrapper] = []
+            self.partition = nn.Sequential()
            self.pipeline = None
        else:
-            self.partitions = self.instantiate_partition(module, self.balance, self.group)
+            self.partition = self.instantiate_partition(module, self.balance, self.group)
            if deferred_batch_norm:
-                for part in self.partitions:
-                    part.module = DeferredBatchNorm.convert_deferred_batch_norm(part.module, chunks)
-            for name, part in enumerate(self.partitions):
-                self.add_module(str(name), part.module)
+                self.partitition = DeferredBatchNorm.convert_deferred_batch_norm(self.partition, chunks)
+            self.add_module(str(0), self.partition)
            self.create_pipeline()

+        # TODO(msb) Remove this hack at some point.
+        self.partitions = [ModuleWrapper(self.partition, Location(self.group.rank(), 0))]
+
        del module

    def create_pipeline(self) -> None:
@@ -228,7 +229,7 @@ class MultiProcessPipe(Module):
        checkpoint_stop = {"always": self.chunks, "except_last": self.chunks - 1, "never": 0}[self.checkpoint]

        self.pipeline = MultiProcessPipeline(
-            self.partitions,
+            [ModuleWrapper(self.partition, Location(self.group.rank(), 0))],
            self._skip_layout,
            checkpoint_stop,
            group=self.group,
@@ -239,48 +240,25 @@ class MultiProcessPipe(Module):

    def instantiate_partition(
        self, module: Union[nn.Sequential, List[LazyModule]], balance: List[int], group: torch.distributed.ProcessGroup,
-    ) -> List[ModuleWrapper]:
+    ) -> nn.Sequential:
        rank = group.rank()
        first_layer = sum(balance[:rank])
        num_layers = balance[rank]
        layers = module[first_layer : first_layer + num_layers]
        instantiated_layers = [l if isinstance(l, nn.Module) else l() for l in layers]
-        return [ModuleWrapper(nn.Sequential(*instantiated_layers), Location(rank, 0))]
+        return nn.Sequential(*instantiated_layers)

    def __len__(self) -> int:
        """Counts the length of the underlying sequential module."""
-        return sum(len(p) for p in self.partitions)
+        return self.partition.__len__()

    def __getitem__(self, index: int) -> nn.Module:
        """Gets a layer in the underlying sequential module."""
-        partitions: List[Any]
-        partitions = self.partitions
-
-        if index < 0:
-            partitions = partitions[::-1]
-
-        for partition in partitions:
-            try:
-                if isinstance(partition, ModuleWrapper):
-                    return partition.module[index]
-                else:
-                    return partition[index]
-            except IndexError:
-                pass
-
-            shift = len(partition)
-
-            if index < 0:
-                index += shift
-            else:
-                index -= shift
-
-        raise IndexError
+        return self.partition.__getitem__(index)

    def __iter__(self) -> Iterable[nn.Module]:
        """Iterates over children of the underlying sequential module."""
-        for partition in self.partitions:
-            yield from partition.module
+        return self.partition.__iter__()

    def forward(self, input: TensorOrTensors, *, event=None) -> TensorOrTensors:  # type: ignore
        """:class:`MultiProcessPipe` is a fairly transparent module wrapper. It doesn't

--- a/fairscale/nn/pipe/rpc.py
+++ b/fairscale/nn/pipe/rpc.py
@@ -14,13 +14,12 @@ from torch.distributed.distributed_c10d import _get_global_rank
 from fairscale.nn.model_parallel.initialize import get_pipeline_parallel_group

 from .async_pipe import AsyncPipe
-from .multiprocess_pipe import MultiProcessPipe
 from .types import EVENT_LOOP_QUEUE, PipeMessage, TensorOrTensors

 DEFAULT_MAX_SOURCE_POSITIONS = 1024
 DEFAULT_MAX_TARGET_POSITIONS = 1024

-PipeModel: MultiProcessPipe
+PipeModel: AsyncPipe
 PipeResult: TensorOrTensors


@@ -72,7 +71,7 @@ class PipeBackRedirect(torch.autograd.Function):
        return (None, None, None, None, None, None)


-def callback_with_model(callback: Callable[[Any, MultiProcessPipe], None], ctx: Any) -> None:
+def callback_with_model(callback: Callable[[Any, AsyncPipe], None], ctx: Any) -> None:
    try:
        group = get_pipeline_parallel_group()  # FIXME(tom) handle dynamic group
        set_device_based_on_group(group)
@@ -121,7 +120,7 @@ class PipeRPCWrapper(nn.Module):
        futures = [f.wait() for f in futures]

    def foreach_worker(
-        self, callback: Callable[[Any, MultiProcessPipe], None], ctx: Any = None, *, include_self: bool = False
+        self, callback: Callable[[Any, AsyncPipe], None], ctx: Any = None, *, include_self: bool = False
    ) -> None:
        """Call `callback` on each worker with the `ctx` and model local to that
        worker. e.g.
@@ -197,7 +196,7 @@ class PipeRPCWrapper(nn.Module):

    @staticmethod
    def _recv_result(
-        model: MultiProcessPipe, shapes: SizeOrSizes, dtypes: DtypeOrDtypes, message: PipeMessage
+        model: AsyncPipe, shapes: SizeOrSizes, dtypes: DtypeOrDtypes, message: PipeMessage
    ) -> TensorOrTensors:
        group = get_pipeline_parallel_group()
        set_device_based_on_group(group)
@@ -245,7 +244,7 @@ class PipeRPCWrapper(nn.Module):
        set_device_based_on_group(group)
        kwargs["group"] = group
        kwargs["input_device"] = torch.device("cuda", torch.cuda.current_device())
-        model = MultiProcessPipe(*args, **kwargs)
+        model = AsyncPipe(*args, **kwargs)
        model.cuda()
        global PipeModel
        PipeModel = model