Initial commit

0cd65242 · Mandeep Singh Baines · 0cd65242 · 0cd65242 · 0cd65242 · 0cd65242
Commit 0cd65242 authored Jul 07, 2020 by Mandeep Singh Baines
20 changed files
--- a/fairscale/nn/pipe/worker.py
+++ b/fairscale/nn/pipe/worker.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copyright 2019 Kakao Brain
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multithreading in pipeline parallelism."""
+from contextlib import contextmanager
+from queue import Queue
+import sys
+from threading import Thread
+from types import TracebackType
+from typing import TYPE_CHECKING, Callable, Dict, Generator, List, Optional, Tuple, Type, Union, cast
+
+import torch
+
+from .microbatch import Batch
+from .stream import AbstractStream, use_device, use_stream
+
+__all__: List[str] = []
+
+
+ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType]
+
+# Queue is generic only in stubs.
+# https://mypy.readthedocs.io/en/latest/common_issues.html#using-classes-that-are-generic-in-stubs-but-not-at-runtime
+if TYPE_CHECKING:
+    InQueue = Queue[Optional["Task"]]
+    OutQueue = Queue[Tuple[bool, Union[Tuple["Task", Batch], ExcInfo, None]]]
+else:
+    InQueue = Queue
+    OutQueue = Queue
+
+
+class Task:
+    """A task represents how to compute a micro-batch on a partition.
+
+    It consists of two parts: :meth:`compute` and :meth:`finalize`.
+    :meth:`compute` should be executed in worker threads concurrently.
+    :meth:`finalize` should be executed after when worker threads complete to
+    execute :meth:`compute`.
+
+    :meth:`compute` might be boosted by worker threads. Because it produces
+    several CUDA API calls by user code. In PyTorch, parallel CUDA API calls
+    are not serialized through GIL. So more than one CUDA API call can be
+    produced at the same time.
+
+    """
+
+    def __init__(
+        self, stream: AbstractStream, *, compute: Callable[[], Batch], finalize: Optional[Callable[[Batch], None]],
+    ) -> None:
+        self.stream = stream
+        self._compute = compute
+        self._finalize = finalize
+        self._grad_enabled = torch.is_grad_enabled()
+
+    def compute(self) -> Batch:
+        with use_stream(self.stream), torch.set_grad_enabled(self._grad_enabled):
+            return self._compute()
+
+    def finalize(self, batch: Batch) -> None:
+        if self._finalize is None:
+            return
+        with use_stream(self.stream), torch.set_grad_enabled(self._grad_enabled):
+            self._finalize(batch)
+
+
+def worker(in_queue: InQueue, out_queue: OutQueue, device: torch.device) -> None:
+    """The main loop of a worker thread."""
+    with use_device(device):
+        while True:
+            task = in_queue.get()
+
+            if task is None:
+                break
+
+            try:
+                batch = task.compute()
+            except Exception:
+                exc_info = cast(ExcInfo, sys.exc_info())
+                out_queue.put((False, exc_info))
+                continue
+
+            out_queue.put((True, (task, batch)))
+
+    done = (False, None)
+    out_queue.put(done)
+
+
+def create_workers(devices: List[torch.device],) -> Tuple[List[InQueue], List[OutQueue]]:
+    """Spawns worker threads. A worker thread is bound to a device."""
+    in_queues: List[InQueue] = []
+    out_queues: List[OutQueue] = []
+
+    # Spawn workers.
+    workers: Dict[torch.device, Tuple[InQueue, OutQueue]] = {}
+
+    def normalize_device(device: torch.device) -> torch.device:
+        if device.type == "cuda" and device.index is None:
+            return torch.device("cuda", index=torch.cuda.current_device())
+
+        if device.type == "cpu" and device.index is not None:
+            return torch.device("cpu")
+
+        return device
+
+    for device in devices:
+        device = normalize_device(device)
+
+        try:
+            in_queue, out_queue = workers[device]
+        except KeyError:
+            in_queue = Queue()
+            out_queue = Queue()
+            workers[device] = (in_queue, out_queue)
+
+            t = Thread(target=worker, args=(in_queue, out_queue, device), daemon=True,)
+            t.start()
+
+        in_queues.append(in_queue)
+        out_queues.append(out_queue)
+
+    return (in_queues, out_queues)
+
+
+def join_workers(in_queues: List[InQueue], out_queues: List[OutQueue]) -> None:
+    # Close workers.
+    for in_queue in set(in_queues):
+        in_queue.put(None)
+
+    # Join running workers.
+    running = set(out_queues)
+    while running:
+        out_queue = running.pop()
+        ok, payload = out_queue.get()
+
+        done = (False, None)
+        if (ok, payload) == done:
+            continue
+
+        running.add(out_queue)
+
+
+@contextmanager
+def spawn_workers(devices: List[torch.device],) -> Generator[Tuple[List[InQueue], List[OutQueue]], None, None]:
+    try:
+        (in_queues, out_queues) = create_workers(devices)
+        yield (in_queues, out_queues)
+    finally:
+        join_workers(in_queues, out_queues)
--- a/fairscale/optim/__init__.py
+++ b/fairscale/optim/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+:mod:`fairgc.optim` is a package implementing various torch optimization algorithms.
+"""
+
+from .oss import OSS
--- a/fairscale/optim/oss.py
+++ b/fairscale/optim/oss.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Type
+
+import torch.distributed as dist
+from torch.optim import SGD, Optimizer
+
+if TYPE_CHECKING:
+    from torch.optim.optimizer import _params_t
+else:
+    _params_t = Any
+
+
+class OSS(Optimizer):
+    """Wraps an arbitrary :class:`optim.Optimizer <torch.optim.Optimizer>`
+    optimizer and shards its state as describe by ZeRO_.
+    ::
+        opt = OSS(params, optim=torch.optim.Adam, lr=0.01)
+
+    .. _ZeRO: https://arxiv.org/abs/1910.02054
+
+    Pipe combines pipeline parallelism with checkpointing to reduce peak
+    memory required to train while minimizing device under-utilization.
+
+    You should determine the balance when defining a :class:`Pipe` module, as
+    balancing will not be done automatically. The module will be partitioned
+    into multiple devices according to the given balance. You may rely on
+    heuristics to find your own optimal configuration.
+
+    Args:
+        params (list of tensors):
+            parameters to be optimized
+    Keyword Args:
+        optim (torch.nn.Optimizer):
+            optimizer to shard (default: SGD)
+        group (group):
+            torch.distributed group (default: group.WORLD)
+    """
+
+    optim: Optimizer
+    in_super_constructor: bool
+
+    def __init__(self, params: _params_t, optim: Type[Optimizer] = SGD, group: Any = dist.group.WORLD, **defaults: Any):
+        self.in_super_constructor = True
+        super().__init__(params, defaults)
+        self.in_super_constructor = False
+
+        self.group = group
+        self.rank = dist.get_rank(group)
+        param_groups = self.partition_parameters()
+        self.optim = optim(param_groups[self.rank], **defaults)
+
+    def partition_parameters(self) -> List[List[dict]]:
+        """Partitions parameters across distributed ranks.
+
+        Returns a list of param_groups (which is a list of dict) where each
+        element of the list contains the param_groups for a rank. Element 0
+        corresponds to rank 0, etc. We need all the ranks for the broadcast
+        inside step().
+        """
+        world_size = dist.get_world_size(self.group)
+        param_groups: List[List] = [list() for _ in range(world_size)]
+        sizes = [0] * world_size
+        for param_group in self.param_groups:
+            param_lists: List[List] = [list() for _ in range(world_size)]
+            for param in param_group["params"]:
+                # Add this param to rank with smallest size.
+                rank = sizes.index(min(sizes))
+                param_lists[rank].append(param)
+                sizes[rank] += param.numel()
+            for rank, params in enumerate(param_lists):
+                if len(params):
+                    pg = copy.copy(param_group)
+                    pg["params"] = params
+                    param_groups[rank].append(pg)
+        return param_groups
+
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+        loss = self.optim.step(closure=closure)
+        for rank, param_groups in enumerate(self.partition_parameters()):
+            for param_group in param_groups:
+                for param in param_group["params"]:
+                    dist.broadcast(param, rank, group=self.group)
+        return loss
+
+    def state_dict(self) -> dict:
+        """ Gets this rank's state_dict. """
+        return self.optim.state_dict()
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """ Loads this rank's state_dict. """
+        self.optim.load_state_dict(state_dict)
+
+    def add_param_group(self, param_group: dict) -> None:
+        super().add_param_group(param_group)
+        if not self.in_super_constructor:
+            param_groups = self.partition_parameters()[self.rank]
+            if len(param_groups) == len(self.optim.param_groups) + 1:
+                self.optim.add_param_group(param_groups[-1])
--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = [
+    "setuptools >= 40.6.2",
+    "wheel >= 0.30.0"
+]
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 120
+exclude = '''
+/(
+    \.git
+  | \.mypy_cache
+  | \.pytest_cache
+  | build
+  | stubs
+)/
+'''
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
+-r requirements.txt
+pre-commit
--- a/requirements-test.txt
+++ b/requirements-test.txt
+black == 19.10b0
+flake8 == 3.7.9
+isort == 4.3.21
+mypy == 0.770
+pytest == 5.4.1
+torchtext == 0.6.0
+torch == 1.4.0
+# NOTE(msb) not a dependency but needed by torch == 1.4.0
+numpy == 1.17.4
--- a/requirements.txt
+++ b/requirements.txt
+torch >= 1.4.0
--- a/setup.cfg
+++ b/setup.cfg
+# -----------------------------------------------------------------------------
+# pytest
+# -----------------------------------------------------------------------------
+
+[tool:pytest]
+testpaths = tests
+addopts = --verbose
+junit_family = xunit2
+
+[aliases]
+test = pytest
+
+# -----------------------------------------------------------------------------
+# coverage
+# -----------------------------------------------------------------------------
+
+[coverage:report]
+# Coverage couldn't detect backward functions because they are called by C++.
+# Append "# pragma: no cover" to the definition lines to ignore them.
+# https://www.janfreyberg.com/blog/2019-04-01-testing-pytorch-functions/
+exclude_lines = pragma: no cover
+
+# -----------------------------------------------------------------------------
+# flake8
+# -----------------------------------------------------------------------------
+
+[flake8]
+select = B,C,E,F,P,T4,W,B9
+max-line-length = 120
+# C408 ignored because we like the dict keyword argument syntax
+# E501 is not flexible enough, we're using B950 instead
+ignore =
+    E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
+per-file-ignores = __init__.py: F401
+exclude = build,*.pyi,.git
+
+# -----------------------------------------------------------------------------
+# isort
+# -----------------------------------------------------------------------------
+
+[isort]
+line_length = 120
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+skip_glob = build/*,stubs/*
+# Don't split "import" and "from".
+force_sort_within_sections = true
+known_third_party = models,pytest,setuptools,torch,torchtext
+
+# -----------------------------------------------------------------------------
+# mypy
+# -----------------------------------------------------------------------------
+
+# Docs for mypy config: https://mypy.readthedocs.io/en/latest/config_file.html
+[mypy]
+mypy_path = ./stubs/
+follow_imports = normal
+
+# This project must be strictly typed.
+[mypy-fairscale.*]
+check_untyped_defs = true
+disallow_untyped_defs = true
+disallow_untyped_calls = true
+disallow_untyped_decorators = true
+disallow_incomplete_defs = true
+warn_unused_ignores = true
+
+# Ignore missing imports from untyped third-party libraries.
+[mypy-torch.*,torchvision.*,setuptools.*,pytest.*]
+ignore_missing_imports = true
--- a/setup.py
+++ b/setup.py
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+import setuptools
+
+
+def fetch_requirements():
+    with open("requirements.txt") as f:
+        reqs = f.read().strip().split("\n")
+    return reqs
+
+
+if __name__ == "__main__":
+    setuptools.setup(
+        name="fairscale",
+        description="fairscale: Utility library for large-scale and high-performance training.",
+        install_requires=fetch_requirements(),
+        include_package_data=True,
+        packages=setuptools.find_packages(exclude=("tests", "tests.*")),
+        python_requires=">=3.6",
+        author="Facebook AI Research",
+        author_email="todo@fb.com",
+        classifiers=[
+            "Programming Language :: Python :: 3.6",
+            "Programming Language :: Python :: 3.7",
+            "Programming Language :: Python :: 3.8",
+            "License :: OSI Approved :: BSD License",
+            "Topic :: Scientific/Engineering :: Artificial Intelligence",
+            "Operating System :: OS Independent",
+        ],
+    )
--- a/stubs/torch/__init__.pyi
+++ b/stubs/torch/__init__.pyi
--- a/stubs/torch/autograd/__init__.pyi
+++ b/stubs/torch/autograd/__init__.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+from typing import Any, Callable, Union, Tuple, Sequence, Optional
+from .. import Tensor
+from .grad_mode import no_grad as no_grad, enable_grad as enable_grad, \
+    set_grad_enabled as set_grad_enabled
+from .profiler import record_function
+
+# TODO make Variable and Function more precise
+class Variable:
+    ...
+
+class Function:
+    @staticmethod
+    def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any: ...
+    @staticmethod
+    def backward(ctx: Any, *grad_outputs: Any) -> Any: ...
+
+#MODIFIED BY TORCHGPIPE
+    @staticmethod
+    def apply(*args: Any, **kwargs: Any) -> Any: ...
+#END
+
+class NestedIOFunction(Function):
+    # The 'type: ignore' statements are needed here because these functions are declared as '@staticmethod' in the
+    # superclass (Function) but are instance methods here, which mypy reports as incomptabile.
+    def backward(self, *gradients: Any) -> Any: ...  # type: ignore
+    def forward(self, *args: Any) -> tuple: ...  # type: ignore
+    def save_for_backward(self, *args: Any) -> None:...
+    def mark_dirty(self, *args: Any, **kwargs: Any) -> None:...
+    def mark_non_differentiable(self, *args: Any, **kwargs: Any) -> None: ...
+    def forward_extended(self, *input: Any) -> None:...
+    def backward_extended(self, *grad_output: Any) -> None: ...
+
+# 'func' accepts a vararg of tensors, which isn't expressable in the type system at the moment.
+# If https://mypy.readthedocs.io/en/latest/additional_features.html?highlight=callable#extended-callable-types is accepted,
+# the '...' first argument of Callabe can be replaced with VarArg(Tensor).
+# For now, we permit any input.
+def gradcheck(func: Callable[..., Union[Tensor, Tuple[Tensor, ...]]], inputs: Union[Tensor, Tuple[Tensor, ...]], eps: float=..., atol: float=..., rtol: float=..., raise_exception: bool=..., check_sparse_nnz: bool=...) -> bool: ...
+def gradgradcheck(func: Callable[..., Union[Tensor, Tuple[Tensor, ...]]], inputs: Union[Tensor, Tuple[Tensor, ...]], eps: float=..., atol: float=..., rtol: float=..., gen_non_contig_grad_outputs: bool=..., raise_exception: bool=...) -> bool: ...
+
+class detect_anomaly:
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args: Any) -> bool: ...
+
+class set_detect_anomaly:
+    def __init__(self, mode: bool) -> None: ...
+    def __enter__(self) -> None:...
+    def __exit__(self, *args: Any) -> bool: ...
+
+_TensorOrTensors = Union[Tensor, Sequence[Tensor]]
+def backward(tensors: _TensorOrTensors, grad_tensors: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=...) -> None: ...
+def grad(outputs: _TensorOrTensors, inputs: _TensorOrTensors, grad_outputs: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=..., only_inputs: bool=..., allow_unused: bool=...) -> Tuple[Tensor, ...]: ...
\ No newline at end of file
--- a/stubs/torch/autograd/grad_mode.pyi
+++ b/stubs/torch/autograd/grad_mode.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+from typing import Any, Callable, Optional, TypeVar
+
+# Used for annotating the decorator usage of 'no_grad' and 'enable_grad'.
+# See https://mypy.readthedocs.io/en/latest/generics.html#declaring-decorators
+FuncType = Callable[..., Any]
+T = TypeVar('T', bound=FuncType)
+
+class no_grad:
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args: Any) -> Optional[bool]: ...
+    def __call__(self, func: T) -> T: ...
+
+class enable_grad:
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args: Any) -> Optional[bool]: ...
+    def __call__(self, func: T) -> T: ...
+
+class set_grad_enabled:
+    def __init__(self, mode: bool) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args: Any) -> Optional[bool]: ...
--- a/stubs/torch/autograd/profiler.pyi
+++ b/stubs/torch/autograd/profiler.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+from typing import Any, ContextManager, Optional
+
+class record_function(ContextManager[None]):
+    def __init__(self, name: str) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args: Any) -> Optional[bool]: ...
--- a/stubs/torch/backends/__init__.pyi
+++ b/stubs/torch/backends/__init__.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#MODIFIED BY TORCHGPIPE
+from . import cudnn
+#END
--- a/stubs/torch/backends/cudnn.pyi
+++ b/stubs/torch/backends/cudnn.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#MODIFIED BY TORCHGPIPE
+def version() -> int: ...
+#END
--- a/stubs/torch/cuda/__init__.pyi
+++ b/stubs/torch/cuda/__init__.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+from typing import Optional, Tuple, Union
+import ctypes
+from .. import device as _device
+
+def is_available() -> bool: ...
+def init() -> None: ...
+
+class cudaStatus:
+    SUCCESS: int
+    ERROR_NOT_READY: int
+
+class CudaError:
+    def __init__(self, code: int) -> None: ...
+
+class _CudaDeviceProperties:
+    name: str
+    major: int
+    minor: int
+    multi_processor_count: int
+    total_memory: int
+    is_integrated: int
+    is_multi_gpu_board: int
+
+_device_t = Union[_device, int]
+
+def check_error(res: int) -> None: ...
+def device_count() -> int: ...
+def empty_cache() -> None: ...
+def synchronize(device: _device_t) -> None: ...
+def set_device(device: _device_t) -> None: ...
+def get_device_capability(device: Optional[_device_t]=...) -> Tuple[int, int]: ...
+def get_device_name(device: Optional[_device_t]=...) -> str: ...
+def get_device_properties(device: _device_t) -> _CudaDeviceProperties: ...
+def current_device() -> int: ...
+def memory_allocated(device: Optional[_device_t]=...) -> int: ...
+def max_memory_allocated(device: Optional[_device_t]=...) -> int: ...
+def reset_max_memory_allocated(device: Optional[_device_t]=...) -> None: ...
+def memory_cached(device: Optional[_device_t]=...) -> int: ...
+def max_memory_cached(device: Optional[_device_t]=...) -> int: ...
+def reset_max_memory_cached(device: Optional[_device_t]=...) -> None: ...
+def cudart() -> ctypes.CDLL: ...
+def find_cuda_windows_lib() -> Optional[ctypes.CDLL]: ...
+#MODIFIED BY TORCHGPIPE
+from .. import ByteTensor
+def set_rng_state(new_state: ByteTensor, device: _device_t = ...) -> None: ...
+def get_rng_state(device: _device_t = ...) -> ByteTensor: ...
+#END
+
+#MODIFIED BY TORCHGPIPE
+from typing import Any
+class device:
+    def __init__(self, device: _device_t = ...) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args: Any) -> None: ...
+
+class Stream:
+    device: _device
+    def __init__(self, device: _device_t = ..., priority: int = ...) -> None: ...
+    def synchronize(self) -> None: ...
+    def wait_stream(self, stream: Stream) -> None: ...
+
+class stream:
+    def __init__(self, stream: Optional[Stream] = ...) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args: Any) -> None: ...
+
+def current_stream(device: Optional[_device_t]) -> Stream: ...
+def default_stream(device: Optional[_device_t]) -> Stream: ...
+#END
--- a/stubs/torch/cuda/comm/__init__.pyi
+++ b/stubs/torch/cuda/comm/__init__.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#MODIFIED BY TORCHGPIPE
+from typing import Iterable, Optional, Tuple
+
+from torch import Tensor
+
+
+def scatter(tensor: Tensor,
+            devices: Iterable[int],
+            chunk_sizes: Optional[Iterable[int]] = None,
+            dim: int = 0,
+            ) -> Tuple[Tensor, ...]: ...
+
+
+def gather(tensors: Iterable[Tensor],
+           dim: int = 0,
+           destination: Optional[int] = None,
+           ) -> Tensor: ...
+
+#END
--- a/stubs/torch/distributed/__init__.pyi
+++ b/stubs/torch/distributed/__init__.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+from typing import Any
+from torch import Tensor
+
+def get_rank(group: Any) -> int: ...
+
+def get_world_size(group: Any) -> int: ...
+
+def broadcast(tensor: Tensor, src: Any, group: Any, async_op: Any = False): ...
+
+class group(object):
+    WORLD: Any
--- a/stubs/torch/nn/__init__.pyi
+++ b/stubs/torch/nn/__init__.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+from .modules import * 
+from .parameter import Parameter as Parameter
+from .parallel import DataParallel as DataParallel
+from . import functional as functional
--- a/stubs/torch/nn/common_types.pyi
+++ b/stubs/torch/nn/common_types.pyi
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+from typing import TypeVar, Union, Tuple
+from .. import Tensor
+
+# Create some useful type aliases
+
+# Template for arguments which can be supplied as a tuple, or which can be a scalar which PyTorch will internally
+# broadcast to a tuple.
+# Comes in several variants: A tuple of unknown size, and a fixed-size tuple for 1d, 2d, or 3d operations.
+T = TypeVar('T')
+_scalar_or_tuple_any_t = Union[T, Tuple[T, ...]]
+_scalar_or_tuple_1_t = Union[T, Tuple[T]]
+_scalar_or_tuple_2_t = Union[T, Tuple[T, T]]
+_scalar_or_tuple_3_t = Union[T, Tuple[T, T, T]]
+_scalar_or_tuple_4_t = Union[T, Tuple[T, T, T, T]]
+_scalar_or_tuple_5_t = Union[T, Tuple[T, T, T, T, T]]
+_scalar_or_tuple_6_t = Union[T, Tuple[T, T, T, T, T, T]]
+
+# For arguments which represent size parameters (eg, kernel size, padding)
+_size_any_t = _scalar_or_tuple_any_t[int]
+_size_1_t = _scalar_or_tuple_1_t[int]
+_size_2_t = _scalar_or_tuple_2_t[int]
+_size_3_t = _scalar_or_tuple_3_t[int]
+_size_4_t = _scalar_or_tuple_4_t[int]
+_size_5_t = _scalar_or_tuple_5_t[int]
+_size_6_t = _scalar_or_tuple_6_t[int]
+
+# For arguments that represent a ratio to adjust each dimension of an input with (eg, upsampling parameters)
+_ratio_2_t = _scalar_or_tuple_2_t[float]
+_ratio_3_t = _scalar_or_tuple_3_t[float]
+_ratio_any_t = _scalar_or_tuple_any_t[float]
+
+_tensor_list_t = _scalar_or_tuple_any_t[Tensor]
+
+# For the return value of max pooling operations that may or may not return indices.
+# With the proposed 'Literal' feature to Python typing, it might be possible to
+# eventually eliminate this.
+_maybe_indices_t = _scalar_or_tuple_2_t[Tensor]