Commit 0cd65242 authored by Mandeep Singh Baines's avatar Mandeep Singh Baines
Browse files

Initial commit

parents
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
# Copyright 2019 Kakao Brain
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multithreading in pipeline parallelism."""
from contextlib import contextmanager
from queue import Queue
import sys
from threading import Thread
from types import TracebackType
from typing import TYPE_CHECKING, Callable, Dict, Generator, List, Optional, Tuple, Type, Union, cast
import torch
from .microbatch import Batch
from .stream import AbstractStream, use_device, use_stream
__all__: List[str] = []
ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType]
# Queue is generic only in stubs.
# https://mypy.readthedocs.io/en/latest/common_issues.html#using-classes-that-are-generic-in-stubs-but-not-at-runtime
if TYPE_CHECKING:
InQueue = Queue[Optional["Task"]]
OutQueue = Queue[Tuple[bool, Union[Tuple["Task", Batch], ExcInfo, None]]]
else:
InQueue = Queue
OutQueue = Queue
class Task:
"""A task represents how to compute a micro-batch on a partition.
It consists of two parts: :meth:`compute` and :meth:`finalize`.
:meth:`compute` should be executed in worker threads concurrently.
:meth:`finalize` should be executed after when worker threads complete to
execute :meth:`compute`.
:meth:`compute` might be boosted by worker threads. Because it produces
several CUDA API calls by user code. In PyTorch, parallel CUDA API calls
are not serialized through GIL. So more than one CUDA API call can be
produced at the same time.
"""
def __init__(
self, stream: AbstractStream, *, compute: Callable[[], Batch], finalize: Optional[Callable[[Batch], None]],
) -> None:
self.stream = stream
self._compute = compute
self._finalize = finalize
self._grad_enabled = torch.is_grad_enabled()
def compute(self) -> Batch:
with use_stream(self.stream), torch.set_grad_enabled(self._grad_enabled):
return self._compute()
def finalize(self, batch: Batch) -> None:
if self._finalize is None:
return
with use_stream(self.stream), torch.set_grad_enabled(self._grad_enabled):
self._finalize(batch)
def worker(in_queue: InQueue, out_queue: OutQueue, device: torch.device) -> None:
"""The main loop of a worker thread."""
with use_device(device):
while True:
task = in_queue.get()
if task is None:
break
try:
batch = task.compute()
except Exception:
exc_info = cast(ExcInfo, sys.exc_info())
out_queue.put((False, exc_info))
continue
out_queue.put((True, (task, batch)))
done = (False, None)
out_queue.put(done)
def create_workers(devices: List[torch.device],) -> Tuple[List[InQueue], List[OutQueue]]:
"""Spawns worker threads. A worker thread is bound to a device."""
in_queues: List[InQueue] = []
out_queues: List[OutQueue] = []
# Spawn workers.
workers: Dict[torch.device, Tuple[InQueue, OutQueue]] = {}
def normalize_device(device: torch.device) -> torch.device:
if device.type == "cuda" and device.index is None:
return torch.device("cuda", index=torch.cuda.current_device())
if device.type == "cpu" and device.index is not None:
return torch.device("cpu")
return device
for device in devices:
device = normalize_device(device)
try:
in_queue, out_queue = workers[device]
except KeyError:
in_queue = Queue()
out_queue = Queue()
workers[device] = (in_queue, out_queue)
t = Thread(target=worker, args=(in_queue, out_queue, device), daemon=True,)
t.start()
in_queues.append(in_queue)
out_queues.append(out_queue)
return (in_queues, out_queues)
def join_workers(in_queues: List[InQueue], out_queues: List[OutQueue]) -> None:
# Close workers.
for in_queue in set(in_queues):
in_queue.put(None)
# Join running workers.
running = set(out_queues)
while running:
out_queue = running.pop()
ok, payload = out_queue.get()
done = (False, None)
if (ok, payload) == done:
continue
running.add(out_queue)
@contextmanager
def spawn_workers(devices: List[torch.device],) -> Generator[Tuple[List[InQueue], List[OutQueue]], None, None]:
try:
(in_queues, out_queues) = create_workers(devices)
yield (in_queues, out_queues)
finally:
join_workers(in_queues, out_queues)
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
"""
:mod:`fairgc.optim` is a package implementing various torch optimization algorithms.
"""
from .oss import OSS
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import copy
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Type
import torch.distributed as dist
from torch.optim import SGD, Optimizer
if TYPE_CHECKING:
from torch.optim.optimizer import _params_t
else:
_params_t = Any
class OSS(Optimizer):
"""Wraps an arbitrary :class:`optim.Optimizer <torch.optim.Optimizer>`
optimizer and shards its state as describe by ZeRO_.
::
opt = OSS(params, optim=torch.optim.Adam, lr=0.01)
.. _ZeRO: https://arxiv.org/abs/1910.02054
Pipe combines pipeline parallelism with checkpointing to reduce peak
memory required to train while minimizing device under-utilization.
You should determine the balance when defining a :class:`Pipe` module, as
balancing will not be done automatically. The module will be partitioned
into multiple devices according to the given balance. You may rely on
heuristics to find your own optimal configuration.
Args:
params (list of tensors):
parameters to be optimized
Keyword Args:
optim (torch.nn.Optimizer):
optimizer to shard (default: SGD)
group (group):
torch.distributed group (default: group.WORLD)
"""
optim: Optimizer
in_super_constructor: bool
def __init__(self, params: _params_t, optim: Type[Optimizer] = SGD, group: Any = dist.group.WORLD, **defaults: Any):
self.in_super_constructor = True
super().__init__(params, defaults)
self.in_super_constructor = False
self.group = group
self.rank = dist.get_rank(group)
param_groups = self.partition_parameters()
self.optim = optim(param_groups[self.rank], **defaults)
def partition_parameters(self) -> List[List[dict]]:
"""Partitions parameters across distributed ranks.
Returns a list of param_groups (which is a list of dict) where each
element of the list contains the param_groups for a rank. Element 0
corresponds to rank 0, etc. We need all the ranks for the broadcast
inside step().
"""
world_size = dist.get_world_size(self.group)
param_groups: List[List] = [list() for _ in range(world_size)]
sizes = [0] * world_size
for param_group in self.param_groups:
param_lists: List[List] = [list() for _ in range(world_size)]
for param in param_group["params"]:
# Add this param to rank with smallest size.
rank = sizes.index(min(sizes))
param_lists[rank].append(param)
sizes[rank] += param.numel()
for rank, params in enumerate(param_lists):
if len(params):
pg = copy.copy(param_group)
pg["params"] = params
param_groups[rank].append(pg)
return param_groups
def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
loss = self.optim.step(closure=closure)
for rank, param_groups in enumerate(self.partition_parameters()):
for param_group in param_groups:
for param in param_group["params"]:
dist.broadcast(param, rank, group=self.group)
return loss
def state_dict(self) -> dict:
""" Gets this rank's state_dict. """
return self.optim.state_dict()
def load_state_dict(self, state_dict: dict) -> None:
""" Loads this rank's state_dict. """
self.optim.load_state_dict(state_dict)
def add_param_group(self, param_group: dict) -> None:
super().add_param_group(param_group)
if not self.in_super_constructor:
param_groups = self.partition_parameters()[self.rank]
if len(param_groups) == len(self.optim.param_groups) + 1:
self.optim.add_param_group(param_groups[-1])
[build-system]
requires = [
"setuptools >= 40.6.2",
"wheel >= 0.30.0"
]
build-backend = "setuptools.build_meta"
[tool.black]
line-length = 120
exclude = '''
/(
\.git
| \.mypy_cache
| \.pytest_cache
| build
| stubs
)/
'''
-r requirements.txt
pre-commit
black == 19.10b0
flake8 == 3.7.9
isort == 4.3.21
mypy == 0.770
pytest == 5.4.1
torchtext == 0.6.0
torch == 1.4.0
# NOTE(msb) not a dependency but needed by torch == 1.4.0
numpy == 1.17.4
# -----------------------------------------------------------------------------
# pytest
# -----------------------------------------------------------------------------
[tool:pytest]
testpaths = tests
addopts = --verbose
junit_family = xunit2
[aliases]
test = pytest
# -----------------------------------------------------------------------------
# coverage
# -----------------------------------------------------------------------------
[coverage:report]
# Coverage couldn't detect backward functions because they are called by C++.
# Append "# pragma: no cover" to the definition lines to ignore them.
# https://www.janfreyberg.com/blog/2019-04-01-testing-pytorch-functions/
exclude_lines = pragma: no cover
# -----------------------------------------------------------------------------
# flake8
# -----------------------------------------------------------------------------
[flake8]
select = B,C,E,F,P,T4,W,B9
max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
per-file-ignores = __init__.py: F401
exclude = build,*.pyi,.git
# -----------------------------------------------------------------------------
# isort
# -----------------------------------------------------------------------------
[isort]
line_length = 120
multi_line_output=3
include_trailing_comma=True
force_grid_wrap=0
use_parentheses=True
skip_glob = build/*,stubs/*
# Don't split "import" and "from".
force_sort_within_sections = true
known_third_party = models,pytest,setuptools,torch,torchtext
# -----------------------------------------------------------------------------
# mypy
# -----------------------------------------------------------------------------
# Docs for mypy config: https://mypy.readthedocs.io/en/latest/config_file.html
[mypy]
mypy_path = ./stubs/
follow_imports = normal
# This project must be strictly typed.
[mypy-fairscale.*]
check_untyped_defs = true
disallow_untyped_defs = true
disallow_untyped_calls = true
disallow_untyped_decorators = true
disallow_incomplete_defs = true
warn_unused_ignores = true
# Ignore missing imports from untyped third-party libraries.
[mypy-torch.*,torchvision.*,setuptools.*,pytest.*]
ignore_missing_imports = true
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
import setuptools
def fetch_requirements():
with open("requirements.txt") as f:
reqs = f.read().strip().split("\n")
return reqs
if __name__ == "__main__":
setuptools.setup(
name="fairscale",
description="fairscale: Utility library for large-scale and high-performance training.",
install_requires=fetch_requirements(),
include_package_data=True,
packages=setuptools.find_packages(exclude=("tests", "tests.*")),
python_requires=">=3.6",
author="Facebook AI Research",
author_email="todo@fb.com",
classifiers=[
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"License :: OSI Approved :: BSD License",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Operating System :: OS Independent",
],
)
This diff is collapsed.
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
from typing import Any, Callable, Union, Tuple, Sequence, Optional
from .. import Tensor
from .grad_mode import no_grad as no_grad, enable_grad as enable_grad, \
set_grad_enabled as set_grad_enabled
from .profiler import record_function
# TODO make Variable and Function more precise
class Variable:
...
class Function:
@staticmethod
def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any: ...
@staticmethod
def backward(ctx: Any, *grad_outputs: Any) -> Any: ...
#MODIFIED BY TORCHGPIPE
@staticmethod
def apply(*args: Any, **kwargs: Any) -> Any: ...
#END
class NestedIOFunction(Function):
# The 'type: ignore' statements are needed here because these functions are declared as '@staticmethod' in the
# superclass (Function) but are instance methods here, which mypy reports as incomptabile.
def backward(self, *gradients: Any) -> Any: ... # type: ignore
def forward(self, *args: Any) -> tuple: ... # type: ignore
def save_for_backward(self, *args: Any) -> None:...
def mark_dirty(self, *args: Any, **kwargs: Any) -> None:...
def mark_non_differentiable(self, *args: Any, **kwargs: Any) -> None: ...
def forward_extended(self, *input: Any) -> None:...
def backward_extended(self, *grad_output: Any) -> None: ...
# 'func' accepts a vararg of tensors, which isn't expressable in the type system at the moment.
# If https://mypy.readthedocs.io/en/latest/additional_features.html?highlight=callable#extended-callable-types is accepted,
# the '...' first argument of Callabe can be replaced with VarArg(Tensor).
# For now, we permit any input.
def gradcheck(func: Callable[..., Union[Tensor, Tuple[Tensor, ...]]], inputs: Union[Tensor, Tuple[Tensor, ...]], eps: float=..., atol: float=..., rtol: float=..., raise_exception: bool=..., check_sparse_nnz: bool=...) -> bool: ...
def gradgradcheck(func: Callable[..., Union[Tensor, Tuple[Tensor, ...]]], inputs: Union[Tensor, Tuple[Tensor, ...]], eps: float=..., atol: float=..., rtol: float=..., gen_non_contig_grad_outputs: bool=..., raise_exception: bool=...) -> bool: ...
class detect_anomaly:
def __enter__(self) -> None: ...
def __exit__(self, *args: Any) -> bool: ...
class set_detect_anomaly:
def __init__(self, mode: bool) -> None: ...
def __enter__(self) -> None:...
def __exit__(self, *args: Any) -> bool: ...
_TensorOrTensors = Union[Tensor, Sequence[Tensor]]
def backward(tensors: _TensorOrTensors, grad_tensors: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=...) -> None: ...
def grad(outputs: _TensorOrTensors, inputs: _TensorOrTensors, grad_outputs: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=..., only_inputs: bool=..., allow_unused: bool=...) -> Tuple[Tensor, ...]: ...
\ No newline at end of file
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
from typing import Any, Callable, Optional, TypeVar
# Used for annotating the decorator usage of 'no_grad' and 'enable_grad'.
# See https://mypy.readthedocs.io/en/latest/generics.html#declaring-decorators
FuncType = Callable[..., Any]
T = TypeVar('T', bound=FuncType)
class no_grad:
def __enter__(self) -> None: ...
def __exit__(self, *args: Any) -> Optional[bool]: ...
def __call__(self, func: T) -> T: ...
class enable_grad:
def __enter__(self) -> None: ...
def __exit__(self, *args: Any) -> Optional[bool]: ...
def __call__(self, func: T) -> T: ...
class set_grad_enabled:
def __init__(self, mode: bool) -> None: ...
def __enter__(self) -> None: ...
def __exit__(self, *args: Any) -> Optional[bool]: ...
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
from typing import Any, ContextManager, Optional
class record_function(ContextManager[None]):
def __init__(self, name: str) -> None: ...
def __enter__(self) -> None: ...
def __exit__(self, *args: Any) -> Optional[bool]: ...
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#MODIFIED BY TORCHGPIPE
from . import cudnn
#END
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#MODIFIED BY TORCHGPIPE
def version() -> int: ...
#END
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
from typing import Optional, Tuple, Union
import ctypes
from .. import device as _device
def is_available() -> bool: ...
def init() -> None: ...
class cudaStatus:
SUCCESS: int
ERROR_NOT_READY: int
class CudaError:
def __init__(self, code: int) -> None: ...
class _CudaDeviceProperties:
name: str
major: int
minor: int
multi_processor_count: int
total_memory: int
is_integrated: int
is_multi_gpu_board: int
_device_t = Union[_device, int]
def check_error(res: int) -> None: ...
def device_count() -> int: ...
def empty_cache() -> None: ...
def synchronize(device: _device_t) -> None: ...
def set_device(device: _device_t) -> None: ...
def get_device_capability(device: Optional[_device_t]=...) -> Tuple[int, int]: ...
def get_device_name(device: Optional[_device_t]=...) -> str: ...
def get_device_properties(device: _device_t) -> _CudaDeviceProperties: ...
def current_device() -> int: ...
def memory_allocated(device: Optional[_device_t]=...) -> int: ...
def max_memory_allocated(device: Optional[_device_t]=...) -> int: ...
def reset_max_memory_allocated(device: Optional[_device_t]=...) -> None: ...
def memory_cached(device: Optional[_device_t]=...) -> int: ...
def max_memory_cached(device: Optional[_device_t]=...) -> int: ...
def reset_max_memory_cached(device: Optional[_device_t]=...) -> None: ...
def cudart() -> ctypes.CDLL: ...
def find_cuda_windows_lib() -> Optional[ctypes.CDLL]: ...
#MODIFIED BY TORCHGPIPE
from .. import ByteTensor
def set_rng_state(new_state: ByteTensor, device: _device_t = ...) -> None: ...
def get_rng_state(device: _device_t = ...) -> ByteTensor: ...
#END
#MODIFIED BY TORCHGPIPE
from typing import Any
class device:
def __init__(self, device: _device_t = ...) -> None: ...
def __enter__(self) -> None: ...
def __exit__(self, *args: Any) -> None: ...
class Stream:
device: _device
def __init__(self, device: _device_t = ..., priority: int = ...) -> None: ...
def synchronize(self) -> None: ...
def wait_stream(self, stream: Stream) -> None: ...
class stream:
def __init__(self, stream: Optional[Stream] = ...) -> None: ...
def __enter__(self) -> None: ...
def __exit__(self, *args: Any) -> None: ...
def current_stream(device: Optional[_device_t]) -> Stream: ...
def default_stream(device: Optional[_device_t]) -> Stream: ...
#END
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#MODIFIED BY TORCHGPIPE
from typing import Iterable, Optional, Tuple
from torch import Tensor
def scatter(tensor: Tensor,
devices: Iterable[int],
chunk_sizes: Optional[Iterable[int]] = None,
dim: int = 0,
) -> Tuple[Tensor, ...]: ...
def gather(tensors: Iterable[Tensor],
dim: int = 0,
destination: Optional[int] = None,
) -> Tensor: ...
#END
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
from typing import Any
from torch import Tensor
def get_rank(group: Any) -> int: ...
def get_world_size(group: Any) -> int: ...
def broadcast(tensor: Tensor, src: Any, group: Any, async_op: Any = False): ...
class group(object):
WORLD: Any
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
from .modules import *
from .parameter import Parameter as Parameter
from .parallel import DataParallel as DataParallel
from . import functional as functional
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
from typing import TypeVar, Union, Tuple
from .. import Tensor
# Create some useful type aliases
# Template for arguments which can be supplied as a tuple, or which can be a scalar which PyTorch will internally
# broadcast to a tuple.
# Comes in several variants: A tuple of unknown size, and a fixed-size tuple for 1d, 2d, or 3d operations.
T = TypeVar('T')
_scalar_or_tuple_any_t = Union[T, Tuple[T, ...]]
_scalar_or_tuple_1_t = Union[T, Tuple[T]]
_scalar_or_tuple_2_t = Union[T, Tuple[T, T]]
_scalar_or_tuple_3_t = Union[T, Tuple[T, T, T]]
_scalar_or_tuple_4_t = Union[T, Tuple[T, T, T, T]]
_scalar_or_tuple_5_t = Union[T, Tuple[T, T, T, T, T]]
_scalar_or_tuple_6_t = Union[T, Tuple[T, T, T, T, T, T]]
# For arguments which represent size parameters (eg, kernel size, padding)
_size_any_t = _scalar_or_tuple_any_t[int]
_size_1_t = _scalar_or_tuple_1_t[int]
_size_2_t = _scalar_or_tuple_2_t[int]
_size_3_t = _scalar_or_tuple_3_t[int]
_size_4_t = _scalar_or_tuple_4_t[int]
_size_5_t = _scalar_or_tuple_5_t[int]
_size_6_t = _scalar_or_tuple_6_t[int]
# For arguments that represent a ratio to adjust each dimension of an input with (eg, upsampling parameters)
_ratio_2_t = _scalar_or_tuple_2_t[float]
_ratio_3_t = _scalar_or_tuple_3_t[float]
_ratio_any_t = _scalar_or_tuple_any_t[float]
_tensor_list_t = _scalar_or_tuple_any_t[Tensor]
# For the return value of max pooling operations that may or may not return indices.
# With the proposed 'Literal' feature to Python typing, it might be possible to
# eventually eliminate this.
_maybe_indices_t = _scalar_or_tuple_2_t[Tensor]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment