Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
* @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @arashashari @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @niumanar
# This file is used to subscribe for notifications for PRs
# related to specific file paths, does not necessarily mean
# approval is required from these people before merging.
#
# Learn more about CODEOWNERS syntax here:
# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
# top-level repo folders
/.github/ @jeffra @mrwyattii
/azure/ @jeffra @awan-10
/benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
/bin/ @jeffra
/csrc/ @RezaYazdaniAminabadi @awan-10 @jeffra @cmikeh2 @arashb
/deepspeed/ @jeffra
/docker/ @jeffra @awan-10
/docs/ @jeffra @mrwyattii
/examples/ @jeffra @awan-10 @mrwyattii
/op_builder/ @jeffra @RezaYazdaniAminabadi @cmikeh2
/release/ @jeffra @mrwyattii
/requirements/ @jeffra @mrwyattii
/scripts/ @jeffra @awan-10
/tests/ @jeffra @mrwyattii @tjruwase
# deepspeed
/deepspeed/autotuning/ @cli99
/deepspeed/checkpoint/ @tjruwase
/deepspeed/comm/ @awan-10
/deepspeed/compression/ @yaozhewei @minjiaz @xiaoxiawu-microsoft @conglongli
/deepspeed/elasticity/ @jeffra @awan-10
/deepspeed/launcher/ @jeffra @awan-10
/deepspeed/module_inject/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/moe/ @awan-10
/deepspeed/monitor/ @awan-10 @jeffra
/deepspeed/nebula/ @tjruwase @jeffra
/deepspeed/ops/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/pipe/ @ShadenSmith @duli2012
/deepspeed/profiling/ @cli99
/deepspeed/utils/ @jeffra @tjruwase @awan-10
# inference
/deepspeed/inference/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/model_implementations/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
# training
/deepspeed/runtime/ @jeffra @tjruwase
/deepspeed/runtime/activation_checkpointing/ @jeffra @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase @jeffra
/deepspeed/runtime/comm/ @awan-10
/deepspeed/runtime/compression/ @awan-10 @conglongli
/deepspeed/runtime/data_pipeline/ @conglongli
/deepspeed/runtime/fp16/ @jeffra @tjruwase
/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
/deepspeed/runtime/zero/ @jeffra @tjruwase @samyam @mrwyattii
......@@ -2,3 +2,6 @@ include *.txt README.md
recursive-include requirements *.txt
recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
recursive-include op_builder *.py
recursive-include benchmarks *.py
recursive-include accelerator *.py
......@@ -6,3 +6,4 @@ recursive-include deepspeed *.tr
recursive-exclude deepspeed/ops/csrc *.cpp *.h *.cu *.cuh *.cc
prune csrc
prune op_builder
prune accelerator
This diff is collapsed.
'''Copyright The Microsoft DeepSpeed Team'''
from .abstract_accelerator import DeepSpeedAccelerator
from .real_accelerator import get_accelerator, set_accelerator
'''Copyright The Microsoft DeepSpeed Team'''
import abc
from abc import ABC
class DeepSpeedAccelerator(ABC):
def __init__(self):
self._name = None
self._communication_backend_name = None
# Device APIs
@abc.abstractmethod
def device_name(self, device_index):
...
@abc.abstractmethod
def device(self, device_index):
...
@abc.abstractmethod
def set_device(self, device_index):
...
@abc.abstractmethod
def current_device(self):
...
@abc.abstractmethod
def current_device_name(self):
...
@abc.abstractmethod
def device_count(self):
...
@abc.abstractmethod
def synchronize(self, device_index=None):
...
# RNG APIs
@abc.abstractmethod
def random(self):
...
@abc.abstractmethod
def set_rng_state(self, new_state, device_index=None):
...
@abc.abstractmethod
def get_rng_state(self, device_index=None):
...
@abc.abstractmethod
def manual_seed(self, seed):
...
@abc.abstractmethod
def manual_seed_all(self, seed):
...
@abc.abstractmethod
def initial_seed(self, seed):
...
@abc.abstractmethod
def default_generator(self, device_index):
...
# Streams/Events
@property
@abc.abstractmethod
def Stream(self):
...
@abc.abstractmethod
def stream(self, stream):
...
@abc.abstractmethod
def current_stream(self, device_index=None):
...
@abc.abstractmethod
def default_stream(self, device_index=None):
...
@property
@abc.abstractmethod
def Event(self):
...
# Memory management
@abc.abstractmethod
def empty_cache(self):
...
@abc.abstractmethod
def memory_allocated(self, device_index=None):
...
@abc.abstractmethod
def max_memory_allocated(self, device_index=None):
...
@abc.abstractmethod
def reset_max_memory_allocated(self, device_index=None):
...
@abc.abstractmethod
def memory_cached(self, device_index=None):
...
@abc.abstractmethod
def max_memory_cached(self, device_index=None):
...
@abc.abstractmethod
def reset_max_memory_cached(self, device_index=None):
...
@abc.abstractmethod
def memory_stats(self, device_index=None):
...
@abc.abstractmethod
def reset_peak_memory_stats(self, device_index=None):
...
@abc.abstractmethod
def memory_reserved(self, device_index=None):
...
@abc.abstractmethod
def max_memory_reserved(self, device_index=None):
...
@abc.abstractmethod
def total_memory(self, device_index=None):
...
# Data types
@abc.abstractmethod
def is_bf16_supported(self):
...
@abc.abstractmethod
def is_fp16_supported(self):
...
# Misc
@abc.abstractmethod
def amp(self):
...
@abc.abstractmethod
def is_available(self):
...
@abc.abstractmethod
def range_push(self, msg):
...
@abc.abstractmethod
def range_pop(self):
...
@abc.abstractmethod
def lazy_call(self, callback):
...
@abc.abstractmethod
def communication_backend_name(self):
...
# Tensor operations
@property
@abc.abstractmethod
def BFloat16Tensor(self):
...
@property
@abc.abstractmethod
def ByteTensor(self):
...
@property
@abc.abstractmethod
def DoubleTensor(self):
...
@property
@abc.abstractmethod
def FloatTensor(self):
...
@property
@abc.abstractmethod
def HalfTensor(self):
...
@property
@abc.abstractmethod
def IntTensor(self):
...
@property
@abc.abstractmethod
def LongTensor(self):
...
@abc.abstractmethod
def pin_memory(self, tensor):
...
@abc.abstractmethod
def on_accelerator(self, tensor):
...
@abc.abstractmethod
def op_builder_dir(self):
...
# create an instance of op builder, specified by class_name
@abc.abstractmethod
def create_op_builder(self, class_name):
...
# return an op builder class, specified by class_name
@abc.abstractmethod
def get_op_builder(self, class_name):
...
@abc.abstractmethod
def build_extension(self):
...
'''Copyright The Microsoft DeepSpeed Team'''
import os
import pkgutil
import importlib
from .abstract_accelerator import DeepSpeedAccelerator
# During setup stage torch may not be installed, pass on no torch will
# allow op builder related API to be executed.
try:
import torch.cuda
except ImportError:
pass
class CUDA_Accelerator(DeepSpeedAccelerator):
def __init__(self):
self._name = 'cuda'
self._communication_backend_name = 'nccl'
# begin initialize for create_op_builder()
# put all valid class name <--> class type mapping into class_dict
op_builder_dir = self.op_builder_dir()
op_builder_module = importlib.import_module(op_builder_dir)
for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
# avoid self references
if module_name != 'all_ops' and module_name != 'builder':
module = importlib.import_module("{}.{}".format(
op_builder_dir,
module_name))
for member_name in module.__dir__():
if member_name.endswith(
'Builder'
) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder": # avoid abstract classes
if not member_name in self.class_dict:
self.class_dict[member_name] = getattr(module, member_name)
# end initialize for create_op_builder()
# Device APIs
def device_name(self, device_index=None):
if device_index == None:
return 'cuda'
return 'cuda:{}'.format(device_index)
def device(self, device_index=None):
return torch.cuda.device(device_index)
def set_device(self, device_index):
torch.cuda.set_device(device_index)
def current_device(self):
return torch.cuda.current_device()
def current_device_name(self):
return 'cuda:{}'.format(torch.cuda.current_device())
def device_count(self):
return torch.cuda.device_count()
def synchronize(self, device_index=None):
return torch.cuda.synchronize(device_index)
# RNG APIs
def random(self):
return torch.random
def set_rng_state(self, new_state, device_index=None):
if device_index is None:
return torch.cuda.set_rng_state(new_state)
return torch.cuda.set_rng_state(new_state, device_index)
def get_rng_state(self, device_index=None):
if device_index is None:
return torch.cuda.get_rng_state()
return torch.cuda.get_rng_state(device_index)
def manual_seed(self, seed):
return torch.cuda.manual_seed(seed)
def manual_seed_all(self, seed):
return torch.cuda.manual_seed_all(seed)
def initial_seed(self, seed):
return torch.cuda.initial_seed(seed)
def default_generator(self, device_index):
return torch.cuda.default_generators[device_index]
# Streams/Events
@property
def Stream(self):
return torch.cuda.Stream
def stream(self, stream):
return torch.cuda.stream(stream)
def current_stream(self, device_index=None):
return torch.cuda.current_stream(device_index)
def default_stream(self, device_index=None):
return torch.cuda.default_stream(device_index)
@property
def Event(self):
return torch.cuda.Event
# Memory management
def empty_cache(self):
return torch.cuda.empty_cache()
def memory_allocated(self, device_index=None):
return torch.cuda.memory_allocated(device_index)
def max_memory_allocated(self, device_index=None):
return torch.cuda.max_memory_allocated(device_index)
def reset_max_memory_allocated(self, device_index=None):
return torch.cuda.reset_max_memory_allocated(device_index)
def memory_cached(self, device_index=None):
return torch.cuda.memory_cached(device_index)
def max_memory_cached(self, device_index=None):
return torch.cuda.max_memory_cached(device_index)
def reset_max_memory_cached(self, device_index=None):
return torch.cuda.reset_max_memory_cached(device_index)
def memory_stats(self, device_index=None):
if hasattr(torch.cuda, 'memory_stats'):
return torch.cuda.memory_stats(device_index)
def reset_peak_memory_stats(self, device_index=None):
if hasattr(torch.cuda, 'reset_peak_memory_stats'):
return torch.cuda.reset_peak_memory_stats(device_index)
def memory_reserved(self, device_index=None):
if hasattr(torch.cuda, 'memory_reserved'):
return torch.cuda.memory_reserved(device_index)
def max_memory_reserved(self, device_index=None):
if hasattr(torch.cuda, 'max_memory_reserved'):
return torch.cuda.max_memory_reserved(device_index)
def total_memory(self, device_index=None):
return torch.cuda.get_device_properties(device_index).total_memory
# Data types
def is_bf16_supported(self):
return torch.cuda.is_bf16_supported()
def is_fp16_supported(self):
major, _ = torch.cuda.get_device_capability()
if major >= 7:
return True
else:
return False
# Misc
def amp(self):
if hasattr(torch.cuda, 'amp'):
return torch.cuda.amp
return None
def is_available(self):
return torch.cuda.is_available()
def range_push(self, msg):
if hasattr(torch.cuda.nvtx, 'range_push'):
return torch.cuda.nvtx.range_push(msg)
def range_pop(self):
if hasattr(torch.cuda.nvtx, 'range_pop'):
return torch.cuda.nvtx.range_pop()
def lazy_call(self, callback):
return torch.cuda._lazy_call(callback)
def communication_backend_name(self):
return self._communication_backend_name
# Tensor operations
@property
def BFloat16Tensor(self):
return torch.cuda.BFloat16Tensor
@property
def ByteTensor(self):
return torch.cuda.ByteTensor
@property
def DoubleTensor(self):
return torch.cuda.DoubleTensor
@property
def FloatTensor(self):
return torch.cuda.FloatTensor
@property
def HalfTensor(self):
return torch.cuda.HalfTensor
@property
def IntTensor(self):
return torch.cuda.IntTensor
@property
def LongTensor(self):
return torch.cuda.LongTensor
def pin_memory(self, tensor):
return tensor.pin_memory()
def on_accelerator(self, tensor):
device_str = str(tensor.device)
if device_str.startswith('cuda:'):
return True
else:
return False
def op_builder_dir(self):
try:
# during installation time op_builder is visible, otherwise return deepspeed.ops.op_builder
import op_builder # noqa: F401
return "op_builder"
except ImportError:
return "deepspeed.ops.op_builder"
# dict that holds class name <--> class type mapping i.e.
# 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
# this dict will be filled at init stage
class_dict = {}
# create an instance of op builder and return, name specified by class_name
def create_op_builder(self, class_name):
if class_name in self.class_dict:
return self.class_dict[class_name]()
else:
return None
# return an op builder class, name specified by class_name
def get_op_builder(self, class_name):
if class_name in self.class_dict:
return self.class_dict[class_name]
else:
return None
def build_extension(self):
from torch.utils.cpp_extension import BuildExtension
return BuildExtension
'''Copyright The Microsoft DeepSpeed Team'''
try:
from accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa1
except ImportError as e:
dsa1 = None
try:
from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa2
except ImportError as e:
dsa2 = None
ds_accelerator = None
def _validate_accelerator(accel_obj):
# because abstract_accelerator has different path during
# build time (accelerator.abstract_accelerator)
# and run time (deepspeed.accelerator.abstract_accelerator)
# and extension would import the
# run time abstract_accelerator/DeepSpeedAccelerator as its base
# class, so we need to compare accel_obj with both base class.
# if accel_obj is instance of DeepSpeedAccelerator in one of
# accelerator.abstractor_accelerator
# or deepspeed.accelerator.abstract_accelerator, consider accel_obj
# is a conforming object
if not ((dsa1 != None and isinstance(accel_obj,
dsa1)) or
(dsa2 != None and isinstance(accel_obj,
dsa2))):
raise AssertionError(
f'{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator'
)
# TODO: turn off is_available test since this breaks tests
#assert accel_obj.is_available(), \
# f'{accel_obj.__class__.__name__} accelerator fails is_available() test'
def get_accelerator():
global ds_accelerator
if ds_accelerator is None:
try:
from intel_extension_for_deepspeed import XPU_Accelerator
except ImportError as e:
pass
else:
ds_accelerator = XPU_Accelerator()
_validate_accelerator(ds_accelerator)
return ds_accelerator
from .cuda_accelerator import CUDA_Accelerator
ds_accelerator = CUDA_Accelerator()
_validate_accelerator(ds_accelerator)
return ds_accelerator
def set_accelerator(accel_obj):
global ds_accelerator
_validate_accelerator(accel_obj)
ds_accelerator = accel_obj
'''
-----------[code] test_get.py -----------
from deepspeed.accelerator import get_accelerator
my_accelerator = get_accelerator()
print(f'{my_accelerator._name=}')
print(f'{my_accelerator._communication_backend=}')
print(f'{my_accelerator.HalfTensor().device=}')
print(f'{my_accelerator.total_memory()=}')
-----------[code] test_get.py -----------
---[output] python test_get.py---------
my_accelerator.name()='cuda'
my_accelerator.communication_backend='nccl'
my_accelerator.HalfTensor().device=device(type='cuda', index=0)
my_accelerator.total_memory()=34089730048
---[output] python test_get.py---------
**************************************************************************
-----------[code] test_set.py -----------
from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
cu_accel = CUDA_Accelerator()
print(f'{id(cu_accel)=}')
from deepspeed.accelerator import set_accelerator, get_accelerator
set_accelerator(cu_accel)
my_accelerator = get_accelerator()
print(f'{id(my_accelerator)=}')
print(f'{my_accelerator._name=}')
print(f'{my_accelerator._communication_backend=}')
print(f'{my_accelerator.HalfTensor().device=}')
print(f'{my_accelerator.total_memory()=}')
-----------[code] test_set.py -----------
---[output] python test_set.py---------
id(cu_accel)=139648165478304
my_accelerator=<deepspeed.accelerator.cuda_accelerator.CUDA_Accelerator object at 0x7f025f4bffa0>
my_accelerator.name='cuda'
my_accelerator.communication_backend='nccl'
my_accelerator.HalfTensor().device=device(type='cuda', index=0)
my_accelerator.total_memory()=34089730048
---[output] python test_set.py---------
'''
# Getting Started with DeepSpeed on Azure
Please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/) to get started with DeepSpeed on Azure!
The recommended and simplest method to try DeepSpeed on Azure is through [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). For more details, please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
'''Copyright The Microsoft DeepSpeed Team'''
# Running Communication Benchmarks
To run benchmarks, there are two options:
1. Run a single communication operation:
For example, run with a single large message size:
<pre>
deepspeed all_reduce.py
</pre>
Scan across message sizes:
<pre>
deepspeed all_reduce.py --scan
</pre>
2. Run all available communication benchmarks:
<pre>
deepspeed run_all.py
</pre>
Like the individual benchmarks, `run_all.py` supports scanning arguments for the max message size, bw-unit, etc. Simply pass the desired arguments to `run_all.py` and they'll be propagated to each comm op.
<pre>
usage: ds_bench [-h] [--local_rank LOCAL_RANK] [--trials TRIALS] [--warmups WARMUPS] [--maxsize MAXSIZE] [--async-op] [--bw-unit {Gbps,GBps}] [--backend {nccl}] [--dist {deepspeed,torch}] [--scan] [--raw] [--all-reduce] [--all-gather] [--all-to-all]
[--pt2pt] [--broadcast] [--dtype DTYPE] [--mem-factor MEM_FACTOR] [--debug]
optional arguments:
-h, --help show this help message and exit
--local_rank LOCAL_RANK
--trials TRIALS Number of timed iterations
--warmups WARMUPS Number of warmup (non-timed) iterations
--maxsize MAXSIZE Max message size as a power of 2
--async-op Enables non-blocking communication
--bw-unit {Gbps,GBps}
--backend {nccl} Communication library to use
--dist {deepspeed,torch}
Distributed DL framework to use
--scan Enables scanning all message sizes
--raw Print the message size and latency without units
--all-reduce Run all_reduce
--all-gather Run all_gather
--all-to-all Run all_to_all
--pt2pt Run pt2pt
--broadcast Run broadcast
--dtype DTYPE PyTorch tensor dtype
--mem-factor MEM_FACTOR
Proportion of max available GPU memory to use for single-size evals
--debug Enables all_to_all debug prints
</pre>
Note that `ds_bench` is a pre-packaged wrapper around `run_all.py`. Users can pass the same arguments as well:
<pre>
<path to deepspeed>/bin/ds_bench --scan --trials=10
</pre>
Finally, users can choose specific communication operations to run in `run_all.py` or `ds_bench` by passing them as arguments (all operations are run by default). For example:
<pre>
deepspeed run_all.py --scan --all-reduce --all-to-all --broadcast
</pre>
# Adding Communication Benchmarks
To add new communication benchmarks, follow this general procedure:
1. Copy a similar benchmark file (e.g. to add `reduce_scatter`, copy `all_reduce.py` as a template)
2. Add a new bw formula in `utils.get_bw`, a new maximum tensor element formula in `utils.max_numel`, and a new arg in `utils.benchmark_parser`
3. Replace comm op calls in new file with find-replace
4. Find a good default `mem_factor` for use in `run_<collective>_single()` function
5. Add new comm op to `run_all.py`
'''Copyright The Microsoft DeepSpeed Team'''
'''Copyright The Microsoft DeepSpeed Team'''
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *
from deepspeed.accelerator import get_accelerator
import time
# Run all_gather and print metrics
def timed_all_gather(input, output, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
sync_all()
# Warmups, establish connections, etc.
for i in range(args.warmups):
# use all_gather_base if available
if args.dist == 'torch':
if hasattr(torch.distributed, "_all_gather_base"):
dist._all_gather_base(output, input, group=None, async_op=args.async_op)
else:
output_tensors = list(
torch.chunk(output_tensor,
cdb.get_world_size(group)))
dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
elif args.dist == 'deepspeed':
dist.allgather_fn(output, input, group=None, async_op=args.async_op)
sync_all()
# time the actual comm op trials times and average it
pre = time.perf_counter()
for i in range(args.trials):
# use all_gather_base if available
if args.dist == 'torch':
if hasattr(torch.distributed, "_all_gather_base"):
dist._all_gather_base(output, input, group=None, async_op=args.async_op)
else:
output_tensors = list(
torch.chunk(output_tensor,
cdb.get_world_size(group)))
dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
elif args.dist == 'deepspeed':
dist.allgather_fn(output, input, group=None, async_op=args.async_op)
sync_all()
duration = time.perf_counter() - pre
# maintain and clean performance data
avg_duration = duration / args.trials
size = input.element_size() * input.nelement()
n = dist.get_world_size()
tput, busbw = get_bw('all_gather', size, avg_duration, args)
tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
desc = f'{input.nelement()}x{input.element_size()}'
if not args.raw:
size = convert_size(size)
print_rank_0(
f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
def run_all_gather(local_rank, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
# Prepare benchmark header
print_header(args, 'all_gather')
global_rank = dist.get_rank()
world_size = dist.get_world_size()
if args.scan:
# Create list of message sizes
M_LIST = []
for x in (2**p for p in range(1, args.maxsize)):
M_LIST.append(x)
sync_all()
# loop over various tensor sizes
for M in M_LIST:
global_rank = dist.get_rank()
try:
mat = torch.ones(world_size,
M,
dtype=getattr(
torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
sync_all()
input = ((mat.mul_(float(global_rank))).view(-1))
# Delete original mat to avoid OOM
del mat
get_accelerator().empty_cache()
output = torch.zeros(input.nelement() * world_size,
dtype=getattr(
torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print('WARNING: Ran out of GPU memory. Exiting comm op.')
sync_all()
break
sync_all()
timed_all_gather(input, output, args)
else:
# all_gather_base saves memory
if (args.dist == 'torch'
and hasattr(torch.distributed,
"_all_gather_base")) or (args.dist == 'deepspeed'
and dist.has_allgather_base):
mem_factor = args.mem_factor + 0.2
else:
mem_factor = args.mem_factor
# Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
sync_all()
elements_per_gpu = max_numel(comm_op='all_gather',
dtype=getattr(torch,
args.dtype),
mem_factor=mem_factor,
local_rank=local_rank,
args=args)
try:
mat = torch.ones(elements_per_gpu,
dtype=getattr(torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
# multiply each GPU's tensor by the rank to ease debugging
input = ((mat.mul_(float(global_rank))).view(-1))
# Delete original mat to avoid OOM
del mat
get_accelerator().empty_cache()
output = torch.zeros(
elements_per_gpu * world_size,
dtype=getattr(torch,
args.dtype)).to(get_accelerator().device_name(local_rank))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print(
'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
)
sync_all()
return
sync_all()
timed_all_gather(input, output, args)
if __name__ == "__main__":
args = benchmark_parser().parse_args()
rank = args.local_rank
init_processes(local_rank=rank, args=args)
run_all_gather(local_rank=rank, args=args)
'''Copyright The Microsoft DeepSpeed Team'''
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *
from deepspeed.accelerator import get_accelerator
import time
def timed_all_reduce(input, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
sync_all()
# Warmups, establish connections, etc.
for i in range(args.warmups):
dist.all_reduce(input, async_op=args.async_op)
sync_all()
# time the actual comm op trials times and average it
pre = time.perf_counter()
for i in range(args.trials):
dist.all_reduce(input, async_op=args.async_op)
sync_all()
duration = time.perf_counter() - pre
# maintain and clean performance data
avg_duration = duration / args.trials
size = input.element_size() * input.nelement()
n = dist.get_world_size()
tput, busbw = get_bw('all_reduce', size, avg_duration, args)
tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
desc = f'{input.nelement()}x{input.element_size()}'
if not args.raw:
size = convert_size(size)
print_rank_0(
f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
def run_all_reduce(local_rank, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
# Prepare benchmark header
print_header(args, 'all_reduce')
world_size = dist.get_world_size()
global_rank = dist.get_rank()
if args.scan:
M_LIST = []
for x in (2**p for p in range(1, args.maxsize)):
M_LIST.append(x)
sync_all()
# loop over various tensor sizes
for M in M_LIST:
global_rank = dist.get_rank()
try:
mat = torch.ones(world_size,
M,
dtype=getattr(
torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
sync_all()
input = ((mat.mul_(float(global_rank))).view(-1))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print('WARNING: Ran out of GPU memory. Exiting comm op.')
sync_all()
break
sync_all()
timed_all_reduce(input, args)
else:
# Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
# Don't need output tensor, so we double mem_factor
elements_per_gpu = max_numel(comm_op='all_reduce',
dtype=getattr(torch,
args.dtype),
mem_factor=args.mem_factor * 2,
local_rank=local_rank,
args=args)
try:
mat = torch.ones(elements_per_gpu,
dtype=getattr(torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
input = ((mat.mul_(float(global_rank))).view(-1))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print(
'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
)
sync_all()
return
sync_all()
timed_all_reduce(input, args)
if __name__ == "__main__":
args = benchmark_parser().parse_args()
rank = args.local_rank
init_processes(local_rank=rank, args=args)
run_all_reduce(local_rank=rank, args=args)
'''Copyright The Microsoft DeepSpeed Team'''
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *
from deepspeed.accelerator import get_accelerator
import time
def timed_all_to_all(input, output, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
sync_all()
# Warmups, establish connections, etc.
for i in range(args.warmups):
dist.all_to_all_single(output, input, async_op=args.async_op)
sync_all()
# time the actual comm op trials times and average it
pre = time.perf_counter()
for i in range(args.trials):
dist.all_to_all_single(output, input, async_op=args.async_op)
sync_all()
duration = time.perf_counter() - pre
# maintain and clean performance data
avg_duration = duration / args.trials
size = input.element_size() * input.nelement()
n = dist.get_world_size()
tput, busbw = get_bw('all_to_all', size, avg_duration, args)
tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
desc = f'{input.nelement()}x{input.element_size()}'
if not args.raw:
size = convert_size(size)
print_rank_0(
f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
def run_all_to_all(local_rank, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
world_size = dist.get_world_size()
global_rank = dist.get_rank()
# Prepare benchmark header
print_header(args, 'all_to_all')
if args.scan:
M_LIST = []
for x in (2**p for p in range(1, args.maxsize)):
M_LIST.append(x)
sync_all()
# loop over various tensor sizes
for M in M_LIST:
global_rank = dist.get_rank()
try:
mat = torch.ones(world_size,
M,
dtype=getattr(
torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks"
sync_all()
input = ((mat.mul_(float(global_rank))).view(-1))
output = (mat.clone().view(-1))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print('WARNING: Ran out of GPU memory. Exiting comm op.')
sync_all()
break
sync_all()
timed_all_to_all(input, output, args)
else:
# Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
elements_per_gpu = max_numel(comm_op='all_to_all',
dtype=getattr(torch,
args.dtype),
mem_factor=args.mem_factor,
local_rank=local_rank,
args=args)
try:
mat = torch.ones(elements_per_gpu,
dtype=getattr(torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks"
input = ((mat.mul_(float(global_rank))).view(-1))
# Delete original mat to avoid OOM
del mat
get_accelerator().empty_cache()
output = torch.zeros(
elements_per_gpu,
dtype=getattr(torch,
args.dtype)).to(get_accelerator().device_name(local_rank))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print(
'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
)
sync_all()
return
sync_all()
if args.debug:
for i in range(world_size):
if i == global_rank:
print(f"Before AllToAll Input List at rank {global_rank}: {input}")
dist.barrier()
timed_all_to_all(input, output, args)
if args.debug:
for i in range(world_size):
if i == global_rank:
print(f"AllToAll Results at rank {global_rank}: {output}")
dist.barrier()
if __name__ == "__main__":
args = benchmark_parser().parse_args()
rank = args.local_rank
init_processes(local_rank=rank, args=args)
run_all_to_all(local_rank=rank, args=args)
'''Copyright The Microsoft DeepSpeed Team'''
import torch
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *
from deepspeed.accelerator import get_accelerator
import time
def timed_broadcast(input, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
sync_all()
# Warmups, establish connections, etc.
for i in range(args.warmups):
dist.broadcast(input, 0, async_op=args.async_op)
sync_all()
# time the actual comm op trials times and average it
pre = time.perf_counter()
for i in range(args.trials):
dist.broadcast(input, 0, async_op=args.async_op)
sync_all()
duration = time.perf_counter() - pre
# maintain and clean performance data
avg_duration = duration / args.trials
size = input.element_size() * input.nelement()
n = dist.get_world_size()
tput, busbw = get_bw('broadcast', size, avg_duration, args)
tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
desc = f'{input.nelement()}x{input.element_size()}'
if not args.raw:
size = convert_size(size)
print_rank_0(
f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
def run_broadcast(local_rank, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
# Prepare benchmark header
print_header(args, 'broadcast')
world_size = dist.get_world_size()
global_rank = dist.get_rank()
if args.scan:
M_LIST = []
for x in (2**p for p in range(1, args.maxsize)):
M_LIST.append(x)
sync_all()
# loop over various tensor sizes
for M in M_LIST:
global_rank = dist.get_rank()
try:
mat = torch.ones(world_size,
M,
dtype=getattr(
torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
sync_all()
input = ((mat.mul_(float(global_rank))).view(-1))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print('WARNING: Ran out of GPU memory. Exiting comm op.')
sync_all()
break
sync_all()
timed_broadcast(input, args)
else:
# Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
# Don't need output tensor, so we double mem_factor
elements_per_gpu = max_numel(comm_op='broadcast',
dtype=getattr(torch,
args.dtype),
mem_factor=args.mem_factor * 2,
local_rank=local_rank,
args=args)
try:
mat = torch.ones(elements_per_gpu,
dtype=getattr(torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
input = ((mat.mul_(float(global_rank))).view(-1))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print(
'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
)
sync_all()
return
sync_all()
timed_broadcast(input, args)
if __name__ == "__main__":
args = benchmark_parser().parse_args()
rank = args.local_rank
init_processes(local_rank=rank, args=args)
run_broadcast(local_rank=rank, args=args)
'''Copyright The Microsoft DeepSpeed Team'''
from deepspeed.accelerator import get_accelerator
DEFAULT_WARMUPS = 5
DEFAULT_TRIALS = 50
DEFAULT_TYPE = 'float'
DEFAULT_BACKEND = get_accelerator().communication_backend_name()
DEFAULT_UNIT = 'Gbps'
DEFAULT_DIST = 'deepspeed'
DEFAULT_MAXSIZE = 24
'''Copyright The Microsoft DeepSpeed Team'''
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *
from deepspeed.accelerator import get_accelerator
import time
def timed_pt2pt(input, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
sync_all()
# Warmups, establish connections, etc.
for i in range(args.warmups):
if dist.get_rank() == 0:
if args.async_op:
dist.isend(input, 1)
else:
dist.send(input, 1)
if dist.get_rank() == 1:
if args.async_op:
dist.irecv(input, src=0)
else:
dist.recv(input, src=0)
sync_all()
# time the actual comm op trials times and average it
pre = time.perf_counter()
for i in range(args.trials):
if dist.get_rank() == 0:
if args.async_op:
dist.isend(input, 1)
else:
dist.send(input, 1)
if dist.get_rank() == 1:
if args.async_op:
dist.irecv(input, src=0)
else:
dist.recv(input, src=0)
sync_all()
duration = time.perf_counter() - pre
# maintain and clean performance data
avg_duration = duration / args.trials
size = input.element_size() * input.nelement()
n = dist.get_world_size()
tput, busbw = get_bw('pt2pt', size, avg_duration, args)
tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
desc = f'{input.nelement()}x{input.element_size()}'
if not args.raw:
size = convert_size(size)
print_rank_0(
f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
def run_pt2pt(local_rank, args):
if args.dist == 'torch':
import torch.distributed as dist
elif args.dist == 'deepspeed':
import deepspeed.comm as dist
# Prepare benchmark header
print_header(args, 'pt2pt')
global_rank = dist.get_rank()
world_size = dist.get_world_size()
if args.scan:
# Create list of message sizes
M_LIST = []
for x in (2**p for p in range(1, args.maxsize)):
M_LIST.append(x)
sync_all()
# loop over various tensor sizes
for M in M_LIST:
global_rank = dist.get_rank()
try:
mat = torch.ones(world_size,
M,
dtype=getattr(
torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
sync_all()
input = ((mat.mul_(float(global_rank))).view(-1))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print('WARNING: Ran out of GPU memory. Exiting comm op.')
sync_all()
break
sync_all()
timed_pt2pt(input, args)
else:
# Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
# Don't need output tensor, so double mem_factor
elements_per_gpu = max_numel(comm_op='pt2pt',
dtype=getattr(torch,
args.dtype),
mem_factor=args.mem_factor * 2,
local_rank=local_rank,
args=args)
try:
mat = torch.ones(elements_per_gpu,
dtype=getattr(torch,
args.dtype)).to(
get_accelerator().device_name(local_rank))
input = ((mat.mul_(float(global_rank))).view(-1))
except RuntimeError as e:
if 'out of memory' in str(e):
if dist.get_rank() == 0:
print(
'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
)
sync_all()
return
sync_all()
timed_pt2pt(input, args)
if __name__ == "__main__":
args = benchmark_parser().parse_args()
rank = args.local_rank
init_processes(local_rank=rank, args=args)
run_pt2pt(local_rank=rank, args=args)
'''Copyright The Microsoft DeepSpeed Team'''
from benchmarks.communication.utils import *
from benchmarks.communication.all_reduce import run_all_reduce
from benchmarks.communication.all_gather import run_all_gather
from benchmarks.communication.all_to_all import run_all_to_all
from benchmarks.communication.pt2pt import run_pt2pt
from benchmarks.communication.broadcast import run_broadcast
from benchmarks.communication.constants import *
# For importing
def main(args, rank):
init_processes(local_rank=rank, args=args)
ops_to_run = []
if args.all_reduce:
ops_to_run.append('all_reduce')
if args.all_gather:
ops_to_run.append('all_gather')
if args.broadcast:
ops_to_run.append('broadcast')
if args.pt2pt:
ops_to_run.append('pt2pt')
if args.all_to_all:
ops_to_run.append('all_to_all')
if len(ops_to_run) == 0:
ops_to_run = ['all_reduce', 'all_gather', 'all_to_all', 'broadcast', 'pt2pt']
for comm_op in ops_to_run:
if comm_op == 'all_reduce':
run_all_reduce(local_rank=rank, args=args)
if comm_op == 'all_gather':
run_all_gather(local_rank=rank, args=args)
if comm_op == 'all_to_all':
run_all_to_all(local_rank=rank, args=args)
if comm_op == 'pt2pt':
run_pt2pt(local_rank=rank, args=args)
if comm_op == 'broadcast':
run_broadcast(local_rank=rank, args=args)
# For directly calling benchmark
if __name__ == "__main__":
args = benchmark_parser().parse_args()
rank = args.local_rank
main(args, rank)
'''Copyright The Microsoft DeepSpeed Team'''
import torch
import os
import math
import argparse
from benchmarks.communication.constants import *
from deepspeed.accelerator import get_accelerator
global dist
def init_torch_distributed(backend):
global dist
import torch.distributed as dist
torch.distributed.init_process_group(backend)
local_rank = int(os.environ['LOCAL_RANK'])
get_accelerator().set_device(local_rank)
def init_deepspeed_comm(backend):
global dist
import deepspeed
import deepspeed.comm as dist
deepspeed.init_distributed(dist_backend=backend)
local_rank = int(os.environ['LOCAL_RANK'])
get_accelerator().set_device(local_rank)
def init_processes(local_rank, args):
if args.dist == 'deepspeed':
init_deepspeed_comm(args.backend)
elif args.dist == 'torch':
init_torch_distributed(args.backend)
else:
print_rank_0(f"distributed framework {args.dist} not supported")
exit(0)
def print_rank_0(message):
if dist.get_rank() == 0:
print(message)
def print_header(args, comm_op):
if comm_op == 'pt2pt':
world_size = 2
else:
world_size = dist.get_world_size()
tput = f'Throughput ({args.bw_unit})'
busbw = f'BusBW ({args.bw_unit})'
header = f"\n---- Performance of {comm_op} on {world_size} devices ---------------------------------------------------------\n"
duration_str = 'Duration'
if args.raw:
duration_str += ' (us)'
header += f"{'Size (Bytes)':20s} {'Description':25s} {duration_str:20s} {tput:20s} {busbw:20s}\n"
header += "----------------------------------------------------------------------------------------------------"
print_rank_0(header)
def get_bw(comm_op, size, duration, args):
n = dist.get_world_size()
tput = 0
busbw = 0
if comm_op == "all_to_all":
tput = (size / duration)
busbw = (size / duration) * ((n - 1) / n)
elif comm_op == "all_gather":
size *= n
tput = (size / duration)
busbw = (size / duration) * ((n - 1) / n)
elif comm_op == "all_reduce":
tput = (size * 2 / duration)
busbw = (size / duration) * (2 * (n - 1) / n)
elif comm_op == "pt2pt" or comm_op == "broadcast":
tput = (size / duration)
busbw = tput
else:
print_rank_0("wrong comm_op specified")
exit(0)
if args.bw_unit == 'Gbps':
tput *= 8
busbw *= 8
return tput, busbw
def get_metric_strings(args, tput, busbw, duration):
duration_ms = duration * 1e3
duration_us = duration * 1e6
tput = f'{tput / 1e9:.3f}'
busbw = f'{busbw /1e9:.3f}'
if duration_us < 1e3 or args.raw:
duration = f'{duration_us:.3f}'
if not args.raw:
duration += ' us'
else:
duration = f'{duration_ms:.3f} ms'
return tput, busbw, duration
def sync_all():
get_accelerator().synchronize()
dist.barrier()
def max_numel(comm_op, dtype, mem_factor, local_rank, args):
dtype_size = _element_size(dtype)
max_memory_per_gpu = get_accelerator().total_memory(local_rank) * mem_factor
if comm_op == 'all_reduce' or comm_op == 'pt2pt' or comm_op == 'broadcast':
elements_per_gpu = int(max_memory_per_gpu // dtype_size)
elif comm_op == 'all_gather':
# all_gather performance is lower for non-powers of two, and the output buffer size scales with world size
# Therefore, divide by world size and round down to nearest power of 2
elements_per_gpu = int(max_memory_per_gpu // dtype_size // dist.get_world_size())
elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
elif comm_op == 'all_to_all':
# Number of elements must be divisible by world_size
# all_to_all performance is lower for non-powers of two. Round down like all_gather.
elements_per_gpu = int(max_memory_per_gpu // dtype_size)
elements_per_gpu = int(dist.get_world_size() *
round(elements_per_gpu / dist.get_world_size()))
elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
else:
print(f"This communication operation: {comm_op} is not supported yet")
exit(0)
return elements_per_gpu
# Helper function to pretty-print message sizes
def convert_size(size_bytes):
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
s = round(size_bytes / p, 2)
return "%s %s" % (s, size_name[i])
# Copied from torch. Need to add the func here for old torch compatibility.
def _element_size(dtype):
"""
Returns the element size for a dtype, in bytes
"""
if not isinstance(dtype, torch.dtype):
raise RuntimeError(f'expected torch.dtype, but got {type(dtype)}')
if dtype.is_complex:
return torch.finfo(dtype).bits >> 2
elif dtype.is_floating_point:
return torch.finfo(dtype).bits >> 3
elif dtype == torch.bool:
# NOTE: torch.bool is not supported in torch.iinfo()
return 1
else:
return torch.iinfo(dtype).bits >> 3
def benchmark_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
parser.add_argument("--trials",
type=int,
default=DEFAULT_TRIALS,
help='Number of timed iterations')
parser.add_argument("--warmups",
type=int,
default=DEFAULT_WARMUPS,
help='Number of warmup (non-timed) iterations')
parser.add_argument("--maxsize",
type=int,
default=24,
help='Max message size as a power of 2')
parser.add_argument("--async-op",
action="store_true",
help='Enables non-blocking communication')
parser.add_argument("--bw-unit",
type=str,
default=DEFAULT_UNIT,
choices=['Gbps',
'GBps'])
parser.add_argument("--backend",
type=str,
default=DEFAULT_BACKEND,
choices=['nccl',
'ccl'],
help='Communication library to use')
parser.add_argument("--dist",
type=str,
default=DEFAULT_DIST,
choices=['deepspeed',
'torch'],
help='Distributed DL framework to use')
parser.add_argument("--scan",
action="store_true",
help='Enables scanning all message sizes')
parser.add_argument("--raw",
action="store_true",
help='Print the message size and latency without units')
parser.add_argument("--all-reduce", action="store_true", help='Run all_reduce')
parser.add_argument("--all-gather", action="store_true", help='Run all_gather')
parser.add_argument("--all-to-all", action="store_true", help='Run all_to_all')
parser.add_argument("--pt2pt", action="store_true", help='Run pt2pt')
parser.add_argument("--broadcast", action="store_true", help='Run broadcast')
parser.add_argument("--dtype",
type=str,
default=DEFAULT_TYPE,
help='PyTorch tensor dtype')
parser.add_argument(
"--mem-factor",
type=float,
default=.4,
help='Proportion of max available GPU memory to use for single-size evals')
parser.add_argument("--debug",
action="store_true",
help='Enables all_to_all debug prints')
return parser
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment