Unverified Commit 31f46fee authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

DeepSpeed JIT op + PyPI support (#496)


Co-authored-by: default avatarShaden Smith <Shaden.Smith@microsoft.com>
Co-authored-by: default avatarReza Yazdani <reyazda@microsoft.com>
parent 0ad4fd88
'''
Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from NVIDIA/apex, commit a109f85
'''
import torch
class MultiTensorApply(object):
def __init__(self, chunk_size):
self.chunk_size = chunk_size
def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
../../csrc
\ No newline at end of file
from deepspeed.ops.lamb.fused_lamb import FusedLamb from .fused_lamb import FusedLamb
...@@ -5,8 +5,8 @@ Copyright NVIDIA/apex ...@@ -5,8 +5,8 @@ Copyright NVIDIA/apex
This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LAMB optimizer This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LAMB optimizer
''' '''
import types import types
import importlib
import torch import torch
from ..op_builder import FusedLambBuilder
class FusedLamb(torch.optim.Optimizer): class FusedLamb(torch.optim.Optimizer):
...@@ -48,15 +48,7 @@ class FusedLamb(torch.optim.Optimizer): ...@@ -48,15 +48,7 @@ class FusedLamb(torch.optim.Optimizer):
max_coeff=10.0, max_coeff=10.0,
min_coeff=0.01, min_coeff=0.01,
amsgrad=False): amsgrad=False):
global fused_lamb_cuda self.fused_lamb_cuda = FusedLambBuilder().load()
try:
fused_lamb_cuda = importlib.import_module(
"deepspeed.ops.lamb.fused_lamb_cuda")
except ImportError as err:
print(
"Unable to import Lamb cuda extension, please build DeepSpeed with cuda/cpp extensions."
)
raise err
if amsgrad: if amsgrad:
raise RuntimeError('FusedLamb does not support the AMSGrad variant.') raise RuntimeError('FusedLamb does not support the AMSGrad variant.')
...@@ -173,22 +165,22 @@ class FusedLamb(torch.optim.Optimizer): ...@@ -173,22 +165,22 @@ class FusedLamb(torch.optim.Optimizer):
out_p = torch.tensor( out_p = torch.tensor(
[], [],
dtype=torch.float) if output_param is None else output_param dtype=torch.float) if output_param is None else output_param
lamb_coeff = fused_lamb_cuda.lamb(p.data, lamb_coeff = self.fused_lamb_cuda.lamb(p.data,
out_p, out_p,
exp_avg, exp_avg,
exp_avg_sq, exp_avg_sq,
grad, grad,
group['lr'], group['lr'],
beta1, beta1,
beta2, beta2,
max_coeff, max_coeff,
min_coeff, min_coeff,
group['eps'], group['eps'],
combined_scale, combined_scale,
state['step'], state['step'],
self.eps_mode, self.eps_mode,
bias_correction, bias_correction,
group['weight_decay']) group['weight_decay'])
self.lamb_coeffs.append(lamb_coeff) self.lamb_coeffs.append(lamb_coeff)
return loss return loss
......
../../op_builder
\ No newline at end of file
...@@ -2,13 +2,12 @@ ...@@ -2,13 +2,12 @@
# https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
import importlib import importlib
import warnings import warnings
try:
import triton
except ImportError:
warnings.warn("Unable to import triton, sparse attention will not be accessible")
import torch import torch
import math import math
from deepspeed.ops.sparse_attention.trsrc import matmul from .trsrc import matmul
from ..op_builder import SparseAttnBuilder
triton = None
############## ##############
...@@ -27,6 +26,9 @@ class _sparse_matmul(torch.autograd.Function): ...@@ -27,6 +26,9 @@ class _sparse_matmul(torch.autograd.Function):
# between `seg_size` elements # between `seg_size` elements
@staticmethod @staticmethod
def load_balance(sizes, block): def load_balance(sizes, block):
global triton
if triton is None:
triton = importlib.import_module('triton')
# segment size # segment size
# heuristics taken from OpenAI blocksparse code # heuristics taken from OpenAI blocksparse code
# https://github.com/openai/blocksparse/blob/master/blocksparse/matmul.py#L95 # https://github.com/openai/blocksparse/blob/master/blocksparse/matmul.py#L95
...@@ -83,11 +85,18 @@ class _sparse_matmul(torch.autograd.Function): ...@@ -83,11 +85,18 @@ class _sparse_matmul(torch.autograd.Function):
########################## ##########################
# SPARSE = DENSE x DENSE # # SPARSE = DENSE x DENSE #
########################## ##########################
cpp_utils = importlib.import_module('deepspeed.ops.sparse_attention.cpp_utils') cpp_utils = None
sdd_segment = cpp_utils.sdd_segment sdd_segment = None
@staticmethod
def _load_utils():
if _sparse_matmul.cpp_utils is None:
_sparse_matmul.cpp_utils = SparseAttnBuilder().load()
_sparse_matmul.sdd_segment = _sparse_matmul.cpp_utils.sdd_segment
@staticmethod @staticmethod
def make_sdd_lut(layout, block, dtype, device): def make_sdd_lut(layout, block, dtype, device):
_sparse_matmul._load_utils()
start_width = 64 // block start_width = 64 // block
segmented = _sparse_matmul.sdd_segment(layout.type(torch.int32), start_width) segmented = _sparse_matmul.sdd_segment(layout.type(torch.int32), start_width)
luts, widths, packs = [], [], [] luts, widths, packs = [], [], []
...@@ -118,6 +127,10 @@ class _sparse_matmul(torch.autograd.Function): ...@@ -118,6 +127,10 @@ class _sparse_matmul(torch.autograd.Function):
packs, packs,
bench, bench,
time): time):
global triton
if triton is None:
triton = importlib.import_module('triton')
if trans_c: if trans_c:
a, b = b, a a, b = b, a
trans_a, trans_b = not trans_b, not trans_a trans_a, trans_b = not trans_b, not trans_a
...@@ -332,6 +345,10 @@ class _sparse_matmul(torch.autograd.Function): ...@@ -332,6 +345,10 @@ class _sparse_matmul(torch.autograd.Function):
packs, packs,
bench, bench,
time): time):
global triton
if triton is None:
triton = importlib.import_module('triton')
# shapes / dtypes # shapes / dtypes
AS0 = a.size(0) AS0 = a.size(0)
AS1 = a.size(1) AS1 = a.size(1)
...@@ -413,6 +430,10 @@ class _sparse_matmul(torch.autograd.Function): ...@@ -413,6 +430,10 @@ class _sparse_matmul(torch.autograd.Function):
packs, packs,
bench, bench,
time): time):
global triton
if triton is None:
triton = importlib.import_module('triton')
# shapes / dtypes # shapes / dtypes
AS0 = spdims[0] AS0 = spdims[0]
AS1 = block * spdims[2 if trans_a else 1] AS1 = block * spdims[2 if trans_a else 1]
......
...@@ -2,17 +2,17 @@ ...@@ -2,17 +2,17 @@
# https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
import warnings import warnings
try: import importlib
import triton
except ImportError:
warnings.warn("Unable to import triton, sparse attention will not be accessible")
import torch import torch
import math import math
from deepspeed.ops.sparse_attention.trsrc import softmax_fwd, softmax_bwd from .trsrc import softmax_fwd, softmax_bwd
fwd_kernels = dict() fwd_kernels = dict()
bwd_kernels = dict() bwd_kernels = dict()
# Delay importing triton unless we need it
triton = None
class _sparse_softmax(torch.autograd.Function): class _sparse_softmax(torch.autograd.Function):
...@@ -52,6 +52,10 @@ class _sparse_softmax(torch.autograd.Function): ...@@ -52,6 +52,10 @@ class _sparse_softmax(torch.autograd.Function):
apply_attn_mask, apply_attn_mask,
kp_mask_mode, kp_mask_mode,
attn_mask_mode): attn_mask_mode):
global triton
if triton is None:
triton = importlib.import_module('triton')
if max_k >= 32768: if max_k >= 32768:
raise NotImplementedError('Reductions larger than 32768 elements '\ raise NotImplementedError('Reductions larger than 32768 elements '\
'are not yet implemented') 'are not yet implemented')
...@@ -112,6 +116,10 @@ class _sparse_softmax(torch.autograd.Function): ...@@ -112,6 +116,10 @@ class _sparse_softmax(torch.autograd.Function):
maxlut, maxlut,
bench, bench,
time): time):
global triton
if triton is None:
triton = importlib.import_module('triton')
apply_scale = False if scale == 1.0 else True apply_scale = False if scale == 1.0 else True
# handle None rpe # handle None rpe
...@@ -180,6 +188,10 @@ class _sparse_softmax(torch.autograd.Function): ...@@ -180,6 +188,10 @@ class _sparse_softmax(torch.autograd.Function):
@staticmethod @staticmethod
def backward(ctx, dx): def backward(ctx, dx):
global triton
if triton is None:
triton = importlib.import_module('triton')
# retrieve from context # retrieve from context
x, lut = ctx.saved_tensors x, lut = ctx.saved_tensors
# run kernel # run kernel
......
from deepspeed.ops.transformer.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
...@@ -8,6 +8,8 @@ import torch ...@@ -8,6 +8,8 @@ import torch
from torch import nn from torch import nn
from torch.autograd import Function from torch.autograd import Function
from ..op_builder import TransformerBuilder, StochasticTransformerBuilder
# Cuda modules will be imported if needed # Cuda modules will be imported if needed
transformer_cuda_module = None transformer_cuda_module = None
stochastic_transformer_cuda_module = None stochastic_transformer_cuda_module = None
...@@ -483,19 +485,12 @@ class DeepSpeedTransformerLayer(nn.Module): ...@@ -483,19 +485,12 @@ class DeepSpeedTransformerLayer(nn.Module):
self.norm_w = initial_weights[7] self.norm_w = initial_weights[7]
self.norm_b = initial_biases[7] self.norm_b = initial_biases[7]
# Import cuda modules if needed # Load cuda modules if needed
global transformer_cuda_module, stochastic_transformer_cuda_module global transformer_cuda_module, stochastic_transformer_cuda_module
if transformer_cuda_module is None or stochastic_transformer_cuda_module is None: if transformer_cuda_module is None and not self.config.stochastic_mode:
try: transformer_cuda_module = TransformerBuilder().load()
transformer_cuda_module = importlib.import_module( if stochastic_transformer_cuda_module is None and self.config.stochastic_mode:
"deepspeed.ops.transformer.transformer_cuda") stochastic_transformer_cuda_module = StochasticTransformerBuilder().load()
stochastic_transformer_cuda_module = importlib.import_module(
"deepspeed.ops.transformer.stochastic_transformer_cuda")
except ImportError as err:
print(
"Unable to import transformer cuda extension, please build DeepSpeed with cuda/cpp extensions."
)
raise err
# create the layer in cuda kernels. # create the layer in cuda kernels.
cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module
......
...@@ -7,8 +7,6 @@ import torch ...@@ -7,8 +7,6 @@ import torch
import warnings import warnings
import torch.distributed as dist import torch.distributed as dist
import apex
from apex import amp
from torch.nn.modules import Module from torch.nn.modules import Module
from torch.distributed.distributed_c10d import _get_global_rank from torch.distributed.distributed_c10d import _get_global_rank
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
...@@ -36,22 +34,17 @@ from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer ...@@ -36,22 +34,17 @@ from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
from .utils import ensure_directory_exists from .utils import ensure_directory_exists
from ..ops.op_builder import UtilsBuilder
from ..ops.adam import DeepSpeedCPUAdam
from ..ops.adam import FusedAdam
MEMORY_OPT_ALLREDUCE_SIZE = 500000000 MEMORY_OPT_ALLREDUCE_SIZE = 500000000
try: try:
from apex_C import flatten from apex import amp
from apex_C import unflatten
except ImportError: except ImportError:
try: # Fail silently so we don't spam logs unnecessarily if user isn't using amp
_ = warned_flatten pass
except NameError:
logger.warning(
"Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten."
)
warned_flatten = True
from torch._utils import _flatten_dense_tensors as flatten
from torch._utils import _unflatten_dense_tensors as unflatten
def split_half_float_double_csr(tensors): def split_half_float_double_csr(tensors):
...@@ -201,6 +194,11 @@ class DeepSpeedEngine(Module): ...@@ -201,6 +194,11 @@ class DeepSpeedEngine(Module):
if self.dump_state(): if self.dump_state():
print_configuration(self, 'DeepSpeedEngine') print_configuration(self, 'DeepSpeedEngine')
# Load pre-installed or JIT compile (un)flatten ops
util_ops = UtilsBuilder().load()
self.flatten = util_ops.flatten
self.unflatten = util_ops.unflatten
def _mpi_check(self, args, dist_init_required): def _mpi_check(self, args, dist_init_required):
if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi: if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
from mpi4py import MPI from mpi4py import MPI
...@@ -558,6 +556,12 @@ class DeepSpeedEngine(Module): ...@@ -558,6 +556,12 @@ class DeepSpeedEngine(Module):
amp_params = self.amp_params() amp_params = self.amp_params()
if self.global_rank == 0: if self.global_rank == 0:
logger.info(f"Initializing AMP with these params: {amp_params}") logger.info(f"Initializing AMP with these params: {amp_params}")
try:
logger.info("Initializing Apex amp from: {}".format(amp.__path__))
except NameError:
# If apex/amp is available it will be imported above
raise RuntimeError(
"Unable to import apex/amp, please make sure it is installed")
self.module, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params) self.module, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params)
self._broadcast_model() self._broadcast_model()
elif self.fp16_enabled(): elif self.fp16_enabled():
...@@ -584,17 +588,18 @@ class DeepSpeedEngine(Module): ...@@ -584,17 +588,18 @@ class DeepSpeedEngine(Module):
# T|F T F torch.optim.Adam # T|F T F torch.optim.Adam
# T F T|F DeepSpeedCPUAdam(adam_w_mode) # T F T|F DeepSpeedCPUAdam(adam_w_mode)
# F F T|F FusedAdam(adam_w_mode) # F F T|F FusedAdam(adam_w_mode)
if torch_adam and adam_w_mode: if torch_adam:
optimizer = torch.optim.AdamW(model_parameters, **optimizer_parameters) if adam_w_mode:
elif torch_adam and not adam_w_mode: optimizer = torch.optim.AdamW(model_parameters,
optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters) **optimizer_parameters)
elif self.zero_cpu_offload() and not torch_adam: else:
from deepspeed.ops.adam import DeepSpeedCPUAdam optimizer = torch.optim.Adam(model_parameters,
**optimizer_parameters)
elif self.zero_cpu_offload():
optimizer = DeepSpeedCPUAdam(model_parameters, optimizer = DeepSpeedCPUAdam(model_parameters,
**optimizer_parameters, **optimizer_parameters,
adamw_mode=adam_w_mode) adamw_mode=adam_w_mode)
elif not self.zero_cpu_offload() and not torch_adam: else:
from apex.optimizers.fused_adam import FusedAdam
optimizer_parameters[ADAM_W_MODE_PARAM] = adam_w_mode optimizer_parameters[ADAM_W_MODE_PARAM] = adam_w_mode
optimizer = FusedAdam(model_parameters, **optimizer_parameters) optimizer = FusedAdam(model_parameters, **optimizer_parameters)
...@@ -614,8 +619,7 @@ class DeepSpeedEngine(Module): ...@@ -614,8 +619,7 @@ class DeepSpeedEngine(Module):
dynamic_loss_args = self.dynamic_loss_scale_args() dynamic_loss_args = self.dynamic_loss_scale_args()
clip_grad = self.gradient_clipping() clip_grad = self.gradient_clipping()
if isinstance(optimizer, if isinstance(optimizer,
apex.optimizers.FusedAdam) or self.optimizer_name( FusedAdam) or self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
) == ONEBIT_ADAM_OPTIMIZER:
if self.dynamic_loss_scale(): if self.dynamic_loss_scale():
logger.info('Creating fp16 optimizer with dynamic loss scale') logger.info('Creating fp16 optimizer with dynamic loss scale')
timers = self.timers if self.wall_clock_breakdown() else None timers = self.timers if self.wall_clock_breakdown() else None
...@@ -1072,7 +1076,7 @@ class DeepSpeedEngine(Module): ...@@ -1072,7 +1076,7 @@ class DeepSpeedEngine(Module):
ranks=[0]) ranks=[0])
def allreduce_bucket(self, bucket): def allreduce_bucket(self, bucket):
tensor = flatten(bucket) tensor = self.flatten(bucket)
tensor_to_allreduce = tensor tensor_to_allreduce = tensor
...@@ -1100,7 +1104,7 @@ class DeepSpeedEngine(Module): ...@@ -1100,7 +1104,7 @@ class DeepSpeedEngine(Module):
def allreduce_and_copy(self, small_bucket): def allreduce_and_copy(self, small_bucket):
allreduced = self.allreduce_bucket(small_bucket) allreduced = self.allreduce_bucket(small_bucket)
for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)): for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
buf.copy_(synced) buf.copy_(synced)
def allreduce_no_retain(self, bucket, numel_per_bucket=500000000): def allreduce_no_retain(self, bucket, numel_per_bucket=500000000):
......
...@@ -15,26 +15,15 @@ import collections ...@@ -15,26 +15,15 @@ import collections
from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
from deepspeed.ops.adam import DeepSpeedCPUAdam
from deepspeed.utils import logger from deepspeed.utils import logger
from ...ops.op_builder import UtilsBuilder
#Toggle this to true to enable correctness test #Toggle this to true to enable correctness test
#with gradient partitioning and without #with gradient partitioning and without
pg_correctness_test = False pg_correctness_test = False
try:
from apex_C import flatten
from apex_C import unflatten
except ImportError:
try:
_ = warned_flatten
except NameError:
logger.warning(
"apex was installed without --cpp_ext. Falling back to Python flatten and unflatten."
)
warned_flatten = True
from torch._utils import _flatten_dense_tensors as flatten
from torch._utils import _unflatten_dense_tensors as unflatten
def input(msg): def input(msg):
return return
...@@ -132,6 +121,11 @@ class FP16_DeepSpeedZeroOptimizer(object): ...@@ -132,6 +121,11 @@ class FP16_DeepSpeedZeroOptimizer(object):
gradient_predivide_factor=1.0, gradient_predivide_factor=1.0,
gradient_accumulation_steps=1): gradient_accumulation_steps=1):
# Load pre-installed or JIT compile (un)flatten ops
util_ops = UtilsBuilder().load()
self.flatten = util_ops.flatten
self.unflatten = util_ops.unflatten
if dist.get_rank() == 0: if dist.get_rank() == 0:
logger.info(f"Reduce bucket size {reduce_bucket_size}") logger.info(f"Reduce bucket size {reduce_bucket_size}")
logger.info(f"Allgather bucket size {allgather_bucket_size}") logger.info(f"Allgather bucket size {allgather_bucket_size}")
...@@ -1053,7 +1047,7 @@ class FP16_DeepSpeedZeroOptimizer(object): ...@@ -1053,7 +1047,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None): def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
rank = None rank = None
tensor = flatten(bucket) tensor = self.flatten(bucket)
tensor_to_allreduce = tensor tensor_to_allreduce = tensor
...@@ -1095,7 +1089,7 @@ class FP16_DeepSpeedZeroOptimizer(object): ...@@ -1095,7 +1089,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
with torch.cuda.stream(stream): with torch.cuda.stream(stream):
allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log) allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
if rank is None or rank == dist.get_rank(group=self.dp_process_group): if rank is None or rank == dist.get_rank(group=self.dp_process_group):
for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)): for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
buf.copy_(synced) buf.copy_(synced)
def allreduce_no_retain(self, def allreduce_no_retain(self,
......
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import apex
from deepspeed.utils import logger from deepspeed.utils import logger
from deepspeed.ops.adam import DeepSpeedCPUAdam from deepspeed.ops.adam import DeepSpeedCPUAdam
from deepspeed.ops.adam import FusedAdam
def _initialize_parameter_parallel_groups(parameter_parallel_size=None): def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
...@@ -23,11 +23,14 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None): ...@@ -23,11 +23,14 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
return my_group return my_group
ZERO_SUPPORTED_OPTIMIZERS = [ ZERO_SUPPORTED_OPTIMIZERS = [torch.optim.Adam, FusedAdam, DeepSpeedCPUAdam]
torch.optim.Adam,
apex.optimizers.FusedAdam, # Add apex FusedAdam to supported list if apex is installed
DeepSpeedCPUAdam try:
] import apex
ZERO_SUPPORTED_OPTIMIZERS.append(apex.optimizers.FusedAdam)
except ImportError:
pass
def is_zero_supported_optimizer(optimizer): def is_zero_supported_optimizer(optimizer):
......
...@@ -30,6 +30,7 @@ collections: ...@@ -30,6 +30,7 @@ collections:
output: true output: true
permalink: /:collection/:path/ permalink: /:collection/:path/
order: order:
- advanced-install.md
- getting-started.md - getting-started.md
- azure.md - azure.md
- cifar-10.md - cifar-10.md
......
---
title: "Installation Details"
date: 2020-10-28
---
The quickest way to get started with DeepSpeed is via pip, this will install
the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
to as our 'ops'. By default, all of these extensions/ops will be built
just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
dynamically link them at runtime.
```bash
pip install deepspeed
```
After installation you can validate your install and see which ops your machine
is compatible with via the DeepSpeed environment report with `ds_report` or
`python -m deepspeed.env_report`. We've found this report useful when debugging
DeepSpeed install or compatibility issues.
```bash
ds_report
```
## Install DeepSpeed from source
After cloning the DeepSpeed repo from github you can install DeepSpeed in
JIT mode via pip (see below). This install should complete
quickly since it is not compiling any C++/CUDA source files.
```bash
pip install .
```
For installs spanning multiple nodes we find it useful to install DeepSpeed
using the
[install.sh](https://github.com/microsoft/DeepSpeed/blob/master/install.sh)
script in the repo. This will build a python wheel locally and copy it to all
the nodes listed in your hostfile (either given via --hostfile, or defaults to
/job/hostfile).
## Pre-install DeepSpeed Ops
Sometimes we have found it useful to pre-install either some or all DeepSpeed
C++/CUDA ops instead of using the JIT compiled path. In order to support
pre-installation we introduce build environment flags to turn on/off building
specific ops.
You can indicate to our installer (either install.sh or pip install) that you
want to attempt to install all of our ops by setting the `DS_BUILD_OPS`
environment variable to 1, for example:
```bash
DS_BUILD_OPS=1 pip install .
```
We will only install any ops that are compatible with your machine, for more
details on which ops are compatible with your system please try our `ds_report`
tool described above.
If you want to install only a specific op (e.g., FusedLamb) you can view the op
specific build environment variable (set as `BUILD_VAR`) in the corresponding
op builder class in the
[op\_builder](https://github.com/microsoft/DeepSpeed/tree/master/op_builder)
directory. For example to install only the Fused Lamb op you would install via:
```bash
DS_BUILD_FUSED_LAMB=1 pip install .
```
## Feature specific dependencies
Some DeepSpeed features require specific dependencies outside of the general
dependencies of DeepSpeed.
* Python package dependencies per feature/op please
see our [requirements
directory](https://github.com/microsoft/DeepSpeed/tree/master/requirements).
* We attempt to keep the system level dependencies to a minimum, however some features do require special system-level packages. Please see our `ds_report` tool output to see if you are missing any system-level packages for a given feature.
## Pre-compiled DeepSpeed builds from PyPI
Coming soon
...@@ -7,9 +7,9 @@ date: 2020-05-15 ...@@ -7,9 +7,9 @@ date: 2020-05-15
## Installation ## Installation
* Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
* Please see our [Azure tutorial](/tutorials/azure/) to get started with DeepSpeed on Azure! * Please see our [Azure tutorial](/tutorials/azure/) to get started with DeepSpeed on Azure!
* If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies. * If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
* If you want to install DeepSpeed manually, we provide an install script `install.sh` to help install on a local machine or across an entire cluster.
## Writing DeepSpeed Models ## Writing DeepSpeed Models
DeepSpeed model training is accomplished using the DeepSpeed engine. The engine DeepSpeed model training is accomplished using the DeepSpeed engine. The engine
......
...@@ -28,8 +28,9 @@ initiative to enable next-generation AI capabilities at scale, where you can fin ...@@ -28,8 +28,9 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale). information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
# What's New? # What's New?
* [2020/10/28] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html) * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
* [DeepSpeed: Extreme-scale model training for everyone]({{ site.press_release_v3 }}) * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
* [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})
* [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html) * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
* [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html) * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
* [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html) * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
......
...@@ -15,16 +15,13 @@ By default will install deepspeed and all third party dependecies accross all ma ...@@ -15,16 +15,13 @@ By default will install deepspeed and all third party dependecies accross all ma
hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
[optional] [optional]
-d, --deepspeed_only Install only deepspeed and no third party dependencies
-t, --third_party_only Install only third party dependencies and not deepspeed
-l, --local_only Install only on local machine -l, --local_only Install only on local machine
-s, --pip_sudo Run pip install with sudo (default: no sudo) -s, --pip_sudo Run pip install with sudo (default: no sudo)
-r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
-n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
-m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
-H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
-a, --apex_commit Install a specific commit hash of apex, instead of the one deepspeed points to -v, --verbose Verbose logging
-k, --skip_requirements Skip installing DeepSpeed requirements
-h, --help This help text -h, --help This help text
""" """
} }
...@@ -42,27 +39,12 @@ apex_commit="" ...@@ -42,27 +39,12 @@ apex_commit=""
skip_requirements=0 skip_requirements=0
allow_sudo=0 allow_sudo=0
no_clean=0 no_clean=0
verbose=0
while [[ $# -gt 0 ]] while [[ $# -gt 0 ]]
do do
key="$1" key="$1"
case $key in case $key in
-d|--deepspeed_only)
deepspeed_install=1;
third_party_install=0;
ds_only=1;
shift
;;
-t|--third_party_only)
deepspeed_install=0;
third_party_install=1;
tp_only=1;
shift
;;
-l|--local_only)
local_only=1;
shift
;;
-s|--pip_sudo) -s|--pip_sudo)
pip_sudo=1; pip_sudo=1;
shift shift
...@@ -72,13 +54,8 @@ case $key in ...@@ -72,13 +54,8 @@ case $key in
shift shift
shift shift
;; ;;
-a|--apex_commit) -v|--verbose)
apex_commit=$2; verbose=1;
shift
shift
;;
-k|--skip_requirements)
skip_requirements=1;
shift shift
;; ;;
-r|--allow_sudo) -r|--allow_sudo)
...@@ -126,12 +103,18 @@ if [ "$ds_only" == "1" ] && [ "$tp_only" == "1" ]; then ...@@ -126,12 +103,18 @@ if [ "$ds_only" == "1" ] && [ "$tp_only" == "1" ]; then
exit 1 exit 1
fi fi
if [ "$verbose" == "1" ]; then
VERBOSE="-v"
else
VERBOSE=""
fi
rm_if_exist() { rm_if_exist() {
echo "Attempting to remove $1" echo "Attempting to remove $1"
if [ -f $1 ]; then if [ -f $1 ]; then
rm -v $1 rm $VERBOSE $1
elif [ -d $1 ]; then elif [ -d $1 ]; then
rm -vr $1 rm -r $VERBOSE $1
fi fi
} }
...@@ -141,10 +124,6 @@ if [ "$no_clean" == "0" ]; then ...@@ -141,10 +124,6 @@ if [ "$no_clean" == "0" ]; then
rm_if_exist dist rm_if_exist dist
rm_if_exist build rm_if_exist build
rm_if_exist deepspeed.egg-info rm_if_exist deepspeed.egg-info
# remove apex build files
rm_if_exist third_party/apex/dist
rm_if_exist third_party/apex/build
rm_if_exist third_party/apex/apex.egg-info
fi fi
if [ "$pip_sudo" == "1" ]; then if [ "$pip_sudo" == "1" ]; then
...@@ -154,60 +133,25 @@ else ...@@ -154,60 +133,25 @@ else
fi fi
if [ "$pip_mirror" != "" ]; then if [ "$pip_mirror" != "" ]; then
PIP_INSTALL="pip install -v -i $pip_mirror" PIP_INSTALL="pip install $VERBOSE -i $pip_mirror"
else else
PIP_INSTALL="pip install -v" PIP_INSTALL="pip install $VERBOSE"
fi fi
if [ ! -f $hostfile ]; then if [ ! -f $hostfile ]; then
echo "No hostfile exists at $hostfile, installing locally" echo "No hostfile exists at $hostfile, installing locally"
local_only=1 local_only=1
fi fi
if [ "$skip_requirements" == "0" ]; then echo "Building deepspeed wheel"
# Ensure dependencies are installed locally python setup.py $VERBOSE bdist_wheel
$PIP_SUDO $PIP_INSTALL -r requirements/requirements.txt
fi
# Build wheels
if [ "$third_party_install" == "1" ]; then
echo "Checking out sub-module(s)"
git submodule update --init --recursive
echo "Building apex wheel"
cd third_party/apex
if [ "$apex_commit" != "" ]; then
echo "Installing a non-standard version of apex at commit: $apex_commit"
git fetch
git checkout $apex_commit
fi
python setup.py -v --cpp_ext --cuda_ext bdist_wheel
cd -
echo "Installing apex locally so that deepspeed will build"
$PIP_SUDO pip uninstall -y apex
$PIP_SUDO $PIP_INSTALL third_party/apex/dist/apex*.whl
fi
if [ "$deepspeed_install" == "1" ]; then
echo "Building deepspeed wheel"
python setup.py -v bdist_wheel
fi
if [ "$local_only" == "1" ]; then if [ "$local_only" == "1" ]; then
if [ "$deepspeed_install" == "1" ]; then echo "Installing deepspeed"
echo "Installing deepspeed" $PIP_SUDO pip uninstall -y deepspeed
$PIP_SUDO pip uninstall -y deepspeed $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
$PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl ds_report
# -I to exclude local directory files
python -I basic_install_test.py
if [ $? == 0 ]; then
echo "Installation is successful"
else
echo "Installation failed"
fi
fi
else else
local_path=`pwd` local_path=`pwd`
if [ -f $hostfile ]; then if [ -f $hostfile ]; then
...@@ -216,28 +160,16 @@ else ...@@ -216,28 +160,16 @@ else
echo "hostfile not found, cannot proceed" echo "hostfile not found, cannot proceed"
exit 1 exit 1
fi fi
export PDSH_RCMD_TYPE=ssh; export PDSH_RCMD_TYPE=ssh
tmp_wheel_path="/tmp/deepspeed_wheels" tmp_wheel_path="/tmp/deepspeed_wheels"
pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi" pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi"
pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/ pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
if [ "$skip_requirements" == "0" ]; then
pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt" echo "Installing deepspeed"
fi pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
if [ "$third_party_install" == "1" ]; then pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex" pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/ pdsh -w $hosts "ds_report"
pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/apex*.whl" pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; rmdir $tmp_wheel_path; fi"
pdsh -w $hosts 'python -c "import apex"'
fi
if [ "$deepspeed_install" == "1" ]; then
echo "Installing deepspeed"
pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
pdcp -w $hosts basic_install_test.py $tmp_wheel_path/
pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
pdsh -w $hosts "python $tmp_wheel_path/basic_install_test.py"
echo "Installation is successful"
fi
pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl $tmp_wheel_path/basic_install_test.py; rmdir $tmp_wheel_path; fi"
fi fi
from .cpu_adam import CPUAdamBuilder
from .fused_adam import FusedAdamBuilder
from .fused_lamb import FusedLambBuilder
from .sparse_attn import SparseAttnBuilder
from .transformer import TransformerBuilder
from .stochastic_transformer import StochasticTransformerBuilder
from .utils import UtilsBuilder
# TODO: infer this list instead of hard coded
# List of all available ops
__op_builders__ = [
CPUAdamBuilder(),
FusedAdamBuilder(),
FusedLambBuilder(),
SparseAttnBuilder(),
TransformerBuilder(),
StochasticTransformerBuilder(),
UtilsBuilder()
]
ALL_OPS = {op.name: op for op in __op_builders__}
import os
import time
import torch
import importlib
from pathlib import Path
import subprocess
from abc import ABC, abstractmethod
YELLOW = '\033[93m'
END = '\033[0m'
WARNING = f"{YELLOW} [WARNING] {END}"
DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
def assert_no_cuda_mismatch():
import torch.utils.cpp_extension
cuda_home = torch.utils.cpp_extension.CUDA_HOME
assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
# Ensure there is not a cuda version mismatch between torch and nvcc compiler
output = subprocess.check_output([cuda_home + "/bin/nvcc",
"-V"],
universal_newlines=True)
output_split = output.split()
release_idx = output_split.index("release")
release = output_split[release_idx + 1].replace(',', '').split(".")
# Ignore patch versions, only look at major + minor
installed_cuda_version = ".".join(release[:2])
torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
# This is a show-stopping error, should probably not proceed past this
if installed_cuda_version != torch_cuda_version:
raise Exception(
f"Installed CUDA version {installed_cuda_version} does not match the "
f"version torch was compiled with {torch.version.cuda}, unable to compile "
"cuda/cpp extensions without a matching cuda version.")
def assert_torch_info(torch_info):
install_torch_version = torch_info['version']
install_cuda_version = torch_info['cuda_version']
current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
current_torch_version = ".".join(torch.__version__.split('.')[:2])
if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
raise RuntimeError(
"PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
"with a different version than what is being used at runtime. Please re-install "
f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
f"torch={current_torch_version}, cuda={current_cuda_version}")
class OpBuilder(ABC):
def __init__(self, name):
self.name = name
self.jit_mode = False
@abstractmethod
def absolute_name(self):
'''
Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
will be installed as something like: deepspeed/ops/adam/cpu_adam.so
'''
pass
@abstractmethod
def sources(self):
'''
Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
'''
pass
def include_paths(self):
'''
Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
'''
return []
def nvcc_args(self):
'''
Returns optional list of compiler flags to forward to nvcc when building CUDA sources
'''
return []
def cxx_args(self):
'''
Returns optional list of compiler flags to forward to the build
'''
return []
def is_compatible(self):
'''
Check if all non-python dependencies are satisfied to build this op
'''
return True
def python_requirements(self):
'''
Override if op wants to define special dependencies, otherwise will
take self.name and load requirements-<op-name>.txt if it exists.
'''
path = f'requirements/requirements-{self.name}.txt'
requirements = []
if os.path.isfile(path):
with open(path, 'r') as fd:
requirements = [r.strip() for r in fd.readlines()]
return requirements
def command_exists(self, cmd):
if '|' in cmd:
cmds = cmd.split("|")
else:
cmds = [cmd]
valid = False
for cmd in cmds:
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
valid = valid or result.wait() == 0
if not valid and len(cmds) > 1:
print(
f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
)
elif not valid and len(cmds) == 1:
print(
f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
)
return valid
def warning(self, msg):
print(f"{WARNING} {msg}")
def deepspeed_src_path(self, code_path):
if os.path.isabs(code_path):
return code_path
else:
return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
def builder(self):
from torch.utils.cpp_extension import CppExtension
return CppExtension(name=self.absolute_name(),
sources=self.sources(),
include_dirs=self.include_paths(),
extra_compile_args={'cxx': self.cxx_args()})
def load(self, verbose=True):
from ...git_version_info import installed_ops, torch_info
if installed_ops[self.name]:
# Ensure the op we're about to load was compiled with the same
# torch/cuda versions we are currently using at runtime.
if isinstance(self, CUDAOpBuilder):
assert_torch_info(torch_info)
return importlib.import_module(self.absolute_name())
else:
return self.jit_load(verbose)
def jit_load(self, verbose=True):
if not self.is_compatible():
raise RuntimeError(
f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue."
)
try:
import ninja
except ImportError:
raise RuntimeError(
f"Unable to JIT load the {self.name} op due to ninja not being installed."
)
if isinstance(self, CUDAOpBuilder):
assert_no_cuda_mismatch()
self.jit_mode = True
from torch.utils.cpp_extension import load
# Ensure directory exists to prevent race condition in some cases
ext_path = os.path.join(
os.environ.get('TORCH_EXTENSIONS_DIR',
DEFAULT_TORCH_EXTENSION_PATH),
self.name)
os.makedirs(ext_path, exist_ok=True)
start_build = time.time()
op_module = load(
name=self.name,
sources=[self.deepspeed_src_path(path) for path in self.sources()],
extra_include_paths=[
self.deepspeed_src_path(path) for path in self.include_paths()
],
extra_cflags=self.cxx_args(),
extra_cuda_cflags=self.nvcc_args(),
verbose=verbose)
build_duration = time.time() - start_build
if verbose:
print(f"Time to load {self.name} op: {build_duration} seconds")
return op_module
class CUDAOpBuilder(OpBuilder):
def compute_capability_args(self, cross_compile_archs=['60', '61', '70']):
args = []
if self.jit_mode:
# Compile for underlying architecture since we know it at runtime
CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability()
compute_capability = f"{CC_MAJOR}{CC_MINOR}"
args.append('-gencode')
args.append(
f'arch=compute_{compute_capability},code=compute_{compute_capability}')
else:
# Cross-compile mode, compile for various architectures
for compute_capability in cross_compile_archs:
args.append('-gencode')
args.append(
f'arch=compute_{compute_capability},code=compute_{compute_capability}'
)
return args
def version_dependent_macros(self):
# Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
version_ge_1_1 = ['-DVERSION_GE_1_1']
version_ge_1_3 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
version_ge_1_3 = ['-DVERSION_GE_1_3']
version_ge_1_5 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
version_ge_1_5 = ['-DVERSION_GE_1_5']
return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
def is_compatible(self):
return super().is_compatible()
def builder(self):
from torch.utils.cpp_extension import CUDAExtension
assert_no_cuda_mismatch()
return CUDAExtension(name=self.absolute_name(),
sources=self.sources(),
include_dirs=self.include_paths(),
extra_compile_args={
'cxx': self.cxx_args(),
'nvcc': self.nvcc_args()
})
import os
import torch
import warnings
from .builder import CUDAOpBuilder
class CPUAdamBuilder(CUDAOpBuilder):
BUILD_VAR = "DS_BUILD_CPU_ADAM"
NAME = "cpu_adam"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.adam.{self.NAME}_op'
def sources(self):
return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/custom_cuda_kernel.cu']
def include_paths(self):
CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
return ['csrc/includes', CUDA_INCLUDE]
def available_vector_instructions(self):
try:
import cpufeature
except ImportError:
warnings.warn(
f'import cpufeature failed - CPU vector optimizations are not available for CPUAdam'
)
return {}
cpu_vector_instructions = {}
try:
cpu_vector_instructions = cpufeature.CPUFeature
except _:
warnings.warn(
f'cpufeature.CPUFeature failed - CPU vector optimizations are not available for CPUAdam'
)
return {}
return cpu_vector_instructions
def cxx_args(self):
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
cpu_info = self.available_vector_instructions()
SIMD_WIDTH = ''
if 'Intel' in cpu_info.get('VendorId', ''):
if cpu_info.get('AVX512f', False):
SIMD_WIDTH = '-D__AVX512__'
elif cpu_info.get('AVX2', False):
SIMD_WIDTH = '-D__AVX256__'
return [
'-O3',
'-std=c++14',
f'-L{CUDA_LIB64}',
'-lcudart',
'-lcublas',
'-g',
'-Wno-reorder',
'-march=native',
'-fopenmp',
SIMD_WIDTH
]
def nvcc_args(self):
args = [
'-O3',
'--use_fast_math',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
args += self.compute_capability_args()
return args
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment