Unverified Commit 31f46fee authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

DeepSpeed JIT op + PyPI support (#496)


Co-authored-by: default avatarShaden Smith <Shaden.Smith@microsoft.com>
Co-authored-by: default avatarReza Yazdani <reyazda@microsoft.com>
parent 0ad4fd88
import torch
from .builder import CUDAOpBuilder
class FusedAdamBuilder(CUDAOpBuilder):
BUILD_VAR = "DS_BUILD_FUSED_ADAM"
NAME = "fused_adam"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.adam.{self.NAME}_op'
def sources(self):
return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
def include_paths(self):
return ['csrc/includes']
def cxx_args(self):
return ['-O3'] + self.version_dependent_macros()
def nvcc_args(self):
return ['-lineinfo', '-O3', '--use_fast_math'] + self.version_dependent_macros()
import torch
from .builder import CUDAOpBuilder
class FusedLambBuilder(CUDAOpBuilder):
BUILD_VAR = 'DS_BUILD_FUSED_LAMB'
NAME = "fused_lamb"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.lamb.{self.NAME}_op'
def sources(self):
return ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
def include_paths(self):
return ['csrc/includes']
def cxx_args(self):
return ['-O3'] + self.version_dependent_macros()
def nvcc_args(self):
return ['-lineinfo', '-O3', '--use_fast_math'] + self.version_dependent_macros()
import torch
import warnings
from .builder import OpBuilder
class SparseAttnBuilder(OpBuilder):
BUILD_VAR = "DS_BUILD_SPARSE_ATTN"
NAME = "sparse_attn"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.sparse_attention.{self.NAME}_op'
def sources(self):
return ['csrc/sparse_attention/utils.cpp']
def cxx_args(self):
return ['-O2', '-fopenmp']
def is_compatible(self):
# Check to see if llvm and cmake are installed since they are dependencies
required_commands = ['llvm-config|llvm-config-9', 'cmake']
command_status = list(map(self.command_exists, required_commands))
deps_compatible = all(command_status)
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5
if not torch_compatible:
self.warning(
f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
)
return super().is_compatible() and deps_compatible and torch_compatible
import torch
from .transformer import TransformerBuilder
class StochasticTransformerBuilder(TransformerBuilder):
BUILD_VAR = "DS_BUILD_STOCHASTIC_TRANSFORMER"
NAME = "stochastic_transformer"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.transformer.{self.NAME}_op'
def nvcc_args(self):
args = super().nvcc_args()
args.append('-D__STOCHASTIC_MODE__')
return args
import torch
from .builder import CUDAOpBuilder
class TransformerBuilder(CUDAOpBuilder):
BUILD_VAR = "DS_BUILD_TRANSFORMER"
NAME = "transformer"
def __init__(self, name=None):
name = self.NAME if name is None else name
super().__init__(name=name)
def absolute_name(self):
return f'deepspeed.ops.transformer.{self.NAME}_op'
def sources(self):
return [
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
]
def include_paths(self):
return ['csrc/includes']
def nvcc_args(self):
args = [
'-O3',
'--use_fast_math',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
return args + self.compute_capability_args()
def cxx_args(self):
return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
from .builder import OpBuilder
class UtilsBuilder(OpBuilder):
BUILD_VAR = "DS_BUILD_UTILS"
NAME = "utils"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.{self.NAME}_op'
def sources(self):
return ['csrc/utils/flatten_unflatten.cpp']
...@@ -2,5 +2,6 @@ torch>=1.2 ...@@ -2,5 +2,6 @@ torch>=1.2
torchvision>=0.4.0 torchvision>=0.4.0
tqdm tqdm
psutil psutil
cpufeature
tensorboardX==1.8 tensorboardX==1.8
ninja
cpufeature
...@@ -16,7 +16,7 @@ import warnings ...@@ -16,7 +16,7 @@ import warnings
from setuptools import setup, find_packages from setuptools import setup, find_packages
from torch.utils.cpp_extension import CUDAExtension, BuildExtension, CppExtension from torch.utils.cpp_extension import CUDAExtension, BuildExtension, CppExtension
VERSION = "0.3.0" import op_builder
def fetch_requirements(path): def fetch_requirements(path):
...@@ -24,88 +24,33 @@ def fetch_requirements(path): ...@@ -24,88 +24,33 @@ def fetch_requirements(path):
return [r.strip() for r in fd.readlines()] return [r.strip() for r in fd.readlines()]
def available_vector_instructions():
try:
import cpufeature
except ImportError:
warnings.warn(
f'import cpufeature failed - CPU vector optimizations are not available for CPUAdam'
)
return {}
cpu_vector_instructions = {}
try:
cpu_vector_instructions = cpufeature.CPUFeature
except _:
warnings.warn(
f'cpufeature.CPUFeature failed - CPU vector optimizations are not available for CPUAdam'
)
return {}
return cpu_vector_instructions
install_requires = fetch_requirements('requirements/requirements.txt') install_requires = fetch_requirements('requirements/requirements.txt')
dev_requires = fetch_requirements('requirements/requirements-dev.txt') extras_require = {
sparse_attn_requires = fetch_requirements('requirements/requirements-sparse-attn.txt') '1bit_adam': fetch_requirements('requirements/requirements-1bit-adam.txt'),
'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
'dev': fetch_requirements('requirements/requirements-dev.txt'),
}
# If MPI is available add 1bit-adam requirements # If MPI is available add 1bit-adam requirements
if torch.cuda.is_available(): if torch.cuda.is_available():
if shutil.which('ompi_info') or shutil.which('mpiname'): if shutil.which('ompi_info') or shutil.which('mpiname'):
onebit_adam_requires = fetch_requirements( cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
'requirements/requirements-1bit-adam.txt') extras_require['1bit_adam'].append(cupy)
onebit_adam_requires.append(f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}")
install_requires += onebit_adam_requires # Make an [all] extra that installs all needed dependencies
all_extras = set()
# Constants for each op for extra in extras_require.items():
LAMB = "lamb" for req in extra[1]:
TRANSFORMER = "transformer" all_extras.add(req)
SPARSE_ATTN = "sparse-attn" extras_require['all'] = list(all_extras)
CPU_ADAM = "cpu-adam"
cpu_vector_instructions = available_vector_instructions()
# Build environment variables for custom builds
DS_BUILD_LAMB_MASK = 1
DS_BUILD_TRANSFORMER_MASK = 10
DS_BUILD_SPARSE_ATTN_MASK = 100
DS_BUILD_CPU_ADAM_MASK = 1000
# Allow for build_cuda to turn on or off all ops
DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK | DS_BUILD_CPU_ADAM_MASK
DS_BUILD_CUDA = int(os.environ.get('DS_BUILD_CUDA', 1)) * DS_BUILD_ALL_OPS
# Set default of each op based on if build_cuda is set
OP_DEFAULT = DS_BUILD_CUDA == DS_BUILD_ALL_OPS
DS_BUILD_CPU_ADAM = int(os.environ.get('DS_BUILD_CPU_ADAM', 0)) * DS_BUILD_CPU_ADAM_MASK
DS_BUILD_LAMB = int(os.environ.get('DS_BUILD_LAMB', OP_DEFAULT)) * DS_BUILD_LAMB_MASK
DS_BUILD_TRANSFORMER = int(os.environ.get('DS_BUILD_TRANSFORMER',
OP_DEFAULT)) * DS_BUILD_TRANSFORMER_MASK
DS_BUILD_SPARSE_ATTN = int(os.environ.get('DS_BUILD_SPARSE_ATTN',
OP_DEFAULT)) * DS_BUILD_SPARSE_ATTN_MASK
# Final effective mask is the bitwise OR of each op
BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN
| DS_BUILD_CPU_ADAM)
install_ops = dict.fromkeys([LAMB, TRANSFORMER, SPARSE_ATTN, CPU_ADAM], False)
if BUILD_MASK & DS_BUILD_LAMB:
install_ops[LAMB] = True
if BUILD_MASK & DS_BUILD_CPU_ADAM:
install_ops[CPU_ADAM] = True
if BUILD_MASK & DS_BUILD_TRANSFORMER:
install_ops[TRANSFORMER] = True
if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
install_ops[SPARSE_ATTN] = True
if len(install_ops) == 0:
print("Building without any cuda/cpp extensions")
print(f'BUILD_MASK={BUILD_MASK}, install_ops={install_ops}')
cmdclass = {} cmdclass = {}
# For any pre-installed ops force disable ninja
cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False) cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MAJOR = torch.__version__.split('.')[0]
TORCH_MINOR = int(torch.__version__.split('.')[1]) TORCH_MINOR = torch.__version__.split('.')[1]
if not torch.cuda.is_available(): if not torch.cuda.is_available():
# Fix to allow docker buils, similar to https://github.com/NVIDIA/apex/issues/486 # Fix to allow docker buils, similar to https://github.com/NVIDIA/apex/issues/486
...@@ -116,230 +61,118 @@ if not torch.cuda.is_available(): ...@@ -116,230 +61,118 @@ if not torch.cuda.is_available():
if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5" os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
# Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
version_ge_1_1 = ['-DVERSION_GE_1_1']
version_ge_1_3 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
version_ge_1_3 = ['-DVERSION_GE_1_3']
version_ge_1_5 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
version_ge_1_5 = ['-DVERSION_GE_1_5']
version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
SIMD_WIDTH = ''
if cpu_vector_instructions.get('AVX512f', False):
SIMD_WIDTH = '-D__AVX512__'
elif cpu_vector_instructions.get('AVX2', False):
SIMD_WIDTH = '-D__AVX256__'
print("SIMD_WIDTH = ", SIMD_WIDTH)
ext_modules = [] ext_modules = []
## Lamb ## from op_builder import ALL_OPS
if BUILD_MASK & DS_BUILD_LAMB:
ext_modules.append( # Default to pre-install kernels to false so we rely on JIT
CUDAExtension(name='deepspeed.ops.lamb.fused_lamb_cuda', BUILD_OP_DEFAULT = int(os.environ.get('DS_BUILD_OPS', 0))
sources=[ print(f"DS_BUILD_OPS={BUILD_OP_DEFAULT}")
'csrc/lamb/fused_lamb_cuda.cpp',
'csrc/lamb/fused_lamb_cuda_kernel.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': [
'-O3',
] + version_dependent_macros,
'nvcc': ['-O3',
'--use_fast_math'] + version_dependent_macros
}))
## Adam ##
if BUILD_MASK & DS_BUILD_CPU_ADAM:
ext_modules.append(
CUDAExtension(name='deepspeed.ops.adam.cpu_adam_op',
sources=[
'csrc/adam/cpu_adam.cpp',
'csrc/adam/custom_cuda_kernel.cu',
],
include_dirs=['csrc/includes',
'/usr/local/cuda/include'],
extra_compile_args={
'cxx': [
'-O3',
'-std=c++14',
'-L/usr/local/cuda/lib64',
'-lcudart',
'-lcublas',
'-g',
'-Wno-reorder',
'-march=native',
'-fopenmp',
SIMD_WIDTH
],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
}))
## Transformer ##
if BUILD_MASK & DS_BUILD_TRANSFORMER:
ext_modules.append(
CUDAExtension(name='deepspeed.ops.transformer.transformer_cuda',
sources=[
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': ['-O3',
'-std=c++14',
'-g',
'-Wno-reorder'],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_60,code=compute_60',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
}))
ext_modules.append(
CUDAExtension(name='deepspeed.ops.transformer.stochastic_transformer_cuda',
sources=[
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': ['-O3',
'-std=c++14',
'-g',
'-Wno-reorder'],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_60,code=compute_60',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__',
'-D__STOCHASTIC_MODE__'
]
}))
def command_exists(cmd): def command_exists(cmd):
if '|' in cmd:
cmds = cmd.split("|")
else:
cmds = [cmd]
valid = False
for cmd in cmds:
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
valid = valid or result.wait() == 0 return result.wait() == 0
return valid
def op_enabled(op_name):
## Sparse transformer ## assert hasattr(ALL_OPS[op_name], 'BUILD_VAR'), \
if BUILD_MASK & DS_BUILD_SPARSE_ATTN: f"{op_name} is missing BUILD_VAR field"
# Check to see if llvm and cmake are installed since they are dependencies env_var = ALL_OPS[op_name].BUILD_VAR
required_commands = ['llvm-config|llvm-config-9', 'cmake'] return int(os.environ.get(env_var, BUILD_OP_DEFAULT))
command_status = list(map(command_exists, required_commands))
if not all(command_status): install_ops = dict.fromkeys(ALL_OPS.keys(), False)
zipped_status = list(zip(required_commands, command_status)) for op_name, builder in ALL_OPS.items():
warnings.warn( op_compatible = builder.is_compatible()
f'Missing non-python requirements, please install the missing packages: {zipped_status}'
) # If op is compatible update install reqs so it can potentially build/run later
warnings.warn( if op_compatible:
'Skipping sparse attention installation due to missing required packages') reqs = builder.python_requirements()
# remove from installed ops list install_requires += builder.python_requirements()
install_ops[SPARSE_ATTN] = False
elif TORCH_MAJOR == 1 and TORCH_MINOR >= 5: # If op install enabled, add builder to extensions
ext_modules.append( if op_enabled(op_name) and op_compatible:
CppExtension(name='deepspeed.ops.sparse_attention.cpp_utils', install_ops[op_name] = op_enabled(op_name)
sources=['csrc/sparse_attention/utils.cpp'], ext_modules.append(builder.builder())
extra_compile_args={'cxx': ['-O2',
'-fopenmp']})) compatible_ops = {op_name: op.is_compatible() for (op_name, op) in ALL_OPS.items()}
# Add sparse attention requirements
install_requires += sparse_attn_requires print(f'Install Ops={install_ops}')
else:
warnings.warn('Unable to meet requirements to install sparse attention')
# remove from installed ops list
install_ops[SPARSE_ATTN] = False
# Add development requirements
install_requires += dev_requires
# Write out version/git info # Write out version/git info
git_hash_cmd = "git rev-parse --short HEAD" git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD" git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
if command_exists('git'): if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
try:
result = subprocess.check_output(git_hash_cmd, shell=True) result = subprocess.check_output(git_hash_cmd, shell=True)
git_hash = result.decode('utf-8').strip() git_hash = result.decode('utf-8').strip()
result = subprocess.check_output(git_branch_cmd, shell=True) result = subprocess.check_output(git_branch_cmd, shell=True)
git_branch = result.decode('utf-8').strip() git_branch = result.decode('utf-8').strip()
except subprocess.CalledProcessError:
git_hash = "unknown"
git_branch = "unknown"
else: else:
git_hash = "unknown" git_hash = "unknown"
git_branch = "unknown" git_branch = "unknown"
print(f"version={VERSION}+{git_hash}, git_hash={git_hash}, git_branch={git_branch}")
# Parse the DeepSpeed version string from version.txt
version_str = open('version.txt', 'r').read().strip()
# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
# example: DS_BUILD_STR=".dev20201022" python setup.py sdist bdist_wheel
#version_str += os.environ.get('DS_BUILD_STRING', f'+{git_hash}')
# Building wheel for distribution, update version file
if 'DS_BUILD_STRING' in os.environ:
# Build string env specified, probably building for distribution
with open('build.txt', 'w') as fd:
fd.write(os.environ.get('DS_BUILD_STRING'))
version_str += os.environ.get('DS_BUILD_STRING')
elif os.path.isfile('build.txt'):
# build.txt exists, probably installing from distribution
with open('build.txt', 'r') as fd:
version_str += fd.read().strip()
else:
# None of the above, probably installing from source
version_str += f'+{git_hash}'
torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
cuda_version = ".".join(torch.version.cuda.split('.')[:2])
torch_info = {"version": torch_version, "cuda_version": cuda_version}
print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
with open('deepspeed/git_version_info_installed.py', 'w') as fd: with open('deepspeed/git_version_info_installed.py', 'w') as fd:
fd.write(f"version='{VERSION}+{git_hash}'\n") fd.write(f"version='{version_str}'\n")
fd.write(f"git_hash='{git_hash}'\n") fd.write(f"git_hash='{git_hash}'\n")
fd.write(f"git_branch='{git_branch}'\n") fd.write(f"git_branch='{git_branch}'\n")
fd.write(f"installed_ops={install_ops}\n") fd.write(f"installed_ops={install_ops}\n")
fd.write(f"compatible_ops={compatible_ops}\n")
fd.write(f"torch_info={torch_info}\n")
print(f'install_requires={install_requires}') print(f'install_requires={install_requires}')
print(f'compatible_ops={compatible_ops}')
print(f'ext_modules={ext_modules}')
setup(name='deepspeed', setup(name='deepspeed',
version=f"{VERSION}+{git_hash}", version=version_str,
description='DeepSpeed library', description='DeepSpeed library',
author='DeepSpeed Team', author='DeepSpeed Team',
author_email='deepspeed@microsoft.com', author_email='deepspeed@microsoft.com',
url='http://deepspeed.ai', url='http://deepspeed.ai',
install_requires=install_requires, install_requires=install_requires,
extras_require=extras_require,
packages=find_packages(exclude=["docker", packages=find_packages(exclude=["docker",
"third_party", "third_party"]),
"csrc"]), include_package_data=True,
package_data={'deepspeed.ops.sparse_attention.trsrc': ['*.tr']}, scripts=[
scripts=['bin/deepspeed', 'bin/deepspeed',
'bin/deepspeed.pt', 'bin/deepspeed.pt',
'bin/ds', 'bin/ds',
'bin/ds_ssh'], 'bin/ds_ssh',
'bin/ds_report'
],
classifiers=[ classifiers=[
'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.7',
......
...@@ -363,10 +363,18 @@ except ImportError: ...@@ -363,10 +363,18 @@ except ImportError:
self.variance_epsilon = eps self.variance_epsilon = eps
def forward(self, x): def forward(self, x):
pdtype = x.dtype
x = x.float()
u = x.mean(-1, keepdim=True) u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True) s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon) x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias return self.weight * x.to(pdtype) + self.bias
#def forward(self, x):
# u = x.mean(-1, keepdim=True)
# s = (x - u).pow(2).mean(-1, keepdim=True)
# x = (x - u) / torch.sqrt(s + self.variance_epsilon)
# return self.weight * x + self.bias
class BertEmbeddings(nn.Module): class BertEmbeddings(nn.Module):
......
...@@ -12,6 +12,8 @@ from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer ...@@ -12,6 +12,8 @@ from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
from deepspeed.runtime.pipe.topology import * from deepspeed.runtime.pipe.topology import *
PipeTopo = PipeDataParallelTopology PipeTopo = PipeDataParallelTopology
from deepspeed.ops.op_builder import FusedLambBuilder, CPUAdamBuilder
import argparse import argparse
import pytest import pytest
import json import json
...@@ -152,8 +154,8 @@ def checkpoint_correctness_verification(args, ...@@ -152,8 +154,8 @@ def checkpoint_correctness_verification(args,
compare_lr_scheduler_states(trained_model, loaded_model) compare_lr_scheduler_states(trained_model, loaded_model)
@pytest.mark.skipif(not deepspeed.ops.__installed_ops__['lamb'], @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
reason="lamb is not installed") reason="lamb is not compatible")
def test_checkpoint_unfused_optimizer(tmpdir): def test_checkpoint_unfused_optimizer(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -264,11 +266,11 @@ def test_checkpoint_fused_optimizer(tmpdir): ...@@ -264,11 +266,11 @@ def test_checkpoint_fused_optimizer(tmpdir):
'Adam'), 'Adam'),
(2, (2,
True, True,
'deepspeed_adam'), 'Adam'),
]) ])
def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not installed") pytest.skip("cpu-adam is not compatible")
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -320,14 +322,14 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_opt ...@@ -320,14 +322,14 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_opt
"Adam"), "Adam"),
(2, (2,
True, True,
'deepspeed_adam'), 'Adam'),
]) ])
def test_checkpoint_zero_no_optimizer(tmpdir, def test_checkpoint_zero_no_optimizer(tmpdir,
zero_stage, zero_stage,
use_cpu_offload, use_cpu_offload,
adam_optimizer): adam_optimizer):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not installed") pytest.skip("cpu-adam is not compatible")
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -385,11 +387,11 @@ def test_checkpoint_zero_no_optimizer(tmpdir, ...@@ -385,11 +387,11 @@ def test_checkpoint_zero_no_optimizer(tmpdir,
'Adam'), 'Adam'),
(2, (2,
True, True,
'deepspeed_adam'), 'Adam'),
]) ])
def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not installed") pytest.skip("cpu-adam is not compatible")
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -459,11 +461,11 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optim ...@@ -459,11 +461,11 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optim
'Adam'), 'Adam'),
(2, (2,
True, True,
'deepspeed_adam'), 'Adam'),
]) ])
def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not installed") pytest.skip("cpu-adam is not compatible")
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
......
import argparse import argparse
import torch import torch
import apex
import time import time
import numpy as np import numpy as np
import pytest import pytest
import copy import copy
import deepspeed import deepspeed
if not deepspeed.ops.__installed_ops__['cpu-adam']: from deepspeed.ops.adam import FusedAdam
pytest.skip("cpu-adam is not installed", allow_module_level=True) from deepspeed.ops.op_builder import CPUAdamBuilder
else:
from deepspeed.ops.adam import DeepSpeedCPUAdam if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not compatible")
def check_equal(first, second, atol=1e-2, verbose=False): def check_equal(first, second, atol=1e-2, verbose=False):
...@@ -32,6 +32,7 @@ def check_equal(first, second, atol=1e-2, verbose=False): ...@@ -32,6 +32,7 @@ def check_equal(first, second, atol=1e-2, verbose=False):
(1048576), (1048576),
]) # yapf: disable ]) # yapf: disable
def test_cpu_adam_opt(model_size): def test_cpu_adam_opt(model_size):
from deepspeed.ops.adam import DeepSpeedCPUAdam
device = 'cpu' device = 'cpu'
rng_state = torch.get_rng_state() rng_state = torch.get_rng_state()
param = torch.nn.Parameter(torch.randn(model_size, device=device)) param = torch.nn.Parameter(torch.randn(model_size, device=device))
...@@ -42,7 +43,7 @@ def test_cpu_adam_opt(model_size): ...@@ -42,7 +43,7 @@ def test_cpu_adam_opt(model_size):
param2 = torch.nn.Parameter(param2_data) param2 = torch.nn.Parameter(param2_data)
optimizer1 = torch.optim.AdamW([param1]) optimizer1 = torch.optim.AdamW([param1])
optimizer2 = apex.optimizers.FusedAdam([param2]) optimizer2 = FusedAdam([param2])
optimizer = DeepSpeedCPUAdam([param]) optimizer = DeepSpeedCPUAdam([param])
for i in range(10): for i in range(10):
......
...@@ -16,8 +16,8 @@ import deepspeed ...@@ -16,8 +16,8 @@ import deepspeed
import sys import sys
if not deepspeed.ops.__installed_ops__['transformer']: #if not deepspeed.ops.__installed_ops__['transformer']:
pytest.skip("transformer kernels are not installed", allow_module_level=True) # pytest.skip("transformer kernels are not installed", allow_module_level=True)
def check_equal(first, second, atol=1e-2, verbose=False): def check_equal(first, second, atol=1e-2, verbose=False):
...@@ -254,6 +254,7 @@ def run_backward(ds_config, atol=1e-2, verbose=False): ...@@ -254,6 +254,7 @@ def run_backward(ds_config, atol=1e-2, verbose=False):
check_equal(base_grads, ds_grads, atol=atol, verbose=verbose) check_equal(base_grads, ds_grads, atol=atol, verbose=verbose)
#test_backward[3-1024-120-16-24-True-True-0.05]
@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol', @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
[ [
(3,1024,120,16,24,True,False, 0.05), (3,1024,120,16,24,True,False, 0.05),
......
...@@ -16,8 +16,8 @@ import deepspeed ...@@ -16,8 +16,8 @@ import deepspeed
import sys import sys
if not deepspeed.ops.__installed_ops__['transformer']: #if not deepspeed.ops.__installed_ops__['transformer']:
pytest.skip("transformer kernels are not installed", allow_module_level=True) # pytest.skip("transformer kernels are not installed", allow_module_level=True)
def check_equal(first, second, atol=1e-2, verbose=False): def check_equal(first, second, atol=1e-2, verbose=False):
......
...@@ -8,9 +8,6 @@ import numpy as np ...@@ -8,9 +8,6 @@ import numpy as np
from common import distributed_test from common import distributed_test
from simple_model import SimpleModel, args_from_dict from simple_model import SimpleModel, args_from_dict
lamb_available = pytest.mark.skipif(not deepspeed.ops.__installed_ops__['lamb'],
reason="lamb is not installed")
def run_model_step(model, gradient_list): def run_model_step(model, gradient_list):
for value in gradient_list: for value in gradient_list:
...@@ -168,7 +165,6 @@ def test_fused_some_overflow(tmpdir): ...@@ -168,7 +165,6 @@ def test_fused_some_overflow(tmpdir):
_test_fused_some_overflow(args) _test_fused_some_overflow(args)
@lamb_available
def test_unfused_no_overflow(tmpdir): def test_unfused_no_overflow(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 1, "train_batch_size": 1,
...@@ -212,7 +208,6 @@ def test_unfused_no_overflow(tmpdir): ...@@ -212,7 +208,6 @@ def test_unfused_no_overflow(tmpdir):
_test_unfused_no_overflow(args) _test_unfused_no_overflow(args)
@lamb_available
def test_unfused_all_overflow(tmpdir): def test_unfused_all_overflow(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 1, "train_batch_size": 1,
...@@ -258,7 +253,6 @@ def test_unfused_all_overflow(tmpdir): ...@@ -258,7 +253,6 @@ def test_unfused_all_overflow(tmpdir):
_test_unfused_all_overflow(args) _test_unfused_all_overflow(args)
@lamb_available
def test_unfused_some_overflow(tmpdir): def test_unfused_some_overflow(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 1, "train_batch_size": 1,
......
import torch import torch
import apex
import deepspeed import deepspeed
import argparse import argparse
import pytest import pytest
import json import json
import os import os
from deepspeed.ops.adam import FusedAdam
from common import distributed_test from common import distributed_test
from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
lamb_available = pytest.mark.skipif(not deepspeed.ops.__installed_ops__['lamb'], try:
reason="lamb is not installed") from apex import amp
_amp_available = True
except ImportError:
_amp_available = False
amp_available = pytest.mark.skip(_amp_available, reason="apex/amp is not installed")
@lamb_available
def test_lamb_fp32_grad_clip(tmpdir): def test_lamb_fp32_grad_clip(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -48,7 +51,6 @@ def test_lamb_fp32_grad_clip(tmpdir): ...@@ -48,7 +51,6 @@ def test_lamb_fp32_grad_clip(tmpdir):
_test_lamb_fp32_grad_clip(args=args, model=model, hidden_dim=hidden_dim) _test_lamb_fp32_grad_clip(args=args, model=model, hidden_dim=hidden_dim)
@lamb_available
def test_lamb_fp16_basic(tmpdir): def test_lamb_fp16_basic(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -86,7 +88,6 @@ def test_lamb_fp16_basic(tmpdir): ...@@ -86,7 +88,6 @@ def test_lamb_fp16_basic(tmpdir):
_test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim) _test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
@lamb_available
def test_lamb_fp16_empty_grad(tmpdir): def test_lamb_fp16_empty_grad(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -234,8 +235,8 @@ def test_adamw_fp16_empty_grad(tmpdir): ...@@ -234,8 +235,8 @@ def test_adamw_fp16_empty_grad(tmpdir):
True), True),
]) ])
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload): def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed") # pytest.skip("cpu-adam is not installed")
config_dict = { config_dict = {
"train_batch_size": 1, "train_batch_size": 1,
"steps_per_print": 1, "steps_per_print": 1,
...@@ -302,8 +303,8 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offlo ...@@ -302,8 +303,8 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offlo
True), True),
]) ])
def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload): def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed") # pytest.skip("cpu-adam is not installed")
config_dict = { config_dict = {
"train_batch_size": 4, "train_batch_size": 4,
"steps_per_print": 1, "steps_per_print": 1,
...@@ -402,8 +403,8 @@ def test_zero_static_scale_deprecated_format(tmpdir): ...@@ -402,8 +403,8 @@ def test_zero_static_scale_deprecated_format(tmpdir):
True), True),
]) ])
def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload): def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed") # pytest.skip("cpu-adam is not installed")
config_dict = { config_dict = {
"train_batch_size": 4, "train_batch_size": 4,
"steps_per_print": 1, "steps_per_print": 1,
...@@ -442,8 +443,8 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload): ...@@ -442,8 +443,8 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
True), True),
]) ])
def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload): def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed") # pytest.skip("cpu-adam is not installed")
config_dict = { config_dict = {
"train_micro_batch_size_per_gpu": 1, "train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
...@@ -489,6 +490,7 @@ def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload): ...@@ -489,6 +490,7 @@ def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
_test_zero_empty_partition(args) _test_zero_empty_partition(args)
@amp_available
def test_adam_amp_basic(tmpdir): def test_adam_amp_basic(tmpdir):
config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}} config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}}
args = args_from_dict(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
...@@ -514,7 +516,7 @@ def test_adam_amp_basic(tmpdir): ...@@ -514,7 +516,7 @@ def test_adam_amp_basic(tmpdir):
_test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim) _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
@lamb_available @amp_available
def test_lamb_amp_basic(tmpdir): def test_lamb_amp_basic(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -552,6 +554,7 @@ def test_lamb_amp_basic(tmpdir): ...@@ -552,6 +554,7 @@ def test_lamb_amp_basic(tmpdir):
_test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim) _test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
@amp_available
def test_adam_amp_o2(tmpdir): def test_adam_amp_o2(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -590,6 +593,7 @@ def test_adam_amp_o2(tmpdir): ...@@ -590,6 +593,7 @@ def test_adam_amp_o2(tmpdir):
_test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim) _test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim)
@amp_available
def test_adam_amp_o2_empty_grad(tmpdir): def test_adam_amp_o2_empty_grad(tmpdir):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
...@@ -630,11 +634,11 @@ def test_adam_amp_o2_empty_grad(tmpdir): ...@@ -630,11 +634,11 @@ def test_adam_amp_o2_empty_grad(tmpdir):
@pytest.mark.parametrize('zero_stage, optimizer_constructor', @pytest.mark.parametrize('zero_stage, optimizer_constructor',
[(1, [(1,
apex.optimizers.FusedAdam), FusedAdam),
(2, (2,
torch.optim.Adam), torch.optim.Adam),
(2, (2,
apex.optimizers.FusedAdam)]) FusedAdam)])
def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor): def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
config_dict = { config_dict = {
"train_batch_size": 2, "train_batch_size": 2,
......
...@@ -6,9 +6,11 @@ ...@@ -6,9 +6,11 @@
import pytest import pytest
import torch import torch
import deepspeed import deepspeed
from deepspeed.ops.op_builder import SparseAttnBuilder
if not deepspeed.ops.__installed_ops__['sparse-attn']: if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]:
pytest.skip("cpu-adam is not installed", allow_module_level=True) pytest.skip("sparse attention op is not compatible on this system",
allow_module_level=True)
def test_sparse_attention_module_availability(): def test_sparse_attention_module_availability():
...@@ -236,7 +238,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo ...@@ -236,7 +238,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
def _skip_on_cuda_compatability(): def _skip_on_cuda_compatability():
pytest.skip("Skip these tests for now until we get our docker image fixed.") #pytest.skip("Skip these tests for now until we get our docker image fixed.")
if torch.cuda.get_device_capability()[0] != 7: if torch.cuda.get_device_capability()[0] != 7:
pytest.skip("needs compute capability 7; v100") pytest.skip("needs compute capability 7; v100")
cuda_major = int(torch.version.cuda.split('.')[0]) * 10 cuda_major = int(torch.version.cuda.split('.')[0]) * 10
......
Subproject commit 494f8ab3fc1b0b26949a3bcbb2bcac78008d48c1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment