Unverified Commit 31f46fee authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

DeepSpeed JIT op + PyPI support (#496)


Co-authored-by: default avatarShaden Smith <Shaden.Smith@microsoft.com>
Co-authored-by: default avatarReza Yazdani <reyazda@microsoft.com>
parent 0ad4fd88
import torch
from .builder import CUDAOpBuilder
class FusedAdamBuilder(CUDAOpBuilder):
BUILD_VAR = "DS_BUILD_FUSED_ADAM"
NAME = "fused_adam"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.adam.{self.NAME}_op'
def sources(self):
return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
def include_paths(self):
return ['csrc/includes']
def cxx_args(self):
return ['-O3'] + self.version_dependent_macros()
def nvcc_args(self):
return ['-lineinfo', '-O3', '--use_fast_math'] + self.version_dependent_macros()
import torch
from .builder import CUDAOpBuilder
class FusedLambBuilder(CUDAOpBuilder):
BUILD_VAR = 'DS_BUILD_FUSED_LAMB'
NAME = "fused_lamb"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.lamb.{self.NAME}_op'
def sources(self):
return ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
def include_paths(self):
return ['csrc/includes']
def cxx_args(self):
return ['-O3'] + self.version_dependent_macros()
def nvcc_args(self):
return ['-lineinfo', '-O3', '--use_fast_math'] + self.version_dependent_macros()
import torch
import warnings
from .builder import OpBuilder
class SparseAttnBuilder(OpBuilder):
BUILD_VAR = "DS_BUILD_SPARSE_ATTN"
NAME = "sparse_attn"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.sparse_attention.{self.NAME}_op'
def sources(self):
return ['csrc/sparse_attention/utils.cpp']
def cxx_args(self):
return ['-O2', '-fopenmp']
def is_compatible(self):
# Check to see if llvm and cmake are installed since they are dependencies
required_commands = ['llvm-config|llvm-config-9', 'cmake']
command_status = list(map(self.command_exists, required_commands))
deps_compatible = all(command_status)
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5
if not torch_compatible:
self.warning(
f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
)
return super().is_compatible() and deps_compatible and torch_compatible
import torch
from .transformer import TransformerBuilder
class StochasticTransformerBuilder(TransformerBuilder):
BUILD_VAR = "DS_BUILD_STOCHASTIC_TRANSFORMER"
NAME = "stochastic_transformer"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.transformer.{self.NAME}_op'
def nvcc_args(self):
args = super().nvcc_args()
args.append('-D__STOCHASTIC_MODE__')
return args
import torch
from .builder import CUDAOpBuilder
class TransformerBuilder(CUDAOpBuilder):
BUILD_VAR = "DS_BUILD_TRANSFORMER"
NAME = "transformer"
def __init__(self, name=None):
name = self.NAME if name is None else name
super().__init__(name=name)
def absolute_name(self):
return f'deepspeed.ops.transformer.{self.NAME}_op'
def sources(self):
return [
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
]
def include_paths(self):
return ['csrc/includes']
def nvcc_args(self):
args = [
'-O3',
'--use_fast_math',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
return args + self.compute_capability_args()
def cxx_args(self):
return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
from .builder import OpBuilder
class UtilsBuilder(OpBuilder):
BUILD_VAR = "DS_BUILD_UTILS"
NAME = "utils"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.{self.NAME}_op'
def sources(self):
return ['csrc/utils/flatten_unflatten.cpp']
......@@ -2,5 +2,6 @@ torch>=1.2
torchvision>=0.4.0
tqdm
psutil
cpufeature
tensorboardX==1.8
ninja
cpufeature
......@@ -16,7 +16,7 @@ import warnings
from setuptools import setup, find_packages
from torch.utils.cpp_extension import CUDAExtension, BuildExtension, CppExtension
VERSION = "0.3.0"
import op_builder
def fetch_requirements(path):
......@@ -24,88 +24,33 @@ def fetch_requirements(path):
return [r.strip() for r in fd.readlines()]
def available_vector_instructions():
try:
import cpufeature
except ImportError:
warnings.warn(
f'import cpufeature failed - CPU vector optimizations are not available for CPUAdam'
)
return {}
cpu_vector_instructions = {}
try:
cpu_vector_instructions = cpufeature.CPUFeature
except _:
warnings.warn(
f'cpufeature.CPUFeature failed - CPU vector optimizations are not available for CPUAdam'
)
return {}
return cpu_vector_instructions
install_requires = fetch_requirements('requirements/requirements.txt')
dev_requires = fetch_requirements('requirements/requirements-dev.txt')
sparse_attn_requires = fetch_requirements('requirements/requirements-sparse-attn.txt')
extras_require = {
'1bit_adam': fetch_requirements('requirements/requirements-1bit-adam.txt'),
'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
'dev': fetch_requirements('requirements/requirements-dev.txt'),
}
# If MPI is available add 1bit-adam requirements
if torch.cuda.is_available():
if shutil.which('ompi_info') or shutil.which('mpiname'):
onebit_adam_requires = fetch_requirements(
'requirements/requirements-1bit-adam.txt')
onebit_adam_requires.append(f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}")
install_requires += onebit_adam_requires
# Constants for each op
LAMB = "lamb"
TRANSFORMER = "transformer"
SPARSE_ATTN = "sparse-attn"
CPU_ADAM = "cpu-adam"
cpu_vector_instructions = available_vector_instructions()
# Build environment variables for custom builds
DS_BUILD_LAMB_MASK = 1
DS_BUILD_TRANSFORMER_MASK = 10
DS_BUILD_SPARSE_ATTN_MASK = 100
DS_BUILD_CPU_ADAM_MASK = 1000
# Allow for build_cuda to turn on or off all ops
DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK | DS_BUILD_CPU_ADAM_MASK
DS_BUILD_CUDA = int(os.environ.get('DS_BUILD_CUDA', 1)) * DS_BUILD_ALL_OPS
# Set default of each op based on if build_cuda is set
OP_DEFAULT = DS_BUILD_CUDA == DS_BUILD_ALL_OPS
DS_BUILD_CPU_ADAM = int(os.environ.get('DS_BUILD_CPU_ADAM', 0)) * DS_BUILD_CPU_ADAM_MASK
DS_BUILD_LAMB = int(os.environ.get('DS_BUILD_LAMB', OP_DEFAULT)) * DS_BUILD_LAMB_MASK
DS_BUILD_TRANSFORMER = int(os.environ.get('DS_BUILD_TRANSFORMER',
OP_DEFAULT)) * DS_BUILD_TRANSFORMER_MASK
DS_BUILD_SPARSE_ATTN = int(os.environ.get('DS_BUILD_SPARSE_ATTN',
OP_DEFAULT)) * DS_BUILD_SPARSE_ATTN_MASK
# Final effective mask is the bitwise OR of each op
BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN
| DS_BUILD_CPU_ADAM)
install_ops = dict.fromkeys([LAMB, TRANSFORMER, SPARSE_ATTN, CPU_ADAM], False)
if BUILD_MASK & DS_BUILD_LAMB:
install_ops[LAMB] = True
if BUILD_MASK & DS_BUILD_CPU_ADAM:
install_ops[CPU_ADAM] = True
if BUILD_MASK & DS_BUILD_TRANSFORMER:
install_ops[TRANSFORMER] = True
if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
install_ops[SPARSE_ATTN] = True
if len(install_ops) == 0:
print("Building without any cuda/cpp extensions")
print(f'BUILD_MASK={BUILD_MASK}, install_ops={install_ops}')
cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
extras_require['1bit_adam'].append(cupy)
# Make an [all] extra that installs all needed dependencies
all_extras = set()
for extra in extras_require.items():
for req in extra[1]:
all_extras.add(req)
extras_require['all'] = list(all_extras)
cmdclass = {}
# For any pre-installed ops force disable ninja
cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
TORCH_MAJOR = torch.__version__.split('.')[0]
TORCH_MINOR = torch.__version__.split('.')[1]
if not torch.cuda.is_available():
# Fix to allow docker buils, similar to https://github.com/NVIDIA/apex/issues/486
......@@ -116,230 +61,118 @@ if not torch.cuda.is_available():
if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
# Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
version_ge_1_1 = ['-DVERSION_GE_1_1']
version_ge_1_3 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
version_ge_1_3 = ['-DVERSION_GE_1_3']
version_ge_1_5 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
version_ge_1_5 = ['-DVERSION_GE_1_5']
version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
SIMD_WIDTH = ''
if cpu_vector_instructions.get('AVX512f', False):
SIMD_WIDTH = '-D__AVX512__'
elif cpu_vector_instructions.get('AVX2', False):
SIMD_WIDTH = '-D__AVX256__'
print("SIMD_WIDTH = ", SIMD_WIDTH)
ext_modules = []
## Lamb ##
if BUILD_MASK & DS_BUILD_LAMB:
ext_modules.append(
CUDAExtension(name='deepspeed.ops.lamb.fused_lamb_cuda',
sources=[
'csrc/lamb/fused_lamb_cuda.cpp',
'csrc/lamb/fused_lamb_cuda_kernel.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': [
'-O3',
] + version_dependent_macros,
'nvcc': ['-O3',
'--use_fast_math'] + version_dependent_macros
}))
## Adam ##
if BUILD_MASK & DS_BUILD_CPU_ADAM:
ext_modules.append(
CUDAExtension(name='deepspeed.ops.adam.cpu_adam_op',
sources=[
'csrc/adam/cpu_adam.cpp',
'csrc/adam/custom_cuda_kernel.cu',
],
include_dirs=['csrc/includes',
'/usr/local/cuda/include'],
extra_compile_args={
'cxx': [
'-O3',
'-std=c++14',
'-L/usr/local/cuda/lib64',
'-lcudart',
'-lcublas',
'-g',
'-Wno-reorder',
'-march=native',
'-fopenmp',
SIMD_WIDTH
],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
}))
## Transformer ##
if BUILD_MASK & DS_BUILD_TRANSFORMER:
ext_modules.append(
CUDAExtension(name='deepspeed.ops.transformer.transformer_cuda',
sources=[
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': ['-O3',
'-std=c++14',
'-g',
'-Wno-reorder'],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_60,code=compute_60',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
}))
ext_modules.append(
CUDAExtension(name='deepspeed.ops.transformer.stochastic_transformer_cuda',
sources=[
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': ['-O3',
'-std=c++14',
'-g',
'-Wno-reorder'],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_60,code=compute_60',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__',
'-D__STOCHASTIC_MODE__'
]
}))
from op_builder import ALL_OPS
# Default to pre-install kernels to false so we rely on JIT
BUILD_OP_DEFAULT = int(os.environ.get('DS_BUILD_OPS', 0))
print(f"DS_BUILD_OPS={BUILD_OP_DEFAULT}")
def command_exists(cmd):
if '|' in cmd:
cmds = cmd.split("|")
else:
cmds = [cmd]
valid = False
for cmd in cmds:
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
valid = valid or result.wait() == 0
return valid
## Sparse transformer ##
if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
# Check to see if llvm and cmake are installed since they are dependencies
required_commands = ['llvm-config|llvm-config-9', 'cmake']
command_status = list(map(command_exists, required_commands))
if not all(command_status):
zipped_status = list(zip(required_commands, command_status))
warnings.warn(
f'Missing non-python requirements, please install the missing packages: {zipped_status}'
)
warnings.warn(
'Skipping sparse attention installation due to missing required packages')
# remove from installed ops list
install_ops[SPARSE_ATTN] = False
elif TORCH_MAJOR == 1 and TORCH_MINOR >= 5:
ext_modules.append(
CppExtension(name='deepspeed.ops.sparse_attention.cpp_utils',
sources=['csrc/sparse_attention/utils.cpp'],
extra_compile_args={'cxx': ['-O2',
'-fopenmp']}))
# Add sparse attention requirements
install_requires += sparse_attn_requires
else:
warnings.warn('Unable to meet requirements to install sparse attention')
# remove from installed ops list
install_ops[SPARSE_ATTN] = False
# Add development requirements
install_requires += dev_requires
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
return result.wait() == 0
def op_enabled(op_name):
assert hasattr(ALL_OPS[op_name], 'BUILD_VAR'), \
f"{op_name} is missing BUILD_VAR field"
env_var = ALL_OPS[op_name].BUILD_VAR
return int(os.environ.get(env_var, BUILD_OP_DEFAULT))
install_ops = dict.fromkeys(ALL_OPS.keys(), False)
for op_name, builder in ALL_OPS.items():
op_compatible = builder.is_compatible()
# If op is compatible update install reqs so it can potentially build/run later
if op_compatible:
reqs = builder.python_requirements()
install_requires += builder.python_requirements()
# If op install enabled, add builder to extensions
if op_enabled(op_name) and op_compatible:
install_ops[op_name] = op_enabled(op_name)
ext_modules.append(builder.builder())
compatible_ops = {op_name: op.is_compatible() for (op_name, op) in ALL_OPS.items()}
print(f'Install Ops={install_ops}')
# Write out version/git info
git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
if command_exists('git'):
result = subprocess.check_output(git_hash_cmd, shell=True)
git_hash = result.decode('utf-8').strip()
result = subprocess.check_output(git_branch_cmd, shell=True)
git_branch = result.decode('utf-8').strip()
if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
try:
result = subprocess.check_output(git_hash_cmd, shell=True)
git_hash = result.decode('utf-8').strip()
result = subprocess.check_output(git_branch_cmd, shell=True)
git_branch = result.decode('utf-8').strip()
except subprocess.CalledProcessError:
git_hash = "unknown"
git_branch = "unknown"
else:
git_hash = "unknown"
git_branch = "unknown"
print(f"version={VERSION}+{git_hash}, git_hash={git_hash}, git_branch={git_branch}")
# Parse the DeepSpeed version string from version.txt
version_str = open('version.txt', 'r').read().strip()
# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
# example: DS_BUILD_STR=".dev20201022" python setup.py sdist bdist_wheel
#version_str += os.environ.get('DS_BUILD_STRING', f'+{git_hash}')
# Building wheel for distribution, update version file
if 'DS_BUILD_STRING' in os.environ:
# Build string env specified, probably building for distribution
with open('build.txt', 'w') as fd:
fd.write(os.environ.get('DS_BUILD_STRING'))
version_str += os.environ.get('DS_BUILD_STRING')
elif os.path.isfile('build.txt'):
# build.txt exists, probably installing from distribution
with open('build.txt', 'r') as fd:
version_str += fd.read().strip()
else:
# None of the above, probably installing from source
version_str += f'+{git_hash}'
torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
cuda_version = ".".join(torch.version.cuda.split('.')[:2])
torch_info = {"version": torch_version, "cuda_version": cuda_version}
print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
with open('deepspeed/git_version_info_installed.py', 'w') as fd:
fd.write(f"version='{VERSION}+{git_hash}'\n")
fd.write(f"version='{version_str}'\n")
fd.write(f"git_hash='{git_hash}'\n")
fd.write(f"git_branch='{git_branch}'\n")
fd.write(f"installed_ops={install_ops}\n")
fd.write(f"compatible_ops={compatible_ops}\n")
fd.write(f"torch_info={torch_info}\n")
print(f'install_requires={install_requires}')
print(f'compatible_ops={compatible_ops}')
print(f'ext_modules={ext_modules}')
setup(name='deepspeed',
version=f"{VERSION}+{git_hash}",
version=version_str,
description='DeepSpeed library',
author='DeepSpeed Team',
author_email='deepspeed@microsoft.com',
url='http://deepspeed.ai',
install_requires=install_requires,
extras_require=extras_require,
packages=find_packages(exclude=["docker",
"third_party",
"csrc"]),
package_data={'deepspeed.ops.sparse_attention.trsrc': ['*.tr']},
scripts=['bin/deepspeed',
'bin/deepspeed.pt',
'bin/ds',
'bin/ds_ssh'],
"third_party"]),
include_package_data=True,
scripts=[
'bin/deepspeed',
'bin/deepspeed.pt',
'bin/ds',
'bin/ds_ssh',
'bin/ds_report'
],
classifiers=[
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
......
......@@ -363,10 +363,18 @@ except ImportError:
self.variance_epsilon = eps
def forward(self, x):
pdtype = x.dtype
x = x.float()
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
return self.weight * x.to(pdtype) + self.bias
#def forward(self, x):
# u = x.mean(-1, keepdim=True)
# s = (x - u).pow(2).mean(-1, keepdim=True)
# x = (x - u) / torch.sqrt(s + self.variance_epsilon)
# return self.weight * x + self.bias
class BertEmbeddings(nn.Module):
......
......@@ -12,6 +12,8 @@ from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
from deepspeed.runtime.pipe.topology import *
PipeTopo = PipeDataParallelTopology
from deepspeed.ops.op_builder import FusedLambBuilder, CPUAdamBuilder
import argparse
import pytest
import json
......@@ -152,8 +154,8 @@ def checkpoint_correctness_verification(args,
compare_lr_scheduler_states(trained_model, loaded_model)
@pytest.mark.skipif(not deepspeed.ops.__installed_ops__['lamb'],
reason="lamb is not installed")
@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
reason="lamb is not compatible")
def test_checkpoint_unfused_optimizer(tmpdir):
config_dict = {
"train_batch_size": 2,
......@@ -264,11 +266,11 @@ def test_checkpoint_fused_optimizer(tmpdir):
'Adam'),
(2,
True,
'deepspeed_adam'),
'Adam'),
])
def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed")
if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not compatible")
config_dict = {
"train_batch_size": 2,
......@@ -320,14 +322,14 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_opt
"Adam"),
(2,
True,
'deepspeed_adam'),
'Adam'),
])
def test_checkpoint_zero_no_optimizer(tmpdir,
zero_stage,
use_cpu_offload,
adam_optimizer):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed")
if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not compatible")
config_dict = {
"train_batch_size": 2,
......@@ -385,11 +387,11 @@ def test_checkpoint_zero_no_optimizer(tmpdir,
'Adam'),
(2,
True,
'deepspeed_adam'),
'Adam'),
])
def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed")
if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not compatible")
config_dict = {
"train_batch_size": 2,
......@@ -459,11 +461,11 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optim
'Adam'),
(2,
True,
'deepspeed_adam'),
'Adam'),
])
def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed")
if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not compatible")
config_dict = {
"train_batch_size": 2,
......
import argparse
import torch
import apex
import time
import numpy as np
import pytest
import copy
import deepspeed
if not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed", allow_module_level=True)
else:
from deepspeed.ops.adam import DeepSpeedCPUAdam
from deepspeed.ops.adam import FusedAdam
from deepspeed.ops.op_builder import CPUAdamBuilder
if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not compatible")
def check_equal(first, second, atol=1e-2, verbose=False):
......@@ -32,6 +32,7 @@ def check_equal(first, second, atol=1e-2, verbose=False):
(1048576),
]) # yapf: disable
def test_cpu_adam_opt(model_size):
from deepspeed.ops.adam import DeepSpeedCPUAdam
device = 'cpu'
rng_state = torch.get_rng_state()
param = torch.nn.Parameter(torch.randn(model_size, device=device))
......@@ -42,7 +43,7 @@ def test_cpu_adam_opt(model_size):
param2 = torch.nn.Parameter(param2_data)
optimizer1 = torch.optim.AdamW([param1])
optimizer2 = apex.optimizers.FusedAdam([param2])
optimizer2 = FusedAdam([param2])
optimizer = DeepSpeedCPUAdam([param])
for i in range(10):
......
......@@ -16,8 +16,8 @@ import deepspeed
import sys
if not deepspeed.ops.__installed_ops__['transformer']:
pytest.skip("transformer kernels are not installed", allow_module_level=True)
#if not deepspeed.ops.__installed_ops__['transformer']:
# pytest.skip("transformer kernels are not installed", allow_module_level=True)
def check_equal(first, second, atol=1e-2, verbose=False):
......@@ -254,6 +254,7 @@ def run_backward(ds_config, atol=1e-2, verbose=False):
check_equal(base_grads, ds_grads, atol=atol, verbose=verbose)
#test_backward[3-1024-120-16-24-True-True-0.05]
@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
[
(3,1024,120,16,24,True,False, 0.05),
......
......@@ -16,8 +16,8 @@ import deepspeed
import sys
if not deepspeed.ops.__installed_ops__['transformer']:
pytest.skip("transformer kernels are not installed", allow_module_level=True)
#if not deepspeed.ops.__installed_ops__['transformer']:
# pytest.skip("transformer kernels are not installed", allow_module_level=True)
def check_equal(first, second, atol=1e-2, verbose=False):
......
......@@ -8,9 +8,6 @@ import numpy as np
from common import distributed_test
from simple_model import SimpleModel, args_from_dict
lamb_available = pytest.mark.skipif(not deepspeed.ops.__installed_ops__['lamb'],
reason="lamb is not installed")
def run_model_step(model, gradient_list):
for value in gradient_list:
......@@ -168,7 +165,6 @@ def test_fused_some_overflow(tmpdir):
_test_fused_some_overflow(args)
@lamb_available
def test_unfused_no_overflow(tmpdir):
config_dict = {
"train_batch_size": 1,
......@@ -212,7 +208,6 @@ def test_unfused_no_overflow(tmpdir):
_test_unfused_no_overflow(args)
@lamb_available
def test_unfused_all_overflow(tmpdir):
config_dict = {
"train_batch_size": 1,
......@@ -258,7 +253,6 @@ def test_unfused_all_overflow(tmpdir):
_test_unfused_all_overflow(args)
@lamb_available
def test_unfused_some_overflow(tmpdir):
config_dict = {
"train_batch_size": 1,
......
import torch
import apex
import deepspeed
import argparse
import pytest
import json
import os
from deepspeed.ops.adam import FusedAdam
from common import distributed_test
from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
lamb_available = pytest.mark.skipif(not deepspeed.ops.__installed_ops__['lamb'],
reason="lamb is not installed")
try:
from apex import amp
_amp_available = True
except ImportError:
_amp_available = False
amp_available = pytest.mark.skip(_amp_available, reason="apex/amp is not installed")
@lamb_available
def test_lamb_fp32_grad_clip(tmpdir):
config_dict = {
"train_batch_size": 2,
......@@ -48,7 +51,6 @@ def test_lamb_fp32_grad_clip(tmpdir):
_test_lamb_fp32_grad_clip(args=args, model=model, hidden_dim=hidden_dim)
@lamb_available
def test_lamb_fp16_basic(tmpdir):
config_dict = {
"train_batch_size": 2,
......@@ -86,7 +88,6 @@ def test_lamb_fp16_basic(tmpdir):
_test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
@lamb_available
def test_lamb_fp16_empty_grad(tmpdir):
config_dict = {
"train_batch_size": 2,
......@@ -234,8 +235,8 @@ def test_adamw_fp16_empty_grad(tmpdir):
True),
])
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed")
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 1,
"steps_per_print": 1,
......@@ -302,8 +303,8 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offlo
True),
])
def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed")
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 4,
"steps_per_print": 1,
......@@ -402,8 +403,8 @@ def test_zero_static_scale_deprecated_format(tmpdir):
True),
])
def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed")
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 4,
"steps_per_print": 1,
......@@ -442,8 +443,8 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
True),
])
def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
pytest.skip("cpu-adam is not installed")
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 1,
......@@ -489,6 +490,7 @@ def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
_test_zero_empty_partition(args)
@amp_available
def test_adam_amp_basic(tmpdir):
config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}}
args = args_from_dict(tmpdir, config_dict)
......@@ -514,7 +516,7 @@ def test_adam_amp_basic(tmpdir):
_test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
@lamb_available
@amp_available
def test_lamb_amp_basic(tmpdir):
config_dict = {
"train_batch_size": 2,
......@@ -552,6 +554,7 @@ def test_lamb_amp_basic(tmpdir):
_test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
@amp_available
def test_adam_amp_o2(tmpdir):
config_dict = {
"train_batch_size": 2,
......@@ -590,6 +593,7 @@ def test_adam_amp_o2(tmpdir):
_test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim)
@amp_available
def test_adam_amp_o2_empty_grad(tmpdir):
config_dict = {
"train_batch_size": 2,
......@@ -630,11 +634,11 @@ def test_adam_amp_o2_empty_grad(tmpdir):
@pytest.mark.parametrize('zero_stage, optimizer_constructor',
[(1,
apex.optimizers.FusedAdam),
FusedAdam),
(2,
torch.optim.Adam),
(2,
apex.optimizers.FusedAdam)])
FusedAdam)])
def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
config_dict = {
"train_batch_size": 2,
......
......@@ -6,9 +6,11 @@
import pytest
import torch
import deepspeed
from deepspeed.ops.op_builder import SparseAttnBuilder
if not deepspeed.ops.__installed_ops__['sparse-attn']:
pytest.skip("cpu-adam is not installed", allow_module_level=True)
if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]:
pytest.skip("sparse attention op is not compatible on this system",
allow_module_level=True)
def test_sparse_attention_module_availability():
......@@ -236,7 +238,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
def _skip_on_cuda_compatability():
pytest.skip("Skip these tests for now until we get our docker image fixed.")
#pytest.skip("Skip these tests for now until we get our docker image fixed.")
if torch.cuda.get_device_capability()[0] != 7:
pytest.skip("needs compute capability 7; v100")
cuda_major = int(torch.version.cuda.split('.')[0]) * 10
......
Subproject commit 494f8ab3fc1b0b26949a3bcbb2bcac78008d48c1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment