"src/vscode:/vscode.git/clone" did not exist on "9d0d0709967d8e029e0a475aae774e0c9c718ce7"
Unverified Commit e5bbc2e5 authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

Sparse attn + ops/runtime refactor + v0.3.0 (#343)



* Sparse attn + ops/runtime refactor + v0.3.0
Co-authored-by: default avatarArash Ashari <arashari@microsoft.com>
Co-authored-by: default avatarArash Ashari <arashari@microsoft.com>
parent 838f53b7
......@@ -147,11 +147,6 @@ if [ "$no_clean" == "0" ]; then
rm_if_exist third_party/apex/apex.egg-info
fi
echo "Updating git hash/branch info"
echo "git_hash = '$(git rev-parse --short HEAD)'" > deepspeed/git_version_info.py
echo "git_branch = '$(git rev-parse --abbrev-ref HEAD)'" >> deepspeed/git_version_info.py
cat deepspeed/git_version_info.py
if [ "$pip_sudo" == "1" ]; then
PIP_SUDO="sudo -H"
else
......@@ -159,7 +154,7 @@ else
fi
if [ "$pip_mirror" != "" ]; then
PIP_INSTALL="pip install -v -i $pip_mirror"
PIP_INSTALL="pip install --use-feature=2020-resolver -v -i $pip_mirror"
else
PIP_INSTALL="pip install -v"
fi
......@@ -169,10 +164,10 @@ if [ ! -f $hostfile ]; then
local_only=1
fi
if [ "$skip_requirements" == "0" ]; then
# Ensure dependencies are installed locally
$PIP_SUDO $PIP_INSTALL -r requirements.txt
fi
#if [ "$skip_requirements" == "0" ]; then
# # Ensure dependencies are installed locally
# $PIP_SUDO $PIP_INSTALL -r requirements.txt
#fi
# Build wheels
if [ "$third_party_install" == "1" ]; then
......@@ -205,7 +200,8 @@ if [ "$local_only" == "1" ]; then
echo "Installing deepspeed"
$PIP_SUDO pip uninstall -y deepspeed
$PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
python basic_install_test.py
# -I to exclude local directory files
python -I basic_install_test.py
if [ $? == 0 ]; then
echo "Installation is successful"
else
......@@ -224,10 +220,10 @@ else
tmp_wheel_path="/tmp/deepspeed_wheels"
pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi"
pdcp -w $hosts requirements.txt ${tmp_wheel_path}/
if [ "$skip_requirements" == "0" ]; then
pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
fi
#pdcp -w $hosts requirements/*.txt ${tmp_wheel_path}/
#if [ "$skip_requirements" == "0" ]; then
# pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
#fi
if [ "$third_party_install" == "1" ]; then
pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex"
pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/
......
pytest
pytest-forked
pre-commit
clang-format
......@@ -10,8 +10,53 @@ The wheel will be located at: dist/*.whl
import os
import torch
import subprocess
import warnings
from setuptools import setup, find_packages
from torch.utils.cpp_extension import CUDAExtension, BuildExtension
from torch.utils.cpp_extension import CUDAExtension, BuildExtension, CppExtension
VERSION = "0.3.0"
def fetch_requirements(path):
with open(path, 'r') as fd:
return [r.strip() for r in fd.readlines()]
install_requires = fetch_requirements('requirements/requirements.txt')
dev_requires = fetch_requirements('requirements/requirements-dev.txt')
sparse_attn_requires = fetch_requirements('requirements/requirements-sparse-attn.txt')
# Build environment variables for custom builds
DS_BUILD_LAMB_MASK = 1
DS_BUILD_TRANSFORMER_MASK = 10
DS_BUILD_SPARSE_ATTN_MASK = 100
# Allow for build_cuda to turn on or off all ops
DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK
DS_BUILD_CUDA = int(os.environ.get('DS_BUILD_CUDA', 1)) * DS_BUILD_ALL_OPS
# Set default of each op based on if build_cuda is set
OP_DEFAULT = DS_BUILD_CUDA == DS_BUILD_ALL_OPS
DS_BUILD_LAMB = int(os.environ.get('DS_BUILD_LAMB', OP_DEFAULT)) * DS_BUILD_LAMB_MASK
DS_BUILD_TRANSFORMER = int(os.environ.get('DS_BUILD_TRANSFORMER',
OP_DEFAULT)) * DS_BUILD_TRANSFORMER_MASK
DS_BUILD_SPARSE_ATTN = int(os.environ.get('DS_BUILD_SPARSE_ATTN',
0)) * DS_BUILD_SPARSE_ATTN_MASK
# Final effective mask is the bitwise OR of each op
BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN)
install_ops = []
if BUILD_MASK & DS_BUILD_LAMB:
install_ops.append('lamb')
if BUILD_MASK & DS_BUILD_TRANSFORMER:
install_ops.append('transformer')
if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
install_ops.append('sparse-attn')
if len(install_ops) == 0:
print("Building without any cuda/cpp extensions")
print(f'BUILD_MASK={BUILD_MASK}, install_ops={install_ops}')
cmdclass = {}
cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
......@@ -40,95 +85,163 @@ if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
version_ge_1_5 = ['-DVERSION_GE_1_5']
version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
ext_modules = [
CUDAExtension(
name='deepspeed_lamb_cuda',
sources=['csrc/lamb/fused_lamb_cuda.cpp',
'csrc/lamb/fused_lamb_cuda_kernel.cu'],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': [
'-O3',
] + version_dependent_macros,
'nvcc': ['-O3',
'--use_fast_math'] + version_dependent_macros
}),
CUDAExtension(name='deepspeed_transformer_cuda',
sources=[
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': ['-O3',
ext_modules = []
## Lamb ##
if BUILD_MASK & DS_BUILD_LAMB:
ext_modules.append(
CUDAExtension(name='deepspeed.ops.lamb.fused_lamb_cuda',
sources=[
'csrc/lamb/fused_lamb_cuda.cpp',
'csrc/lamb/fused_lamb_cuda_kernel.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': [
'-O3',
] + version_dependent_macros,
'nvcc': ['-O3',
'--use_fast_math'] + version_dependent_macros
}))
## Transformer ##
if BUILD_MASK & DS_BUILD_TRANSFORMER:
ext_modules.append(
CUDAExtension(name='deepspeed.ops.transformer.transformer_cuda',
sources=[
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': ['-O3',
'-std=c++14',
'-g',
'-Wno-reorder'],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-g',
'-Wno-reorder'],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
}),
CUDAExtension(name='deepspeed_stochastic_transformer_cuda',
sources=[
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': ['-O3',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
}))
ext_modules.append(
CUDAExtension(name='deepspeed.ops.transformer.stochastic_transformer_cuda',
sources=[
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
],
include_dirs=['csrc/includes'],
extra_compile_args={
'cxx': ['-O3',
'-std=c++14',
'-g',
'-Wno-reorder'],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-g',
'-Wno-reorder'],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__',
'-D__STOCHASTIC_MODE__'
]
}),
]
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__',
'-D__STOCHASTIC_MODE__'
]
}))
def command_exists(cmd):
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
return result.wait() == 0
## Sparse transformer ##
if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
# Check to see if llvm and cmake are installed since they are dependencies
required_commands = ['llc-9', 'cmake']
command_status = list(map(command_exists, required_commands))
if not all(command_status):
zipped_status = list(zip(required_commands, command_status))
warnings.warn(
f'Missing non-python requirements, please install the missing packages: {zipped_status}'
)
warnings.warn(
'Skipping sparse attention installation due to missing required packages')
elif TORCH_MAJOR == 1 and TORCH_MINOR >= 5:
ext_modules.append(
CppExtension(name='deepspeed.ops.sparse_attention.cpp_utils',
sources=['csrc/sparse_attention/utils.cpp'],
extra_compile_args={'cxx': ['-O2',
'-fopenmp']}))
# Add sparse attention requirements
install_requires += sparse_attn_requires
else:
warnings.warn('Unable to meet requirements to install sparse attention')
# Add development requirements
install_requires += dev_requires
# Write out version/git info
git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
if command_exists('git'):
result = subprocess.check_output(git_hash_cmd, shell=True)
git_hash = result.decode('utf-8').strip()
result = subprocess.check_output(git_branch_cmd, shell=True)
git_branch = result.decode('utf-8').strip()
else:
git_hash = "unknown"
git_branch = "unknown"
print(f"version={VERSION}+{git_hash}, git_hash={git_hash}, git_branch={git_branch}")
with open('deepspeed/git_version_info.py', 'w') as fd:
fd.write(f"version='{VERSION}+{git_hash}'\n")
fd.write(f"git_hash='{git_hash}'\n")
fd.write(f"git_branch='{git_branch}'\n")
print(f'install_requires={install_requires}')
setup(name='deepspeed',
version='0.2.0',
version=f"{VERSION}+{git_hash}",
description='DeepSpeed library',
author='DeepSpeed Team',
author_email='deepspeed@microsoft.com',
url='http://aka.ms/deepspeed',
install_requires=install_requires,
packages=find_packages(exclude=["docker",
"third_party",
"csrc"]),
package_data={'deepspeed.ops.sparse_attention.trsrc': ['*.tr']},
scripts=['bin/deepspeed',
'bin/deepspeed.pt',
'bin/ds',
'bin/ds_ssh'],
classifiers=['Programming Language :: Python :: 3.6'],
classifiers=[
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8'
],
license='MIT',
ext_modules=ext_modules,
cmdclass=cmdclass)
# DeepSpeed note, some parts of code taken & adapted from commit c368a9fd1b2c9dee4cc94de9a6bb0be3d447be41
# https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_softmax.py
# https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_matmul.py
# https://github.com/ptillet/torch-blocksparse/blob/master/tests/utils
import pytest
import torch
def test_sparse_attention_module_availability():
try:
from deepspeed.ops import sparse_attention
except ImportError:
print("Sparse Attention Module is not installed!")
return False
return True
def test_matmul_module_availability():
try:
from deepspeed.ops.sparse_attention import MatMul
except ImportError:
print("Sparse MatMul Module is not installed!")
return False
return True
def test_softmax_module_availability():
try:
from deepspeed.ops.sparse_attention import Softmax
except ImportError:
print("Sparse Softmax Module is not installed!")
return False
return True
def test_sparsityconfig_module_availability():
try:
from deepspeed.ops.sparse_attention import SparsityConfig
except ImportError:
print("SparsityConfig Module is not installed!")
return False
return True
def test_densesparsityconfig_module_availability():
try:
from deepspeed.ops.sparse_attention import DenseSparsityConfig
except ImportError:
print("DenseSparsityConfig Module is not installed!")
return False
return True
def test_fixedsparsityconfig_module_availability():
try:
from deepspeed.ops.sparse_attention import FixedSparsityConfig
except ImportError:
print("FixedSparsityConfig Module is not installed!")
return False
return True
def test_variablesparsityconfig_module_availability():
try:
from deepspeed.ops.sparse_attention import VariableSparsityConfig
except ImportError:
print("VariableSparsityConfig Module is not installed!")
return False
return True
def test_bigbirdsparsityconfig_module_availability():
try:
from deepspeed.ops.sparse_attention import BigBirdSparsityConfig
except ImportError:
print("BigBirdSparsityConfig Module is not installed!")
return False
return True
def test_bslongformersparsityconfig_module_availability():
try:
from deepspeed.ops.sparse_attention import BSLongformerSparsityConfig
except ImportError:
print("BSLongformerSparsityConfig Module is not installed!")
return False
return True
def test_sparseselfattention_module_availability():
try:
from deepspeed.ops.sparse_attention import SparseSelfAttention
except ImportError:
print("SparseSelfAttention Module is not installed!")
return False
return True
def test_bertsparseselfattention_module_availability():
try:
from deepspeed.ops.sparse_attention import BertSparseSelfAttention
except ImportError:
print("BertSparseSelfAttention Module is not installed!")
return False
return True
def test_sparseattentionutils_availability():
try:
from deepspeed.ops.sparse_attention import SparseAttentionUtils
except ImportError:
print("SparseAttentionUtils Module is not installed!")
return False
return True
def test_cpp_utils_availability():
try:
from deepspeed.ops.sparse_attention import cpp_utils
except ImportError:
print("Sparse Attention cpp_utils Module is not installed!")
return False
return True
def dense_to_sparse(w, mask, block):
"""Converts dense matrix with explicit zeros to sparse matrix
"""
Z = w.size(0)
ret = torch.empty((Z, mask.sum(), block, block), dtype=w.dtype, device=w.device)
nnz = mask.nonzero()
h, i, j = nnz[:, 0], nnz[:, 1], nnz[:, 2]
for zz in range(Z):
for idx, (hh, ii, jj) in enumerate(zip(h, i, j)):
ret[zz, idx, :, :] = w[zz, hh, ii*block: (ii+1)*block, jj*block: (jj+1)*block]
return ret
def sparse_to_dense(w, mask, block, zero=0):
"""Converts sparse matrix to dense matrix with explicit zeros
"""
maskedw = w.clone()
for bz, wz in enumerate(range(0, w.size(0))):
for bh, wh in enumerate(range(0, w.size(1))):
for bi, wi in enumerate(range(0, w.size(2), block)):
for bj, wj in enumerate(range(0, w.size(3), block)):
if mask[bh, bi, bj] == 0:
maskedw[wz, wh, wi:wi + block, wj:wj + block] = zero
#maskedw[wz, wh, wi : wi+block, wj : wj+block] *= mask[bh, bi, bj]
return maskedw
def allclose(x, y):
assert x.dtype == y.dtype
rtol, atol = {torch.float32: (1e-4, 1e-5), torch.float16: (1e-2, 1e-3)}[x.dtype]
return torch.allclose(x, y, rtol=rtol, atol=atol)
def make_layout(rho, shape):
probs = torch.Tensor([rho, 1 - rho])
generator = torch.distributions.categorical.Categorical(probs)
layout = generator.sample(shape)
return layout
def run_softmax_reference(x, scale, dx, kp_mask, attn_mask, layout, block):
x = sparse_to_dense(x, layout, block, zero=float('-inf'))
x.retain_grad()
if kp_mask is not None:
bcattn_mask = attn_mask[None, None, :, :] + torch.zeros_like(x)
x[bcattn_mask == 0] = float('-inf')
y = torch.softmax(x * scale + kp_mask[:, None, None, :], -1)
else:
y = torch.softmax(x * scale, -1)
y.backward(dx)
dx = x.grad.clone()
dx = dense_to_sparse(dx, layout, block)
y = dense_to_sparse(y, layout, block)
return y, dx
def run_softmax_sparse(x, scale, dx, kp_mask, attn_mask, layout, block):
from deepspeed.ops.sparse_attention import Softmax
sparse_softmax = Softmax(layout, block, bench=False)
dx = dense_to_sparse(dx, layout, block)
x = dense_to_sparse(x, layout, block)
x.retain_grad()
y = sparse_softmax(x,
scale=scale,
key_padding_mask=kp_mask,
key_padding_mask_mode='add',
attn_mask=attn_mask,
attn_mask_mode='mul')
y.backward(dx)
dx = x.grad.clone()
x.grad.zero_()
return x, dx
def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layout=None):
if layout is None:
layout = make_layout(rho, (H, M // block, N // block))
if dense_x:
x = torch.rand((Z, H, M, N), dtype=dtype, requires_grad=True, device='cuda')
else:
x = torch.rand((Z,
layout.sum(),
block,
block),
dtype=dtype,
requires_grad=True,
device='cuda')
dx = torch.rand_like(x)
bool_attn_mask = torch.randint(low=0,
high=2,
size=(N,
N),
dtype=torch.bool,
requires_grad=False,
device='cuda')
fp_attn_mask = bool_attn_mask.type(dtype)
kp_mask = torch.randint(low=0,
high=2,
size=(Z,
N),
dtype=dtype,
requires_grad=False,
device='cuda')
kp_mask[kp_mask == 1.] = float('-inf')
return layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask
def _skip_on_cuda_compatability():
if torch.cuda.get_device_capability()[0] != 7:
pytest.skip("needs compute capability 7; v100")
cuda_major = int(torch.version.cuda.split('.')[0]) * 10
cuda_minor = int(torch.version.cuda.split('.')[1])
cuda_version = cuda_major + cuda_minor
if cuda_version != 101 and cuda_version != 102:
pytest.skip("requires cuda 10.1 or 10.2")
@pytest.mark.parametrize("block", [16, 32])
@pytest.mark.parametrize("width", [256, 576])
@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
def test_softmax(block, width, dtype):
_skip_on_cuda_compatability()
Z = 2
H = 4
scale = 0.4
rho = 0.4
M = N = width
layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, layout=None)
ref_y, ref_dx = run_softmax_reference(x, scale, dx, kp_mask, bool_attn_mask, layout, block)
st_y, st_dx = run_softmax_sparse(x, scale, dx, kp_mask, fp_attn_mask, layout, block)
assert allclose(ref_y, st_y)
assert allclose(ref_dx, st_dx)
def run_matmul_reference(x, w, mode, trans_a, trans_b, layout, block, dy):
x = sparse_to_dense(x, layout, block) if mode == 'dsd' else x
w = sparse_to_dense(w, layout, block) if mode == 'dds' else w
x.retain_grad()
w.retain_grad()
xx = x.transpose(2, 3) if trans_a else x
ww = w.transpose(2, 3) if trans_b else w
y = torch.matmul(xx, ww)
y = sparse_to_dense(y, layout, block) if mode == 'sdd' else y
y.backward(dy)
dx = x.grad.clone()
dw = w.grad.clone()
x.grad.zero_()
w.grad.zero_()
y = dense_to_sparse(y, layout, block) if mode == 'sdd' else y
dx = dense_to_sparse(dx, layout, block) if mode == 'dsd' else dx
dw = dense_to_sparse(dw, layout, block) if mode == 'dds' else dw
return y, dx, dw
def run_matmul_sparse(x, w, mode, trans_a, trans_b, layout, block, dy):
from deepspeed.ops.sparse_attention import MatMul
x = dense_to_sparse(x, layout, block) if mode == 'dsd' else x
w = dense_to_sparse(w, layout, block) if mode == 'dds' else w
dy = dense_to_sparse(dy, layout, block) if mode == 'sdd' else dy
op = MatMul(layout, block, mode, trans_a=trans_a, trans_b=trans_b)
x.retain_grad()
w.retain_grad()
y = op(x, w)
y.backward(dy)
dx = x.grad.clone()
dw = w.grad.clone()
x.grad.zero_()
return y, dx, dw
def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout):
torch.manual_seed(1)
AS0 = K if trans_a else M
AS1 = M if trans_a else K
BS0 = N if trans_b else K
BS1 = K if trans_b else N
shape = {'sdd': (M, N), 'dsd': (AS0, AS1), 'dds': (BS0, BS1)}[mode]
x = torch.rand((Z, H, AS0, AS1), dtype=dtype, requires_grad=True, device='cuda')
w = torch.rand((Z, H, BS0, BS1), dtype=dtype, requires_grad=True, device='cuda')
dy = torch.rand((Z, H, M, N), dtype=dtype, device='cuda')
if layout is None:
layout = make_layout(rho, (H, shape[0] // block, shape[1] // block))
else:
assert list(layout.shape) == [H, shape[0] // block, shape[1] // block]
x.retain_grad()
w.retain_grad()
return x, w, dy, shape, layout
testdata = [
(16, dtype, mode, trans_a, trans_b)\
for dtype in [torch.float16, torch.float32]\
for mode in ['sdd', 'dsd', 'dds']\
for trans_a in [False, True]\
for trans_b in [False, True]\
] + [
(block, torch.float16, mode, False, False)\
for block in [16, 32, 64]\
for mode in ['sdd', 'dsd', 'dds']\
]
@pytest.mark.parametrize("block, dtype, mode, trans_a, trans_b", testdata)
def test_matmul(block, dtype, mode, trans_a, trans_b):
_skip_on_cuda_compatability()
Z = 3
H = 2
M = 128
N = 256
K = 192
rho = 0.5
x, w, dy, shape, layout = init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout=None)
ref_y, ref_dx, ref_dw = run_matmul_reference(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
st_y, st_dx, st_dw = run_matmul_sparse(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
assert allclose(ref_y, st_y)
assert allclose(ref_dx, st_dx)
assert allclose(ref_dw, st_dw)
import torch
import deepspeed
from deepspeed.pt.deepspeed_zero_optimizer import FP16_DeepSpeedZeroOptimizer
from deepspeed.pt.zero_optimizer_stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
from deepspeed.pt.fp16_optimizer import FP16_Optimizer
from deepspeed.pt.fp16_unfused_optimizer import FP16_UnfusedOptimizer
from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
import argparse
import pytest
......
......@@ -9,7 +9,7 @@ import torch.distributed as dist
# A test on its own
import deepspeed
from deepspeed.pt.deepspeed_config import DeepSpeedConfig
from deepspeed.runtime.config import DeepSpeedConfig
def test_cuda():
......
import torch
import random
from deepspeed.pt.deepspeed_csr_tensor import CSRTensor
from deepspeed.runtime.csr_tensor import CSRTensor
def test_csr_addition_self():
......
import pytest
import os
import json
from deepspeed.pt import deepspeed_config as ds_config
from deepspeed.runtime import config as ds_config
def test_only_required_fields(tmpdir):
......
import pytest
from deepspeed.pt import deepspeed_run as dsrun
from deepspeed.launcher import runner as dsrun
def test_parser_mutual_exclusive():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment