Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
eadbbe09
"mmdet/vscode:/vscode.git/clone" did not exist on "0868596e268a535cea396976330957649718cbd9"
Commit
eadbbe09
authored
Apr 25, 2021
by
401qingkong
Browse files
push rocm deepspeed v0.3.13
parent
ab5534fc
Changes
155
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
402 additions
and
73 deletions
+402
-73
install.sh
install.sh
+3
-1
op_builder/builder.py
op_builder/builder.py
+5
-5
op_builder/cpu_adam.py
op_builder/cpu_adam.py
+36
-14
op_builder/fused_adam.py
op_builder/fused_adam.py
+7
-4
op_builder/fused_lamb.py
op_builder/fused_lamb.py
+7
-5
op_builder/sparse_attn.py
op_builder/sparse_attn.py
+17
-13
op_builder/transformer.py
op_builder/transformer.py
+30
-17
op_builder/utils.py
op_builder/utils.py
+5
-2
requirements/requirements-sparse_attn.txt
requirements/requirements-sparse_attn.txt
+1
-1
requirements/requirements.txt
requirements/requirements.txt
+2
-2
setup.py
setup.py
+30
-8
tests/onebitadam/test_com_reduce_cuda.py
tests/onebitadam/test_com_reduce_cuda.py
+86
-0
tests/onebitadam/test_com_reduce_host.py
tests/onebitadam/test_com_reduce_host.py
+86
-0
tests/onebitadam/test_server_error.py
tests/onebitadam/test_server_error.py
+87
-0
tests/unit/test_pipe.py
tests/unit/test_pipe.py
+0
-1
No files found.
install.sh
View file @
eadbbe09
...
@@ -152,7 +152,9 @@ if [ ! -f $hostfile ]; then
...
@@ -152,7 +152,9 @@ if [ ! -f $hostfile ]; then
fi
fi
echo
"Building deepspeed wheel"
echo
"Building deepspeed wheel"
python setup.py
$VERBOSE
bdist_wheel
###aiss add
CXX
=
hipcc
CC
=
hipcc
DS_BUILD_CPU_ADAM
=
1
DS_BUILD_FUSED_ADAM
=
1
DS_BUILD_FUSED_LAMB
=
1
DS_BUILD_SPARSE_ATTN
=
1
DS_BUILD_TRANSFORMER
=
1
DS_BUILD_STOCHASTIC_TRANSFORMER
=
1
DS_BUILD_UTILS
=
1 python3 setup.py
$VERBOSE
bdist_wheel
#CXX=hipcc CC=hipcc DS_BUILD_UTILS=1 python3 setup.py $VERBOSE bdist_wheel
if
[
"
$local_only
"
==
"1"
]
;
then
if
[
"
$local_only
"
==
"1"
]
;
then
echo
"Installing deepspeed"
echo
"Installing deepspeed"
...
...
op_builder/builder.py
View file @
eadbbe09
...
@@ -194,10 +194,10 @@ class OpBuilder(ABC):
...
@@ -194,10 +194,10 @@ class OpBuilder(ABC):
else
:
else
:
return
os
.
path
.
join
(
Path
(
__file__
).
parent
.
parent
.
absolute
(),
code_path
)
return
os
.
path
.
join
(
Path
(
__file__
).
parent
.
parent
.
absolute
(),
code_path
)
def
builder
(
self
):
def
builder
(
self
,
is_rocm_pytorch
):
from
torch.utils.cpp_extension
import
CppExtension
from
torch.utils.cpp_extension
import
CppExtension
return
CppExtension
(
name
=
self
.
absolute_name
(),
return
CppExtension
(
name
=
self
.
absolute_name
(),
sources
=
self
.
sources
(),
sources
=
self
.
sources
(
is_rocm_pytorch
),
include_dirs
=
self
.
include_paths
(),
include_dirs
=
self
.
include_paths
(),
extra_compile_args
=
{
'cxx'
:
self
.
cxx_args
()},
extra_compile_args
=
{
'cxx'
:
self
.
cxx_args
()},
extra_link_args
=
self
.
extra_ldflags
())
extra_link_args
=
self
.
extra_ldflags
())
...
@@ -328,11 +328,11 @@ class CUDAOpBuilder(OpBuilder):
...
@@ -328,11 +328,11 @@ class CUDAOpBuilder(OpBuilder):
def
is_compatible
(
self
):
def
is_compatible
(
self
):
return
super
().
is_compatible
()
return
super
().
is_compatible
()
def
builder
(
self
):
def
builder
(
self
,
is_rocm_pytorch
):
from
torch.utils.cpp_extension
import
CUDAExtension
from
torch.utils.cpp_extension
import
CUDAExtension
assert_no_cuda_mismatch
()
#
assert_no_cuda_mismatch()
return
CUDAExtension
(
name
=
self
.
absolute_name
(),
return
CUDAExtension
(
name
=
self
.
absolute_name
(),
sources
=
self
.
sources
(),
sources
=
self
.
sources
(
is_rocm_pytorch
),
include_dirs
=
self
.
include_paths
(),
include_dirs
=
self
.
include_paths
(),
extra_compile_args
=
{
extra_compile_args
=
{
'cxx'
:
self
.
cxx_args
(),
'cxx'
:
self
.
cxx_args
(),
...
...
op_builder/cpu_adam.py
View file @
eadbbe09
...
@@ -17,12 +17,16 @@ class CPUAdamBuilder(CUDAOpBuilder):
...
@@ -17,12 +17,16 @@ class CPUAdamBuilder(CUDAOpBuilder):
def
absolute_name
(
self
):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.adam.
{
self
.
NAME
}
_op'
return
f
'deepspeed.ops.adam.
{
self
.
NAME
}
_op'
def
sources
(
self
):
def
sources
(
self
,
is_rocm_pytorch
):
return
[
'csrc/adam/cpu_adam.cpp'
,
'csrc/adam/custom_cuda_kernel.cu'
]
if
is_rocm_pytorch
:
return
[
'csrc/adam/hip/cpu_adam.cpp'
,
'csrc/adam/hip/custom_hip_kernel.hip'
]
else
:
return
[
'csrc/adam/cpu_adam.cpp'
,
'csrc/adam/custom_cuda_kernel.cu'
]
def
include_paths
(
self
):
def
include_paths
(
self
):
CUDA_INCLUDE
=
os
.
path
.
join
(
torch
.
utils
.
cpp_extension
.
CUDA_HOME
,
"include"
)
#CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
return
[
'csrc/includes'
,
CUDA_INCLUDE
]
#return ['csrc/includes', CUDA_INCLUDE]
return
[
'csrc/includes/'
,
'/opt/rocm-3.9.1/include/'
]
def
simd_width
(
self
):
def
simd_width
(
self
):
if
not
self
.
command_exists
(
'lscpu'
):
if
not
self
.
command_exists
(
'lscpu'
):
...
@@ -42,30 +46,48 @@ class CPUAdamBuilder(CUDAOpBuilder):
...
@@ -42,30 +46,48 @@ class CPUAdamBuilder(CUDAOpBuilder):
return
'-D__SCALAR__'
return
'-D__SCALAR__'
def
cxx_args
(
self
):
def
cxx_args
(
self
):
CUDA_LIB64
=
os
.
path
.
join
(
torch
.
utils
.
cpp_extension
.
CUDA_HOME
,
"lib64"
)
#
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
SIMD_WIDTH
=
self
.
simd_width
()
SIMD_WIDTH
=
self
.
simd_width
()
#return [
# '-O3',
# '-std=c++14',
# f'-L{CUDA_LIB64}',
# '-lcudart',
# '-lcublas',
# '-g',
# '-Wno-reorder',
# '-march=native',
# '-fopenmp',
# SIMD_WIDTH
#]
return
[
return
[
'-O3'
,
'-O3'
,
'-std=c++14'
,
'-std=c++14'
,
f
'-L
{
CUDA_LIB64
}
'
,
'-lrocblas'
,
'-lcudart'
,
'-lcublas'
,
'-g'
,
'-g'
,
'-Wno-reorder'
,
'-Wno-reorder'
,
'-march=native'
,
'-march=native'
,
'-fopenmp'
,
'-fopenmp'
,
'-lpthread'
,
SIMD_WIDTH
SIMD_WIDTH
]
]
def
nvcc_args
(
self
):
def
nvcc_args
(
self
):
#args = [
# '-O3',
# '--use_fast_math',
# '-std=c++14',
# '-U__CUDA_NO_HALF_OPERATORS__',
# '-U__CUDA_NO_HALF_CONVERSIONS__',
# '-U__CUDA_NO_HALF2_OPERATORS__'
#]
args
=
[
args
=
[
'-O3'
,
'-O3'
,
'--use_fast_math'
,
#'--use_fast_math',
'-fopenmp'
,
'-lpthread'
,
'-std=c++14'
,
'-std=c++14'
,
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'-U__CUDA_NO_HALF2_OPERATORS__'
]
]
args
+=
self
.
compute_capability_args
()
#
args += self.compute_capability_args()
return
args
return
args
op_builder/fused_adam.py
View file @
eadbbe09
...
@@ -15,8 +15,11 @@ class FusedAdamBuilder(CUDAOpBuilder):
...
@@ -15,8 +15,11 @@ class FusedAdamBuilder(CUDAOpBuilder):
def
absolute_name
(
self
):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.adam.
{
self
.
NAME
}
_op'
return
f
'deepspeed.ops.adam.
{
self
.
NAME
}
_op'
def
sources
(
self
):
def
sources
(
self
,
is_rocm_pytorch
):
return
[
'csrc/adam/fused_adam_frontend.cpp'
,
'csrc/adam/multi_tensor_adam.cu'
]
if
is_rocm_pytorch
:
return
[
'csrc/adam/hip/fused_adam_frontend.cpp'
,
'csrc/adam/hip/multi_tensor_adam.hip'
]
else
:
return
[
'csrc/adam/fused_adam_frontend.cpp'
,
'csrc/adam/multi_tensor_adam.cu'
]
def
include_paths
(
self
):
def
include_paths
(
self
):
return
[
'csrc/includes'
]
return
[
'csrc/includes'
]
...
@@ -27,5 +30,5 @@ class FusedAdamBuilder(CUDAOpBuilder):
...
@@ -27,5 +30,5 @@ class FusedAdamBuilder(CUDAOpBuilder):
def
nvcc_args
(
self
):
def
nvcc_args
(
self
):
return
[
'-lineinfo'
,
return
[
'-lineinfo'
,
'-O3'
,
'-O3'
,
'--use_fast_math'
#
'--use_fast_math'
]
+
self
.
version_dependent_macros
()
+
self
.
compute_capability_args
()
]
#
+ self.version_dependent_macros() + self.compute_capability_args()
op_builder/fused_lamb.py
View file @
eadbbe09
...
@@ -15,9 +15,11 @@ class FusedLambBuilder(CUDAOpBuilder):
...
@@ -15,9 +15,11 @@ class FusedLambBuilder(CUDAOpBuilder):
def
absolute_name
(
self
):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.lamb.
{
self
.
NAME
}
_op'
return
f
'deepspeed.ops.lamb.
{
self
.
NAME
}
_op'
def
sources
(
self
):
def
sources
(
self
,
is_rocm_pytorch
):
return
[
'csrc/lamb/fused_lamb_cuda.cpp'
,
'csrc/lamb/fused_lamb_cuda_kernel.cu'
]
if
is_rocm_pytorch
:
return
[
'csrc/lamb/hip/fused_lamb_hip.cpp'
,
'csrc/lamb/hip/fused_lamb_hip_kernel.hip'
]
else
:
return
[
'csrc/lamb/fused_lamb_cuda.cpp'
,
'csrc/lamb/fused_lamb_cuda_kernel.cu'
]
def
include_paths
(
self
):
def
include_paths
(
self
):
return
[
'csrc/includes'
]
return
[
'csrc/includes'
]
...
@@ -27,5 +29,5 @@ class FusedLambBuilder(CUDAOpBuilder):
...
@@ -27,5 +29,5 @@ class FusedLambBuilder(CUDAOpBuilder):
def
nvcc_args
(
self
):
def
nvcc_args
(
self
):
return
[
'-lineinfo'
,
return
[
'-lineinfo'
,
'-O3'
,
'-O3'
,
'--use_fast_math'
#
'--use_fast_math'
]
+
self
.
version_dependent_macros
()
+
self
.
compute_capability_args
()
]
#
+ self.version_dependent_macros() + self.compute_capability_args()
op_builder/sparse_attn.py
View file @
eadbbe09
...
@@ -16,29 +16,33 @@ class SparseAttnBuilder(OpBuilder):
...
@@ -16,29 +16,33 @@ class SparseAttnBuilder(OpBuilder):
def
absolute_name
(
self
):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.sparse_attention.
{
self
.
NAME
}
_op'
return
f
'deepspeed.ops.sparse_attention.
{
self
.
NAME
}
_op'
def
sources
(
self
):
def
sources
(
self
,
is_rocm_pytorch
):
return
[
'csrc/sparse_attention/utils.cpp'
]
if
is_rocm_pytorch
:
return
[
'csrc/sparse_attention/hip/utils.cpp'
]
else
:
return
[
'csrc/sparse_attention/utils.cpp'
]
def
cxx_args
(
self
):
def
cxx_args
(
self
):
return
[
'-O2'
,
'-fopenmp'
]
return
[
'-O2'
,
'-fopenmp'
]
def
is_compatible
(
self
):
def
is_compatible
(
self
):
# Check to see if llvm and cmake are installed since they are dependencies
# Check to see if llvm and cmake are installed since they are dependencies
required_commands
=
[
'llvm-config|llvm-config-9'
,
'cmake'
]
#required_commands = ['llvm-config|llvm-config-9', 'cmake']
required_commands
=
[
'cmake'
]
command_status
=
list
(
map
(
self
.
command_exists
,
required_commands
))
command_status
=
list
(
map
(
self
.
command_exists
,
required_commands
))
deps_compatible
=
all
(
command_status
)
deps_compatible
=
all
(
command_status
)
# torch-cpu will not have a cuda version
# torch-cpu will not have a cuda version
if
torch
.
version
.
cuda
is
None
:
if
torch
.
version
.
hip
is
None
:
cuda_compatible
=
False
cuda_compatible
=
False
self
.
warning
(
f
"
{
self
.
NAME
}
cuda is not available from torch"
)
self
.
warning
(
f
"
{
self
.
NAME
}
cuda is not available from torch"
)
else
:
#
else:
major
,
minor
=
torch
.
version
.
cuda
.
split
(
'.'
)[:
2
]
#
major, minor = torch.version.cuda.split('.')[:2]
cuda_compatible
=
int
(
major
)
==
10
and
int
(
minor
)
>=
1
#
cuda_compatible = int(major) == 10 and int(minor) >= 1
if
not
cuda_compatible
:
#
if not cuda_compatible:
self
.
warning
(
#
self.warning(
f
"
{
self
.
NAME
}
requires CUDA version 10.1+, does not currently support >=11 or <10.1"
#
f"{self.NAME} requires CUDA version 10.1+, does not currently support >=11 or <10.1"
)
#
)
TORCH_MAJOR
=
int
(
torch
.
__version__
.
split
(
'.'
)[
0
])
TORCH_MAJOR
=
int
(
torch
.
__version__
.
split
(
'.'
)[
0
])
TORCH_MINOR
=
int
(
torch
.
__version__
.
split
(
'.'
)[
1
])
TORCH_MINOR
=
int
(
torch
.
__version__
.
split
(
'.'
)[
1
])
...
@@ -49,4 +53,4 @@ class SparseAttnBuilder(OpBuilder):
...
@@ -49,4 +53,4 @@ class SparseAttnBuilder(OpBuilder):
)
)
return
super
().
is_compatible
(
return
super
().
is_compatible
(
)
and
deps_compatible
and
torch_compatible
and
cuda_compatible
)
and
deps_compatible
and
torch_compatible
#
and cuda_compatible
op_builder/transformer.py
View file @
eadbbe09
...
@@ -16,17 +16,30 @@ class TransformerBuilder(CUDAOpBuilder):
...
@@ -16,17 +16,30 @@ class TransformerBuilder(CUDAOpBuilder):
def
absolute_name
(
self
):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.transformer.
{
self
.
NAME
}
_op'
return
f
'deepspeed.ops.transformer.
{
self
.
NAME
}
_op'
def
sources
(
self
):
def
sources
(
self
,
is_rocm_pytorch
):
return
[
if
is_rocm_pytorch
:
'csrc/transformer/ds_transformer_cuda.cpp'
,
return
[
'csrc/transformer/cublas_wrappers.cu'
,
'csrc/transformer/hip/ds_transformer_hip.cpp'
,
'csrc/transformer/transform_kernels.cu'
,
'csrc/transformer/hip/cublas_wrappers.hip'
,
'csrc/transformer/gelu_kernels.cu'
,
'csrc/transformer/hip/transform_kernels.hip'
,
'csrc/transformer/dropout_kernels.cu'
,
'csrc/transformer/hip/gelu_kernels.hip'
,
'csrc/transformer/normalize_kernels.cu'
,
'csrc/transformer/hip/dropout_kernels.hip'
,
'csrc/transformer/softmax_kernels.cu'
,
###don't support
'csrc/transformer/general_kernels.cu'
#'csrc/transformer/hip/normalize_kernels.hip',
]
#'csrc/transformer/hip/softmax_kernels.hip',
'csrc/transformer/hip/general_kernels.hip'
]
else
:
return
[
'csrc/transformer/ds_transformer_cuda.cpp'
,
'csrc/transformer/cublas_wrappers.cu'
,
'csrc/transformer/transform_kernels.cu'
,
'csrc/transformer/gelu_kernels.cu'
,
'csrc/transformer/dropout_kernels.cu'
,
'csrc/transformer/normalize_kernels.cu'
,
'csrc/transformer/softmax_kernels.cu'
,
'csrc/transformer/general_kernels.cu'
]
def
include_paths
(
self
):
def
include_paths
(
self
):
return
[
'csrc/includes'
]
return
[
'csrc/includes'
]
...
@@ -34,14 +47,14 @@ class TransformerBuilder(CUDAOpBuilder):
...
@@ -34,14 +47,14 @@ class TransformerBuilder(CUDAOpBuilder):
def
nvcc_args
(
self
):
def
nvcc_args
(
self
):
args
=
[
args
=
[
'-O3'
,
'-O3'
,
'--use_fast_math'
,
#
'--use_fast_math',
'-std=c++14'
,
'-std=c++14'
,
'-U__CUDA_NO_HALF_OPERATORS__'
,
#
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
#
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
#
'-U__CUDA_NO_HALF2_OPERATORS__'
]
]
return
args
+
self
.
compute_capability_args
()
return
args
#
+ self.compute_capability_args()
def
cxx_args
(
self
):
def
cxx_args
(
self
):
return
[
'-O3'
,
'-std=c++14'
,
'-g'
,
'-Wno-reorder'
]
return
[
'-O3'
,
'-std=c++14'
,
'-g'
,
'-Wno-reorder'
,
'-Wno-c++11-narrowing'
]
op_builder/utils.py
View file @
eadbbe09
...
@@ -14,5 +14,8 @@ class UtilsBuilder(OpBuilder):
...
@@ -14,5 +14,8 @@ class UtilsBuilder(OpBuilder):
def
absolute_name
(
self
):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.
{
self
.
NAME
}
_op'
return
f
'deepspeed.ops.
{
self
.
NAME
}
_op'
def
sources
(
self
):
def
sources
(
self
,
is_rocm_pytorch
):
return
[
'csrc/utils/flatten_unflatten.cpp'
]
if
is_rocm_pytorch
:
return
[
'csrc/utils/hip/flatten_unflatten.cpp'
]
else
:
return
[
'csrc/utils/flatten_unflatten.cpp'
]
requirements/requirements-sparse_attn.txt
View file @
eadbbe09
triton==0.2.3
#
triton==0.2.3
requirements/requirements.txt
View file @
eadbbe09
torch>=1.2
#
torch>=1.2
torchvision>=0.4.0
#
torchvision>=0.4.0
tqdm
tqdm
tensorboardX==1.8
tensorboardX==1.8
ninja
ninja
...
...
setup.py
View file @
eadbbe09
...
@@ -17,10 +17,16 @@ import time
...
@@ -17,10 +17,16 @@ import time
try
:
try
:
import
torch
import
torch
from
torch.utils.cpp_extension
import
BuildExtension
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CppExtension
from
torch.utils.hipify
import
hipify_python
except
ImportError
:
except
ImportError
:
raise
ImportError
(
'Unable to import torch, please visit https://pytorch.org/ '
raise
ImportError
(
'Unable to import torch, please visit https://pytorch.org/ '
'to see how to properly install torch on your system.'
)
'to see how to properly install torch on your system.'
)
###aiss add
is_rocm_pytorch
=
False
if
torch
.
__version__
>=
'1.5'
:
from
torch.utils.cpp_extension
import
ROCM_HOME
is_rocm_pytorch
=
True
if
((
torch
.
version
.
hip
is
not
None
)
and
(
ROCM_HOME
is
not
None
))
else
False
from
op_builder
import
ALL_OPS
,
get_default_compute_capatabilities
from
op_builder
import
ALL_OPS
,
get_default_compute_capatabilities
...
@@ -36,12 +42,19 @@ extras_require = {
...
@@ -36,12 +42,19 @@ extras_require = {
'readthedocs'
:
fetch_requirements
(
'requirements/requirements-readthedocs.txt'
),
'readthedocs'
:
fetch_requirements
(
'requirements/requirements-readthedocs.txt'
),
'dev'
:
fetch_requirements
(
'requirements/requirements-dev.txt'
),
'dev'
:
fetch_requirements
(
'requirements/requirements-dev.txt'
),
}
}
###aiss add ################
if
is_rocm_pytorch
:
print
(
"NOTE: Please manually install torch and torchvision packages for ROCm"
)
#install_requires = fetch_requirements('requirements/requirements-rocm.txt')
# If MPI is available add 1bit-adam requirements
# If MPI is available add 1bit-adam requirements
if
torch
.
cuda
.
is_available
():
##aiss add: cupy 9.0 has been setup manufully
if
shutil
.
which
(
'ompi_info'
)
or
shutil
.
which
(
'mpiname'
):
#if torch.cuda.is_available():
cupy
=
f
"cupy-cuda
{
torch
.
version
.
cuda
.
replace
(
'.'
,
''
)[:
3
]
}
"
# if shutil.which('ompi_info') or shutil.which('mpiname'):
extras_require
[
'1bit_adam'
].
append
(
cupy
)
# cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
# print("cupy version: ", cupy)
# extras_require['1bit_adam'].append(cupy)
# Make an [all] extra that installs all needed dependencies
# Make an [all] extra that installs all needed dependencies
all_extras
=
set
()
all_extras
=
set
()
...
@@ -66,10 +79,18 @@ if not torch.cuda.is_available():
...
@@ -66,10 +79,18 @@ if not torch.cuda.is_available():
"(compute capabilities 6.0, 6.1, 6.2)"
)
"(compute capabilities 6.0, 6.1, 6.2)"
)
if
os
.
environ
.
get
(
"TORCH_CUDA_ARCH_LIST"
,
None
)
is
None
:
if
os
.
environ
.
get
(
"TORCH_CUDA_ARCH_LIST"
,
None
)
is
None
:
os
.
environ
[
"TORCH_CUDA_ARCH_LIST"
]
=
get_default_compute_capatabilities
()
os
.
environ
[
"TORCH_CUDA_ARCH_LIST"
]
=
get_default_compute_capatabilities
()
###########aiss add ######################only need run once for convert
#if is_rocm_pytorch:
# import shutil
# this_dir = os.path.dirname(os.path.abspath(__file__))
# hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
# show_detailed=True, is_pytorch_extension=True)
# print("cuda file has been transformed to hip format!!!")
ext_modules
=
[]
ext_modules
=
[]
# Default to pre-install kernels to false so we rely on JIT
# Default to pre-install kernels to false so we rely on JIT
BUILD_OP_DEFAULT
=
int
(
os
.
environ
.
get
(
'DS_BUILD_OPS'
,
0
))
BUILD_OP_DEFAULT
=
int
(
os
.
environ
.
get
(
'DS_BUILD_OPS'
,
0
))
print
(
f
"DS_BUILD_OPS=
{
BUILD_OP_DEFAULT
}
"
)
print
(
f
"DS_BUILD_OPS=
{
BUILD_OP_DEFAULT
}
"
)
...
@@ -94,11 +115,12 @@ for op_name, builder in ALL_OPS.items():
...
@@ -94,11 +115,12 @@ for op_name, builder in ALL_OPS.items():
if
op_compatible
:
if
op_compatible
:
reqs
=
builder
.
python_requirements
()
reqs
=
builder
.
python_requirements
()
install_requires
+=
builder
.
python_requirements
()
install_requires
+=
builder
.
python_requirements
()
######aiss debug###############
print
(
"op_enabled(op_name): , op_compatible: "
,
op_enabled
(
op_name
),
op_compatible
)
# If op install enabled, add builder to extensions
# If op install enabled, add builder to extensions
if
op_enabled
(
op_name
)
and
op_compatible
:
if
op_enabled
(
op_name
)
and
op_compatible
:
install_ops
[
op_name
]
=
op_enabled
(
op_name
)
install_ops
[
op_name
]
=
op_enabled
(
op_name
)
ext_modules
.
append
(
builder
.
builder
())
ext_modules
.
append
(
builder
.
builder
(
is_rocm_pytorch
))
compatible_ops
=
{
op_name
:
op
.
is_compatible
()
for
(
op_name
,
op
)
in
ALL_OPS
.
items
()}
compatible_ops
=
{
op_name
:
op
.
is_compatible
()
for
(
op_name
,
op
)
in
ALL_OPS
.
items
()}
...
...
tests/onebitadam/test_com_reduce_cuda.py
0 → 100644
View file @
eadbbe09
from
mpi4py
import
MPI
import
time
import
torch
import
torch.distributed
as
dist
import
numpy
as
np
import
deepspeed
from
deepspeed.runtime.fp16.onebit_adam
import
OnebitAdam
comm
=
MPI
.
COMM_WORLD
size
=
comm
.
Get_size
()
rank
=
comm
.
Get_rank
()
#TODO: Detect the hostname we are running on automatically
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
'tcp://worker-1:2245'
,
world_size
=
size
,
rank
=
rank
)
dummy_model
=
[
torch
.
nn
.
Parameter
(
torch
.
ones
(
10
))]
# Set cuda_aware to True to use CUDA buffers for communication
dummy_optim
=
OnebitAdam
(
dummy_model
,
cuda_aware
=
True
)
device
=
torch
.
device
(
'cuda'
,
rank
%
torch
.
cuda
.
device_count
())
def
torch_sim
(
a
):
a_sign
=
a
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
scale
=
a
.
norm
()
/
np
.
sqrt
(
a
.
numel
())
a_compressed
=
scale
*
a_sign
a_sign
=
None
worker_error
=
a
-
a_compressed
dist
.
all_reduce
(
a_compressed
)
a_compressed
.
mul_
(
1
/
dist
.
get_world_size
())
a_server_sign
=
a_compressed
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
a_list
=
torch
.
chunk
(
a_compressed
,
chunks
=
dist
.
get_world_size
())
server_scale
=
[
chunk_a
.
norm
()
/
np
.
sqrt
(
chunk_a
.
numel
())
for
chunk_a
in
a_list
]
a_sign_list
=
torch
.
chunk
(
a_server_sign
,
dist
.
get_world_size
())
a_server_compressed
=
torch
.
cat
(
[
server_scale
[
i
]
*
a_sign_list
[
i
]
for
i
in
range
(
dist
.
get_world_size
())])
rank
=
dist
.
get_rank
()
server_error
=
a_list
[
rank
]
-
server_scale
[
rank
]
*
a_sign_list
[
rank
]
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
return
a_server_compressed
,
worker_error
,
server_error
tensor_size
=
100
*
2
**
20
server_size
=
int
(
tensor_size
/
size
)
if
tensor_size
%
(
8
*
size
)
!=
0
:
right_tensor_size
=
tensor_size
+
(
8
*
size
-
(
tensor_size
%
(
8
*
size
)))
else
:
right_tensor_size
=
tensor_size
right_server_size
=
right_tensor_size
//
size
# Adding bias to the initialization of the gradient we are communicating
# In order to get rid of the case where some elements in the gradient are too small
a
=
(
torch
.
rand
(
tensor_size
,
device
=
device
)
-
0.5
)
+
0.01
*
rank
worker_error
=
torch
.
zeros
(
right_tensor_size
,
device
=
device
)
server_error
=
torch
.
zeros
(
right_server_size
,
device
=
device
)
a_torch
,
worker_error_torch
,
server_error_torch
=
torch_sim
(
a
)
torch
.
cuda
.
empty_cache
()
local_rank
=
rank
%
torch
.
cuda
.
device_count
()
a_after
=
dummy_optim
.
Compressed_Allreduce
(
a
,
worker_error
,
server_error
,
rank
,
size
,
comm
,
local_rank
)
threshold
=
1e-6
magnitude_threshold
=
1e-6
diff_mask
=
(
a_after
-
a_torch
)
>
threshold
diff_server_mask
=
torch
.
chunk
(
diff_mask
,
size
)[
rank
]
mpi_server
=
torch
.
chunk
(
a_after
,
size
)[
rank
]
+
server_error
torch_server
=
torch
.
chunk
(
a_torch
,
size
)[
rank
]
+
server_error_torch
# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
# The test would skip those numbers that are too small in compensated_server_m
if
torch
.
sum
(
diff_server_mask
)
==
0
:
print
(
'Successfully passed the test for 1bit Adam at Rank {}'
.
format
(
rank
))
else
:
check_mag_mask
=
mpi_server
[
diff_mask
]
>
magnitude_threshold
if
torch
.
sum
(
check_mag_mask
)
==
0
:
print
(
'Successfully passed the test for 1bit Adam at Rank {}'
.
format
(
rank
))
else
:
print
(
'Fails at {} of positions'
.
format
(
torch
.
sum
(
check_mag_mask
)))
tests/onebitadam/test_com_reduce_host.py
0 → 100644
View file @
eadbbe09
from
mpi4py
import
MPI
import
time
import
torch
import
torch.distributed
as
dist
import
numpy
as
np
import
deepspeed
from
deepspeed.runtime.fp16.onebit_adam
import
OnebitAdam
comm
=
MPI
.
COMM_WORLD
size
=
comm
.
Get_size
()
rank
=
comm
.
Get_rank
()
#TODO: Detect the hostname we are running on automatically
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
'tcp://worker-1:2245'
,
world_size
=
size
,
rank
=
rank
)
dummy_model
=
[
torch
.
nn
.
Parameter
(
torch
.
ones
(
10
))]
# Set cuda_aware to False to use host buffers for communication
dummy_optim
=
OnebitAdam
(
dummy_model
,
cuda_aware
=
False
)
device
=
torch
.
device
(
'cuda'
,
rank
%
torch
.
cuda
.
device_count
())
def
torch_sim
(
a
):
a_sign
=
a
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
scale
=
a
.
norm
()
/
np
.
sqrt
(
a
.
numel
())
a_compressed
=
scale
*
a_sign
a_sign
=
None
worker_error
=
a
-
a_compressed
dist
.
all_reduce
(
a_compressed
)
a_compressed
.
mul_
(
1
/
dist
.
get_world_size
())
a_server_sign
=
a_compressed
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
a_list
=
torch
.
chunk
(
a_compressed
,
chunks
=
dist
.
get_world_size
())
server_scale
=
[
chunk_a
.
norm
()
/
np
.
sqrt
(
chunk_a
.
numel
())
for
chunk_a
in
a_list
]
a_sign_list
=
torch
.
chunk
(
a_server_sign
,
dist
.
get_world_size
())
a_server_compressed
=
torch
.
cat
(
[
server_scale
[
i
]
*
a_sign_list
[
i
]
for
i
in
range
(
dist
.
get_world_size
())])
rank
=
dist
.
get_rank
()
server_error
=
a_list
[
rank
]
-
server_scale
[
rank
]
*
a_sign_list
[
rank
]
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
return
a_server_compressed
,
worker_error
,
server_error
tensor_size
=
100
*
2
**
20
server_size
=
int
(
tensor_size
/
size
)
if
tensor_size
%
(
8
*
size
)
!=
0
:
right_tensor_size
=
tensor_size
+
(
8
*
size
-
(
tensor_size
%
(
8
*
size
)))
else
:
right_tensor_size
=
tensor_size
right_server_size
=
right_tensor_size
//
size
# Adding bias to the initialization of the gradient we are communicating
# In order to get rid of the case where some elements in the gradient are too small
a
=
(
torch
.
rand
(
tensor_size
,
device
=
device
)
-
0.5
)
+
0.01
*
rank
worker_error
=
torch
.
zeros
(
right_tensor_size
,
device
=
device
)
server_error
=
torch
.
zeros
(
right_server_size
,
device
=
device
)
a_torch
,
worker_error_torch
,
server_error_torch
=
torch_sim
(
a
)
torch
.
cuda
.
empty_cache
()
local_rank
=
rank
%
torch
.
cuda
.
device_count
()
a_after
=
dummy_optim
.
Compressed_Allreduce
(
a
,
worker_error
,
server_error
,
rank
,
size
,
comm
,
local_rank
)
threshold
=
1e-6
magnitude_threshold
=
1e-6
diff_mask
=
(
a_after
-
a_torch
)
>
threshold
diff_server_mask
=
torch
.
chunk
(
diff_mask
,
size
)[
rank
]
mpi_server
=
torch
.
chunk
(
a_after
,
size
)[
rank
]
+
server_error
torch_server
=
torch
.
chunk
(
a_torch
,
size
)[
rank
]
+
server_error_torch
# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
# The test would skip those numbers that are too small in compensated_server_m
if
torch
.
sum
(
diff_server_mask
)
==
0
:
print
(
'Successfully passed the test for 1bit Adam at Rank {}'
.
format
(
rank
))
else
:
check_mag_mask
=
mpi_server
[
diff_mask
]
>
magnitude_threshold
if
torch
.
sum
(
check_mag_mask
)
==
0
:
print
(
'Successfully passed the test for 1bit Adam at Rank {}'
.
format
(
rank
))
else
:
print
(
'Fails at {} of positions'
.
format
(
torch
.
sum
(
check_mag_mask
)))
tests/onebitadam/test_server_error.py
0 → 100644
View file @
eadbbe09
from
mpi4py
import
MPI
import
time
import
torch
import
torch.distributed
as
dist
import
numpy
as
np
import
deepspeed
from
deepspeed.runtime.fp16.onebit_adam
import
OnebitAdam
comm
=
MPI
.
COMM_WORLD
size
=
comm
.
Get_size
()
rank
=
comm
.
Get_rank
()
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
'tcp://worker-0:2245'
,
world_size
=
size
,
rank
=
rank
)
dummy_model
=
[
torch
.
nn
.
Parameter
(
torch
.
ones
(
10
))]
dummy_optim
=
OnebitAdam
(
dummy_model
,
cuda_aware
=
False
)
device
=
torch
.
device
(
'cuda'
,
rank
%
torch
.
cuda
.
device_count
())
def
torch_sim
(
a
):
a_sign
=
a
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
scale
=
a
.
norm
()
/
np
.
sqrt
(
a
.
numel
())
a_compressed
=
scale
*
a_sign
a_sign
=
None
worker_error
=
a
-
a_compressed
dist
.
all_reduce
(
a_compressed
)
a_compressed
.
mul_
(
1
/
dist
.
get_world_size
())
a_server_sign
=
a_compressed
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
a_list
=
torch
.
chunk
(
a_compressed
,
chunks
=
dist
.
get_world_size
())
server_scale
=
[
chunk_a
.
norm
()
/
np
.
sqrt
(
chunk_a
.
numel
())
for
chunk_a
in
a_list
]
a_sign_list
=
torch
.
chunk
(
a_server_sign
,
dist
.
get_world_size
())
a_server_compressed
=
torch
.
cat
(
[
server_scale
[
i
]
*
a_sign_list
[
i
]
for
i
in
range
(
dist
.
get_world_size
())])
rank
=
dist
.
get_rank
()
server_error
=
a_list
[
rank
]
-
server_scale
[
rank
]
*
a_sign_list
[
rank
]
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
return
a_server_compressed
,
worker_error
,
server_error
# Input Tensor size
tensor_size
=
100
*
2
**
20
server_size
=
int
(
tensor_size
/
size
)
if
tensor_size
%
(
8
*
size
)
!=
0
:
right_tensor_size
=
tensor_size
+
(
8
*
size
-
(
tensor_size
%
(
8
*
size
)))
else
:
right_tensor_size
=
tensor_size
right_server_size
=
right_tensor_size
//
size
# The -0.5 is required for avoiding sign flips/errors
a
=
torch
.
rand
(
tensor_size
,
device
=
device
)
-
0.5
worker_error
=
torch
.
zeros
(
right_tensor_size
,
device
=
device
)
server_error
=
torch
.
zeros
(
right_server_size
,
device
=
device
)
a_torch
,
worker_error_torch
,
server_error_torch
=
torch_sim
(
a
)
torch
.
cuda
.
empty_cache
()
local_rank
=
rank
%
torch
.
cuda
.
device_count
()
# Test the 1-bit Adam optimizer
a_after
=
dummy_optim
.
Compressed_Allreduce
(
a
,
worker_error
,
server_error
,
rank
,
size
,
comm
,
local_rank
)
# If the error is below the threshold, it is acceptable for training
threshold
=
1e-6
diff_pos
=
((
a_after
-
a_torch
)
>
threshold
)
if
rank
==
0
:
before_diff
=
torch
.
chunk
(
a_after
-
a_torch
,
size
)[
rank
]
+
server_error
-
server_error_torch
if
torch
.
norm
(
before_diff
)
/
torch
.
norm
(
torch
.
chunk
(
a_after
,
size
)[
rank
])
<
threshold
:
print
(
'Successfully passed the test'
)
else
:
print
(
'The difference for the tensor before allgather is {}'
.
format
(
torch
.
norm
(
before_diff
)))
tests/unit/test_pipe.py
View file @
eadbbe09
...
@@ -169,7 +169,6 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
...
@@ -169,7 +169,6 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
return
losses
return
losses
@
pytest
.
mark
.
skip
(
reason
=
"been seeing nondeterministic failures, skipping for now"
)
@
pytest
.
mark
.
parametrize
(
'topo'
,
@
pytest
.
mark
.
parametrize
(
'topo'
,
[
[
PipeTopo
(
num_pp
=
1
,
PipeTopo
(
num_pp
=
1
,
...
...
Prev
1
…
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment