Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
eadbbe09
Commit
eadbbe09
authored
Apr 25, 2021
by
401qingkong
Browse files
push rocm deepspeed v0.3.13
parent
ab5534fc
Changes
155
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
402 additions
and
73 deletions
+402
-73
install.sh
install.sh
+3
-1
op_builder/builder.py
op_builder/builder.py
+5
-5
op_builder/cpu_adam.py
op_builder/cpu_adam.py
+36
-14
op_builder/fused_adam.py
op_builder/fused_adam.py
+7
-4
op_builder/fused_lamb.py
op_builder/fused_lamb.py
+7
-5
op_builder/sparse_attn.py
op_builder/sparse_attn.py
+17
-13
op_builder/transformer.py
op_builder/transformer.py
+30
-17
op_builder/utils.py
op_builder/utils.py
+5
-2
requirements/requirements-sparse_attn.txt
requirements/requirements-sparse_attn.txt
+1
-1
requirements/requirements.txt
requirements/requirements.txt
+2
-2
setup.py
setup.py
+30
-8
tests/onebitadam/test_com_reduce_cuda.py
tests/onebitadam/test_com_reduce_cuda.py
+86
-0
tests/onebitadam/test_com_reduce_host.py
tests/onebitadam/test_com_reduce_host.py
+86
-0
tests/onebitadam/test_server_error.py
tests/onebitadam/test_server_error.py
+87
-0
tests/unit/test_pipe.py
tests/unit/test_pipe.py
+0
-1
No files found.
install.sh
View file @
eadbbe09
...
...
@@ -152,7 +152,9 @@ if [ ! -f $hostfile ]; then
fi
echo
"Building deepspeed wheel"
python setup.py
$VERBOSE
bdist_wheel
###aiss add
CXX
=
hipcc
CC
=
hipcc
DS_BUILD_CPU_ADAM
=
1
DS_BUILD_FUSED_ADAM
=
1
DS_BUILD_FUSED_LAMB
=
1
DS_BUILD_SPARSE_ATTN
=
1
DS_BUILD_TRANSFORMER
=
1
DS_BUILD_STOCHASTIC_TRANSFORMER
=
1
DS_BUILD_UTILS
=
1 python3 setup.py
$VERBOSE
bdist_wheel
#CXX=hipcc CC=hipcc DS_BUILD_UTILS=1 python3 setup.py $VERBOSE bdist_wheel
if
[
"
$local_only
"
==
"1"
]
;
then
echo
"Installing deepspeed"
...
...
op_builder/builder.py
View file @
eadbbe09
...
...
@@ -194,10 +194,10 @@ class OpBuilder(ABC):
else
:
return
os
.
path
.
join
(
Path
(
__file__
).
parent
.
parent
.
absolute
(),
code_path
)
def
builder
(
self
):
def
builder
(
self
,
is_rocm_pytorch
):
from
torch.utils.cpp_extension
import
CppExtension
return
CppExtension
(
name
=
self
.
absolute_name
(),
sources
=
self
.
sources
(),
sources
=
self
.
sources
(
is_rocm_pytorch
),
include_dirs
=
self
.
include_paths
(),
extra_compile_args
=
{
'cxx'
:
self
.
cxx_args
()},
extra_link_args
=
self
.
extra_ldflags
())
...
...
@@ -328,11 +328,11 @@ class CUDAOpBuilder(OpBuilder):
def
is_compatible
(
self
):
return
super
().
is_compatible
()
def
builder
(
self
):
def
builder
(
self
,
is_rocm_pytorch
):
from
torch.utils.cpp_extension
import
CUDAExtension
assert_no_cuda_mismatch
()
#
assert_no_cuda_mismatch()
return
CUDAExtension
(
name
=
self
.
absolute_name
(),
sources
=
self
.
sources
(),
sources
=
self
.
sources
(
is_rocm_pytorch
),
include_dirs
=
self
.
include_paths
(),
extra_compile_args
=
{
'cxx'
:
self
.
cxx_args
(),
...
...
op_builder/cpu_adam.py
View file @
eadbbe09
...
...
@@ -17,12 +17,16 @@ class CPUAdamBuilder(CUDAOpBuilder):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.adam.
{
self
.
NAME
}
_op'
def
sources
(
self
):
return
[
'csrc/adam/cpu_adam.cpp'
,
'csrc/adam/custom_cuda_kernel.cu'
]
def
sources
(
self
,
is_rocm_pytorch
):
if
is_rocm_pytorch
:
return
[
'csrc/adam/hip/cpu_adam.cpp'
,
'csrc/adam/hip/custom_hip_kernel.hip'
]
else
:
return
[
'csrc/adam/cpu_adam.cpp'
,
'csrc/adam/custom_cuda_kernel.cu'
]
def
include_paths
(
self
):
CUDA_INCLUDE
=
os
.
path
.
join
(
torch
.
utils
.
cpp_extension
.
CUDA_HOME
,
"include"
)
return
[
'csrc/includes'
,
CUDA_INCLUDE
]
#CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
#return ['csrc/includes', CUDA_INCLUDE]
return
[
'csrc/includes/'
,
'/opt/rocm-3.9.1/include/'
]
def
simd_width
(
self
):
if
not
self
.
command_exists
(
'lscpu'
):
...
...
@@ -42,30 +46,48 @@ class CPUAdamBuilder(CUDAOpBuilder):
return
'-D__SCALAR__'
def
cxx_args
(
self
):
CUDA_LIB64
=
os
.
path
.
join
(
torch
.
utils
.
cpp_extension
.
CUDA_HOME
,
"lib64"
)
#
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
SIMD_WIDTH
=
self
.
simd_width
()
#return [
# '-O3',
# '-std=c++14',
# f'-L{CUDA_LIB64}',
# '-lcudart',
# '-lcublas',
# '-g',
# '-Wno-reorder',
# '-march=native',
# '-fopenmp',
# SIMD_WIDTH
#]
return
[
'-O3'
,
'-std=c++14'
,
f
'-L
{
CUDA_LIB64
}
'
,
'-lcudart'
,
'-lcublas'
,
'-lrocblas'
,
'-g'
,
'-Wno-reorder'
,
'-march=native'
,
'-fopenmp'
,
'-lpthread'
,
SIMD_WIDTH
]
def
nvcc_args
(
self
):
#args = [
# '-O3',
# '--use_fast_math',
# '-std=c++14',
# '-U__CUDA_NO_HALF_OPERATORS__',
# '-U__CUDA_NO_HALF_CONVERSIONS__',
# '-U__CUDA_NO_HALF2_OPERATORS__'
#]
args
=
[
'-O3'
,
'--use_fast_math'
,
#'--use_fast_math',
'-fopenmp'
,
'-lpthread'
,
'-std=c++14'
,
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'-U__CUDA_NO_HALF2_OPERATORS__'
]
args
+=
self
.
compute_capability_args
()
#
args += self.compute_capability_args()
return
args
op_builder/fused_adam.py
View file @
eadbbe09
...
...
@@ -15,8 +15,11 @@ class FusedAdamBuilder(CUDAOpBuilder):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.adam.
{
self
.
NAME
}
_op'
def
sources
(
self
):
return
[
'csrc/adam/fused_adam_frontend.cpp'
,
'csrc/adam/multi_tensor_adam.cu'
]
def
sources
(
self
,
is_rocm_pytorch
):
if
is_rocm_pytorch
:
return
[
'csrc/adam/hip/fused_adam_frontend.cpp'
,
'csrc/adam/hip/multi_tensor_adam.hip'
]
else
:
return
[
'csrc/adam/fused_adam_frontend.cpp'
,
'csrc/adam/multi_tensor_adam.cu'
]
def
include_paths
(
self
):
return
[
'csrc/includes'
]
...
...
@@ -27,5 +30,5 @@ class FusedAdamBuilder(CUDAOpBuilder):
def
nvcc_args
(
self
):
return
[
'-lineinfo'
,
'-O3'
,
'--use_fast_math'
]
+
self
.
version_dependent_macros
()
+
self
.
compute_capability_args
()
#
'--use_fast_math'
]
#
+ self.version_dependent_macros() + self.compute_capability_args()
op_builder/fused_lamb.py
View file @
eadbbe09
...
...
@@ -15,9 +15,11 @@ class FusedLambBuilder(CUDAOpBuilder):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.lamb.
{
self
.
NAME
}
_op'
def
sources
(
self
):
return
[
'csrc/lamb/fused_lamb_cuda.cpp'
,
'csrc/lamb/fused_lamb_cuda_kernel.cu'
]
def
sources
(
self
,
is_rocm_pytorch
):
if
is_rocm_pytorch
:
return
[
'csrc/lamb/hip/fused_lamb_hip.cpp'
,
'csrc/lamb/hip/fused_lamb_hip_kernel.hip'
]
else
:
return
[
'csrc/lamb/fused_lamb_cuda.cpp'
,
'csrc/lamb/fused_lamb_cuda_kernel.cu'
]
def
include_paths
(
self
):
return
[
'csrc/includes'
]
...
...
@@ -27,5 +29,5 @@ class FusedLambBuilder(CUDAOpBuilder):
def
nvcc_args
(
self
):
return
[
'-lineinfo'
,
'-O3'
,
'--use_fast_math'
]
+
self
.
version_dependent_macros
()
+
self
.
compute_capability_args
()
#
'--use_fast_math'
]
#
+ self.version_dependent_macros() + self.compute_capability_args()
op_builder/sparse_attn.py
View file @
eadbbe09
...
...
@@ -16,29 +16,33 @@ class SparseAttnBuilder(OpBuilder):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.sparse_attention.
{
self
.
NAME
}
_op'
def
sources
(
self
):
return
[
'csrc/sparse_attention/utils.cpp'
]
def
sources
(
self
,
is_rocm_pytorch
):
if
is_rocm_pytorch
:
return
[
'csrc/sparse_attention/hip/utils.cpp'
]
else
:
return
[
'csrc/sparse_attention/utils.cpp'
]
def
cxx_args
(
self
):
return
[
'-O2'
,
'-fopenmp'
]
def
is_compatible
(
self
):
# Check to see if llvm and cmake are installed since they are dependencies
required_commands
=
[
'llvm-config|llvm-config-9'
,
'cmake'
]
#required_commands = ['llvm-config|llvm-config-9', 'cmake']
required_commands
=
[
'cmake'
]
command_status
=
list
(
map
(
self
.
command_exists
,
required_commands
))
deps_compatible
=
all
(
command_status
)
# torch-cpu will not have a cuda version
if
torch
.
version
.
cuda
is
None
:
if
torch
.
version
.
hip
is
None
:
cuda_compatible
=
False
self
.
warning
(
f
"
{
self
.
NAME
}
cuda is not available from torch"
)
else
:
major
,
minor
=
torch
.
version
.
cuda
.
split
(
'.'
)[:
2
]
cuda_compatible
=
int
(
major
)
==
10
and
int
(
minor
)
>=
1
if
not
cuda_compatible
:
self
.
warning
(
f
"
{
self
.
NAME
}
requires CUDA version 10.1+, does not currently support >=11 or <10.1"
)
#
else:
#
major, minor = torch.version.cuda.split('.')[:2]
#
cuda_compatible = int(major) == 10 and int(minor) >= 1
#
if not cuda_compatible:
#
self.warning(
#
f"{self.NAME} requires CUDA version 10.1+, does not currently support >=11 or <10.1"
#
)
TORCH_MAJOR
=
int
(
torch
.
__version__
.
split
(
'.'
)[
0
])
TORCH_MINOR
=
int
(
torch
.
__version__
.
split
(
'.'
)[
1
])
...
...
@@ -49,4 +53,4 @@ class SparseAttnBuilder(OpBuilder):
)
return
super
().
is_compatible
(
)
and
deps_compatible
and
torch_compatible
and
cuda_compatible
)
and
deps_compatible
and
torch_compatible
#
and cuda_compatible
op_builder/transformer.py
View file @
eadbbe09
...
...
@@ -16,17 +16,30 @@ class TransformerBuilder(CUDAOpBuilder):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.transformer.
{
self
.
NAME
}
_op'
def
sources
(
self
):
return
[
'csrc/transformer/ds_transformer_cuda.cpp'
,
'csrc/transformer/cublas_wrappers.cu'
,
'csrc/transformer/transform_kernels.cu'
,
'csrc/transformer/gelu_kernels.cu'
,
'csrc/transformer/dropout_kernels.cu'
,
'csrc/transformer/normalize_kernels.cu'
,
'csrc/transformer/softmax_kernels.cu'
,
'csrc/transformer/general_kernels.cu'
]
def
sources
(
self
,
is_rocm_pytorch
):
if
is_rocm_pytorch
:
return
[
'csrc/transformer/hip/ds_transformer_hip.cpp'
,
'csrc/transformer/hip/cublas_wrappers.hip'
,
'csrc/transformer/hip/transform_kernels.hip'
,
'csrc/transformer/hip/gelu_kernels.hip'
,
'csrc/transformer/hip/dropout_kernels.hip'
,
###don't support
#'csrc/transformer/hip/normalize_kernels.hip',
#'csrc/transformer/hip/softmax_kernels.hip',
'csrc/transformer/hip/general_kernels.hip'
]
else
:
return
[
'csrc/transformer/ds_transformer_cuda.cpp'
,
'csrc/transformer/cublas_wrappers.cu'
,
'csrc/transformer/transform_kernels.cu'
,
'csrc/transformer/gelu_kernels.cu'
,
'csrc/transformer/dropout_kernels.cu'
,
'csrc/transformer/normalize_kernels.cu'
,
'csrc/transformer/softmax_kernels.cu'
,
'csrc/transformer/general_kernels.cu'
]
def
include_paths
(
self
):
return
[
'csrc/includes'
]
...
...
@@ -34,14 +47,14 @@ class TransformerBuilder(CUDAOpBuilder):
def
nvcc_args
(
self
):
args
=
[
'-O3'
,
'--use_fast_math'
,
#
'--use_fast_math',
'-std=c++14'
,
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'-U__CUDA_NO_HALF2_OPERATORS__'
#
'-U__CUDA_NO_HALF_OPERATORS__',
#
'-U__CUDA_NO_HALF_CONVERSIONS__',
#
'-U__CUDA_NO_HALF2_OPERATORS__'
]
return
args
+
self
.
compute_capability_args
()
return
args
#
+ self.compute_capability_args()
def
cxx_args
(
self
):
return
[
'-O3'
,
'-std=c++14'
,
'-g'
,
'-Wno-reorder'
]
return
[
'-O3'
,
'-std=c++14'
,
'-g'
,
'-Wno-reorder'
,
'-Wno-c++11-narrowing'
]
op_builder/utils.py
View file @
eadbbe09
...
...
@@ -14,5 +14,8 @@ class UtilsBuilder(OpBuilder):
def
absolute_name
(
self
):
return
f
'deepspeed.ops.
{
self
.
NAME
}
_op'
def
sources
(
self
):
return
[
'csrc/utils/flatten_unflatten.cpp'
]
def
sources
(
self
,
is_rocm_pytorch
):
if
is_rocm_pytorch
:
return
[
'csrc/utils/hip/flatten_unflatten.cpp'
]
else
:
return
[
'csrc/utils/flatten_unflatten.cpp'
]
requirements/requirements-sparse_attn.txt
View file @
eadbbe09
triton==0.2.3
#
triton==0.2.3
requirements/requirements.txt
View file @
eadbbe09
torch>=1.2
torchvision>=0.4.0
#
torch>=1.2
#
torchvision>=0.4.0
tqdm
tensorboardX==1.8
ninja
...
...
setup.py
View file @
eadbbe09
...
...
@@ -17,10 +17,16 @@ import time
try
:
import
torch
from
torch.utils.cpp_extension
import
BuildExtension
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CppExtension
from
torch.utils.hipify
import
hipify_python
except
ImportError
:
raise
ImportError
(
'Unable to import torch, please visit https://pytorch.org/ '
'to see how to properly install torch on your system.'
)
###aiss add
is_rocm_pytorch
=
False
if
torch
.
__version__
>=
'1.5'
:
from
torch.utils.cpp_extension
import
ROCM_HOME
is_rocm_pytorch
=
True
if
((
torch
.
version
.
hip
is
not
None
)
and
(
ROCM_HOME
is
not
None
))
else
False
from
op_builder
import
ALL_OPS
,
get_default_compute_capatabilities
...
...
@@ -36,12 +42,19 @@ extras_require = {
'readthedocs'
:
fetch_requirements
(
'requirements/requirements-readthedocs.txt'
),
'dev'
:
fetch_requirements
(
'requirements/requirements-dev.txt'
),
}
###aiss add ################
if
is_rocm_pytorch
:
print
(
"NOTE: Please manually install torch and torchvision packages for ROCm"
)
#install_requires = fetch_requirements('requirements/requirements-rocm.txt')
# If MPI is available add 1bit-adam requirements
if
torch
.
cuda
.
is_available
():
if
shutil
.
which
(
'ompi_info'
)
or
shutil
.
which
(
'mpiname'
):
cupy
=
f
"cupy-cuda
{
torch
.
version
.
cuda
.
replace
(
'.'
,
''
)[:
3
]
}
"
extras_require
[
'1bit_adam'
].
append
(
cupy
)
##aiss add: cupy 9.0 has been setup manufully
#if torch.cuda.is_available():
# if shutil.which('ompi_info') or shutil.which('mpiname'):
# cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
# print("cupy version: ", cupy)
# extras_require['1bit_adam'].append(cupy)
# Make an [all] extra that installs all needed dependencies
all_extras
=
set
()
...
...
@@ -66,10 +79,18 @@ if not torch.cuda.is_available():
"(compute capabilities 6.0, 6.1, 6.2)"
)
if
os
.
environ
.
get
(
"TORCH_CUDA_ARCH_LIST"
,
None
)
is
None
:
os
.
environ
[
"TORCH_CUDA_ARCH_LIST"
]
=
get_default_compute_capatabilities
()
###########aiss add ######################only need run once for convert
#if is_rocm_pytorch:
# import shutil
# this_dir = os.path.dirname(os.path.abspath(__file__))
# hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
# show_detailed=True, is_pytorch_extension=True)
# print("cuda file has been transformed to hip format!!!")
ext_modules
=
[]
# Default to pre-install kernels to false so we rely on JIT
BUILD_OP_DEFAULT
=
int
(
os
.
environ
.
get
(
'DS_BUILD_OPS'
,
0
))
print
(
f
"DS_BUILD_OPS=
{
BUILD_OP_DEFAULT
}
"
)
...
...
@@ -94,11 +115,12 @@ for op_name, builder in ALL_OPS.items():
if
op_compatible
:
reqs
=
builder
.
python_requirements
()
install_requires
+=
builder
.
python_requirements
()
######aiss debug###############
print
(
"op_enabled(op_name): , op_compatible: "
,
op_enabled
(
op_name
),
op_compatible
)
# If op install enabled, add builder to extensions
if
op_enabled
(
op_name
)
and
op_compatible
:
install_ops
[
op_name
]
=
op_enabled
(
op_name
)
ext_modules
.
append
(
builder
.
builder
())
ext_modules
.
append
(
builder
.
builder
(
is_rocm_pytorch
))
compatible_ops
=
{
op_name
:
op
.
is_compatible
()
for
(
op_name
,
op
)
in
ALL_OPS
.
items
()}
...
...
tests/onebitadam/test_com_reduce_cuda.py
0 → 100644
View file @
eadbbe09
from
mpi4py
import
MPI
import
time
import
torch
import
torch.distributed
as
dist
import
numpy
as
np
import
deepspeed
from
deepspeed.runtime.fp16.onebit_adam
import
OnebitAdam
comm
=
MPI
.
COMM_WORLD
size
=
comm
.
Get_size
()
rank
=
comm
.
Get_rank
()
#TODO: Detect the hostname we are running on automatically
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
'tcp://worker-1:2245'
,
world_size
=
size
,
rank
=
rank
)
dummy_model
=
[
torch
.
nn
.
Parameter
(
torch
.
ones
(
10
))]
# Set cuda_aware to True to use CUDA buffers for communication
dummy_optim
=
OnebitAdam
(
dummy_model
,
cuda_aware
=
True
)
device
=
torch
.
device
(
'cuda'
,
rank
%
torch
.
cuda
.
device_count
())
def
torch_sim
(
a
):
a_sign
=
a
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
scale
=
a
.
norm
()
/
np
.
sqrt
(
a
.
numel
())
a_compressed
=
scale
*
a_sign
a_sign
=
None
worker_error
=
a
-
a_compressed
dist
.
all_reduce
(
a_compressed
)
a_compressed
.
mul_
(
1
/
dist
.
get_world_size
())
a_server_sign
=
a_compressed
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
a_list
=
torch
.
chunk
(
a_compressed
,
chunks
=
dist
.
get_world_size
())
server_scale
=
[
chunk_a
.
norm
()
/
np
.
sqrt
(
chunk_a
.
numel
())
for
chunk_a
in
a_list
]
a_sign_list
=
torch
.
chunk
(
a_server_sign
,
dist
.
get_world_size
())
a_server_compressed
=
torch
.
cat
(
[
server_scale
[
i
]
*
a_sign_list
[
i
]
for
i
in
range
(
dist
.
get_world_size
())])
rank
=
dist
.
get_rank
()
server_error
=
a_list
[
rank
]
-
server_scale
[
rank
]
*
a_sign_list
[
rank
]
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
return
a_server_compressed
,
worker_error
,
server_error
tensor_size
=
100
*
2
**
20
server_size
=
int
(
tensor_size
/
size
)
if
tensor_size
%
(
8
*
size
)
!=
0
:
right_tensor_size
=
tensor_size
+
(
8
*
size
-
(
tensor_size
%
(
8
*
size
)))
else
:
right_tensor_size
=
tensor_size
right_server_size
=
right_tensor_size
//
size
# Adding bias to the initialization of the gradient we are communicating
# In order to get rid of the case where some elements in the gradient are too small
a
=
(
torch
.
rand
(
tensor_size
,
device
=
device
)
-
0.5
)
+
0.01
*
rank
worker_error
=
torch
.
zeros
(
right_tensor_size
,
device
=
device
)
server_error
=
torch
.
zeros
(
right_server_size
,
device
=
device
)
a_torch
,
worker_error_torch
,
server_error_torch
=
torch_sim
(
a
)
torch
.
cuda
.
empty_cache
()
local_rank
=
rank
%
torch
.
cuda
.
device_count
()
a_after
=
dummy_optim
.
Compressed_Allreduce
(
a
,
worker_error
,
server_error
,
rank
,
size
,
comm
,
local_rank
)
threshold
=
1e-6
magnitude_threshold
=
1e-6
diff_mask
=
(
a_after
-
a_torch
)
>
threshold
diff_server_mask
=
torch
.
chunk
(
diff_mask
,
size
)[
rank
]
mpi_server
=
torch
.
chunk
(
a_after
,
size
)[
rank
]
+
server_error
torch_server
=
torch
.
chunk
(
a_torch
,
size
)[
rank
]
+
server_error_torch
# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
# The test would skip those numbers that are too small in compensated_server_m
if
torch
.
sum
(
diff_server_mask
)
==
0
:
print
(
'Successfully passed the test for 1bit Adam at Rank {}'
.
format
(
rank
))
else
:
check_mag_mask
=
mpi_server
[
diff_mask
]
>
magnitude_threshold
if
torch
.
sum
(
check_mag_mask
)
==
0
:
print
(
'Successfully passed the test for 1bit Adam at Rank {}'
.
format
(
rank
))
else
:
print
(
'Fails at {} of positions'
.
format
(
torch
.
sum
(
check_mag_mask
)))
tests/onebitadam/test_com_reduce_host.py
0 → 100644
View file @
eadbbe09
from
mpi4py
import
MPI
import
time
import
torch
import
torch.distributed
as
dist
import
numpy
as
np
import
deepspeed
from
deepspeed.runtime.fp16.onebit_adam
import
OnebitAdam
comm
=
MPI
.
COMM_WORLD
size
=
comm
.
Get_size
()
rank
=
comm
.
Get_rank
()
#TODO: Detect the hostname we are running on automatically
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
'tcp://worker-1:2245'
,
world_size
=
size
,
rank
=
rank
)
dummy_model
=
[
torch
.
nn
.
Parameter
(
torch
.
ones
(
10
))]
# Set cuda_aware to False to use host buffers for communication
dummy_optim
=
OnebitAdam
(
dummy_model
,
cuda_aware
=
False
)
device
=
torch
.
device
(
'cuda'
,
rank
%
torch
.
cuda
.
device_count
())
def
torch_sim
(
a
):
a_sign
=
a
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
scale
=
a
.
norm
()
/
np
.
sqrt
(
a
.
numel
())
a_compressed
=
scale
*
a_sign
a_sign
=
None
worker_error
=
a
-
a_compressed
dist
.
all_reduce
(
a_compressed
)
a_compressed
.
mul_
(
1
/
dist
.
get_world_size
())
a_server_sign
=
a_compressed
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
a_list
=
torch
.
chunk
(
a_compressed
,
chunks
=
dist
.
get_world_size
())
server_scale
=
[
chunk_a
.
norm
()
/
np
.
sqrt
(
chunk_a
.
numel
())
for
chunk_a
in
a_list
]
a_sign_list
=
torch
.
chunk
(
a_server_sign
,
dist
.
get_world_size
())
a_server_compressed
=
torch
.
cat
(
[
server_scale
[
i
]
*
a_sign_list
[
i
]
for
i
in
range
(
dist
.
get_world_size
())])
rank
=
dist
.
get_rank
()
server_error
=
a_list
[
rank
]
-
server_scale
[
rank
]
*
a_sign_list
[
rank
]
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
return
a_server_compressed
,
worker_error
,
server_error
tensor_size
=
100
*
2
**
20
server_size
=
int
(
tensor_size
/
size
)
if
tensor_size
%
(
8
*
size
)
!=
0
:
right_tensor_size
=
tensor_size
+
(
8
*
size
-
(
tensor_size
%
(
8
*
size
)))
else
:
right_tensor_size
=
tensor_size
right_server_size
=
right_tensor_size
//
size
# Adding bias to the initialization of the gradient we are communicating
# In order to get rid of the case where some elements in the gradient are too small
a
=
(
torch
.
rand
(
tensor_size
,
device
=
device
)
-
0.5
)
+
0.01
*
rank
worker_error
=
torch
.
zeros
(
right_tensor_size
,
device
=
device
)
server_error
=
torch
.
zeros
(
right_server_size
,
device
=
device
)
a_torch
,
worker_error_torch
,
server_error_torch
=
torch_sim
(
a
)
torch
.
cuda
.
empty_cache
()
local_rank
=
rank
%
torch
.
cuda
.
device_count
()
a_after
=
dummy_optim
.
Compressed_Allreduce
(
a
,
worker_error
,
server_error
,
rank
,
size
,
comm
,
local_rank
)
threshold
=
1e-6
magnitude_threshold
=
1e-6
diff_mask
=
(
a_after
-
a_torch
)
>
threshold
diff_server_mask
=
torch
.
chunk
(
diff_mask
,
size
)[
rank
]
mpi_server
=
torch
.
chunk
(
a_after
,
size
)[
rank
]
+
server_error
torch_server
=
torch
.
chunk
(
a_torch
,
size
)[
rank
]
+
server_error_torch
# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
# The test would skip those numbers that are too small in compensated_server_m
if
torch
.
sum
(
diff_server_mask
)
==
0
:
print
(
'Successfully passed the test for 1bit Adam at Rank {}'
.
format
(
rank
))
else
:
check_mag_mask
=
mpi_server
[
diff_mask
]
>
magnitude_threshold
if
torch
.
sum
(
check_mag_mask
)
==
0
:
print
(
'Successfully passed the test for 1bit Adam at Rank {}'
.
format
(
rank
))
else
:
print
(
'Fails at {} of positions'
.
format
(
torch
.
sum
(
check_mag_mask
)))
tests/onebitadam/test_server_error.py
0 → 100644
View file @
eadbbe09
from
mpi4py
import
MPI
import
time
import
torch
import
torch.distributed
as
dist
import
numpy
as
np
import
deepspeed
from
deepspeed.runtime.fp16.onebit_adam
import
OnebitAdam
comm
=
MPI
.
COMM_WORLD
size
=
comm
.
Get_size
()
rank
=
comm
.
Get_rank
()
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
'tcp://worker-0:2245'
,
world_size
=
size
,
rank
=
rank
)
dummy_model
=
[
torch
.
nn
.
Parameter
(
torch
.
ones
(
10
))]
dummy_optim
=
OnebitAdam
(
dummy_model
,
cuda_aware
=
False
)
device
=
torch
.
device
(
'cuda'
,
rank
%
torch
.
cuda
.
device_count
())
def
torch_sim
(
a
):
a_sign
=
a
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
scale
=
a
.
norm
()
/
np
.
sqrt
(
a
.
numel
())
a_compressed
=
scale
*
a_sign
a_sign
=
None
worker_error
=
a
-
a_compressed
dist
.
all_reduce
(
a_compressed
)
a_compressed
.
mul_
(
1
/
dist
.
get_world_size
())
a_server_sign
=
a_compressed
.
sign
().
add_
(
1
).
bool
().
float
().
add_
(
-
0.5
).
mul_
(
2.0
)
a_list
=
torch
.
chunk
(
a_compressed
,
chunks
=
dist
.
get_world_size
())
server_scale
=
[
chunk_a
.
norm
()
/
np
.
sqrt
(
chunk_a
.
numel
())
for
chunk_a
in
a_list
]
a_sign_list
=
torch
.
chunk
(
a_server_sign
,
dist
.
get_world_size
())
a_server_compressed
=
torch
.
cat
(
[
server_scale
[
i
]
*
a_sign_list
[
i
]
for
i
in
range
(
dist
.
get_world_size
())])
rank
=
dist
.
get_rank
()
server_error
=
a_list
[
rank
]
-
server_scale
[
rank
]
*
a_sign_list
[
rank
]
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
return
a_server_compressed
,
worker_error
,
server_error
# Input Tensor size
tensor_size
=
100
*
2
**
20
server_size
=
int
(
tensor_size
/
size
)
if
tensor_size
%
(
8
*
size
)
!=
0
:
right_tensor_size
=
tensor_size
+
(
8
*
size
-
(
tensor_size
%
(
8
*
size
)))
else
:
right_tensor_size
=
tensor_size
right_server_size
=
right_tensor_size
//
size
# The -0.5 is required for avoiding sign flips/errors
a
=
torch
.
rand
(
tensor_size
,
device
=
device
)
-
0.5
worker_error
=
torch
.
zeros
(
right_tensor_size
,
device
=
device
)
server_error
=
torch
.
zeros
(
right_server_size
,
device
=
device
)
a_torch
,
worker_error_torch
,
server_error_torch
=
torch_sim
(
a
)
torch
.
cuda
.
empty_cache
()
local_rank
=
rank
%
torch
.
cuda
.
device_count
()
# Test the 1-bit Adam optimizer
a_after
=
dummy_optim
.
Compressed_Allreduce
(
a
,
worker_error
,
server_error
,
rank
,
size
,
comm
,
local_rank
)
# If the error is below the threshold, it is acceptable for training
threshold
=
1e-6
diff_pos
=
((
a_after
-
a_torch
)
>
threshold
)
if
rank
==
0
:
before_diff
=
torch
.
chunk
(
a_after
-
a_torch
,
size
)[
rank
]
+
server_error
-
server_error_torch
if
torch
.
norm
(
before_diff
)
/
torch
.
norm
(
torch
.
chunk
(
a_after
,
size
)[
rank
])
<
threshold
:
print
(
'Successfully passed the test'
)
else
:
print
(
'The difference for the tensor before allgather is {}'
.
format
(
torch
.
norm
(
before_diff
)))
tests/unit/test_pipe.py
View file @
eadbbe09
...
...
@@ -169,7 +169,6 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
return
losses
@
pytest
.
mark
.
skip
(
reason
=
"been seeing nondeterministic failures, skipping for now"
)
@
pytest
.
mark
.
parametrize
(
'topo'
,
[
PipeTopo
(
num_pp
=
1
,
...
...
Prev
1
…
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment