Commit eadbbe09 authored by 401qingkong's avatar 401qingkong
Browse files

push rocm deepspeed v0.3.13

parent ab5534fc
......@@ -152,7 +152,9 @@ if [ ! -f $hostfile ]; then
fi
echo "Building deepspeed wheel"
python setup.py $VERBOSE bdist_wheel
###aiss add
CXX=hipcc CC=hipcc DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_SPARSE_ATTN=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 python3 setup.py $VERBOSE bdist_wheel
#CXX=hipcc CC=hipcc DS_BUILD_UTILS=1 python3 setup.py $VERBOSE bdist_wheel
if [ "$local_only" == "1" ]; then
echo "Installing deepspeed"
......
......@@ -194,10 +194,10 @@ class OpBuilder(ABC):
else:
return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
def builder(self):
def builder(self, is_rocm_pytorch):
from torch.utils.cpp_extension import CppExtension
return CppExtension(name=self.absolute_name(),
sources=self.sources(),
sources=self.sources(is_rocm_pytorch),
include_dirs=self.include_paths(),
extra_compile_args={'cxx': self.cxx_args()},
extra_link_args=self.extra_ldflags())
......@@ -328,11 +328,11 @@ class CUDAOpBuilder(OpBuilder):
def is_compatible(self):
return super().is_compatible()
def builder(self):
def builder(self,is_rocm_pytorch):
from torch.utils.cpp_extension import CUDAExtension
assert_no_cuda_mismatch()
#assert_no_cuda_mismatch()
return CUDAExtension(name=self.absolute_name(),
sources=self.sources(),
sources=self.sources(is_rocm_pytorch),
include_dirs=self.include_paths(),
extra_compile_args={
'cxx': self.cxx_args(),
......
......@@ -17,12 +17,16 @@ class CPUAdamBuilder(CUDAOpBuilder):
def absolute_name(self):
return f'deepspeed.ops.adam.{self.NAME}_op'
def sources(self):
return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/custom_cuda_kernel.cu']
def sources(self, is_rocm_pytorch):
if is_rocm_pytorch:
return ['csrc/adam/hip/cpu_adam.cpp', 'csrc/adam/hip/custom_hip_kernel.hip']
else:
return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/custom_cuda_kernel.cu']
def include_paths(self):
CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
return ['csrc/includes', CUDA_INCLUDE]
#CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
#return ['csrc/includes', CUDA_INCLUDE]
return ['csrc/includes/', '/opt/rocm-3.9.1/include/']
def simd_width(self):
if not self.command_exists('lscpu'):
......@@ -42,30 +46,48 @@ class CPUAdamBuilder(CUDAOpBuilder):
return '-D__SCALAR__'
def cxx_args(self):
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
#CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
SIMD_WIDTH = self.simd_width()
#return [
# '-O3',
# '-std=c++14',
# f'-L{CUDA_LIB64}',
# '-lcudart',
# '-lcublas',
# '-g',
# '-Wno-reorder',
# '-march=native',
# '-fopenmp',
# SIMD_WIDTH
#]
return [
'-O3',
'-std=c++14',
f'-L{CUDA_LIB64}',
'-lcudart',
'-lcublas',
'-lrocblas',
'-g',
'-Wno-reorder',
'-march=native',
'-fopenmp',
'-lpthread',
SIMD_WIDTH
]
def nvcc_args(self):
#args = [
# '-O3',
# '--use_fast_math',
# '-std=c++14',
# '-U__CUDA_NO_HALF_OPERATORS__',
# '-U__CUDA_NO_HALF_CONVERSIONS__',
# '-U__CUDA_NO_HALF2_OPERATORS__'
#]
args = [
'-O3',
'--use_fast_math',
#'--use_fast_math',
'-fopenmp',
'-lpthread',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
args += self.compute_capability_args()
#args += self.compute_capability_args()
return args
......@@ -15,8 +15,11 @@ class FusedAdamBuilder(CUDAOpBuilder):
def absolute_name(self):
return f'deepspeed.ops.adam.{self.NAME}_op'
def sources(self):
return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
def sources(self, is_rocm_pytorch):
if is_rocm_pytorch:
return ['csrc/adam/hip/fused_adam_frontend.cpp', 'csrc/adam/hip/multi_tensor_adam.hip']
else:
return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
def include_paths(self):
return ['csrc/includes']
......@@ -27,5 +30,5 @@ class FusedAdamBuilder(CUDAOpBuilder):
def nvcc_args(self):
return ['-lineinfo',
'-O3',
'--use_fast_math'
] + self.version_dependent_macros() + self.compute_capability_args()
#'--use_fast_math'
] #+ self.version_dependent_macros() + self.compute_capability_args()
......@@ -15,9 +15,11 @@ class FusedLambBuilder(CUDAOpBuilder):
def absolute_name(self):
return f'deepspeed.ops.lamb.{self.NAME}_op'
def sources(self):
return ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
def sources(self, is_rocm_pytorch):
if is_rocm_pytorch:
return ['csrc/lamb/hip/fused_lamb_hip.cpp', 'csrc/lamb/hip/fused_lamb_hip_kernel.hip']
else:
return ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
def include_paths(self):
return ['csrc/includes']
......@@ -27,5 +29,5 @@ class FusedLambBuilder(CUDAOpBuilder):
def nvcc_args(self):
return ['-lineinfo',
'-O3',
'--use_fast_math'
] + self.version_dependent_macros() + self.compute_capability_args()
#'--use_fast_math'
]# + self.version_dependent_macros() + self.compute_capability_args()
......@@ -16,29 +16,33 @@ class SparseAttnBuilder(OpBuilder):
def absolute_name(self):
return f'deepspeed.ops.sparse_attention.{self.NAME}_op'
def sources(self):
return ['csrc/sparse_attention/utils.cpp']
def sources(self, is_rocm_pytorch):
if is_rocm_pytorch:
return ['csrc/sparse_attention/hip/utils.cpp']
else:
return ['csrc/sparse_attention/utils.cpp']
def cxx_args(self):
return ['-O2', '-fopenmp']
def is_compatible(self):
# Check to see if llvm and cmake are installed since they are dependencies
required_commands = ['llvm-config|llvm-config-9', 'cmake']
#required_commands = ['llvm-config|llvm-config-9', 'cmake']
required_commands = ['cmake']
command_status = list(map(self.command_exists, required_commands))
deps_compatible = all(command_status)
# torch-cpu will not have a cuda version
if torch.version.cuda is None:
if torch.version.hip is None:
cuda_compatible = False
self.warning(f"{self.NAME} cuda is not available from torch")
else:
major, minor = torch.version.cuda.split('.')[:2]
cuda_compatible = int(major) == 10 and int(minor) >= 1
if not cuda_compatible:
self.warning(
f"{self.NAME} requires CUDA version 10.1+, does not currently support >=11 or <10.1"
)
#else:
# major, minor = torch.version.cuda.split('.')[:2]
# cuda_compatible = int(major) == 10 and int(minor) >= 1
# if not cuda_compatible:
# self.warning(
# f"{self.NAME} requires CUDA version 10.1+, does not currently support >=11 or <10.1"
# )
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
......@@ -49,4 +53,4 @@ class SparseAttnBuilder(OpBuilder):
)
return super().is_compatible(
) and deps_compatible and torch_compatible and cuda_compatible
) and deps_compatible and torch_compatible #and cuda_compatible
......@@ -16,17 +16,30 @@ class TransformerBuilder(CUDAOpBuilder):
def absolute_name(self):
return f'deepspeed.ops.transformer.{self.NAME}_op'
def sources(self):
return [
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
]
def sources(self, is_rocm_pytorch):
if is_rocm_pytorch:
return [
'csrc/transformer/hip/ds_transformer_hip.cpp',
'csrc/transformer/hip/cublas_wrappers.hip',
'csrc/transformer/hip/transform_kernels.hip',
'csrc/transformer/hip/gelu_kernels.hip',
'csrc/transformer/hip/dropout_kernels.hip',
###don't support
#'csrc/transformer/hip/normalize_kernels.hip',
#'csrc/transformer/hip/softmax_kernels.hip',
'csrc/transformer/hip/general_kernels.hip'
]
else:
return [
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
]
def include_paths(self):
return ['csrc/includes']
......@@ -34,14 +47,14 @@ class TransformerBuilder(CUDAOpBuilder):
def nvcc_args(self):
args = [
'-O3',
'--use_fast_math',
#'--use_fast_math',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
#'-U__CUDA_NO_HALF_OPERATORS__',
#'-U__CUDA_NO_HALF_CONVERSIONS__',
#'-U__CUDA_NO_HALF2_OPERATORS__'
]
return args + self.compute_capability_args()
return args #+ self.compute_capability_args()
def cxx_args(self):
return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
return ['-O3', '-std=c++14', '-g', '-Wno-reorder', '-Wno-c++11-narrowing']
......@@ -14,5 +14,8 @@ class UtilsBuilder(OpBuilder):
def absolute_name(self):
return f'deepspeed.ops.{self.NAME}_op'
def sources(self):
return ['csrc/utils/flatten_unflatten.cpp']
def sources(self, is_rocm_pytorch):
if is_rocm_pytorch:
return ['csrc/utils/hip/flatten_unflatten.cpp']
else:
return ['csrc/utils/flatten_unflatten.cpp']
torch>=1.2
torchvision>=0.4.0
#torch>=1.2
#torchvision>=0.4.0
tqdm
tensorboardX==1.8
ninja
......
......@@ -17,10 +17,16 @@ import time
try:
import torch
from torch.utils.cpp_extension import BuildExtension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
from torch.utils.hipify import hipify_python
except ImportError:
raise ImportError('Unable to import torch, please visit https://pytorch.org/ '
'to see how to properly install torch on your system.')
###aiss add
is_rocm_pytorch = False
if torch.__version__ >= '1.5':
from torch.utils.cpp_extension import ROCM_HOME
is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
from op_builder import ALL_OPS, get_default_compute_capatabilities
......@@ -36,12 +42,19 @@ extras_require = {
'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
'dev': fetch_requirements('requirements/requirements-dev.txt'),
}
###aiss add ################
if is_rocm_pytorch:
print("NOTE: Please manually install torch and torchvision packages for ROCm")
#install_requires = fetch_requirements('requirements/requirements-rocm.txt')
# If MPI is available add 1bit-adam requirements
if torch.cuda.is_available():
if shutil.which('ompi_info') or shutil.which('mpiname'):
cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
extras_require['1bit_adam'].append(cupy)
##aiss add: cupy 9.0 has been setup manufully
#if torch.cuda.is_available():
# if shutil.which('ompi_info') or shutil.which('mpiname'):
# cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
# print("cupy version: ", cupy)
# extras_require['1bit_adam'].append(cupy)
# Make an [all] extra that installs all needed dependencies
all_extras = set()
......@@ -66,10 +79,18 @@ if not torch.cuda.is_available():
"(compute capabilities 6.0, 6.1, 6.2)")
if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capatabilities()
###########aiss add ######################only need run once for convert
#if is_rocm_pytorch:
# import shutil
# this_dir = os.path.dirname(os.path.abspath(__file__))
# hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
# show_detailed=True, is_pytorch_extension=True)
# print("cuda file has been transformed to hip format!!!")
ext_modules = []
# Default to pre-install kernels to false so we rely on JIT
BUILD_OP_DEFAULT = int(os.environ.get('DS_BUILD_OPS', 0))
print(f"DS_BUILD_OPS={BUILD_OP_DEFAULT}")
......@@ -94,11 +115,12 @@ for op_name, builder in ALL_OPS.items():
if op_compatible:
reqs = builder.python_requirements()
install_requires += builder.python_requirements()
######aiss debug###############
print("op_enabled(op_name): , op_compatible: ", op_enabled(op_name), op_compatible)
# If op install enabled, add builder to extensions
if op_enabled(op_name) and op_compatible:
install_ops[op_name] = op_enabled(op_name)
ext_modules.append(builder.builder())
ext_modules.append(builder.builder(is_rocm_pytorch))
compatible_ops = {op_name: op.is_compatible() for (op_name, op) in ALL_OPS.items()}
......
from mpi4py import MPI
import time
import torch
import torch.distributed as dist
import numpy as np
import deepspeed
from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
#TODO: Detect the hostname we are running on automatically
torch.distributed.init_process_group(backend='nccl',
init_method='tcp://worker-1:2245',
world_size=size,
rank=rank)
dummy_model = [torch.nn.Parameter(torch.ones(10))]
# Set cuda_aware to True to use CUDA buffers for communication
dummy_optim = OnebitAdam(dummy_model, cuda_aware=True)
device = torch.device('cuda', rank % torch.cuda.device_count())
def torch_sim(a):
a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
scale = a.norm() / np.sqrt(a.numel())
a_compressed = scale * a_sign
a_sign = None
worker_error = a - a_compressed
dist.all_reduce(a_compressed)
a_compressed.mul_(1 / dist.get_world_size())
a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
a_server_compressed = torch.cat(
[server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
rank = dist.get_rank()
server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
torch.cuda.synchronize()
torch.distributed.barrier()
return a_server_compressed, worker_error, server_error
tensor_size = 100 * 2**20
server_size = int(tensor_size / size)
if tensor_size % (8 * size) != 0:
right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
else:
right_tensor_size = tensor_size
right_server_size = right_tensor_size // size
# Adding bias to the initialization of the gradient we are communicating
# In order to get rid of the case where some elements in the gradient are too small
a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
worker_error = torch.zeros(right_tensor_size, device=device)
server_error = torch.zeros(right_server_size, device=device)
a_torch, worker_error_torch, server_error_torch = torch_sim(a)
torch.cuda.empty_cache()
local_rank = rank % torch.cuda.device_count()
a_after = dummy_optim.Compressed_Allreduce(a,
worker_error,
server_error,
rank,
size,
comm,
local_rank)
threshold = 1e-6
magnitude_threshold = 1e-6
diff_mask = (a_after - a_torch) > threshold
diff_server_mask = torch.chunk(diff_mask, size)[rank]
mpi_server = torch.chunk(a_after, size)[rank] + server_error
torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
# The test would skip those numbers that are too small in compensated_server_m
if torch.sum(diff_server_mask) == 0:
print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
else:
check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
if torch.sum(check_mag_mask) == 0:
print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
else:
print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
from mpi4py import MPI
import time
import torch
import torch.distributed as dist
import numpy as np
import deepspeed
from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
#TODO: Detect the hostname we are running on automatically
torch.distributed.init_process_group(backend='nccl',
init_method='tcp://worker-1:2245',
world_size=size,
rank=rank)
dummy_model = [torch.nn.Parameter(torch.ones(10))]
# Set cuda_aware to False to use host buffers for communication
dummy_optim = OnebitAdam(dummy_model, cuda_aware=False)
device = torch.device('cuda', rank % torch.cuda.device_count())
def torch_sim(a):
a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
scale = a.norm() / np.sqrt(a.numel())
a_compressed = scale * a_sign
a_sign = None
worker_error = a - a_compressed
dist.all_reduce(a_compressed)
a_compressed.mul_(1 / dist.get_world_size())
a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
a_server_compressed = torch.cat(
[server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
rank = dist.get_rank()
server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
torch.cuda.synchronize()
torch.distributed.barrier()
return a_server_compressed, worker_error, server_error
tensor_size = 100 * 2**20
server_size = int(tensor_size / size)
if tensor_size % (8 * size) != 0:
right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
else:
right_tensor_size = tensor_size
right_server_size = right_tensor_size // size
# Adding bias to the initialization of the gradient we are communicating
# In order to get rid of the case where some elements in the gradient are too small
a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
worker_error = torch.zeros(right_tensor_size, device=device)
server_error = torch.zeros(right_server_size, device=device)
a_torch, worker_error_torch, server_error_torch = torch_sim(a)
torch.cuda.empty_cache()
local_rank = rank % torch.cuda.device_count()
a_after = dummy_optim.Compressed_Allreduce(a,
worker_error,
server_error,
rank,
size,
comm,
local_rank)
threshold = 1e-6
magnitude_threshold = 1e-6
diff_mask = (a_after - a_torch) > threshold
diff_server_mask = torch.chunk(diff_mask, size)[rank]
mpi_server = torch.chunk(a_after, size)[rank] + server_error
torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
# The test would skip those numbers that are too small in compensated_server_m
if torch.sum(diff_server_mask) == 0:
print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
else:
check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
if torch.sum(check_mag_mask) == 0:
print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
else:
print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
from mpi4py import MPI
import time
import torch
import torch.distributed as dist
import numpy as np
import deepspeed
from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
torch.distributed.init_process_group(backend='nccl',
init_method='tcp://worker-0:2245',
world_size=size,
rank=rank)
dummy_model = [torch.nn.Parameter(torch.ones(10))]
dummy_optim = OnebitAdam(dummy_model, cuda_aware=False)
device = torch.device('cuda', rank % torch.cuda.device_count())
def torch_sim(a):
a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
scale = a.norm() / np.sqrt(a.numel())
a_compressed = scale * a_sign
a_sign = None
worker_error = a - a_compressed
dist.all_reduce(a_compressed)
a_compressed.mul_(1 / dist.get_world_size())
a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
a_server_compressed = torch.cat(
[server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
rank = dist.get_rank()
server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
torch.cuda.synchronize()
torch.distributed.barrier()
return a_server_compressed, worker_error, server_error
# Input Tensor size
tensor_size = 100 * 2**20
server_size = int(tensor_size / size)
if tensor_size % (8 * size) != 0:
right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
else:
right_tensor_size = tensor_size
right_server_size = right_tensor_size // size
# The -0.5 is required for avoiding sign flips/errors
a = torch.rand(tensor_size, device=device) - 0.5
worker_error = torch.zeros(right_tensor_size, device=device)
server_error = torch.zeros(right_server_size, device=device)
a_torch, worker_error_torch, server_error_torch = torch_sim(a)
torch.cuda.empty_cache()
local_rank = rank % torch.cuda.device_count()
# Test the 1-bit Adam optimizer
a_after = dummy_optim.Compressed_Allreduce(a,
worker_error,
server_error,
rank,
size,
comm,
local_rank)
# If the error is below the threshold, it is acceptable for training
threshold = 1e-6
diff_pos = ((a_after - a_torch) > threshold)
if rank == 0:
before_diff = torch.chunk(a_after - a_torch,
size)[rank] + server_error - server_error_torch
if torch.norm(before_diff) / torch.norm(torch.chunk(a_after,
size)[rank]) < threshold:
print('Successfully passed the test')
else:
print('The difference for the tensor before allgather is {}'.format(
torch.norm(before_diff)))
......@@ -169,7 +169,6 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
return losses
@pytest.mark.skip(reason="been seeing nondeterministic failures, skipping for now")
@pytest.mark.parametrize('topo',
[
PipeTopo(num_pp=1,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment