Commit c25a91b6 authored by aiss's avatar aiss
Browse files

Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm

See merge request dcutoolkit/deeplearing/deepspeed!2
parents d1596c94 af82b300
...@@ -4,6 +4,6 @@ If you are looking for examples using DeepSpeed please see the following resourc ...@@ -4,6 +4,6 @@ If you are looking for examples using DeepSpeed please see the following resourc
1. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) 1. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)
2. [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) 2. [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed) 3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed)
4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed) 4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed)
5. [DeepSpeed + PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.utilities.deepspeed.html) 5. [DeepSpeed + PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.utilities.deepspeed.html)
#!/bin/bash #!/usr/bin/env bash
set -e set -e
err_report() { err_report() {
...@@ -121,7 +121,7 @@ rm_if_exist() { ...@@ -121,7 +121,7 @@ rm_if_exist() {
if [ -f $1 ]; then if [ -f $1 ]; then
rm $VERBOSE $1 rm $VERBOSE $1
elif [ -d $1 ]; then elif [ -d $1 ]; then
rm -r $VERBOSE $1 rm -rf $VERBOSE $1
fi fi
} }
......
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
import sys import sys
import os import os
import pkgutil import pkgutil
...@@ -8,6 +10,9 @@ import importlib ...@@ -8,6 +10,9 @@ import importlib
from .builder import get_default_compute_capabilities, OpBuilder from .builder import get_default_compute_capabilities, OpBuilder
# Do not remove, required for abstract accelerator to detect if we have a deepspeed or 3p op_builder
__deepspeed__ = True
# List of all available op builders from deepspeed op_builder # List of all available op builders from deepspeed op_builder
try: try:
import deepspeed.ops.op_builder # noqa: F401 import deepspeed.ops.op_builder # noqa: F401
...@@ -42,9 +47,7 @@ for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__fil ...@@ -42,9 +47,7 @@ for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__fil
if module_name != 'all_ops' and module_name != 'builder': if module_name != 'all_ops' and module_name != 'builder':
module = importlib.import_module(f".{module_name}", package=op_builder_dir) module = importlib.import_module(f".{module_name}", package=op_builder_dir)
for member_name in module.__dir__(): for member_name in module.__dir__():
if member_name.endswith( if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
'Builder'
) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
# assign builder name to variable with same name # assign builder name to variable with same name
# the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder" # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder"
this_module.__dict__[member_name] = builder_closure(member_name) this_module.__dict__[member_name] = builder_closure(member_name)
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
import os import os
import pkgutil import pkgutil
import importlib import importlib
......
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
import distutils.spawn import distutils.spawn
import subprocess import subprocess
...@@ -19,14 +21,10 @@ class AsyncIOBuilder(OpBuilder): ...@@ -19,14 +21,10 @@ class AsyncIOBuilder(OpBuilder):
def sources(self): def sources(self):
return [ return [
'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
'csrc/aio/py_lib/py_ds_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
'csrc/aio/common/deepspeed_aio_utils.cpp',
'csrc/aio/common/deepspeed_aio_common.cpp',
'csrc/aio/common/deepspeed_aio_types.cpp',
'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
] ]
...@@ -52,21 +50,14 @@ class AsyncIOBuilder(OpBuilder): ...@@ -52,21 +50,14 @@ class AsyncIOBuilder(OpBuilder):
] ]
def extra_ldflags(self): def extra_ldflags(self):
#aiss
#return ['-laio'] #return ['-laio']
return ['-laio', '-liomp5'] return ['-laio', '-liomp5']
def check_for_libaio_pkg(self): def check_for_libaio_pkg(self):
libs = dict( libs = dict(
dpkg=["-l", dpkg=["-l", "libaio-dev", "apt"],
"libaio-dev", pacman=["-Q", "libaio", "pacman"],
"apt"], rpm=["-q", "libaio-devel", "yum"],
pacman=["-Q",
"libaio",
"pacman"],
rpm=["-q",
"libaio-devel",
"yum"],
) )
found = False found = False
...@@ -75,15 +66,11 @@ class AsyncIOBuilder(OpBuilder): ...@@ -75,15 +66,11 @@ class AsyncIOBuilder(OpBuilder):
path = distutils.spawn.find_executable(pkgmgr) path = distutils.spawn.find_executable(pkgmgr)
if path is not None: if path is not None:
cmd = f"{pkgmgr} {flag} {lib}" cmd = f"{pkgmgr} {flag} {lib}"
result = subprocess.Popen(cmd, result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True)
if result.wait() == 0: if result.wait() == 0:
found = True found = True
else: else:
self.warning( self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
f"{self.NAME}: please install the {lib} package with {tool}")
break break
return found return found
...@@ -95,9 +82,7 @@ class AsyncIOBuilder(OpBuilder): ...@@ -95,9 +82,7 @@ class AsyncIOBuilder(OpBuilder):
# respectively to specify the directories for libaio.h and libaio.so. # respectively to specify the directories for libaio.h and libaio.so.
aio_compatible = self.has_function('io_submit', ('aio', )) aio_compatible = self.has_function('io_submit', ('aio', ))
if verbose and not aio_compatible: if verbose and not aio_compatible:
self.warning( self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
f"{self.NAME} requires the dev libaio .so object and headers but these were not found."
)
# Check for the libaio package via known package managers # Check for the libaio package via known package managers
# to print suggestions on which package to install. # to print suggestions on which package to install.
......
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
import os import os
import sys import sys
import time import time
...@@ -27,25 +29,18 @@ DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0" ...@@ -27,25 +29,18 @@ DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
try: try:
import torch import torch
except ImportError: except ImportError:
print( print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops."
)
else: else:
TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1]) TORCH_MINOR = int(torch.__version__.split('.')[1])
def installed_cuda_version(name=""): def installed_cuda_version(name=""):
import torch.cuda
if not torch.cuda.is_available():
return 0, 0
import torch.utils.cpp_extension import torch.utils.cpp_extension
cuda_home = torch.utils.cpp_extension.CUDA_HOME cuda_home = torch.utils.cpp_extension.CUDA_HOME
assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)" assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
# Ensure there is not a cuda version mismatch between torch and nvcc compiler # Ensure there is not a cuda version mismatch between torch and nvcc compiler
output = subprocess.check_output([cuda_home + "/bin/nvcc", output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
"-V"],
universal_newlines=True)
output_split = output.split() output_split = output.split()
release_idx = output_split.index("release") release_idx = output_split.index("release")
release = output_split[release_idx + 1].replace(',', '').split(".") release = output_split[release_idx + 1].replace(',', '').split(".")
...@@ -57,8 +52,7 @@ def installed_cuda_version(name=""): ...@@ -57,8 +52,7 @@ def installed_cuda_version(name=""):
def get_default_compute_capabilities(): def get_default_compute_capabilities():
compute_caps = DEFAULT_COMPUTE_CAPABILITIES compute_caps = DEFAULT_COMPUTE_CAPABILITIES
import torch.utils.cpp_extension import torch.utils.cpp_extension
if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version( if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11:
)[0] >= 11:
if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0: if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
# Special treatment of CUDA 11.0 because compute_86 is not supported. # Special treatment of CUDA 11.0 because compute_86 is not supported.
compute_caps += ";8.0" compute_caps += ";8.0"
...@@ -75,37 +69,25 @@ cuda_minor_mismatch_ok = { ...@@ -75,37 +69,25 @@ cuda_minor_mismatch_ok = {
"10.1", "10.1",
"10.2", "10.2",
], ],
11: ["11.0", 11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
"11.1",
"11.2",
"11.3",
"11.4",
"11.5",
"11.6",
"11.7",
"11.8"],
} }
def assert_no_cuda_mismatch(name=""): def assert_no_cuda_mismatch(name=""):
cuda_major, cuda_minor = installed_cuda_version(name) cuda_major, cuda_minor = installed_cuda_version(name)
if cuda_minor == 0 and cuda_major == 0:
return False
sys_cuda_version = f'{cuda_major}.{cuda_minor}' sys_cuda_version = f'{cuda_major}.{cuda_minor}'
torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2]) torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
# This is a show-stopping error, should probably not proceed past this # This is a show-stopping error, should probably not proceed past this
if sys_cuda_version != torch_cuda_version: if sys_cuda_version != torch_cuda_version:
if (cuda_major in cuda_minor_mismatch_ok if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]): and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
print(f"Installed CUDA version {sys_cuda_version} does not match the " print(f"Installed CUDA version {sys_cuda_version} does not match the "
f"version torch was compiled with {torch.version.cuda} " f"version torch was compiled with {torch.version.cuda} "
"but since the APIs are compatible, accepting this combination") "but since the APIs are compatible, accepting this combination")
return True return True
raise Exception( raise Exception(f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the " f"version torch was compiled with {torch.version.cuda}, unable to compile "
f"version torch was compiled with {torch.version.cuda}, unable to compile " "cuda/cpp extensions without a matching cuda version.")
"cuda/cpp extensions without a matching cuda version.")
return True return True
...@@ -142,12 +124,11 @@ class OpBuilder(ABC): ...@@ -142,12 +124,11 @@ class OpBuilder(ABC):
install_torch_version = torch_info['version'] install_torch_version = torch_info['version']
current_torch_version = ".".join(torch.__version__.split('.')[:2]) current_torch_version = ".".join(torch.__version__.split('.')[:2])
if install_torch_version != current_torch_version: if install_torch_version != current_torch_version:
raise RuntimeError( raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
"PyTorch version mismatch! DeepSpeed ops were compiled and installed " "with a different version than what is being used at runtime. "
"with a different version than what is being used at runtime. " f"Please re-install DeepSpeed or switch torch versions. "
f"Please re-install DeepSpeed or switch torch versions. " f"Install torch version={install_torch_version}, "
f"Install torch version={install_torch_version}, " f"Runtime torch version={current_torch_version}")
f"Runtime torch version={current_torch_version}")
@staticmethod @staticmethod
def validate_torch_op_version(torch_info): def validate_torch_op_version(torch_info):
...@@ -155,22 +136,20 @@ class OpBuilder(ABC): ...@@ -155,22 +136,20 @@ class OpBuilder(ABC):
current_cuda_version = ".".join(torch.version.cuda.split('.')[:2]) current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
install_cuda_version = torch_info['cuda_version'] install_cuda_version = torch_info['cuda_version']
if install_cuda_version != current_cuda_version: if install_cuda_version != current_cuda_version:
raise RuntimeError( raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
"CUDA version mismatch! DeepSpeed ops were compiled and installed " "with a different version than what is being used at runtime. "
"with a different version than what is being used at runtime. " f"Please re-install DeepSpeed or switch torch versions. "
f"Please re-install DeepSpeed or switch torch versions. " f"Install CUDA version={install_cuda_version}, "
f"Install CUDA version={install_cuda_version}, " f"Runtime CUDA version={current_cuda_version}")
f"Runtime CUDA version={current_cuda_version}")
else: else:
current_hip_version = ".".join(torch.version.hip.split('.')[:2]) current_hip_version = ".".join(torch.version.hip.split('.')[:2])
install_hip_version = torch_info['hip_version'] install_hip_version = torch_info['hip_version']
if install_hip_version != current_hip_version: if install_hip_version != current_hip_version:
raise RuntimeError( raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
"HIP version mismatch! DeepSpeed ops were compiled and installed " "with a different version than what is being used at runtime. "
"with a different version than what is being used at runtime. " f"Please re-install DeepSpeed or switch torch versions. "
f"Please re-install DeepSpeed or switch torch versions. " f"Install HIP version={install_hip_version}, "
f"Install HIP version={install_hip_version}, " f"Runtime HIP version={current_hip_version}")
f"Runtime HIP version={current_hip_version}")
@staticmethod @staticmethod
def is_rocm_pytorch(): def is_rocm_pytorch():
...@@ -184,8 +163,7 @@ class OpBuilder(ABC): ...@@ -184,8 +163,7 @@ class OpBuilder(ABC):
pass pass
else: else:
if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5): if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
_is_rocm_pytorch = hasattr(torch.version, _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
'hip') and torch.version.hip is not None
if _is_rocm_pytorch: if _is_rocm_pytorch:
from torch.utils.cpp_extension import ROCM_HOME from torch.utils.cpp_extension import ROCM_HOME
_is_rocm_pytorch = ROCM_HOME is not None _is_rocm_pytorch = ROCM_HOME is not None
...@@ -240,7 +218,6 @@ class OpBuilder(ABC): ...@@ -240,7 +218,6 @@ class OpBuilder(ABC):
return True return True
def extra_ldflags(self): def extra_ldflags(self):
#aiss
#return [] #return []
return ['-liomp5'] return ['-liomp5']
...@@ -248,10 +225,7 @@ class OpBuilder(ABC): ...@@ -248,10 +225,7 @@ class OpBuilder(ABC):
valid = False valid = False
check_cmd = 'dpkg -l' check_cmd = 'dpkg -l'
for lib in libraries: for lib in libraries:
result = subprocess.Popen(f'dpkg -l {lib}', result = subprocess.Popen(f'dpkg -l {lib}', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True)
valid = valid or result.wait() == 0 valid = valid or result.wait() == 0
return valid return valid
...@@ -282,9 +256,7 @@ class OpBuilder(ABC): ...@@ -282,9 +256,7 @@ class OpBuilder(ABC):
tempdir = tempfile.mkdtemp() tempdir = tempfile.mkdtemp()
# Define a simple C program that calls the function in question # Define a simple C program that calls the function in question
prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % ( prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
funcname,
funcname)
# Write the test program to a file. # Write the test program to a file.
filename = os.path.join(tempdir, 'test.c') filename = os.path.join(tempdir, 'test.c')
...@@ -305,16 +277,13 @@ class OpBuilder(ABC): ...@@ -305,16 +277,13 @@ class OpBuilder(ABC):
# Attempt to compile the C program into an object file. # Attempt to compile the C program into an object file.
cflags = shlex.split(os.environ.get('CFLAGS', "")) cflags = shlex.split(os.environ.get('CFLAGS', ""))
objs = compiler.compile([filename], objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
output_dir=output_dir,
extra_preargs=self.strip_empty_entries(cflags))
# Attempt to link the object file into an executable. # Attempt to link the object file into an executable.
# Be sure to tack on any libraries that have been specified. # Be sure to tack on any libraries that have been specified.
ldflags = shlex.split(os.environ.get('LDFLAGS', "")) ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
compiler.link_executable(objs, compiler.link_executable(objs,
os.path.join(tempdir, os.path.join(tempdir, 'a.out'),
'a.out'),
extra_preargs=self.strip_empty_entries(ldflags), extra_preargs=self.strip_empty_entries(ldflags),
libraries=libraries) libraries=libraries)
...@@ -358,9 +327,8 @@ class OpBuilder(ABC): ...@@ -358,9 +327,8 @@ class OpBuilder(ABC):
try: try:
cpu_info = get_cpu_info() cpu_info = get_cpu_info()
except Exception as e: except Exception as e:
self.warning( self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), " "falling back to `lscpu` to get this information.")
"falling back to `lscpu` to get this information.")
cpu_info = self._backup_cpuinfo() cpu_info = self._backup_cpuinfo()
if cpu_info is None: if cpu_info is None:
return "-march=native" return "-march=native"
...@@ -372,23 +340,23 @@ class OpBuilder(ABC): ...@@ -372,23 +340,23 @@ class OpBuilder(ABC):
def is_cuda_enable(self): def is_cuda_enable(self):
try: try:
#assert_no_cuda_mismatch(self.name)
#return '-D__ENABLE_CUDA__'
#aiss
if torch.cuda.is_available(): if torch.cuda.is_available():
return '-D__ENABLE_CUDA__' return '-D__ENABLE_CUDA__'
except: except BaseException:
print( print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
f"{WARNING} {self.name} torch.cuda is missing, only cpu ops can be compiled!" "only cpu ops can be compiled!")
)
return '-D__DISABLE_CUDA__' return '-D__DISABLE_CUDA__'
return '-D__DISABLE_CUDA__' return '-D__DISABLE_CUDA__'
def _backup_cpuinfo(self): def _backup_cpuinfo(self):
# Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
if not self.command_exists('lscpu'): if not self.command_exists('lscpu'):
self.warning( self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo " "to detect the CPU architecture. 'lscpu' does not appear to exist on "
"to detect the CPU architecture. 'lscpu' does not appear to exist on " "your system, will fall back to use -march=native and non-vectorized execution.")
"your system, will fall back to use -march=native and non-vectorized execution."
)
return None return None
result = subprocess.check_output('lscpu', shell=True) result = subprocess.check_output('lscpu', shell=True)
result = result.decode('utf-8').strip().lower() result = result.decode('utf-8').strip().lower()
...@@ -420,9 +388,8 @@ class OpBuilder(ABC): ...@@ -420,9 +388,8 @@ class OpBuilder(ABC):
try: try:
cpu_info = get_cpu_info() cpu_info = get_cpu_info()
except Exception as e: except Exception as e:
self.warning( self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), " "falling back to `lscpu` to get this information.")
"falling back to `lscpu` to get this information.")
cpu_info = self._backup_cpuinfo() cpu_info = self._backup_cpuinfo()
if cpu_info is None: if cpu_info is None:
return '-D__SCALAR__' return '-D__SCALAR__'
...@@ -445,13 +412,9 @@ class OpBuilder(ABC): ...@@ -445,13 +412,9 @@ class OpBuilder(ABC):
valid = valid or result.wait() == 0 valid = valid or result.wait() == 0
if not valid and len(cmds) > 1: if not valid and len(cmds) > 1:
print( print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!")
f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
)
elif not valid and len(cmds) == 1: elif not valid and len(cmds) == 1:
print( print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!")
f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
)
return valid return valid
def warning(self, msg): def warning(self, msg):
...@@ -466,12 +429,11 @@ class OpBuilder(ABC): ...@@ -466,12 +429,11 @@ class OpBuilder(ABC):
def builder(self): def builder(self):
from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CppExtension
return CppExtension( return CppExtension(name=self.absolute_name(),
name=self.absolute_name(), sources=self.strip_empty_entries(self.sources()),
sources=self.strip_empty_entries(self.sources()), include_dirs=self.strip_empty_entries(self.include_paths()),
include_dirs=self.strip_empty_entries(self.include_paths()), extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())}, extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
def load(self, verbose=True): def load(self, verbose=True):
from deepspeed.git_version_info import installed_ops, torch_info from deepspeed.git_version_info import installed_ops, torch_info
...@@ -480,9 +442,8 @@ class OpBuilder(ABC): ...@@ -480,9 +442,8 @@ class OpBuilder(ABC):
# torch/cuda versions we are currently using at runtime. # torch/cuda versions we are currently using at runtime.
self.validate_torch_version(torch_info) self.validate_torch_version(torch_info)
if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder): if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
#aiss HIP version mismatch error self.validate_torch_op_version(torch_info)
#self.validate_torch_op_version(torch_info)
pass
return importlib.import_module(self.absolute_name()) return importlib.import_module(self.absolute_name())
else: else:
return self.jit_load(verbose) return self.jit_load(verbose)
...@@ -495,21 +456,21 @@ class OpBuilder(ABC): ...@@ -495,21 +456,21 @@ class OpBuilder(ABC):
try: try:
import ninja # noqa: F401 import ninja # noqa: F401
except ImportError: except ImportError:
raise RuntimeError( raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
f"Unable to JIT load the {self.name} op due to ninja not being installed."
)
if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch(): if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
self.build_for_cpu = not assert_no_cuda_mismatch(self.name) try:
assert_no_cuda_mismatch(self.name)
self.build_for_cpu = False
except BaseException:
self.build_for_cpu = True
self.jit_mode = True self.jit_mode = True
from torch.utils.cpp_extension import load from torch.utils.cpp_extension import load
start_build = time.time() start_build = time.time()
sources = [self.deepspeed_src_path(path) for path in self.sources()] sources = [self.deepspeed_src_path(path) for path in self.sources()]
extra_include_paths = [ extra_include_paths = [self.deepspeed_src_path(path) for path in self.include_paths()]
self.deepspeed_src_path(path) for path in self.include_paths()
]
# Torch will try and apply whatever CCs are in the arch list at compile time, # Torch will try and apply whatever CCs are in the arch list at compile time,
# we have already set the intended targets ourselves we know that will be # we have already set the intended targets ourselves we know that will be
...@@ -520,14 +481,13 @@ class OpBuilder(ABC): ...@@ -520,14 +481,13 @@ class OpBuilder(ABC):
torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST") torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
os.environ["TORCH_CUDA_ARCH_LIST"] = "" os.environ["TORCH_CUDA_ARCH_LIST"] = ""
op_module = load( op_module = load(name=self.name,
name=self.name, sources=self.strip_empty_entries(sources),
sources=self.strip_empty_entries(sources), extra_include_paths=self.strip_empty_entries(extra_include_paths),
extra_include_paths=self.strip_empty_entries(extra_include_paths), extra_cflags=self.strip_empty_entries(self.cxx_args()),
extra_cflags=self.strip_empty_entries(self.cxx_args()), extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()), extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
extra_ldflags=self.strip_empty_entries(self.extra_ldflags()), verbose=verbose)
verbose=verbose)
build_duration = time.time() - start_build build_duration = time.time() - start_build
if verbose: if verbose:
...@@ -541,6 +501,7 @@ class OpBuilder(ABC): ...@@ -541,6 +501,7 @@ class OpBuilder(ABC):
class CUDAOpBuilder(OpBuilder): class CUDAOpBuilder(OpBuilder):
def compute_capability_args(self, cross_compile_archs=None): def compute_capability_args(self, cross_compile_archs=None):
""" """
Returns nvcc compute capability compile flags. Returns nvcc compute capability compile flags.
...@@ -587,8 +548,7 @@ class CUDAOpBuilder(OpBuilder): ...@@ -587,8 +548,7 @@ class CUDAOpBuilder(OpBuilder):
ccs = self.filter_ccs(ccs) ccs = self.filter_ccs(ccs)
if len(ccs) == 0: if len(ccs) == 0:
raise RuntimeError( raise RuntimeError(
f"Unable to load {self.name} op due to no compute capabilities remaining after filtering" f"Unable to load {self.name} op due to no compute capabilities remaining after filtering")
)
args = [] args = []
for cc in ccs: for cc in ccs:
...@@ -623,7 +583,16 @@ class CUDAOpBuilder(OpBuilder): ...@@ -623,7 +583,16 @@ class CUDAOpBuilder(OpBuilder):
return super().is_compatible(verbose) return super().is_compatible(verbose)
def builder(self): def builder(self):
#self.build_for_cpu = not assert_no_cuda_mismatch(self.name) #try:
# assert_no_cuda_mismatch(self.name)
# self.build_for_cpu = False
#except BaseException:
# self.build_for_cpu = True
#if self.build_for_cpu:
# from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
#else:
# from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
#aiss #aiss
if not self.is_rocm_pytorch(): if not self.is_rocm_pytorch():
self.build_for_cpu = not assert_no_cuda_mismatch(self.name) self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
...@@ -632,23 +601,17 @@ class CUDAOpBuilder(OpBuilder): ...@@ -632,23 +601,17 @@ class CUDAOpBuilder(OpBuilder):
else: else:
from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
else: else:
from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
#self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
#if self.build_for_cpu:
# from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
#else:
# from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \ compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
{'cxx': self.strip_empty_entries(self.cxx_args()), \ {'cxx': self.strip_empty_entries(self.cxx_args()), \
'nvcc': self.strip_empty_entries(self.nvcc_args())} 'nvcc': self.strip_empty_entries(self.nvcc_args())}
cuda_ext = ExtensionBuilder( cuda_ext = ExtensionBuilder(name=self.absolute_name(),
name=self.absolute_name(), sources=self.strip_empty_entries(self.sources()),
sources=self.strip_empty_entries(self.sources()), include_dirs=self.strip_empty_entries(self.include_paths()),
include_dirs=self.strip_empty_entries(self.include_paths()), libraries=self.strip_empty_entries(self.libraries_args()),
libraries=self.strip_empty_entries(self.libraries_args()), extra_compile_args=compile_args)
extra_compile_args=compile_args)
if self.is_rocm_pytorch(): if self.is_rocm_pytorch():
# hip converts paths to absolute, this converts back to relative # hip converts paths to absolute, this converts back to relative
...@@ -656,7 +619,10 @@ class CUDAOpBuilder(OpBuilder): ...@@ -656,7 +619,10 @@ class CUDAOpBuilder(OpBuilder):
curr_file = Path(__file__).parent.parent # ds root curr_file = Path(__file__).parent.parent # ds root
for i in range(len(sources)): for i in range(len(sources)):
src = Path(sources[i]) src = Path(sources[i])
sources[i] = str(src.relative_to(curr_file)) if src.is_absolute():
sources[i] = str(src.relative_to(curr_file))
else:
sources[i] = str(src)
cuda_ext.sources = sources cuda_ext.sources = sources
return cuda_ext return cuda_ext
...@@ -711,9 +677,7 @@ class CUDAOpBuilder(OpBuilder): ...@@ -711,9 +677,7 @@ class CUDAOpBuilder(OpBuilder):
if self.is_rocm_pytorch(): if self.is_rocm_pytorch():
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
args += [ args += [
'-std=c++14', '-std=c++14', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__',
'-U__HIP_NO_HALF_OPERATORS__',
'-U__HIP_NO_HALF_CONVERSIONS__',
'-U__HIP_NO_HALF2_OPERATORS__', '-U__HIP_NO_HALF2_OPERATORS__',
'-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
'-DROCM_VERSION_MINOR=%s' % ROCM_MINOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR,
...@@ -722,13 +686,9 @@ class CUDAOpBuilder(OpBuilder): ...@@ -722,13 +686,9 @@ class CUDAOpBuilder(OpBuilder):
else: else:
cuda_major, _ = installed_cuda_version() cuda_major, _ = installed_cuda_version()
args += [ args += [
'-allow-unsupported-compiler' if sys.platform == "win32" else '', '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math',
'--use_fast_math', '-std=c++17' if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
'-std=c++17' '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__'
if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
] ]
if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1': if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
args.append('--ptxas-options=-v') args.append('--ptxas-options=-v')
...@@ -742,10 +702,12 @@ class CUDAOpBuilder(OpBuilder): ...@@ -742,10 +702,12 @@ class CUDAOpBuilder(OpBuilder):
if sys.platform == "win32": if sys.platform == "win32":
return ['cublas', 'curand'] return ['cublas', 'curand']
else: else:
#return []
return ['iomp5'] return ['iomp5']
class TorchCPUOpBuilder(CUDAOpBuilder): class TorchCPUOpBuilder(CUDAOpBuilder):
def extra_ldflags(self): def extra_ldflags(self):
if self.build_for_cpu: if self.build_for_cpu:
return ['-fopenmp'] return ['-fopenmp']
......
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
import os import os
from .builder import TorchCPUOpBuilder from .builder import TorchCPUOpBuilder
...@@ -38,13 +40,8 @@ class CPUAdagradBuilder(TorchCPUOpBuilder): ...@@ -38,13 +40,8 @@ class CPUAdagradBuilder(TorchCPUOpBuilder):
CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
else: else:
CUDA_INCLUDE = [ CUDA_INCLUDE = [
os.path.join(torch.utils.cpp_extension.ROCM_HOME, os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
"include"), os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
os.path.join(torch.utils.cpp_extension.ROCM_HOME, os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
"include",
"rocrand"),
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
"include",
"hiprand"),
] ]
return ['csrc/includes'] + CUDA_INCLUDE return ['csrc/includes'] + CUDA_INCLUDE
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
import os import os
from .builder import TorchCPUOpBuilder from .builder import TorchCPUOpBuilder
...@@ -8,7 +10,7 @@ from .builder import TorchCPUOpBuilder ...@@ -8,7 +10,7 @@ from .builder import TorchCPUOpBuilder
class CPUAdamBuilder(TorchCPUOpBuilder): class CPUAdamBuilder(TorchCPUOpBuilder):
BUILD_VAR = "DS_BUILD_CPU_ADAM" BUILD_VAR = "DS_BUILD_CPU_ADAM"
NAME = "cpu_adam" NAME = "cpu_adam"
def __init__(self): def __init__(self):
super().__init__(name=self.NAME) super().__init__(name=self.NAME)
...@@ -18,7 +20,7 @@ class CPUAdamBuilder(TorchCPUOpBuilder): ...@@ -18,7 +20,7 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
def sources(self): def sources(self):
if self.build_for_cpu: if self.build_for_cpu:
return ['csrc/adam/cpu_adam.cpp'] return ['csrc/adam/cpu_adam.cpp']
return ['csrc/adam/cpu_adam.cpp', 'csrc/common/custom_cuda_kernel.cu'] return ['csrc/adam/cpu_adam.cpp', 'csrc/common/custom_cuda_kernel.cu']
def libraries_args(self): def libraries_args(self):
...@@ -28,6 +30,7 @@ class CPUAdamBuilder(TorchCPUOpBuilder): ...@@ -28,6 +30,7 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
if not self.is_rocm_pytorch(): if not self.is_rocm_pytorch():
args += ['curand'] args += ['curand']
return args return args
def include_paths(self): def include_paths(self):
...@@ -38,13 +41,8 @@ class CPUAdamBuilder(TorchCPUOpBuilder): ...@@ -38,13 +41,8 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
else: else:
CUDA_INCLUDE = [ CUDA_INCLUDE = [
os.path.join(torch.utils.cpp_extension.ROCM_HOME, os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
"include"), os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
os.path.join(torch.utils.cpp_extension.ROCM_HOME, os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
"include",
"rocrand"),
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
"include",
"hiprand"),
] ]
return ['csrc/includes'] + CUDA_INCLUDE return ['csrc/includes'] + CUDA_INCLUDE
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
from .builder import CUDAOpBuilder from .builder import CUDAOpBuilder
import sys import sys
...@@ -29,9 +31,7 @@ class FusedAdamBuilder(CUDAOpBuilder): ...@@ -29,9 +31,7 @@ class FusedAdamBuilder(CUDAOpBuilder):
def nvcc_args(self): def nvcc_args(self):
nvcc_flags = ['-O3'] + self.version_dependent_macros() nvcc_flags = ['-O3'] + self.version_dependent_macros()
if not self.is_rocm_pytorch(): if not self.is_rocm_pytorch():
nvcc_flags.extend([ nvcc_flags.extend(
'-allow-unsupported-compiler' if sys.platform == "win32" else '', ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
'-lineinfo', self.compute_capability_args())
'--use_fast_math'
] + self.compute_capability_args())
return nvcc_flags return nvcc_flags
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
from .builder import CUDAOpBuilder from .builder import CUDAOpBuilder
import sys import sys
...@@ -30,14 +32,9 @@ class FusedLambBuilder(CUDAOpBuilder): ...@@ -30,14 +32,9 @@ class FusedLambBuilder(CUDAOpBuilder):
nvcc_flags = ['-O3'] + self.version_dependent_macros() nvcc_flags = ['-O3'] + self.version_dependent_macros()
if self.is_rocm_pytorch(): if self.is_rocm_pytorch():
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
nvcc_flags += [ nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
'-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
'-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
]
else: else:
nvcc_flags.extend([ nvcc_flags.extend(
'-allow-unsupported-compiler' if sys.platform == "win32" else '', ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
'-lineinfo', self.compute_capability_args())
'--use_fast_math'
] + self.compute_capability_args())
return nvcc_flags return nvcc_flags
'''Copyright The Microsoft DeepSpeed Team''' # Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from .builder import CUDAOpBuilder from .builder import CUDAOpBuilder
......
""" # Copyright (c) Microsoft Corporation.
Copyright 2022 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
from .builder import CUDAOpBuilder from .builder import CUDAOpBuilder
...@@ -23,18 +25,13 @@ class RandomLTDBuilder(CUDAOpBuilder): ...@@ -23,18 +25,13 @@ class RandomLTDBuilder(CUDAOpBuilder):
def sources(self): def sources(self):
return [ return [
'csrc/random_ltd/pt_binding.cpp', 'csrc/random_ltd/pt_binding.cpp', 'csrc/random_ltd/gather_scatter.cu',
'csrc/random_ltd/gather_scatter.cu', 'csrc/random_ltd/slice_attn_masks.cu', 'csrc/random_ltd/token_sort.cu'
'csrc/random_ltd/slice_attn_masks.cu',
'csrc/random_ltd/token_sort.cu'
] ]
def include_paths(self): def include_paths(self):
includes = ['csrc/includes'] includes = ['csrc/includes']
if self.is_rocm_pytorch(): if self.is_rocm_pytorch():
from torch.utils.cpp_extension import ROCM_HOME from torch.utils.cpp_extension import ROCM_HOME
includes += [ includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
'{}/hiprand/include'.format(ROCM_HOME),
'{}/rocrand/include'.format(ROCM_HOME)
]
return includes return includes
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
from .builder import OpBuilder from .builder import OpBuilder
try: try:
...@@ -32,9 +34,8 @@ class SparseAttnBuilder(OpBuilder): ...@@ -32,9 +34,8 @@ class SparseAttnBuilder(OpBuilder):
#deps_compatible = all(command_status) #deps_compatible = all(command_status)
if self.is_rocm_pytorch(): if self.is_rocm_pytorch():
#aiss debug
#self.warning(f'{self.NAME} is not compatible with ROCM') #self.warning(f'{self.NAME} is not compatible with ROCM')
#return False #aiss debug
return True return True
try: try:
...@@ -49,26 +50,23 @@ class SparseAttnBuilder(OpBuilder): ...@@ -49,26 +50,23 @@ class SparseAttnBuilder(OpBuilder):
self.warning(f"{self.NAME} cuda is not available from torch") self.warning(f"{self.NAME} cuda is not available from torch")
else: else:
major, minor = torch.version.cuda.split('.')[:2] major, minor = torch.version.cuda.split('.')[:2]
cuda_compatible = (int(major) == 10 cuda_compatible = (int(major) == 10 and int(minor) >= 1) or (int(major) >= 11)
and int(minor) >= 1) or (int(major) >= 11)
if not cuda_compatible: if not cuda_compatible:
self.warning(f"{self.NAME} requires CUDA version 10.1+") self.warning(f"{self.NAME} requires CUDA version 10.1+")
TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1]) TORCH_MINOR = int(torch.__version__.split('.')[1])
torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5 torch_compatible = (TORCH_MAJOR == 1 and TORCH_MINOR >= 5)
if not torch_compatible: if not torch_compatible:
self.warning( self.warning(
f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}' f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')
)
try: try:
import triton import triton
except ImportError: except ImportError:
# auto-install of triton is broken on some systems, reverting to manual install for now # auto-install of triton is broken on some systems, reverting to manual install for now
# see this issue: https://github.com/microsoft/DeepSpeed/issues/1710 # see this issue: https://github.com/microsoft/DeepSpeed/issues/1710
self.warning( self.warning(f"please install triton==1.0.0 if you want to use sparse attention")
f"please install triton==1.0.0 if you want to use sparse attention")
return False return False
if pkg_version: if pkg_version:
...@@ -79,9 +77,7 @@ class SparseAttnBuilder(OpBuilder): ...@@ -79,9 +77,7 @@ class SparseAttnBuilder(OpBuilder):
triton_mismatch = installed_triton != "1.0.0" triton_mismatch = installed_triton != "1.0.0"
if triton_mismatch: if triton_mismatch:
self.warning( self.warning(f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible")
f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible"
)
return False return False
return super().is_compatible(verbose) and torch_compatible and cuda_compatible return super().is_compatible(verbose) and torch_compatible and cuda_compatible
''' # Copyright (c) Microsoft Corporation.
Copyright 2022 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
'''
# DeepSpeed Team
from .builder import CUDAOpBuilder, installed_cuda_version from .builder import CUDAOpBuilder, installed_cuda_version
...@@ -19,8 +21,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder): ...@@ -19,8 +21,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder):
try: try:
import torch import torch
except ImportError: except ImportError:
self.warning( self.warning("Please install torch if trying to pre-compile inference kernels")
"Please install torch if trying to pre-compile inference kernels")
return False return False
cuda_okay = True cuda_okay = True
...@@ -30,8 +31,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder): ...@@ -30,8 +31,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder):
cuda_capability = torch.cuda.get_device_properties(0).major cuda_capability = torch.cuda.get_device_properties(0).major
if cuda_capability >= 8: if cuda_capability >= 8:
if torch_cuda_major < 11 or sys_cuda_major < 11: if torch_cuda_major < 11 or sys_cuda_major < 11:
self.warning( self.warning("On Ampere and higher architectures please use CUDA 11+")
"On Ampere and higher architectures please use CUDA 11+")
cuda_okay = False cuda_okay = False
return super().is_compatible(verbose) and cuda_okay return super().is_compatible(verbose) and cuda_okay
......
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
from .transformer import TransformerBuilder from .transformer import TransformerBuilder
......
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
from .builder import CUDAOpBuilder from .builder import CUDAOpBuilder
...@@ -23,22 +25,15 @@ class TransformerBuilder(CUDAOpBuilder): ...@@ -23,22 +25,15 @@ class TransformerBuilder(CUDAOpBuilder):
def sources(self): def sources(self):
return [ return [
'csrc/transformer/ds_transformer_cuda.cpp', 'csrc/transformer/ds_transformer_cuda.cpp', 'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/cublas_wrappers.cu', 'csrc/transformer/transform_kernels.cu', 'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/transform_kernels.cu', 'csrc/transformer/dropout_kernels.cu', 'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/gelu_kernels.cu', 'csrc/transformer/softmax_kernels.cu', 'csrc/transformer/general_kernels.cu'
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
] ]
def include_paths(self): def include_paths(self):
includes = ['csrc/includes'] includes = ['csrc/includes']
if self.is_rocm_pytorch(): if self.is_rocm_pytorch():
from torch.utils.cpp_extension import ROCM_HOME from torch.utils.cpp_extension import ROCM_HOME
includes += [ includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
'{}/hiprand/include'.format(ROCM_HOME),
'{}/rocrand/include'.format(ROCM_HOME)
]
return includes return includes
'''Copyright The Microsoft DeepSpeed Team''' # Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from .builder import CUDAOpBuilder, installed_cuda_version from .builder import CUDAOpBuilder, installed_cuda_version
...@@ -18,8 +21,7 @@ class InferenceBuilder(CUDAOpBuilder): ...@@ -18,8 +21,7 @@ class InferenceBuilder(CUDAOpBuilder):
try: try:
import torch import torch
except ImportError: except ImportError:
self.warning( self.warning("Please install torch if trying to pre-compile inference kernels")
"Please install torch if trying to pre-compile inference kernels")
return False return False
cuda_okay = True cuda_okay = True
...@@ -28,14 +30,11 @@ class InferenceBuilder(CUDAOpBuilder): ...@@ -28,14 +30,11 @@ class InferenceBuilder(CUDAOpBuilder):
torch_cuda_major = int(torch.version.cuda.split('.')[0]) torch_cuda_major = int(torch.version.cuda.split('.')[0])
cuda_capability = torch.cuda.get_device_properties(0).major cuda_capability = torch.cuda.get_device_properties(0).major
if cuda_capability < 6: if cuda_capability < 6:
self.warning( self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
"NVIDIA Inference is only supported on Pascal and newer architectures"
)
cuda_okay = False cuda_okay = False
if cuda_capability >= 8: if cuda_capability >= 8:
if torch_cuda_major < 11 or sys_cuda_major < 11: if torch_cuda_major < 11 or sys_cuda_major < 11:
self.warning( self.warning("On Ampere and higher architectures please use CUDA 11+")
"On Ampere and higher architectures please use CUDA 11+")
cuda_okay = False cuda_okay = False
return super().is_compatible(verbose) and cuda_okay return super().is_compatible(verbose) and cuda_okay
......
""" # Copyright (c) Microsoft Corporation.
Copyright 2020 The Microsoft DeepSpeed Team # SPDX-License-Identifier: Apache-2.0
"""
# DeepSpeed Team
from .builder import OpBuilder from .builder import OpBuilder
......
'''Copyright The Microsoft DeepSpeed Team''' # Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from packaging import version as pkg_version from packaging import version as pkg_version
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment