Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2

Merge branch 'ds-v0.9.2-rocm' into 'main'
Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2
c25a91b6 · aiss · d1596c94 · af82b300 · c25a91b6 · d1596c94
Commit c25a91b6 authored May 30, 2023 by aiss
20 changed files
--- a/examples/README.md
+++ b/examples/README.md
@@ -4,6 +4,6 @@ If you are looking for examples using DeepSpeed please see the following resourc
 1. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)
 2. [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
-3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed)
+3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed)
 4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed)
-5. [DeepSpeed + PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.utilities.deepspeed.html)
+5. [DeepSpeed + PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.utilities.deepspeed.html)
--- a/hipify_20230511113250.png
+++ b/hipify_20230511113250.png
--- a/install.sh
+++ b/install.sh
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 err_report() {
@@ -121,7 +121,7 @@ rm_if_exist() {
    if [ -f $1 ]; then
        rm $VERBOSE $1
    elif [ -d $1 ]; then
-        rm -r $VERBOSE $1
+        rm -rf $VERBOSE $1
    fi
 }

--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 import sys
 import os
 import pkgutil
@@ -8,6 +10,9 @@ import importlib
 from .builder import get_default_compute_capabilities, OpBuilder
+# Do not remove, required for abstract accelerator to detect if we have a deepspeed or 3p op_builder
+__deepspeed__ = True
 # List of all available op builders from deepspeed op_builder
 try:
    import deepspeed.ops.op_builder  # noqa: F401
@@ -42,9 +47,7 @@ for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__fil
    if module_name != 'all_ops' and module_name != 'builder':
        module = importlib.import_module(f".{module_name}", package=op_builder_dir)
        for member_name in module.__dir__():
-            if member_name.endswith(
+            if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
-                    'Builder'
-            ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
                # assign builder name to variable with same name
                # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder"
                this_module.__dict__[member_name] = builder_closure(member_name)
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 import os
 import pkgutil
 import importlib

--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 import distutils.spawn
 import subprocess
@@ -19,14 +21,10 @@ class AsyncIOBuilder(OpBuilder):
    def sources(self):
        return [
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
-            'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
-            'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/common/deepspeed_aio_common.cpp',
-            'csrc/aio/common/deepspeed_aio_types.cpp',
            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
        ]
@@ -52,21 +50,14 @@ class AsyncIOBuilder(OpBuilder):
        ]
    def extra_ldflags(self):
-        #aiss
        #return ['-laio']
        return ['-laio', '-liomp5']
    def check_for_libaio_pkg(self):
        libs = dict(
-            dpkg=["-l",
+            dpkg=["-l", "libaio-dev", "apt"],
-                  "libaio-dev",
+            pacman=["-Q", "libaio", "pacman"],
-                  "apt"],
+            rpm=["-q", "libaio-devel", "yum"],
-            pacman=["-Q",
-                    "libaio",
-                    "pacman"],
-            rpm=["-q",
-                 "libaio-devel",
-                 "yum"],
        )
        found = False
@@ -75,15 +66,11 @@ class AsyncIOBuilder(OpBuilder):
            path = distutils.spawn.find_executable(pkgmgr)
            if path is not None:
                cmd = f"{pkgmgr} {flag} {lib}"
-                result = subprocess.Popen(cmd,
+                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-                                          stdout=subprocess.PIPE,
-                                          stderr=subprocess.PIPE,
-                                          shell=True)
                if result.wait() == 0:
                    found = True
                else:
-                    self.warning(
+                    self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
-                        f"{self.NAME}: please install the {lib} package with {tool}")
                break
        return found
@@ -95,9 +82,7 @@ class AsyncIOBuilder(OpBuilder):
        # respectively to specify the directories for libaio.h and libaio.so.
        aio_compatible = self.has_function('io_submit', ('aio', ))
        if verbose and not aio_compatible:
-            self.warning(
+            self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
-                f"{self.NAME} requires the dev libaio .so object and headers but these were not found."
-            )
            # Check for the libaio package via known package managers
            # to print suggestions on which package to install.

--- a/op_builder/builder.py
+++ b/op_builder/builder.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 import os
 import sys
 import time
@@ -27,25 +29,18 @@ DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
 try:
    import torch
 except ImportError:
-    print(
+    print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
-        f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops."
-    )
 else:
    TORCH_MAJOR = int(torch.__version__.split('.')[0])
    TORCH_MINOR = int(torch.__version__.split('.')[1])
 def installed_cuda_version(name=""):
-    import torch.cuda
-    if not torch.cuda.is_available():
-        return 0, 0
    import torch.utils.cpp_extension
    cuda_home = torch.utils.cpp_extension.CUDA_HOME
    assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
    # Ensure there is not a cuda version mismatch between torch and nvcc compiler
-    output = subprocess.check_output([cuda_home + "/bin/nvcc",
+    output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
-                                      "-V"],
-                                     universal_newlines=True)
    output_split = output.split()
    release_idx = output_split.index("release")
    release = output_split[release_idx + 1].replace(',', '').split(".")
@@ -57,8 +52,7 @@ def installed_cuda_version(name=""):
 def get_default_compute_capabilities():
    compute_caps = DEFAULT_COMPUTE_CAPABILITIES
    import torch.utils.cpp_extension
-    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version(
+    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11:
-    )[0] >= 11:
        if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
            # Special treatment of CUDA 11.0 because compute_86 is not supported.
            compute_caps += ";8.0"
@@ -75,37 +69,25 @@ cuda_minor_mismatch_ok = {
        "10.1",
        "10.2",
    ],
-    11: ["11.0",
+    11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
-         "11.1",
-         "11.2",
-         "11.3",
-         "11.4",
-         "11.5",
-         "11.6",
-         "11.7",
-         "11.8"],
 }
 def assert_no_cuda_mismatch(name=""):
    cuda_major, cuda_minor = installed_cuda_version(name)
-    if cuda_minor == 0 and cuda_major == 0:
-        return False
    sys_cuda_version = f'{cuda_major}.{cuda_minor}'
    torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
    # This is a show-stopping error, should probably not proceed past this
    if sys_cuda_version != torch_cuda_version:
-        if (cuda_major in cuda_minor_mismatch_ok
+        if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
-                and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
                and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
            print(f"Installed CUDA version {sys_cuda_version} does not match the "
                  f"version torch was compiled with {torch.version.cuda} "
                  "but since the APIs are compatible, accepting this combination")
            return True
-        raise Exception(
+        raise Exception(f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
-            f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+                        f"version torch was compiled with {torch.version.cuda}, unable to compile "
-            f"version torch was compiled with {torch.version.cuda}, unable to compile "
+                        "cuda/cpp extensions without a matching cuda version.")
-            "cuda/cpp extensions without a matching cuda version.")
    return True
@@ -142,12 +124,11 @@ class OpBuilder(ABC):
        install_torch_version = torch_info['version']
        current_torch_version = ".".join(torch.__version__.split('.')[:2])
        if install_torch_version != current_torch_version:
-            raise RuntimeError(
+            raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
-                "PyTorch version mismatch! DeepSpeed ops were compiled and installed "
+                               "with a different version than what is being used at runtime. "
-                "with a different version than what is being used at runtime. "
+                               f"Please re-install DeepSpeed or switch torch versions. "
-                f"Please re-install DeepSpeed or switch torch versions. "
+                               f"Install torch version={install_torch_version}, "
-                f"Install torch version={install_torch_version}, "
+                               f"Runtime torch version={current_torch_version}")
-                f"Runtime torch version={current_torch_version}")
    @staticmethod
    def validate_torch_op_version(torch_info):
@@ -155,22 +136,20 @@ class OpBuilder(ABC):
            current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
            install_cuda_version = torch_info['cuda_version']
            if install_cuda_version != current_cuda_version:
-                raise RuntimeError(
+                raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
-                    "CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
-                    "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
-                    f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install CUDA version={install_cuda_version}, "
-                    f"Install CUDA version={install_cuda_version}, "
+                                   f"Runtime CUDA version={current_cuda_version}")
-                    f"Runtime CUDA version={current_cuda_version}")
        else:
            current_hip_version = ".".join(torch.version.hip.split('.')[:2])
            install_hip_version = torch_info['hip_version']
            if install_hip_version != current_hip_version:
-                raise RuntimeError(
+                raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
-                    "HIP version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
-                    "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
-                    f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install HIP version={install_hip_version}, "
-                    f"Install HIP version={install_hip_version}, "
+                                   f"Runtime HIP version={current_hip_version}")
-                    f"Runtime HIP version={current_hip_version}")
    @staticmethod
    def is_rocm_pytorch():
@@ -184,8 +163,7 @@ class OpBuilder(ABC):
            pass
        else:
            if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
-                _is_rocm_pytorch = hasattr(torch.version,
+                _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
-                                           'hip') and torch.version.hip is not None
                if _is_rocm_pytorch:
                    from torch.utils.cpp_extension import ROCM_HOME
                    _is_rocm_pytorch = ROCM_HOME is not None
@@ -240,7 +218,6 @@ class OpBuilder(ABC):
        return True
    def extra_ldflags(self):
-        #aiss
        #return []
        return ['-liomp5']
@@ -248,10 +225,7 @@ class OpBuilder(ABC):
        valid = False
        check_cmd = 'dpkg -l'
        for lib in libraries:
-            result = subprocess.Popen(f'dpkg -l {lib}',
+            result = subprocess.Popen(f'dpkg -l {lib}', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-                                      stdout=subprocess.PIPE,
-                                      stderr=subprocess.PIPE,
-                                      shell=True)
            valid = valid or result.wait() == 0
        return valid
@@ -282,9 +256,7 @@ class OpBuilder(ABC):
            tempdir = tempfile.mkdtemp()
            # Define a simple C program that calls the function in question
-            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (
+            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
-                funcname,
-                funcname)
            # Write the test program to a file.
            filename = os.path.join(tempdir, 'test.c')
@@ -305,16 +277,13 @@ class OpBuilder(ABC):
            # Attempt to compile the C program into an object file.
            cflags = shlex.split(os.environ.get('CFLAGS', ""))
-            objs = compiler.compile([filename],
+            objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
-                                    output_dir=output_dir,
-                                    extra_preargs=self.strip_empty_entries(cflags))
            # Attempt to link the object file into an executable.
            # Be sure to tack on any libraries that have been specified.
            ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
            compiler.link_executable(objs,
-                                     os.path.join(tempdir,
+                                     os.path.join(tempdir, 'a.out'),
-                                                  'a.out'),
                                     extra_preargs=self.strip_empty_entries(ldflags),
                                     libraries=libraries)
@@ -358,9 +327,8 @@ class OpBuilder(ABC):
        try:
            cpu_info = get_cpu_info()
        except Exception as e:
-            self.warning(
+            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
+                         "falling back to `lscpu` to get this information.")
-                "falling back to `lscpu` to get this information.")
            cpu_info = self._backup_cpuinfo()
            if cpu_info is None:
                return "-march=native"
@@ -372,23 +340,23 @@ class OpBuilder(ABC):
    def is_cuda_enable(self):
        try:
+            #assert_no_cuda_mismatch(self.name)
+            #return '-D__ENABLE_CUDA__'
+            #aiss
            if torch.cuda.is_available():
                return '-D__ENABLE_CUDA__'
-        except:
+        except BaseException:
-            print(
+            print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
-                f"{WARNING} {self.name} torch.cuda is missing, only cpu ops can be compiled!"
+                  "only cpu ops can be compiled!")
-            )
            return '-D__DISABLE_CUDA__'
        return '-D__DISABLE_CUDA__'
    def _backup_cpuinfo(self):
        # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
        if not self.command_exists('lscpu'):
-            self.warning(
+            self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
-                f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
+                         "to detect the CPU architecture. 'lscpu' does not appear to exist on "
-                "to detect the CPU architecture. 'lscpu' does not appear to exist on "
+                         "your system, will fall back to use -march=native and non-vectorized execution.")
-                "your system, will fall back to use -march=native and non-vectorized execution."
-            )
            return None
        result = subprocess.check_output('lscpu', shell=True)
        result = result.decode('utf-8').strip().lower()
@@ -420,9 +388,8 @@ class OpBuilder(ABC):
        try:
            cpu_info = get_cpu_info()
        except Exception as e:
-            self.warning(
+            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
+                         "falling back to `lscpu` to get this information.")
-                "falling back to `lscpu` to get this information.")
            cpu_info = self._backup_cpuinfo()
            if cpu_info is None:
                return '-D__SCALAR__'
@@ -445,13 +412,9 @@ class OpBuilder(ABC):
            valid = valid or result.wait() == 0
        if not valid and len(cmds) > 1:
-            print(
+            print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!")
-                f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
-            )
        elif not valid and len(cmds) == 1:
-            print(
+            print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!")
-                f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
-            )
        return valid
    def warning(self, msg):
@@ -466,12 +429,11 @@ class OpBuilder(ABC):
    def builder(self):
        from torch.utils.cpp_extension import CppExtension
-        return CppExtension(
+        return CppExtension(name=self.absolute_name(),
-            name=self.absolute_name(),
+                            sources=self.strip_empty_entries(self.sources()),
-            sources=self.strip_empty_entries(self.sources()),
+                            include_dirs=self.strip_empty_entries(self.include_paths()),
-            include_dirs=self.strip_empty_entries(self.include_paths()),
+                            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
-            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
+                            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
-            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
    def load(self, verbose=True):
        from deepspeed.git_version_info import installed_ops, torch_info
@@ -480,9 +442,8 @@ class OpBuilder(ABC):
            # torch/cuda versions we are currently using at runtime.
            self.validate_torch_version(torch_info)
            if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
-#aiss HIP version mismatch error
+                self.validate_torch_op_version(torch_info)
-                #self.validate_torch_op_version(torch_info)
-                pass   
            return importlib.import_module(self.absolute_name())
        else:
            return self.jit_load(verbose)
@@ -495,21 +456,21 @@ class OpBuilder(ABC):
        try:
            import ninja  # noqa: F401
        except ImportError:
-            raise RuntimeError(
+            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
-                f"Unable to JIT load the {self.name} op due to ninja not being installed."
-            )
        if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
-            self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
+            try:
+                assert_no_cuda_mismatch(self.name)
+                self.build_for_cpu = False
+            except BaseException:
+                self.build_for_cpu = True
        self.jit_mode = True
        from torch.utils.cpp_extension import load
        start_build = time.time()
        sources = [self.deepspeed_src_path(path) for path in self.sources()]
-        extra_include_paths = [
+        extra_include_paths = [self.deepspeed_src_path(path) for path in self.include_paths()]
-            self.deepspeed_src_path(path) for path in self.include_paths()
-        ]
        # Torch will try and apply whatever CCs are in the arch list at compile time,
        # we have already set the intended targets ourselves we know that will be
@@ -520,14 +481,13 @@ class OpBuilder(ABC):
            torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
            os.environ["TORCH_CUDA_ARCH_LIST"] = ""
-        op_module = load(
+        op_module = load(name=self.name,
-            name=self.name,
+                         sources=self.strip_empty_entries(sources),
-            sources=self.strip_empty_entries(sources),
+                         extra_include_paths=self.strip_empty_entries(extra_include_paths),
-            extra_include_paths=self.strip_empty_entries(extra_include_paths),
+                         extra_cflags=self.strip_empty_entries(self.cxx_args()),
-            extra_cflags=self.strip_empty_entries(self.cxx_args()),
+                         extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
-            extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
+                         extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
-            extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
+                         verbose=verbose)
-            verbose=verbose)
        build_duration = time.time() - start_build
        if verbose:
@@ -541,6 +501,7 @@ class OpBuilder(ABC):
 class CUDAOpBuilder(OpBuilder):
    def compute_capability_args(self, cross_compile_archs=None):
        """
        Returns nvcc compute capability compile flags.
@@ -587,8 +548,7 @@ class CUDAOpBuilder(OpBuilder):
        ccs = self.filter_ccs(ccs)
        if len(ccs) == 0:
            raise RuntimeError(
-                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering"
+                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering")
-            )
        args = []
        for cc in ccs:
@@ -623,7 +583,16 @@ class CUDAOpBuilder(OpBuilder):
        return super().is_compatible(verbose)
    def builder(self):
-        #self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
+        #try:
+        #    assert_no_cuda_mismatch(self.name)
+        #    self.build_for_cpu = False
+        #except BaseException:
+        #    self.build_for_cpu = True
+        #if self.build_for_cpu:
+        #    from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+        #else:
+        #    from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
        #aiss
        if not self.is_rocm_pytorch():
            self.build_for_cpu = not assert_no_cuda_mismatch(self.name) 
@@ -632,23 +601,17 @@ class CUDAOpBuilder(OpBuilder):
            else:
                from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
        else:
-            from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder    
+            from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder 
-        #self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
-        #if self.build_for_cpu:
-        #    from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
-        #else:
-        #    from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
                       {'cxx': self.strip_empty_entries(self.cxx_args()), \
                           'nvcc': self.strip_empty_entries(self.nvcc_args())}
-        cuda_ext = ExtensionBuilder(
+        cuda_ext = ExtensionBuilder(name=self.absolute_name(),
-            name=self.absolute_name(),
+                                    sources=self.strip_empty_entries(self.sources()),
-            sources=self.strip_empty_entries(self.sources()),
+                                    include_dirs=self.strip_empty_entries(self.include_paths()),
-            include_dirs=self.strip_empty_entries(self.include_paths()),
+                                    libraries=self.strip_empty_entries(self.libraries_args()),
-            libraries=self.strip_empty_entries(self.libraries_args()),
+                                    extra_compile_args=compile_args)
-            extra_compile_args=compile_args)
        if self.is_rocm_pytorch():
            # hip converts paths to absolute, this converts back to relative
@@ -656,7 +619,10 @@ class CUDAOpBuilder(OpBuilder):
            curr_file = Path(__file__).parent.parent  # ds root
            for i in range(len(sources)):
                src = Path(sources[i])
-                sources[i] = str(src.relative_to(curr_file))
+                if src.is_absolute():
+                    sources[i] = str(src.relative_to(curr_file))
+                else:
+                    sources[i] = str(src)
            cuda_ext.sources = sources
        return cuda_ext
@@ -711,9 +677,7 @@ class CUDAOpBuilder(OpBuilder):
        if self.is_rocm_pytorch():
            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
            args += [
-                '-std=c++14',
+                '-std=c++14', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__',
-                '-U__HIP_NO_HALF_OPERATORS__',
-                '-U__HIP_NO_HALF_CONVERSIONS__',
                '-U__HIP_NO_HALF2_OPERATORS__',
                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR,
@@ -722,13 +686,9 @@ class CUDAOpBuilder(OpBuilder):
        else:
            cuda_major, _ = installed_cuda_version()
            args += [
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math',
-                '--use_fast_math',
+                '-std=c++17' if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
-                '-std=c++17'
+                '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__'
-                if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
-                '-U__CUDA_NO_HALF_OPERATORS__',
-                '-U__CUDA_NO_HALF_CONVERSIONS__',
-                '-U__CUDA_NO_HALF2_OPERATORS__'
            ]
            if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
                args.append('--ptxas-options=-v')
@@ -742,10 +702,12 @@ class CUDAOpBuilder(OpBuilder):
        if sys.platform == "win32":
            return ['cublas', 'curand']
        else:
+            #return []
            return ['iomp5']
 class TorchCPUOpBuilder(CUDAOpBuilder):
    def extra_ldflags(self):
        if self.build_for_cpu:
            return ['-fopenmp']

--- a/op_builder/cpu_adagrad.py
+++ b/op_builder/cpu_adagrad.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 import os
 from .builder import TorchCPUOpBuilder
@@ -38,13 +40,8 @@ class CPUAdagradBuilder(TorchCPUOpBuilder):
            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
        else:
            CUDA_INCLUDE = [
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
-                             "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
-                             "include",
-                             "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "hiprand"),
            ]
        return ['csrc/includes'] + CUDA_INCLUDE
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 import os
 from .builder import TorchCPUOpBuilder
@@ -8,7 +10,7 @@ from .builder import TorchCPUOpBuilder
 class CPUAdamBuilder(TorchCPUOpBuilder):
    BUILD_VAR = "DS_BUILD_CPU_ADAM"
    NAME = "cpu_adam"
    def __init__(self):
        super().__init__(name=self.NAME)
@@ -18,7 +20,7 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
    def sources(self):
        if self.build_for_cpu:
            return ['csrc/adam/cpu_adam.cpp']
        return ['csrc/adam/cpu_adam.cpp', 'csrc/common/custom_cuda_kernel.cu']
    def libraries_args(self):
@@ -28,6 +30,7 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
        if not self.is_rocm_pytorch():
            args += ['curand']
        return args
    def include_paths(self):
@@ -38,13 +41,8 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
        else:
            CUDA_INCLUDE = [
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
-                             "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
-                             "include",
-                             "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "hiprand"),
            ]
        return ['csrc/includes'] + CUDA_INCLUDE
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 from .builder import CUDAOpBuilder
 import sys
@@ -29,9 +31,7 @@ class FusedAdamBuilder(CUDAOpBuilder):
    def nvcc_args(self):
        nvcc_flags = ['-O3'] + self.version_dependent_macros()
        if not self.is_rocm_pytorch():
-            nvcc_flags.extend([
+            nvcc_flags.extend(
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
-                '-lineinfo',
+                self.compute_capability_args())
-                '--use_fast_math'
-            ] + self.compute_capability_args())
        return nvcc_flags
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 from .builder import CUDAOpBuilder
 import sys
@@ -30,14 +32,9 @@ class FusedLambBuilder(CUDAOpBuilder):
        nvcc_flags = ['-O3'] + self.version_dependent_macros()
        if self.is_rocm_pytorch():
            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            nvcc_flags += [
+            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
-                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
-                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
-            ]
        else:
-            nvcc_flags.extend([
+            nvcc_flags.extend(
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
-                '-lineinfo',
+                self.compute_capability_args())
-                '--use_fast_math'
-            ] + self.compute_capability_args())
        return nvcc_flags
--- a/op_builder/quantizer.py
+++ b/op_builder/quantizer.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 from .builder import CUDAOpBuilder

--- a/op_builder/random_ltd.py
+++ b/op_builder/random_ltd.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2022 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 from .builder import CUDAOpBuilder
@@ -23,18 +25,13 @@ class RandomLTDBuilder(CUDAOpBuilder):
    def sources(self):
        return [
-            'csrc/random_ltd/pt_binding.cpp',
+            'csrc/random_ltd/pt_binding.cpp', 'csrc/random_ltd/gather_scatter.cu',
-            'csrc/random_ltd/gather_scatter.cu',
+            'csrc/random_ltd/slice_attn_masks.cu', 'csrc/random_ltd/token_sort.cu'
-            'csrc/random_ltd/slice_attn_masks.cu',
-            'csrc/random_ltd/token_sort.cu'
        ]
    def include_paths(self):
        includes = ['csrc/includes']
        if self.is_rocm_pytorch():
            from torch.utils.cpp_extension import ROCM_HOME
-            includes += [
+            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
-                '{}/hiprand/include'.format(ROCM_HOME),
-                '{}/rocrand/include'.format(ROCM_HOME)
-            ]
        return includes
--- a/op_builder/sparse_attn.py
+++ b/op_builder/sparse_attn.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 from .builder import OpBuilder
 try:
@@ -32,9 +34,8 @@ class SparseAttnBuilder(OpBuilder):
        #deps_compatible = all(command_status)
        if self.is_rocm_pytorch():
-        #aiss debug
            #self.warning(f'{self.NAME} is not compatible with ROCM')
-            #return False
+#aiss debug
            return True
        try:
@@ -49,26 +50,23 @@ class SparseAttnBuilder(OpBuilder):
            self.warning(f"{self.NAME} cuda is not available from torch")
        else:
            major, minor = torch.version.cuda.split('.')[:2]
-            cuda_compatible = (int(major) == 10
+            cuda_compatible = (int(major) == 10 and int(minor) >= 1) or (int(major) >= 11)
-                               and int(minor) >= 1) or (int(major) >= 11)
            if not cuda_compatible:
                self.warning(f"{self.NAME} requires CUDA version 10.1+")
        TORCH_MAJOR = int(torch.__version__.split('.')[0])
        TORCH_MINOR = int(torch.__version__.split('.')[1])
-        torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5
+        torch_compatible = (TORCH_MAJOR == 1 and TORCH_MINOR >= 5)
        if not torch_compatible:
            self.warning(
-                f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
+                f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')
-            )
        try:
            import triton
        except ImportError:
            # auto-install of triton is broken on some systems, reverting to manual install for now
            # see this issue: https://github.com/microsoft/DeepSpeed/issues/1710
-            self.warning(
+            self.warning(f"please install triton==1.0.0 if you want to use sparse attention")
-                f"please install triton==1.0.0 if you want to use sparse attention")
            return False
        if pkg_version:
@@ -79,9 +77,7 @@ class SparseAttnBuilder(OpBuilder):
            triton_mismatch = installed_triton != "1.0.0"
        if triton_mismatch:
-            self.warning(
+            self.warning(f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible")
-                f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible"
-            )
            return False
        return super().is_compatible(verbose) and torch_compatible and cuda_compatible
--- a/op_builder/spatial_inference.py
+++ b/op_builder/spatial_inference.py
-'''
+# Copyright (c) Microsoft Corporation.
-Copyright 2022 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-'''
+# DeepSpeed Team
 from .builder import CUDAOpBuilder, installed_cuda_version
@@ -19,8 +21,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder):
        try:
            import torch
        except ImportError:
-            self.warning(
+            self.warning("Please install torch if trying to pre-compile inference kernels")
-                "Please install torch if trying to pre-compile inference kernels")
            return False
        cuda_okay = True
@@ -30,8 +31,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder):
            cuda_capability = torch.cuda.get_device_properties(0).major
            if cuda_capability >= 8:
                if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning(
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
-                        "On Ampere and higher architectures please use CUDA 11+")
                    cuda_okay = False
        return super().is_compatible(verbose) and cuda_okay

--- a/op_builder/stochastic_transformer.py
+++ b/op_builder/stochastic_transformer.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 from .transformer import TransformerBuilder

--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 from .builder import CUDAOpBuilder
@@ -23,22 +25,15 @@ class TransformerBuilder(CUDAOpBuilder):
    def sources(self):
        return [
-            'csrc/transformer/ds_transformer_cuda.cpp',
+            'csrc/transformer/ds_transformer_cuda.cpp', 'csrc/transformer/cublas_wrappers.cu',
-            'csrc/transformer/cublas_wrappers.cu',
+            'csrc/transformer/transform_kernels.cu', 'csrc/transformer/gelu_kernels.cu',
-            'csrc/transformer/transform_kernels.cu',
+            'csrc/transformer/dropout_kernels.cu', 'csrc/transformer/normalize_kernels.cu',
-            'csrc/transformer/gelu_kernels.cu',
+            'csrc/transformer/softmax_kernels.cu', 'csrc/transformer/general_kernels.cu'
-            'csrc/transformer/dropout_kernels.cu',
-            'csrc/transformer/normalize_kernels.cu',
-            'csrc/transformer/softmax_kernels.cu',
-            'csrc/transformer/general_kernels.cu'
        ]
    def include_paths(self):
        includes = ['csrc/includes']
        if self.is_rocm_pytorch():
            from torch.utils.cpp_extension import ROCM_HOME
-            includes += [
+            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
-                '{}/hiprand/include'.format(ROCM_HOME),
-                '{}/rocrand/include'.format(ROCM_HOME)
-            ]
        return includes
--- a/op_builder/transformer_inference.py
+++ b/op_builder/transformer_inference.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 from .builder import CUDAOpBuilder, installed_cuda_version
@@ -18,8 +21,7 @@ class InferenceBuilder(CUDAOpBuilder):
        try:
            import torch
        except ImportError:
-            self.warning(
+            self.warning("Please install torch if trying to pre-compile inference kernels")
-                "Please install torch if trying to pre-compile inference kernels")
            return False
        cuda_okay = True
@@ -28,14 +30,11 @@ class InferenceBuilder(CUDAOpBuilder):
            torch_cuda_major = int(torch.version.cuda.split('.')[0])
            cuda_capability = torch.cuda.get_device_properties(0).major
            if cuda_capability < 6:
-                self.warning(
+                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
-                    "NVIDIA Inference is only supported on Pascal and newer architectures"
-                )
                cuda_okay = False
            if cuda_capability >= 8:
                if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning(
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
-                        "On Ampere and higher architectures please use CUDA 11+")
                    cuda_okay = False
        return super().is_compatible(verbose) and cuda_okay

--- a/op_builder/utils.py
+++ b/op_builder/utils.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-"""
+# DeepSpeed Team
 from .builder import OpBuilder

--- a/release/bump_patch_version.py
+++ b/release/bump_patch_version.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 from packaging import version as pkg_version