DeepSpeed JIT op + PyPI support (#496)

Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com>

DeepSpeed JIT op + PyPI support (#496)
Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com>
31f46fee · Jeff Rasley · GitHub · 0ad4fd88 · 31f46fee · 31f46fee
Unverified Commit 31f46fee authored Nov 12, 2020 by Jeff Rasley Committed by GitHub Nov 12, 2020
19 changed files
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
+import torch
+from .builder import CUDAOpBuilder
+
+
+class FusedAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
+
+    def include_paths(self):
+        return ['csrc/includes']
+
+    def cxx_args(self):
+        return ['-O3'] + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        return ['-lineinfo', '-O3', '--use_fast_math'] + self.version_dependent_macros()
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
+import torch
+from .builder import CUDAOpBuilder
+
+
+class FusedLambBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_FUSED_LAMB'
+    NAME = "fused_lamb"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.lamb.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['csrc/includes']
+
+    def cxx_args(self):
+        return ['-O3'] + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        return ['-lineinfo', '-O3', '--use_fast_math'] + self.version_dependent_macros()
--- a/op_builder/sparse_attn.py
+++ b/op_builder/sparse_attn.py
+import torch
+import warnings
+from .builder import OpBuilder
+
+
+class SparseAttnBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_SPARSE_ATTN"
+    NAME = "sparse_attn"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.sparse_attention.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/sparse_attention/utils.cpp']
+
+    def cxx_args(self):
+        return ['-O2', '-fopenmp']
+
+    def is_compatible(self):
+        # Check to see if llvm and cmake are installed since they are dependencies
+        required_commands = ['llvm-config|llvm-config-9', 'cmake']
+        command_status = list(map(self.command_exists, required_commands))
+        deps_compatible = all(command_status)
+
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5
+        if not torch_compatible:
+            self.warning(
+                f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
+            )
+
+        return super().is_compatible() and deps_compatible and torch_compatible
--- a/op_builder/stochastic_transformer.py
+++ b/op_builder/stochastic_transformer.py
+import torch
+from .transformer import TransformerBuilder
+
+
+class StochasticTransformerBuilder(TransformerBuilder):
+    BUILD_VAR = "DS_BUILD_STOCHASTIC_TRANSFORMER"
+    NAME = "stochastic_transformer"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.{self.NAME}_op'
+
+    def nvcc_args(self):
+        args = super().nvcc_args()
+        args.append('-D__STOCHASTIC_MODE__')
+        return args
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
+import torch
+from .builder import CUDAOpBuilder
+
+
+class TransformerBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER"
+    NAME = "transformer"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.{self.NAME}_op'
+
+    def sources(self):
+        return [
+            'csrc/transformer/ds_transformer_cuda.cpp',
+            'csrc/transformer/cublas_wrappers.cu',
+            'csrc/transformer/transform_kernels.cu',
+            'csrc/transformer/gelu_kernels.cu',
+            'csrc/transformer/dropout_kernels.cu',
+            'csrc/transformer/normalize_kernels.cu',
+            'csrc/transformer/softmax_kernels.cu',
+            'csrc/transformer/general_kernels.cu'
+        ]
+
+    def include_paths(self):
+        return ['csrc/includes']
+
+    def nvcc_args(self):
+        args = [
+            '-O3',
+            '--use_fast_math',
+            '-std=c++14',
+            '-U__CUDA_NO_HALF_OPERATORS__',
+            '-U__CUDA_NO_HALF_CONVERSIONS__',
+            '-U__CUDA_NO_HALF2_OPERATORS__'
+        ]
+
+        return args + self.compute_capability_args()
+
+    def cxx_args(self):
+        return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
--- a/op_builder/utils.py
+++ b/op_builder/utils.py
+from .builder import OpBuilder
+
+
+class UtilsBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_UTILS"
+    NAME = "utils"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/utils/flatten_unflatten.cpp']
--- a/requirements/requirements-sparse-attn.txt
+++ b/requirements/requirements-sparse-attn.txt
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -2,5 +2,6 @@ torch>=1.2
 torchvision>=0.4.0
 tqdm
 psutil
-cpufeature
 tensorboardX==1.8
+ninja
+cpufeature
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ import warnings
 from setuptools import setup, find_packages
 from torch.utils.cpp_extension import CUDAExtension, BuildExtension, CppExtension

-VERSION = "0.3.0"
+import op_builder


 def fetch_requirements(path):
@@ -24,88 +24,33 @@ def fetch_requirements(path):
        return [r.strip() for r in fd.readlines()]


-def available_vector_instructions():
-    try:
-        import cpufeature
-    except ImportError:
-        warnings.warn(
-            f'import cpufeature failed - CPU vector optimizations are not available for CPUAdam'
-        )
-        return {}
-
-    cpu_vector_instructions = {}
-    try:
-        cpu_vector_instructions = cpufeature.CPUFeature
-    except _:
-        warnings.warn(
-            f'cpufeature.CPUFeature failed - CPU vector optimizations are not available for CPUAdam'
-        )
-        return {}
-
-    return cpu_vector_instructions
-
-
 install_requires = fetch_requirements('requirements/requirements.txt')
-dev_requires = fetch_requirements('requirements/requirements-dev.txt')
-sparse_attn_requires = fetch_requirements('requirements/requirements-sparse-attn.txt')
+extras_require = {
+    '1bit_adam': fetch_requirements('requirements/requirements-1bit-adam.txt'),
+    'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
+    'dev': fetch_requirements('requirements/requirements-dev.txt'),
+}

 # If MPI is available add 1bit-adam requirements
 if torch.cuda.is_available():
    if shutil.which('ompi_info') or shutil.which('mpiname'):
-        onebit_adam_requires = fetch_requirements(
-            'requirements/requirements-1bit-adam.txt')
-        onebit_adam_requires.append(f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}")
-        install_requires += onebit_adam_requires
-
-# Constants for each op
-LAMB = "lamb"
-TRANSFORMER = "transformer"
-SPARSE_ATTN = "sparse-attn"
-CPU_ADAM = "cpu-adam"
-
-cpu_vector_instructions = available_vector_instructions()
-
-# Build environment variables for custom builds
-DS_BUILD_LAMB_MASK = 1
-DS_BUILD_TRANSFORMER_MASK = 10
-DS_BUILD_SPARSE_ATTN_MASK = 100
-DS_BUILD_CPU_ADAM_MASK = 1000
-
-# Allow for build_cuda to turn on or off all ops
-DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK | DS_BUILD_CPU_ADAM_MASK
-DS_BUILD_CUDA = int(os.environ.get('DS_BUILD_CUDA', 1)) * DS_BUILD_ALL_OPS
-
-# Set default of each op based on if build_cuda is set
-OP_DEFAULT = DS_BUILD_CUDA == DS_BUILD_ALL_OPS
-DS_BUILD_CPU_ADAM = int(os.environ.get('DS_BUILD_CPU_ADAM', 0)) * DS_BUILD_CPU_ADAM_MASK
-DS_BUILD_LAMB = int(os.environ.get('DS_BUILD_LAMB', OP_DEFAULT)) * DS_BUILD_LAMB_MASK
-DS_BUILD_TRANSFORMER = int(os.environ.get('DS_BUILD_TRANSFORMER',
-                                          OP_DEFAULT)) * DS_BUILD_TRANSFORMER_MASK
-DS_BUILD_SPARSE_ATTN = int(os.environ.get('DS_BUILD_SPARSE_ATTN',
-                                          OP_DEFAULT)) * DS_BUILD_SPARSE_ATTN_MASK
-
-# Final effective mask is the bitwise OR of each op
-BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN
-              | DS_BUILD_CPU_ADAM)
-
-install_ops = dict.fromkeys([LAMB, TRANSFORMER, SPARSE_ATTN, CPU_ADAM], False)
-if BUILD_MASK & DS_BUILD_LAMB:
-    install_ops[LAMB] = True
-if BUILD_MASK & DS_BUILD_CPU_ADAM:
-    install_ops[CPU_ADAM] = True
-if BUILD_MASK & DS_BUILD_TRANSFORMER:
-    install_ops[TRANSFORMER] = True
-if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
-    install_ops[SPARSE_ATTN] = True
-if len(install_ops) == 0:
-    print("Building without any cuda/cpp extensions")
-print(f'BUILD_MASK={BUILD_MASK}, install_ops={install_ops}')
+        cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
+        extras_require['1bit_adam'].append(cupy)
+
+# Make an [all] extra that installs all needed dependencies
+all_extras = set()
+for extra in extras_require.items():
+    for req in extra[1]:
+        all_extras.add(req)
+extras_require['all'] = list(all_extras)

 cmdclass = {}
+
+# For any pre-installed ops force disable ninja
 cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)

-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
+TORCH_MAJOR = torch.__version__.split('.')[0]
+TORCH_MINOR = torch.__version__.split('.')[1]

 if not torch.cuda.is_available():
    # Fix to allow docker buils, similar to https://github.com/NVIDIA/apex/issues/486
@@ -116,230 +61,118 @@ if not torch.cuda.is_available():
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
        os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"

-# Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
-version_ge_1_1 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
-    version_ge_1_1 = ['-DVERSION_GE_1_1']
-version_ge_1_3 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
-    version_ge_1_3 = ['-DVERSION_GE_1_3']
-version_ge_1_5 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
-    version_ge_1_5 = ['-DVERSION_GE_1_5']
-version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
-
-SIMD_WIDTH = ''
-if cpu_vector_instructions.get('AVX512f', False):
-    SIMD_WIDTH = '-D__AVX512__'
-elif cpu_vector_instructions.get('AVX2', False):
-    SIMD_WIDTH = '-D__AVX256__'
-print("SIMD_WIDTH = ", SIMD_WIDTH)
-
 ext_modules = []

-## Lamb ##
-if BUILD_MASK & DS_BUILD_LAMB:
-    ext_modules.append(
-        CUDAExtension(name='deepspeed.ops.lamb.fused_lamb_cuda',
-                      sources=[
-                          'csrc/lamb/fused_lamb_cuda.cpp',
-                          'csrc/lamb/fused_lamb_cuda_kernel.cu'
-                      ],
-                      include_dirs=['csrc/includes'],
-                      extra_compile_args={
-                          'cxx': [
-                              '-O3',
-                          ] + version_dependent_macros,
-                          'nvcc': ['-O3',
-                                   '--use_fast_math'] + version_dependent_macros
-                      }))
-
-## Adam ##
-if BUILD_MASK & DS_BUILD_CPU_ADAM:
-    ext_modules.append(
-        CUDAExtension(name='deepspeed.ops.adam.cpu_adam_op',
-                      sources=[
-                          'csrc/adam/cpu_adam.cpp',
-                          'csrc/adam/custom_cuda_kernel.cu',
-                      ],
-                      include_dirs=['csrc/includes',
-                                    '/usr/local/cuda/include'],
-                      extra_compile_args={
-                          'cxx': [
-                              '-O3',
-                              '-std=c++14',
-                              '-L/usr/local/cuda/lib64',
-                              '-lcudart',
-                              '-lcublas',
-                              '-g',
-                              '-Wno-reorder',
-                              '-march=native',
-                              '-fopenmp',
-                              SIMD_WIDTH
-                          ],
-                          'nvcc': [
-                              '-O3',
-                              '--use_fast_math',
-                              '-gencode',
-                              'arch=compute_61,code=compute_61',
-                              '-gencode',
-                              'arch=compute_70,code=compute_70',
-                              '-std=c++14',
-                              '-U__CUDA_NO_HALF_OPERATORS__',
-                              '-U__CUDA_NO_HALF_CONVERSIONS__',
-                              '-U__CUDA_NO_HALF2_OPERATORS__'
-                          ]
-                      }))
-
-## Transformer ##
-if BUILD_MASK & DS_BUILD_TRANSFORMER:
-    ext_modules.append(
-        CUDAExtension(name='deepspeed.ops.transformer.transformer_cuda',
-                      sources=[
-                          'csrc/transformer/ds_transformer_cuda.cpp',
-                          'csrc/transformer/cublas_wrappers.cu',
-                          'csrc/transformer/transform_kernels.cu',
-                          'csrc/transformer/gelu_kernels.cu',
-                          'csrc/transformer/dropout_kernels.cu',
-                          'csrc/transformer/normalize_kernels.cu',
-                          'csrc/transformer/softmax_kernels.cu',
-                          'csrc/transformer/general_kernels.cu'
-                      ],
-                      include_dirs=['csrc/includes'],
-                      extra_compile_args={
-                          'cxx': ['-O3',
-                                  '-std=c++14',
-                                  '-g',
-                                  '-Wno-reorder'],
-                          'nvcc': [
-                              '-O3',
-                              '--use_fast_math',
-                              '-gencode',
-                              'arch=compute_61,code=compute_61',
-                              '-gencode',
-                              'arch=compute_60,code=compute_60',
-                              '-gencode',
-                              'arch=compute_70,code=compute_70',
-                              '-std=c++14',
-                              '-U__CUDA_NO_HALF_OPERATORS__',
-                              '-U__CUDA_NO_HALF_CONVERSIONS__',
-                              '-U__CUDA_NO_HALF2_OPERATORS__'
-                          ]
-                      }))
-    ext_modules.append(
-        CUDAExtension(name='deepspeed.ops.transformer.stochastic_transformer_cuda',
-                      sources=[
-                          'csrc/transformer/ds_transformer_cuda.cpp',
-                          'csrc/transformer/cublas_wrappers.cu',
-                          'csrc/transformer/transform_kernels.cu',
-                          'csrc/transformer/gelu_kernels.cu',
-                          'csrc/transformer/dropout_kernels.cu',
-                          'csrc/transformer/normalize_kernels.cu',
-                          'csrc/transformer/softmax_kernels.cu',
-                          'csrc/transformer/general_kernels.cu'
-                      ],
-                      include_dirs=['csrc/includes'],
-                      extra_compile_args={
-                          'cxx': ['-O3',
-                                  '-std=c++14',
-                                  '-g',
-                                  '-Wno-reorder'],
-                          'nvcc': [
-                              '-O3',
-                              '--use_fast_math',
-                              '-gencode',
-                              'arch=compute_61,code=compute_61',
-                              '-gencode',
-                              'arch=compute_60,code=compute_60',
-                              '-gencode',
-                              'arch=compute_70,code=compute_70',
-                              '-std=c++14',
-                              '-U__CUDA_NO_HALF_OPERATORS__',
-                              '-U__CUDA_NO_HALF_CONVERSIONS__',
-                              '-U__CUDA_NO_HALF2_OPERATORS__',
-                              '-D__STOCHASTIC_MODE__'
-                          ]
-                      }))
+from op_builder import ALL_OPS
+
+# Default to pre-install kernels to false so we rely on JIT
+BUILD_OP_DEFAULT = int(os.environ.get('DS_BUILD_OPS', 0))
+print(f"DS_BUILD_OPS={BUILD_OP_DEFAULT}")


 def command_exists(cmd):
-    if '|' in cmd:
-        cmds = cmd.split("|")
-    else:
-        cmds = [cmd]
-    valid = False
-    for cmd in cmds:
-        result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
-        valid = valid or result.wait() == 0
-    return valid
-
-
-## Sparse transformer ##
-if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
-    # Check to see if llvm and cmake are installed since they are dependencies
-    required_commands = ['llvm-config|llvm-config-9', 'cmake']
-
-    command_status = list(map(command_exists, required_commands))
-    if not all(command_status):
-        zipped_status = list(zip(required_commands, command_status))
-        warnings.warn(
-            f'Missing non-python requirements, please install the missing packages: {zipped_status}'
-        )
-        warnings.warn(
-            'Skipping sparse attention installation due to missing required packages')
-        # remove from installed ops list
-        install_ops[SPARSE_ATTN] = False
-    elif TORCH_MAJOR == 1 and TORCH_MINOR >= 5:
-        ext_modules.append(
-            CppExtension(name='deepspeed.ops.sparse_attention.cpp_utils',
-                         sources=['csrc/sparse_attention/utils.cpp'],
-                         extra_compile_args={'cxx': ['-O2',
-                                                     '-fopenmp']}))
-        # Add sparse attention requirements
-        install_requires += sparse_attn_requires
-    else:
-        warnings.warn('Unable to meet requirements to install sparse attention')
-        # remove from installed ops list
-        install_ops[SPARSE_ATTN] = False
-
-# Add development requirements
-install_requires += dev_requires
+    result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+    return result.wait() == 0
+
+
+def op_enabled(op_name):
+    assert hasattr(ALL_OPS[op_name], 'BUILD_VAR'), \
+        f"{op_name} is missing BUILD_VAR field"
+    env_var = ALL_OPS[op_name].BUILD_VAR
+    return int(os.environ.get(env_var, BUILD_OP_DEFAULT))
+
+
+install_ops = dict.fromkeys(ALL_OPS.keys(), False)
+for op_name, builder in ALL_OPS.items():
+    op_compatible = builder.is_compatible()
+
+    # If op is compatible update install reqs so it can potentially build/run later
+    if op_compatible:
+        reqs = builder.python_requirements()
+        install_requires += builder.python_requirements()
+
+    # If op install enabled, add builder to extensions
+    if op_enabled(op_name) and op_compatible:
+        install_ops[op_name] = op_enabled(op_name)
+        ext_modules.append(builder.builder())
+
+compatible_ops = {op_name: op.is_compatible() for (op_name, op) in ALL_OPS.items()}
+
+print(f'Install Ops={install_ops}')

 # Write out version/git info
 git_hash_cmd = "git rev-parse --short HEAD"
 git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
-if command_exists('git'):
-    result = subprocess.check_output(git_hash_cmd, shell=True)
-    git_hash = result.decode('utf-8').strip()
-    result = subprocess.check_output(git_branch_cmd, shell=True)
-    git_branch = result.decode('utf-8').strip()
+if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
+    try:
+        result = subprocess.check_output(git_hash_cmd, shell=True)
+        git_hash = result.decode('utf-8').strip()
+        result = subprocess.check_output(git_branch_cmd, shell=True)
+        git_branch = result.decode('utf-8').strip()
+    except subprocess.CalledProcessError:
+        git_hash = "unknown"
+        git_branch = "unknown"
 else:
    git_hash = "unknown"
    git_branch = "unknown"
-print(f"version={VERSION}+{git_hash}, git_hash={git_hash}, git_branch={git_branch}")
+
+# Parse the DeepSpeed version string from version.txt
+version_str = open('version.txt', 'r').read().strip()
+
+# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
+# example: DS_BUILD_STR=".dev20201022" python setup.py sdist bdist_wheel
+#version_str += os.environ.get('DS_BUILD_STRING', f'+{git_hash}')
+
+# Building wheel for distribution, update version file
+
+if 'DS_BUILD_STRING' in os.environ:
+    # Build string env specified, probably building for distribution
+    with open('build.txt', 'w') as fd:
+        fd.write(os.environ.get('DS_BUILD_STRING'))
+    version_str += os.environ.get('DS_BUILD_STRING')
+elif os.path.isfile('build.txt'):
+    # build.txt exists, probably installing from distribution
+    with open('build.txt', 'r') as fd:
+        version_str += fd.read().strip()
+else:
+    # None of the above, probably installing from source
+    version_str += f'+{git_hash}'
+
+torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
+cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+torch_info = {"version": torch_version, "cuda_version": cuda_version}
+
+print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
 with open('deepspeed/git_version_info_installed.py', 'w') as fd:
-    fd.write(f"version='{VERSION}+{git_hash}'\n")
+    fd.write(f"version='{version_str}'\n")
    fd.write(f"git_hash='{git_hash}'\n")
    fd.write(f"git_branch='{git_branch}'\n")
    fd.write(f"installed_ops={install_ops}\n")
+    fd.write(f"compatible_ops={compatible_ops}\n")
+    fd.write(f"torch_info={torch_info}\n")

 print(f'install_requires={install_requires}')
+print(f'compatible_ops={compatible_ops}')
+print(f'ext_modules={ext_modules}')

 setup(name='deepspeed',
-      version=f"{VERSION}+{git_hash}",
+      version=version_str,
      description='DeepSpeed library',
      author='DeepSpeed Team',
      author_email='deepspeed@microsoft.com',
      url='http://deepspeed.ai',
      install_requires=install_requires,
+      extras_require=extras_require,
      packages=find_packages(exclude=["docker",
-                                      "third_party",
-                                      "csrc"]),
-      package_data={'deepspeed.ops.sparse_attention.trsrc': ['*.tr']},
-      scripts=['bin/deepspeed',
-               'bin/deepspeed.pt',
-               'bin/ds',
-               'bin/ds_ssh'],
+                                      "third_party"]),
+      include_package_data=True,
+      scripts=[
+          'bin/deepspeed',
+          'bin/deepspeed.pt',
+          'bin/ds',
+          'bin/ds_ssh',
+          'bin/ds_report'
+      ],
      classifiers=[
          'Programming Language :: Python :: 3.6',
          'Programming Language :: Python :: 3.7',

--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@@ -363,10 +363,18 @@ except ImportError:
            self.variance_epsilon = eps

        def forward(self, x):
+            pdtype = x.dtype
+            x = x.float()
            u = x.mean(-1, keepdim=True)
            s = (x - u).pow(2).mean(-1, keepdim=True)
            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-            return self.weight * x + self.bias
+            return self.weight * x.to(pdtype) + self.bias
+
+        #def forward(self, x):
+        #    u = x.mean(-1, keepdim=True)
+        #    s = (x - u).pow(2).mean(-1, keepdim=True)
+        #    x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        #    return self.weight * x + self.bias


 class BertEmbeddings(nn.Module):

--- a/tests/unit/test_checkpointing.py
+++ b/tests/unit/test_checkpointing.py
@@ -12,6 +12,8 @@ from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.pipe.topology import *
 PipeTopo = PipeDataParallelTopology

+from deepspeed.ops.op_builder import FusedLambBuilder, CPUAdamBuilder
+
 import argparse
 import pytest
 import json
@@ -152,8 +154,8 @@ def checkpoint_correctness_verification(args,
        compare_lr_scheduler_states(trained_model, loaded_model)


-@pytest.mark.skipif(not deepspeed.ops.__installed_ops__['lamb'],
-                    reason="lamb is not installed")
+@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                    reason="lamb is not compatible")
 def test_checkpoint_unfused_optimizer(tmpdir):
    config_dict = {
        "train_batch_size": 2,
@@ -264,11 +266,11 @@ def test_checkpoint_fused_optimizer(tmpdir):
                              'Adam'),
                             (2,
                              True,
-                              'deepspeed_adam'),
+                              'Adam'),
                         ])
 def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
-    if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
-        pytest.skip("cpu-adam is not installed")
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")

    config_dict = {
        "train_batch_size": 2,
@@ -320,14 +322,14 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_opt
                              "Adam"),
                             (2,
                              True,
-                              'deepspeed_adam'),
+                              'Adam'),
                         ])
 def test_checkpoint_zero_no_optimizer(tmpdir,
                                      zero_stage,
                                      use_cpu_offload,
                                      adam_optimizer):
-    if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
-        pytest.skip("cpu-adam is not installed")
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")

    config_dict = {
        "train_batch_size": 2,
@@ -385,11 +387,11 @@ def test_checkpoint_zero_no_optimizer(tmpdir,
                              'Adam'),
                             (2,
                              True,
-                              'deepspeed_adam'),
+                              'Adam'),
                         ])
 def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
-    if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
-        pytest.skip("cpu-adam is not installed")
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")

    config_dict = {
        "train_batch_size": 2,
@@ -459,11 +461,11 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optim
                              'Adam'),
                             (2,
                              True,
-                              'deepspeed_adam'),
+                              'Adam'),
                         ])
 def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
-    if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
-        pytest.skip("cpu-adam is not installed")
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")

    config_dict = {
        "train_batch_size": 2,

--- a/tests/unit/test_cpu_adam.py
+++ b/tests/unit/test_cpu_adam.py
 import argparse
 import torch
-import apex
 import time
 import numpy as np
 import pytest
 import copy

 import deepspeed
-if not deepspeed.ops.__installed_ops__['cpu-adam']:
-    pytest.skip("cpu-adam is not installed", allow_module_level=True)
-else:
-    from deepspeed.ops.adam import DeepSpeedCPUAdam
+from deepspeed.ops.adam import FusedAdam
+from deepspeed.ops.op_builder import CPUAdamBuilder
+
+if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+    pytest.skip("cpu-adam is not compatible")


 def check_equal(first, second, atol=1e-2, verbose=False):
@@ -32,6 +32,7 @@ def check_equal(first, second, atol=1e-2, verbose=False):
                             (1048576),
                         ]) # yapf: disable
 def test_cpu_adam_opt(model_size):
+    from deepspeed.ops.adam import DeepSpeedCPUAdam
    device = 'cpu'
    rng_state = torch.get_rng_state()
    param = torch.nn.Parameter(torch.randn(model_size, device=device))
@@ -42,7 +43,7 @@ def test_cpu_adam_opt(model_size):
    param2 = torch.nn.Parameter(param2_data)

    optimizer1 = torch.optim.AdamW([param1])
-    optimizer2 = apex.optimizers.FusedAdam([param2])
+    optimizer2 = FusedAdam([param2])
    optimizer = DeepSpeedCPUAdam([param])

    for i in range(10):

--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@@ -16,8 +16,8 @@ import deepspeed

 import sys

-if not deepspeed.ops.__installed_ops__['transformer']:
-    pytest.skip("transformer kernels are not installed", allow_module_level=True)
+#if not deepspeed.ops.__installed_ops__['transformer']:
+#    pytest.skip("transformer kernels are not installed", allow_module_level=True)


 def check_equal(first, second, atol=1e-2, verbose=False):
@@ -254,6 +254,7 @@ def run_backward(ds_config, atol=1e-2, verbose=False):
    check_equal(base_grads, ds_grads, atol=atol, verbose=verbose)


+#test_backward[3-1024-120-16-24-True-True-0.05]
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
                         [
                             (3,1024,120,16,24,True,False, 0.05),

--- a/tests/unit/test_cuda_forward.py
+++ b/tests/unit/test_cuda_forward.py
@@ -16,8 +16,8 @@ import deepspeed

 import sys

-if not deepspeed.ops.__installed_ops__['transformer']:
-    pytest.skip("transformer kernels are not installed", allow_module_level=True)
+#if not deepspeed.ops.__installed_ops__['transformer']:
+#    pytest.skip("transformer kernels are not installed", allow_module_level=True)


 def check_equal(first, second, atol=1e-2, verbose=False):

--- a/tests/unit/test_dynamic_loss_scale.py
+++ b/tests/unit/test_dynamic_loss_scale.py
@@ -8,9 +8,6 @@ import numpy as np
 from common import distributed_test
 from simple_model import SimpleModel, args_from_dict

-lamb_available = pytest.mark.skipif(not deepspeed.ops.__installed_ops__['lamb'],
-                                    reason="lamb is not installed")
-

 def run_model_step(model, gradient_list):
    for value in gradient_list:
@@ -168,7 +165,6 @@ def test_fused_some_overflow(tmpdir):
    _test_fused_some_overflow(args)


-@lamb_available
 def test_unfused_no_overflow(tmpdir):
    config_dict = {
        "train_batch_size": 1,
@@ -212,7 +208,6 @@ def test_unfused_no_overflow(tmpdir):
    _test_unfused_no_overflow(args)


-@lamb_available
 def test_unfused_all_overflow(tmpdir):
    config_dict = {
        "train_batch_size": 1,
@@ -258,7 +253,6 @@ def test_unfused_all_overflow(tmpdir):
    _test_unfused_all_overflow(args)


-@lamb_available
 def test_unfused_some_overflow(tmpdir):
    config_dict = {
        "train_batch_size": 1,

--- a/tests/unit/test_fp16.py
+++ b/tests/unit/test_fp16.py
 import torch
-import apex
 import deepspeed
 import argparse
 import pytest
 import json
 import os
+from deepspeed.ops.adam import FusedAdam
 from common import distributed_test
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict

-lamb_available = pytest.mark.skipif(not deepspeed.ops.__installed_ops__['lamb'],
-                                    reason="lamb is not installed")
+try:
+    from apex import amp
+    _amp_available = True
+except ImportError:
+    _amp_available = False
+amp_available = pytest.mark.skip(_amp_available, reason="apex/amp is not installed")


-@lamb_available
 def test_lamb_fp32_grad_clip(tmpdir):
    config_dict = {
        "train_batch_size": 2,
@@ -48,7 +51,6 @@ def test_lamb_fp32_grad_clip(tmpdir):
    _test_lamb_fp32_grad_clip(args=args, model=model, hidden_dim=hidden_dim)


-@lamb_available
 def test_lamb_fp16_basic(tmpdir):
    config_dict = {
        "train_batch_size": 2,
@@ -86,7 +88,6 @@ def test_lamb_fp16_basic(tmpdir):
    _test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)


-@lamb_available
 def test_lamb_fp16_empty_grad(tmpdir):
    config_dict = {
        "train_batch_size": 2,
@@ -234,8 +235,8 @@ def test_adamw_fp16_empty_grad(tmpdir):
                              True),
                         ])
 def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
-    if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
-        pytest.skip("cpu-adam is not installed")
+    #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
+    #    pytest.skip("cpu-adam is not installed")
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
@@ -302,8 +303,8 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offlo
                              True),
                         ])
 def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
-    if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
-        pytest.skip("cpu-adam is not installed")
+    #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
+    #    pytest.skip("cpu-adam is not installed")
    config_dict = {
        "train_batch_size": 4,
        "steps_per_print": 1,
@@ -402,8 +403,8 @@ def test_zero_static_scale_deprecated_format(tmpdir):
                              True),
                         ])
 def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
-    if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
-        pytest.skip("cpu-adam is not installed")
+    #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
+    #    pytest.skip("cpu-adam is not installed")
    config_dict = {
        "train_batch_size": 4,
        "steps_per_print": 1,
@@ -442,8 +443,8 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
                              True),
                         ])
 def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
-    if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
-        pytest.skip("cpu-adam is not installed")
+    #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
+    #    pytest.skip("cpu-adam is not installed")
    config_dict = {
        "train_micro_batch_size_per_gpu": 1,
        "gradient_accumulation_steps": 1,
@@ -489,6 +490,7 @@ def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
    _test_zero_empty_partition(args)


+@amp_available
 def test_adam_amp_basic(tmpdir):
    config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}}
    args = args_from_dict(tmpdir, config_dict)
@@ -514,7 +516,7 @@ def test_adam_amp_basic(tmpdir):
    _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)


-@lamb_available
+@amp_available
 def test_lamb_amp_basic(tmpdir):
    config_dict = {
        "train_batch_size": 2,
@@ -552,6 +554,7 @@ def test_lamb_amp_basic(tmpdir):
    _test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim)


+@amp_available
 def test_adam_amp_o2(tmpdir):
    config_dict = {
        "train_batch_size": 2,
@@ -590,6 +593,7 @@ def test_adam_amp_o2(tmpdir):
    _test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim)


+@amp_available
 def test_adam_amp_o2_empty_grad(tmpdir):
    config_dict = {
        "train_batch_size": 2,
@@ -630,11 +634,11 @@ def test_adam_amp_o2_empty_grad(tmpdir):

 @pytest.mark.parametrize('zero_stage, optimizer_constructor',
                         [(1,
-                           apex.optimizers.FusedAdam),
+                           FusedAdam),
                          (2,
                           torch.optim.Adam),
                          (2,
-                           apex.optimizers.FusedAdam)])
+                           FusedAdam)])
 def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
    config_dict = {
        "train_batch_size": 2,

--- a/tests/unit/test_sparse_attention.py
+++ b/tests/unit/test_sparse_attention.py
@@ -6,9 +6,11 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.ops.op_builder import SparseAttnBuilder

-if not deepspeed.ops.__installed_ops__['sparse-attn']:
-    pytest.skip("cpu-adam is not installed", allow_module_level=True)
+if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]:
+    pytest.skip("sparse attention op is not compatible on this system",
+                allow_module_level=True)


 def test_sparse_attention_module_availability():
@@ -236,7 +238,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo


 def _skip_on_cuda_compatability():
-    pytest.skip("Skip these tests for now until we get our docker image fixed.")
+    #pytest.skip("Skip these tests for now until we get our docker image fixed.")
    if torch.cuda.get_device_capability()[0] != 7:
        pytest.skip("needs compute capability 7; v100")
    cuda_major = int(torch.version.cuda.split('.')[0]) * 10

--- a/apex @ 494f8ab3
+++ b/apex @ 494f8ab3
-Subproject commit 494f8ab3fc1b0b26949a3bcbb2bcac78008d48c1
--- a/version.txt
+++ b/version.txt
+0.3.1