Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2

Merge branch 'ds-v0.9.2-rocm' into 'main'
Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2
c25a91b6 · aiss · d1596c94 · af82b300 · d1596c94 · c25a91b6
Commit c25a91b6 authored May 30, 2023 by aiss
20 changed files
--- a/requirements/requirements-1bit-adam.txt
+++ b/requirements/requirements-1bit-adam.txt
-mpi4py
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
-clang-format>=14.0.6
+clang-format==16.0.2
 docutils<0.18
 future
 importlib-metadata>=4
-megatron-lm==1.1.5
 pre-commit>=2.20.0
 pytest
 pytest-forked

--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
-autodoc_pydantic
+autodoc_pydantic<2.0.0
 docutils<0.18
 hjson
 packaging
 psutil
 py-cpuinfo
-pydantic
+pydantic<2.0.0
 torch
 tqdm
--- a/requirements/requirements-sd.txt
+++ b/requirements/requirements-sd.txt
 diffusers
-#triton==2.0.0.dev20221005
+#triton==2.0.0.dev20221202
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -4,6 +4,6 @@ numpy
 packaging>=20.0
 psutil
 py-cpuinfo
-pydantic
+pydantic<2.0.0
 #torch
 tqdm
--- a/requirements/run_pip.sh
+++ b/requirements/run_pip.sh
@@ -8,4 +8,3 @@ pip3 install -r  requirements-inf.txt
 pip3 install -r  requirements-sd.txt
 pip3 install -r  requirements.txt
--- a/run.sh
+++ b/run.sh
 #!/bin/bash
-source `pwd`/env.sh
+export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH
 export DS_BUILD_VERSION=dtk22.10.1
+export LD_LIBRARY_PATH=/usr/local/lib/python3.7/site-packages/torch/lib:$LD_LIBRARY_PATH
 DS_BUILD_RANDOM_LTD=0 DS_BUILD_QUANTIZER=0 DS_BUILD_TRANSFORMER_INFERENCE=0 DS_BUILD_OPS=1 verbose=1 CXX=hipcc CC=hipcc python3 setup.py install bdist_wheel
--- a/scripts/check-license.py
+++ b/scripts/check-license.py
 #!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 from __future__ import annotations
 '''Copyright The Microsoft DeepSpeed Team'''
 """
@@ -13,24 +18,25 @@ def err(s: str) -> None:
    print(s, file=sys.stderr)
+COPYRIGHT = [
+    r"^\(\/\/\|#\) Copyright (c) Microsoft Corporation.$", r"^\(\/\/\|#\) SPDX-License-Identifier: Apache-2.0$",
+    r"^\(\/\/\|#\) DeepSpeed Team$"
+]
 success = True
 failures = []
 for f in sys.argv[1:]:
-    res = subprocess.run(
+    for copyright_line in COPYRIGHT:
-        ["git",
+        if not success:
-         "grep",
+            break
-         "--quiet",
+        res = subprocess.run(["git", "grep", "--quiet", "-e", copyright_line, f], capture_output=True)
-         "-e",
+        if res.returncode == 1:
-         r"Copyright .* DeepSpeed Team",
+            success = False
-         f],
+            failures.append(f)
-        capture_output=True)
+        elif res.returncode == 2:
-    if res.returncode == 1:
+            err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
-        success = False
+            err(res.stderr.decode("utf-8"))
-        failures.append(f)
+            sys.exit(2)
-    elif res.returncode == 2:
-        err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
-        err(res.stderr.decode("utf-8"))
-        sys.exit(2)
 if not success:
    err(f'{failures}: Missing license at top of file')

--- a/scripts/check-torchcuda.py
+++ b/scripts/check-torchcuda.py
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+from __future__ import annotations
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Checks each file in sys.argv for the string "torch.cuda".
+Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py
+"""
+import subprocess
+import sys
+def err(s: str) -> None:
+    print(s, file=sys.stderr)
+# There are many ways we could search for the string "torch.cuda", but `git
+# grep --no-index` is nice because
+#  - it's very fast (as compared to iterating over the file in Python)
+#  - we can reasonably assume it's available on all machines
+#  - unlike plain grep, which is slower and has different flags on MacOS versus
+#    Linux, git grep is always the same.
+res = subprocess.run(
+    ["git", "grep", "-Hn", "--no-index", "-e", r"torch\.cuda", "--and", "--not", "-e", "#ignore-cuda", *sys.argv[1:]],
+    capture_output=True,
+)
+if res.returncode == 0:
+    err('Error: The string "torch.cuda" was found.\nPlease replace all calls to torch.cuda with "get_accelerator()" and add the following import line:\n\n    from deepspeed.accelerator import get_accelerator\n\nIf your code is mean to be cuda specific, please add the following comment in the line with torch.cuda:\n\n    #ignore-cuda\n'
+        )
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
+elif res.returncode == 2:
+    err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+    err(res.stderr.decode("utf-8"))
+    sys.exit(2)
+res = subprocess.run(
+    ["git", "grep", "-Hn", "--no-index", r"\.cuda()", *sys.argv[1:]],
+    capture_output=True,
+)
+if res.returncode == 0:
+    err('Error: The string ".cuda()" was found. This implies convert a tensor to cuda tensor.  Please replace all calls to tensor.cuda() with "tensor.to(get_accelerator().device_name())" and add the following import line:\nfrom deepspeed.accelerator import get_accelerator'
+        )
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
+elif res.returncode == 2:
+    err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+    err(res.stderr.decode("utf-8"))
+    sys.exit(2)
--- a/scripts/check-torchdist.py
+++ b/scripts/check-torchdist.py
 #!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 from __future__ import annotations
 '''Copyright The Microsoft DeepSpeed Team'''
 """
@@ -21,12 +26,7 @@ def err(s: str) -> None:
 #  - unlike plain grep, which is slower and has different flags on MacOS versus
 #    Linux, git grep is always the same.
 res = subprocess.run(
-    ["git",
+    ["git", "grep", "-Hn", "--no-index", r"torch\.distributed", *sys.argv[1:]],
-     "grep",
-     "-Hn",
-     "--no-index",
-     r"torch\.distributed",
-     *sys.argv[1:]],
    capture_output=True,
 )
 if res.returncode == 0:

--- a/scripts/replace_copyright.py
+++ b/scripts/replace_copyright.py
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""
+USAGE:
+$ python3 script/replace_copyright.py --repo_dir ./
+"""
+import os
+import argparse
+NEW_COPYRIGHT = ("Copyright (c) Microsoft Corporation.", "SPDX-License-Identifier: Apache-2.0", "", "DeepSpeed Team")
+PY_SL_COMMENT = "#"
+PY_ML_SINGLE = "'''"
+PY_ML_DOUBLE = '"""'
+PY_COMMENTS = (PY_SL_COMMENT, PY_ML_SINGLE, PY_ML_DOUBLE)
+C_SL_COMMENT = "//"
+C_ML_OPEN = "/*"
+C_ML_CLOSE = "*/"
+C_COMMENTS = (C_SL_COMMENT, C_ML_OPEN, C_ML_CLOSE)
+BASH_SL_COMMENT = "#"
+BASH_COMMENTS = (BASH_SL_COMMENT, )
+DELIM = "|/-\|/-\|BARRIER|/-\|/-\|"  # noqa: W605
+def parser_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo_dir", type=str, help="Repository directory")
+    parser.add_argument("--python_style_ext",
+                        type=str,
+                        nargs="+",
+                        default=[".py"],
+                        help="File types to process with python-style comments")
+    parser.add_argument("--bash_style_ext",
+                        type=str,
+                        nargs="+",
+                        default=[".sh"],
+                        help="File types to process with bash-style comments")
+    parser.add_argument("--c_style_ext",
+                        type=str,
+                        nargs="+",
+                        default=[
+                            ".c",
+                            ".cpp",
+                            ".cu",
+                            ".h",
+                            ".hpp",
+                            ".cuh",
+                            ".cc",
+                            ".hip",
+                            ".tr",
+                        ],
+                        help="File types to process with C-style comments")
+    args = parser.parse_args()
+    return args
+# These get_header_* functions are ugly, but they work :)
+def get_header_py(fp):
+    with open(fp, "r") as f:
+        lines = iter(l for l in f.readlines())
+    header = []
+    rest = []
+    in_multiline = False
+    multiline_type = None
+    while (l := next(lines, None)) is not None:
+        l = l.strip()
+        if l.startswith(PY_ML_SINGLE) or l.startswith(PY_ML_DOUBLE):
+            # Detected multiline comment
+            if in_multiline and multiline_type == l[:3]:
+                # Ended a multiline comment
+                in_multiline = False
+            else:
+                # Started a multiline comment
+                in_multiline = True
+                multiline_type = l[:3]
+            if l.endswith(multiline_type) and len(l) >= 6:
+                # Opened and closed multiline comment on single line
+                in_multiline = False
+        elif in_multiline and l.endswith(multiline_type):
+            # Ended a multiline comment
+            in_multiline = False
+        elif not (in_multiline or l.startswith(PY_SL_COMMENT) or l == ""):
+            # Not in a comment
+            rest += [l + "\n"]
+            break
+        header.append(l)
+    rest += list(lines)
+    return header, rest
+def get_header_c(fp):
+    with open(fp, "r") as f:
+        lines = iter(l for l in f.readlines())
+    header = []
+    rest = []
+    in_multiline = False
+    while (l := next(lines, None)) is not None:
+        l = l.strip()
+        if l.startswith(C_ML_OPEN):
+            # Detected multiline comment
+            if not l.endswith(C_ML_CLOSE):
+                # multiline comment not closed on same line
+                in_multiline = True
+        elif l.endswith(C_ML_CLOSE):
+            # Ended a multline comment
+            in_multiline = False
+        elif not in_multiline or l.startswith(C_SL_COMMENT) or l.isspace():
+            # Not in a comment
+            rest += [l + "\n"]
+            break
+        header.append(l)
+    rest += list(lines)
+    return header, rest
+def get_header_bash(fp):
+    with open(fp, "r") as f:
+        lines = iter(l for l in f.readlines())
+    header = []
+    rest = []
+    while (l := next(lines, None)) is not None:
+        l = l.strip()
+        if not l.startswith(BASH_SL_COMMENT) or l.isspace():
+            # Not in a comment
+            rest += [l + "\n"]
+            break
+        header.append(l)
+    rest += list(lines)
+    return header, rest
+def remove_comments(line, comment_strs):
+    for cstr in comment_strs:
+        line = line.replace(cstr, "")
+    return line
+def format_multiline_comment(text, comment_type):
+    if comment_type == PY_COMMENTS:
+        text = f"\n{comment_type[2]}\n" + "\n".join(text) + f"{comment_type[2]}"
+    if comment_type == C_COMMENTS:
+        text = f"\n{comment_type[1]}\n" + "\n".join(text) + f"{comment_type[2]}"
+    if comment_type == BASH_COMMENTS:
+        text = "\n".join([f"{comment_type[0]}{l}" for l in text])
+    return text
+def modify_file_header(fp, file_header, rest_of_file, preserve_text_store, comment_type):
+    header_text = "\n".join(file_header)
+    if not (header_text.strip() == "" or header_text in preserve_text_store):
+        # Unique header, need to get user input
+        print("\n", DELIM, "\n")
+        for idx, line in enumerate(file_header):
+            print(f"{idx}: {line}")
+        print("\n", DELIM, "\n")
+        print("\nIndicate the FIRST line of the Header to KEEP")
+        print("(shebang #! lines will be automatically processed and should not be included).")
+        keep_idx = input("Enter number (or leave blank if no lines should be preserved): ")
+        preserve_text_store[header_text] = file_header[int(keep_idx):] if keep_idx != "" else ""
+    # Identify any shebang lines in the file
+    shebang = "\n".join([l for l in file_header if l.startswith("#!")])
+    if shebang != "":
+        shebang += "\n"
+    # Get the text we should preserve in this file and process to remove comment characters
+    text_to_preserve = preserve_text_store.get(header_text, [""])
+    text_to_preserve = [remove_comments(l, comment_type) for l in text_to_preserve]
+    # Format the text we want to keep into a new multiline comment
+    if "".join(text_to_preserve) == "":
+        text_to_preserve = ""
+    else:
+        text_to_preserve = format_multiline_comment(text_to_preserve, comment_type)
+    # Generate the copyright text we will be adding
+    copyright_text = "\n".join([f"{comment_type[0]} {l}" if l != "" else l for l in NEW_COPYRIGHT])
+    # Assemble the new header
+    new_header = shebang + copyright_text + text_to_preserve
+    # Write out the new file
+    new_file_contents = new_header + "\n" + "".join(rest_of_file)
+    with open(fp, "w") as f:
+        f.write(new_file_contents)
+    return preserve_text_store  # Return so we can reuse for future files
+def main(args):
+    preserve_text_store = {}  # Used to track header comments we should preserve
+    for root, dirs, fnames in os.walk(args.repo_dir):
+        # Walk across directory looking for all files with extensions we want to modify
+        for ext in args.python_style_ext:
+            fpaths = [os.path.join(root, fn) for fn in fnames if fn.endswith(ext)]
+            for fp in fpaths:
+                file_header, rest_of_file = get_header_py(fp)
+                preserve_text_store = modify_file_header(fp, file_header, rest_of_file, preserve_text_store,
+                                                         PY_COMMENTS)
+        for ext in args.c_style_ext:
+            fpaths = [os.path.join(root, fn) for fn in fnames if fn.endswith(ext)]
+            for fp in fpaths:
+                file_header, rest_of_file = get_header_c(fp)
+                preserve_text_store = modify_file_header(fp, file_header, rest_of_file, preserve_text_store,
+                                                         C_COMMENTS)
+        for ext in args.bash_style_ext:
+            fpaths = [os.path.join(root, fn) for fn in fnames if fn.endswith(ext)]
+            for fp in fpaths:
+                file_header, rest_of_file = get_header_bash(fp)
+                preserve_text_store = modify_file_header(fp, file_header, rest_of_file, preserve_text_store,
+                                                         BASH_COMMENTS)
+if __name__ == "__main__":
+    args = parser_args()
+    main(args)
--- a/setup.py
+++ b/setup.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""
 DeepSpeed library
 To build wheel on Windows:
-    1. Install pytorch, such as pytorch 1.12 + cuda 11.6
+1. Install pytorch, such as pytorch 1.12 + cuda 11.6.
-    2. Install visual cpp build tool
+2. Install visual cpp build tool.
-    3. Include cuda toolkit
+3. Include cuda toolkit.
-    4. Launch cmd console with Administrator privilege for creating required symlink folders
+4. Launch cmd console with Administrator privilege for creating required symlink folders.
 Create a new wheel via the following command:
-    build_win.bat
+build_win.bat
 The wheel will be located at: dist/*.whl
 """
@@ -34,7 +37,7 @@ from op_builder import get_default_compute_capabilities, OpBuilder
 from op_builder.all_ops import ALL_OPS
 from op_builder.builder import installed_cuda_version
-# fetch rocm state
+# Fetch rocm state.
 is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
 rocm_version = OpBuilder.installed_rocm_version()
@@ -55,7 +58,7 @@ def fetch_requirements(path):
 install_requires = fetch_requirements('requirements/requirements.txt')
 extras_require = {
-    '1bit': [], # add cupy based on cuda/rocm version
+    '1bit': [],  # add cupy based on cuda/rocm version
    '1bit_mpi': fetch_requirements('requirements/requirements-1bit-mpi.txt'),
    'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
    'dev': fetch_requirements('requirements/requirements-dev.txt'),
@@ -66,21 +69,26 @@ extras_require = {
    'sd': fetch_requirements('requirements/requirements-sd.txt')
 }
-# Add specific cupy version to both onebit extension variants
+# Add specific cupy version to both onebit extension variants.
 if torch_available and torch.cuda.is_available():
    cupy = None
    if is_rocm_pytorch:
        rocm_major, rocm_minor = rocm_version
-        # XXX cupy support for rocm 5 is not available yet
+        # XXX cupy support for rocm 5 is not available yet.
        if rocm_major <= 4:
            cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
    else:
-        cupy = f"cupy-cuda{''.join(map(str,installed_cuda_version()))}"
+        cuda_major_ver, cuda_minor_ver = installed_cuda_version()
+        if (cuda_major_ver < 11) or ((cuda_major_ver == 11) and (cuda_minor_ver < 3)):
+            cupy = f"cupy-cuda{cuda_major_ver}{cuda_minor_ver}"
+        else:
+            cupy = f"cupy-cuda{cuda_major_ver}x"
    if cupy:
        extras_require['1bit'].append(cupy)
        extras_require['1bit_mpi'].append(cupy)
-# Make an [all] extra that installs all needed dependencies
+# Make an [all] extra that installs all needed dependencies.
 all_extras = set()
 for extra in extras_require.items():
    for req in extra[1]:
@@ -89,11 +97,10 @@ extras_require['all'] = list(all_extras)
 cmdclass = {}
-# For any pre-installed ops force disable ninja
+# For any pre-installed ops force disable ninja.
 if torch_available:
    from accelerator import get_accelerator
-    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(
+    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False)
-        use_ninja=False)
 if torch_available:
    TORCH_MAJOR = torch.__version__.split('.')[0]
@@ -103,11 +110,10 @@ else:
    TORCH_MINOR = "0"
 if torch_available and not torch.cuda.is_available():
-    # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486
+    # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
-    print(
+    print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
-        "[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
+          "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
-        "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
+          "(compute capabilities 6.0, 6.1, 6.2)")
-        "(compute capabilities 6.0, 6.1, 6.2)")
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
        os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities()
@@ -150,18 +156,19 @@ for op_name, builder in ALL_OPS.items():
    print("op_name: ", op_name)
    print("op_enabled: ", op_enabled(op_name))
    print("op_compatible: ", op_compatible)
-    # If op is requested but not available, throw an error
+    # If op is requested but not available, throw an error.
    if op_enabled(op_name) and not op_compatible:
        env_var = op_envvar(op_name)
        if env_var not in os.environ:
            builder.warning(f"One can disable {op_name} with {env_var}=0")
        abort(f"Unable to pre-compile {op_name}")
-    # if op is compatible but install is not enabled (JIT mode)
+    # If op is compatible but install is not enabled (JIT mode).
    if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
        builder.hipify_extension()
-    # If op install enabled, add builder to extensions
+    # If op install enabled, add builder to extensions.
    if op_enabled(op_name) and op_compatible:
        assert torch_available, f"Unable to pre-compile {op_name}, please first install torch"
        install_ops[op_name] = op_enabled(op_name)
@@ -169,7 +176,7 @@ for op_name, builder in ALL_OPS.items():
 print(f'Install Ops={install_ops}')
-# Write out version/git info
+# Write out version/git info.
 git_hash_cmd = "git rev-parse --short HEAD"
 git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
 if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
@@ -178,7 +185,7 @@ if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
        git_hash = result.decode('utf-8').strip()
        result = subprocess.check_output(git_branch_cmd, shell=True)
        git_branch = result.decode('utf-8').strip()
-#add dtk version
+        #add dtk version
        if os.getenv('DS_BUILD_VERSION'):
            version_dtk = os.getenv('DS_BUILD_VERSION', "")
            git_hash += "." + version_dtk
@@ -207,38 +214,38 @@ if sys.platform == "win32":
    create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
    egg_info.manifest_maker.template = 'MANIFEST_win.in'
-# Parse the DeepSpeed version string from version.txt
+# Parse the DeepSpeed version string from version.txt.
 version_str = open('version.txt', 'r').read().strip()
 # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
-# example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel
+# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
-# Building wheel for distribution, update version file
+# Building wheel for distribution, update version file.
 if 'DS_BUILD_STRING' in os.environ:
-    # Build string env specified, probably building for distribution
+    # Build string env specified, probably building for distribution.
    with open('build.txt', 'w') as fd:
        fd.write(os.environ.get('DS_BUILD_STRING'))
    version_str += os.environ.get('DS_BUILD_STRING')
 elif os.path.isfile('build.txt'):
-    # build.txt exists, probably installing from distribution
+    # build.txt exists, probably installing from distribution.
    with open('build.txt', 'r') as fd:
        version_str += fd.read().strip()
 else:
-    # None of the above, probably installing from source
+    # None of the above, probably installing from source.
    version_str += f'+{git_hash}'
 torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
 bf16_support = False
-# Set cuda_version to 0.0 if cpu-only
+# Set cuda_version to 0.0 if cpu-only.
 cuda_version = "0.0"
 nccl_version = "0.0"
-# Set hip_version to 0.0 if cpu-only
+# Set hip_version to 0.0 if cpu-only.
 hip_version = "0.0"
 if torch_available and torch.version.cuda is not None:
    cuda_version = ".".join(torch.version.cuda.split('.')[:2])
    if sys.platform != "win32":
        if isinstance(torch.cuda.nccl.version(), int):
-            # This will break if minor version > 9
+            # This will break if minor version > 9.
            nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
        else:
            nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
@@ -280,7 +287,7 @@ setup(name='deepspeed',
      long_description=readme_text,
      long_description_content_type='text/markdown',
      author='DeepSpeed Team',
-      author_email='deepspeed@microsoft.com',
+      author_email='deepspeed-info@microsoft.com',
      url='http://deepspeed.ai',
      project_urls={
          'Documentation': 'https://deepspeed.readthedocs.io',
@@ -288,26 +295,17 @@ setup(name='deepspeed',
      },
      install_requires=install_requires,
      extras_require=extras_require,
-      #packages=find_packages(include=['deepspeed',
+      #packages=find_packages(include=['deepspeed', 'deepspeed.*']),
-      #                                'deepspeed.*']),
      packages=find_namespace_packages(include=['deepspeed',
                                      'deepspeed.*']),
      include_package_data=True,
      scripts=[
-          'bin/deepspeed',
+          'bin/deepspeed', 'bin/deepspeed.pt', 'bin/ds', 'bin/ds_ssh', 'bin/ds_report', 'bin/ds_bench', 'bin/dsr',
-          'bin/deepspeed.pt',
-          'bin/ds',
-          'bin/ds_ssh',
-          'bin/ds_report',
-          'bin/ds_bench',
-          'bin/dsr',
          'bin/ds_elastic'
      ],
      classifiers=[
-          'Programming Language :: Python :: 3.6',
+          'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7',
-          'Programming Language :: Python :: 3.7',
+          'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9',
-          'Programming Language :: Python :: 3.8',
-          'Programming Language :: Python :: 3.9',
          'Programming Language :: Python :: 3.10'
      ],
      license='MIT',

--- a/tests/accelerator/test_ds_init.py
+++ b/tests/accelerator/test_ds_init.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 import os
 import torch
 import deepspeed
@@ -6,6 +10,7 @@ from deepspeed.accelerator import get_accelerator
 class OneLayerNet(torch.nn.Module):
    def __init__(self, D_in, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
@@ -38,10 +43,6 @@ def test_literal_device():
    string = get_accelerator().device_name()  #'xpu' or 'cuda'
    string0 = get_accelerator().device_name(0)  #'xpu:0' or 'cuda:0'
    string1 = get_accelerator().device_name(1)  #'xpu:1' or 'cuda:1'
-    #aiss
-    print(string0)
-    print(string1)
    assert string == 'xpu' or string == 'cuda'
    assert string0 == 'xpu:0' or string0 == 'cuda:0'
    assert string1 == 'xpu:1' or string1 == 'cuda:1'
--- a/tests/benchmarks/flatten_bench.py
+++ b/tests/benchmarks/flatten_bench.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 #!/usr/bin/env python
 # run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
@@ -26,12 +29,9 @@ unflatten = util_ops.unflatten
 torch.manual_seed(0)
 # emulate a small typical model weights
 x = [
-    torch.rand((512,
+    torch.rand((512, 512)).to(get_accelerator().device_name()),
-                512)).to(get_accelerator().device_name()),
+    torch.rand((512, 1024)).to(get_accelerator().device_name()),
-    torch.rand((512,
+    torch.rand((512, 30000)).to(get_accelerator().device_name())
-                1024)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                30000)).to(get_accelerator().device_name())
 ]
 t = x * 30

--- a/tests/benchmarks/unflatten_bench.py
+++ b/tests/benchmarks/unflatten_bench.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 #!/usr/bin/env python
@@ -26,12 +29,9 @@ unflatten = util_ops.unflatten
 torch.manual_seed(0)
 # emulate a small typical model weights
 x = [
-    torch.rand((512,
+    torch.rand((512, 512)).to(get_accelerator().device_name()),
-                512)).to(get_accelerator().device_name()),
+    torch.rand((512, 1024)).to(get_accelerator().device_name()),
-    torch.rand((512,
+    torch.rand((512, 30000)).to(get_accelerator().device_name())
-                1024)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                30000)).to(get_accelerator().device_name())
 ]
 unflat_t = x * 30

--- a/tests/conftest.py
+++ b/tests/conftest.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 # tests directory-specific settings - this file is run automatically by pytest before any tests are run
@@ -18,6 +21,13 @@ git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
 sys.path.insert(1, git_repo_path)
+def pytest_configure(config):
+    config.option.color = "yes"
+    config.option.durations = 0
+    config.option.durations_min = 1
+    config.option.verbose = True
 def pytest_addoption(parser):
    parser.addoption("--torch_ver", default=None, type=str)
    parser.addoption("--cuda_ver", default=None, type=str)
@@ -35,16 +45,14 @@ def check_environment(pytestconfig):
    expected_cuda_version = pytestconfig.getoption("cuda_ver")
    if expected_torch_version is None:
        warnings.warn(
-            "Running test without verifying torch version, please provide an expected torch version with --torch_ver"
+            "Running test without verifying torch version, please provide an expected torch version with --torch_ver")
-        )
    elif not validate_version(expected_torch_version, torch.__version__):
        pytest.exit(
            f"expected torch version {expected_torch_version} did not match found torch version {torch.__version__}",
            returncode=2)
    if expected_cuda_version is None:
        warnings.warn(
-            "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver"
+            "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver")
-        )
    elif not validate_version(expected_cuda_version, torch.version.cuda):
        pytest.exit(
            f"expected cuda version {expected_cuda_version} did not match found cuda version {torch.version.cuda}",

--- a/tests/hybrid_engine/hybrid_engine_config.json
+++ b/tests/hybrid_engine/hybrid_engine_config.json
+{
+  "train_batch_size" : 32,
+  "train_micro_batch_size_per_gpu": 2,
+  "steps_per_print": 10,
+  "zero_optimization": {
+    "stage": 0,
+    "offload_param": {
+        "device": "cpu"
+    },
+    "stage3_param_persistence_threshold": 0
+  },
+  "fp16":{
+    "enabled": true,
+    "loss_scale_window": 100
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "wall_clock_breakdown" : false
+}
--- a/tests/hybrid_engine/hybrid_engine_test.py
+++ b/tests/hybrid_engine/hybrid_engine_test.py
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+import torch
+from transformers import AutoModelForCausalLM
+import deepspeed
+import argparse
+from deepspeed.accelerator import get_accelerator
+deepspeed.runtime.utils.see_memory_usage('pre test', force=True)
+model = AutoModelForCausalLM.from_pretrained('facebook/opt-350M').half().to(get_accelerator().device_name())
+parser = argparse.ArgumentParser()
+parser = deepspeed.add_config_arguments(parser)
+args = parser.parse_args()
+deepspeed.runtime.utils.see_memory_usage('post test', force=True)
+m, _, _, _ = deepspeed.initialize(model=model, args=args, enable_hybrid_engine=True)
+m.eval()
+input = torch.ones(1, 16, device='cuda', dtype=torch.long)
+out = m(input)
+m.train()
+out = m(input)
+print(out['logits'], out['logits'].norm())
--- a/tests/lightning/test_simple.py
+++ b/tests/lightning/test_simple.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 import torch
 from pytorch_lightning import LightningModule, Trainer
@@ -7,6 +10,7 @@ from torch.utils.data import DataLoader, Dataset
 class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)
@@ -19,6 +23,7 @@ class RandomDataset(Dataset):
 class BoringModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(32, 2)
@@ -53,5 +58,5 @@ def test_lightning_model():
    """Test that DeepSpeed works with a simple LightningModule and LightningDataModule."""
    model = BoringModel()
-    trainer = Trainer(strategy=DeepSpeedStrategy(), max_epochs=1, precision=16, gpus=1)
+    trainer = Trainer(strategy=DeepSpeedStrategy(), max_epochs=1, precision=16, accelerator="gpu", devices=1)
    trainer.fit(model)
--- a/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/checkpoint/mp_rank_00_model_states.pt
+++ b/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/checkpoint/mp_rank_00_model_states.pt