Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2

Merge branch 'ds-v0.9.2-rocm' into 'main'
Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2
c25a91b6 · aiss · d1596c94 · af82b300 · d1596c94 · d1596c94
Commit c25a91b6 authored May 30, 2023 by aiss
20 changed files
--- a/deepspeed.git/hooks/pre-push.sample
+++ b/deepspeed.git/hooks/pre-push.sample
-#!/bin/sh
-# An example hook script to verify what is about to be pushed.  Called by "git
-# push" after it has checked the remote status, but before anything has been
-# pushed.  If this script exits with a non-zero status nothing will be pushed.
-#
-# This hook is called with the following parameters:
-#
-# $1 -- Name of the remote to which the push is being done
-# $2 -- URL to which the push is being done
-#
-# If pushing without using a named remote those arguments will be equal.
-#
-# Information about the commits which are being pushed is supplied as lines to
-# the standard input in the form:
-#
-#   <local ref> <local sha1> <remote ref> <remote sha1>
-#
-# This sample shows how to prevent push of commits where the log message starts
-# with "WIP" (work in progress).
-remote="$1"
-url="$2"
-z40=0000000000000000000000000000000000000000
-IFS=' '
-while read local_ref local_sha remote_ref remote_sha
-do
-	if [ "$local_sha" = $z40 ]
-	then
-		# Handle delete
-	else
-		if [ "$remote_sha" = $z40 ]
-		then
-			# New branch, examine all commits
-			range="$local_sha"
-		else
-			# Update to existing branch, examine new commits
-			range="$remote_sha..$local_sha"
-		fi
-		# Check for WIP commit
-		commit=`git rev-list -n 1 --grep '^WIP' "$range"`
-		if [ -n "$commit" ]
-		then
-			echo "Found WIP commit in $local_ref, not pushing"
-			exit 1
-		fi
-	fi
-done
-exit 0
--- a/deepspeed.git/hooks/pre-rebase.sample
+++ b/deepspeed.git/hooks/pre-rebase.sample
-#!/bin/sh
-#
-# Copyright (c) 2006, 2008 Junio C Hamano
-#
-# The "pre-rebase" hook is run just before "git rebase" starts doing
-# its job, and can prevent the command from running by exiting with
-# non-zero status.
-#
-# The hook is called with the following parameters:
-#
-# $1 -- the upstream the series was forked from.
-# $2 -- the branch being rebased (or empty when rebasing the current branch).
-#
-# This sample shows how to prevent topic branches that are already
-# merged to 'next' branch from getting rebased, because allowing it
-# would result in rebasing already published history.
-publish=next
-basebranch="$1"
-if test "$#" = 2
-then
-	topic="refs/heads/$2"
-else
-	topic=`git symbolic-ref HEAD` ||
-	exit 0 ;# we do not interrupt rebasing detached HEAD
-fi
-case "$topic" in
-refs/heads/??/*)
-	;;
-*)
-	exit 0 ;# we do not interrupt others.
-	;;
-esac
-# Now we are dealing with a topic branch being rebased
-# on top of master.  Is it OK to rebase it?
-# Does the topic really exist?
-git show-ref -q "$topic" || {
-	echo >&2 "No such branch $topic"
-	exit 1
-}
-# Is topic fully merged to master?
-not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
-if test -z "$not_in_master"
-then
-	echo >&2 "$topic is fully merged to master; better remove it."
-	exit 1 ;# we could allow it, but there is no point.
-fi
-# Is topic ever merged to next?  If so you should not be rebasing it.
-only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
-only_next_2=`git rev-list ^master           ${publish} | sort`
-if test "$only_next_1" = "$only_next_2"
-then
-	not_in_topic=`git rev-list "^$topic" master`
-	if test -z "$not_in_topic"
-	then
-		echo >&2 "$topic is already up-to-date with master"
-		exit 1 ;# we could allow it, but there is no point.
-	else
-		exit 0
-	fi
-else
-	not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
-	/usr/bin/perl -e '
-		my $topic = $ARGV[0];
-		my $msg = "* $topic has commits already merged to public branch:\n";
-		my (%not_in_next) = map {
-			/^([0-9a-f]+) /;
-			($1 => 1);
-		} split(/\n/, $ARGV[1]);
-		for my $elem (map {
-				/^([0-9a-f]+) (.*)$/;
-				[$1 => $2];
-			} split(/\n/, $ARGV[2])) {
-			if (!exists $not_in_next{$elem->[0]}) {
-				if ($msg) {
-					print STDERR $msg;
-					undef $msg;
-				}
-				print STDERR " $elem->[1]\n";
-			}
-		}
-	' "$topic" "$not_in_next" "$not_in_master"
-	exit 1
-fi
-exit 0
-################################################################
-This sample hook safeguards topic branches that have been
-published from being rewound.
-The workflow assumed here is:
- * Once a topic branch forks from "master", "master" is never
-   merged into it again (either directly or indirectly).
- * Once a topic branch is fully cooked and merged into "master",
-   it is deleted.  If you need to build on top of it to correct
-   earlier mistakes, a new topic branch is created by forking at
-   the tip of the "master".  This is not strictly necessary, but
-   it makes it easier to keep your history simple.
- * Whenever you need to test or publish your changes to topic
-   branches, merge them into "next" branch.
-The script, being an example, hardcodes the publish branch name
-to be "next", but it is trivial to make it configurable via
-$GIT_DIR/config mechanism.
-With this workflow, you would want to know:
-(1) ... if a topic branch has ever been merged to "next".  Young
-    topic branches can have stupid mistakes you would rather
-    clean up before publishing, and things that have not been
-    merged into other branches can be easily rebased without
-    affecting other people.  But once it is published, you would
-    not want to rewind it.
-(2) ... if a topic branch has been fully merged to "master".
-    Then you can delete it.  More importantly, you should not
-    build on top of it -- other people may already want to
-    change things related to the topic as patches against your
-    "master", so if you need further changes, it is better to
-    fork the topic (perhaps with the same name) afresh from the
-    tip of "master".
-Let's look at this example:
-		   o---o---o---o---o---o---o---o---o---o "next"
-		  /       /           /           /
-		 /   a---a---b A     /           /
-		/   /               /           /
-	       /   /   c---c---c---c B         /
-	      /   /   /             \         /
-	     /   /   /   b---b C     \       /
-	    /   /   /   /             \     /
-    ---o---o---o---o---o---o---o---o---o---o---o "master"
-A, B and C are topic branches.
- * A has one fix since it was merged up to "next".
- * B has finished.  It has been fully merged up to "master" and "next",
-   and is ready to be deleted.
- * C has not merged to "next" at all.
-We would want to allow C to be rebased, refuse A, and encourage
-B to be deleted.
-To compute (1):
-	git rev-list ^master ^topic next
-	git rev-list ^master        next
-	if these match, topic has not merged in next at all.
-To compute (2):
-	git rev-list master..topic
-	if this is empty, it is fully merged to "master".
--- a/deepspeed.git/hooks/prepare-commit-msg.sample
+++ b/deepspeed.git/hooks/prepare-commit-msg.sample
-#!/bin/sh
-#
-# An example hook script to prepare the commit log message.
-# Called by "git commit" with the name of the file that has the
-# commit message, followed by the description of the commit
-# message's source.  The hook's purpose is to edit the commit
-# message file.  If the hook fails with a non-zero status,
-# the commit is aborted.
-#
-# To enable this hook, rename this file to "prepare-commit-msg".
-# This hook includes three examples.  The first comments out the
-# "Conflicts:" part of a merge commit.
-#
-# The second includes the output of "git diff --name-status -r"
-# into the message, just before the "git status" output.  It is
-# commented because it doesn't cope with --amend or with squashed
-# commits.
-#
-# The third example adds a Signed-off-by line to the message, that can
-# still be edited.  This is rarely a good idea.
-case "$2,$3" in
-  merge,)
-    /usr/bin/perl -i.bak -ne 's/^/# /, s/^# #/#/ if /^Conflicts/ .. /#/; print' "$1" ;;
-# ,|template,)
-#   /usr/bin/perl -i.bak -pe '
-#      print "\n" . `git diff --cached --name-status -r`
-#	 if /^#/ && $first++ == 0' "$1" ;;
-  *) ;;
-esac
-# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
-# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
--- a/deepspeed.git/hooks/update.sample
+++ b/deepspeed.git/hooks/update.sample
-#!/bin/sh
-#
-# An example hook script to blocks unannotated tags from entering.
-# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
-#
-# To enable this hook, rename this file to "update".
-#
-# Config
-# ------
-# hooks.allowunannotated
-#   This boolean sets whether unannotated tags will be allowed into the
-#   repository.  By default they won't be.
-# hooks.allowdeletetag
-#   This boolean sets whether deleting tags will be allowed in the
-#   repository.  By default they won't be.
-# hooks.allowmodifytag
-#   This boolean sets whether a tag may be modified after creation. By default
-#   it won't be.
-# hooks.allowdeletebranch
-#   This boolean sets whether deleting branches will be allowed in the
-#   repository.  By default they won't be.
-# hooks.denycreatebranch
-#   This boolean sets whether remotely creating branches will be denied
-#   in the repository.  By default this is allowed.
-#
-# --- Command line
-refname="$1"
-oldrev="$2"
-newrev="$3"
-# --- Safety check
-if [ -z "$GIT_DIR" ]; then
-	echo "Don't run this script from the command line." >&2
-	echo " (if you want, you could supply GIT_DIR then run" >&2
-	echo "  $0 <ref> <oldrev> <newrev>)" >&2
-	exit 1
-fi
-if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
-	echo "usage: $0 <ref> <oldrev> <newrev>" >&2
-	exit 1
-fi
-# --- Config
-allowunannotated=$(git config --bool hooks.allowunannotated)
-allowdeletebranch=$(git config --bool hooks.allowdeletebranch)
-denycreatebranch=$(git config --bool hooks.denycreatebranch)
-allowdeletetag=$(git config --bool hooks.allowdeletetag)
-allowmodifytag=$(git config --bool hooks.allowmodifytag)
-# check for no description
-projectdesc=$(sed -e '1q' "$GIT_DIR/description")
-case "$projectdesc" in
-"Unnamed repository"* | "")
-	echo "*** Project description file hasn't been set" >&2
-	exit 1
-	;;
-esac
-# --- Check types
-# if $newrev is 0000...0000, it's a commit to delete a ref.
-zero="0000000000000000000000000000000000000000"
-if [ "$newrev" = "$zero" ]; then
-	newrev_type=delete
-else
-	newrev_type=$(git cat-file -t $newrev)
-fi
-case "$refname","$newrev_type" in
-	refs/tags/*,commit)
-		# un-annotated tag
-		short_refname=${refname##refs/tags/}
-		if [ "$allowunannotated" != "true" ]; then
-			echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
-			echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
-			exit 1
-		fi
-		;;
-	refs/tags/*,delete)
-		# delete tag
-		if [ "$allowdeletetag" != "true" ]; then
-			echo "*** Deleting a tag is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	refs/tags/*,tag)
-		# annotated tag
-		if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
-		then
-			echo "*** Tag '$refname' already exists." >&2
-			echo "*** Modifying a tag is not allowed in this repository." >&2
-			exit 1
-		fi
-		;;
-	refs/heads/*,commit)
-		# branch
-		if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
-			echo "*** Creating a branch is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	refs/heads/*,delete)
-		# delete branch
-		if [ "$allowdeletebranch" != "true" ]; then
-			echo "*** Deleting a branch is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	refs/remotes/*,commit)
-		# tracking branch
-		;;
-	refs/remotes/*,delete)
-		# delete tracking branch
-		if [ "$allowdeletebranch" != "true" ]; then
-			echo "*** Deleting a tracking branch is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	*)
-		# Anything else (is there anything else?)
-		echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
-		exit 1
-		;;
-esac
-# --- Finished
-exit 0
--- a/deepspeed.git/info/exclude
+++ b/deepspeed.git/info/exclude
-# git ls-files --others --exclude-from=.git/info/exclude
-# Lines that start with '#' are comments.
-# For a project mostly in C, the following would be a good set of
-# exclude patterns (uncomment them if you want to use them):
-# *.[oa]
-# *~
--- a/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.idx
+++ b/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.idx
--- a/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.pack
+++ b/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.pack
--- a/deepspeed.git/packed-refs
+++ b/deepspeed.git/packed-refs
-# pack-refs with: peeled fully-peeled 
-1b2721adcd96656bb1f27d1f2f60947567b2d505 refs/heads/deepspeed-0.6.3-rocm
-cd3feaaa6aef8e868eea954841294ab3a2b16f84 refs/heads/ds-0.3.13-rocm
-67ea635fe037707924417893674e94275e849d7e refs/heads/ds-v0.8.2-rocm
-67ea635fe037707924417893674e94275e849d7e refs/heads/main
-87833e1f85e006c5c5d618dbe4de700885a2f571 refs/tags/grad-norm-test
-c61e23b4b108df2af0dda7939ee59d4ae9090415 refs/tags/v0.1.0
-96c4daabc162c3c05fe602152ee2ab2d780c0e23 refs/tags/v0.2.0
-4b1df25ae96e3732213877e7729c5e15548188fd refs/tags/v0.3.0
-31f46feee2d491d58a13404e354440551de9d5bf refs/tags/v0.3.1
-c14b839d9898f4c84e372e896e3ce8fa2e169a79 refs/tags/v0.3.10
-72b23ea32282c52c53a81a097dfc26c653d3a731 refs/tags/v0.3.11
-35fd7ccd862adcb93febd546cb5b9fa7cb883d8f refs/tags/v0.3.12
-12a53b43833b7bea279a205e313f2bd3f0cdfd99 refs/tags/v0.3.13
-9941ce75225868ef9222a0360683a563d05d87ad refs/tags/v0.3.2
-9de21b72b5e8adb6c1fe4ae96cbddaa929178cc1 refs/tags/v0.3.3
-6b28bc5db58fa95628b9cf69e350dcacc2f33478 refs/tags/v0.3.4
-16313a962bce9df567597ffc2380250a1535e27a refs/tags/v0.3.5
-73c3262df63e85c2b2f0d48bf9217c58508e44f3 refs/tags/v0.3.6
-c51fa65de847ba44f0a1bcfc9957cb4e5fae3ab6 refs/tags/v0.3.7
-cb7c7da6f7696e27591610db3c2c906f9c2c8070 refs/tags/v0.3.8
-81aeea361da3936b875a678b9cb44596800510b5 refs/tags/v0.3.9
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
-'''
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-'''
+# DeepSpeed Team
 import sys
 import types
@@ -14,8 +15,10 @@ from packaging import version as pkg_version
 from . import ops
 from . import module_inject
+from .accelerator import get_accelerator
 from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpeedSchedulerCallable
 from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
+from .runtime.hybrid_engine import DeepSpeedHybridEngine
 from .runtime.pipe.engine import PipelineEngine
 from .inference.engine import InferenceEngine
 from .inference.config import DeepSpeedInferenceConfig
@@ -25,7 +28,7 @@ from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .module_inject import replace_transformer_layer, revert_transformer_layer
-from .utils import log_dist, OnDevice
+from .utils import log_dist, OnDevice, logger
 from .comm.comm import init_distributed
 from .runtime import zero
@@ -48,15 +51,16 @@ __version_major__, __version_minor__, __version_patch__ = _parse_version(__versi
 __git_hash__ = git_hash
 __git_branch__ = git_branch
+# Set to torch's distributed package or deepspeed.comm based inside DeepSpeedEngine init
+dist = None
 def initialize(args=None,
               model: torch.nn.Module = None,
-               optimizer: Optional[Union[Optimizer,
+               optimizer: Optional[Union[Optimizer, DeepSpeedOptimizerCallable]] = None,
-                                         DeepSpeedOptimizerCallable]] = None,
               model_parameters: Optional[torch.nn.Module] = None,
               training_data: Optional[torch.utils.data.Dataset] = None,
-               lr_scheduler: Optional[Union[_LRScheduler,
+               lr_scheduler: Optional[Union[_LRScheduler, DeepSpeedSchedulerCallable]] = None,
-                                            DeepSpeedSchedulerCallable]] = None,
               mpu=None,
               dist_init_required: Optional[bool] = None,
               collate_fn=None,
@@ -110,10 +114,8 @@ def initialize(args=None,
        * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
          if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
    """
-    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
+    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__,
-        __version__,
+                                                                             __git_branch__),
-        __git_hash__,
-        __git_branch__),
             ranks=[0])
    # Disable zero.Init context if it's currently enabled
@@ -121,38 +123,73 @@ def initialize(args=None,
    assert model is not None, "deepspeed.initialize requires a model"
+    global dist
+    from deepspeed import comm as dist
+    dist_backend = get_accelerator().communication_backend_name()
+    dist.init_distributed(dist_backend=dist_backend, dist_init_required=dist_init_required)
+    # Set config using config_params for backwards compat
+    if config is None and config_params is not None:
+        config = config_params
+    # Check for deepscale_config for backwards compat
+    if hasattr(args, "deepscale_config") and args.deepscale_config is not None:
+        logger.warning("************ --deepscale_config is deprecated, please use --deepspeed_config ************")
+        if hasattr(args, "deepspeed_config"):
+            assert (args.deepspeed_config is
+                    None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
+        args.deepspeed_config = args.deepscale_config
+        args.deepscale_config = None
+    # Check that we have only one config passed
+    if hasattr(args, "deepspeed_config") and args.deepspeed_config is not None:
+        assert config is None, "Not sure how to proceed, we were given deepspeed configs in the deepspeed arguments and deepspeed.initialize() function call"
+        config = args.deepspeed_config
+    assert config != None, "DeepSpeed requires --deepspeed_config to specify configuration file"
    if not isinstance(model, PipelineModule):
-        engine = DeepSpeedEngine(args=args,
+        config_class = DeepSpeedConfig(config, mpu)
-                                 model=model,
+        if config_class.hybrid_engine.enabled:
-                                 optimizer=optimizer,
+            engine = DeepSpeedHybridEngine(args=args,
-                                 model_parameters=model_parameters,
+                                           model=model,
-                                 training_data=training_data,
+                                           optimizer=optimizer,
-                                 lr_scheduler=lr_scheduler,
+                                           model_parameters=model_parameters,
-                                 mpu=mpu,
+                                           training_data=training_data,
-                                 dist_init_required=dist_init_required,
+                                           lr_scheduler=lr_scheduler,
-                                 collate_fn=collate_fn,
+                                           mpu=mpu,
-                                 config=config,
+                                           dist_init_required=dist_init_required,
-                                 config_params=config_params)
+                                           collate_fn=collate_fn,
+                                           config=config,
+                                           config_class=config_class)
+        else:
+            engine = DeepSpeedEngine(args=args,
+                                     model=model,
+                                     optimizer=optimizer,
+                                     model_parameters=model_parameters,
+                                     training_data=training_data,
+                                     lr_scheduler=lr_scheduler,
+                                     mpu=mpu,
+                                     dist_init_required=dist_init_required,
+                                     collate_fn=collate_fn,
+                                     config=config,
+                                     config_class=config_class)
    else:
        assert mpu is None, "mpu must be None with pipeline parallelism"
+        mpu = model.mpu()
+        config_class = DeepSpeedConfig(config, mpu)
        engine = PipelineEngine(args=args,
                                model=model,
                                optimizer=optimizer,
                                model_parameters=model_parameters,
                                training_data=training_data,
                                lr_scheduler=lr_scheduler,
-                                mpu=model.mpu(),
+                                mpu=mpu,
                                dist_init_required=dist_init_required,
                                collate_fn=collate_fn,
                                config=config,
-                                config_params=config_params)
+                                config_class=config_class)
-    return_items = [
+    return_items = [engine, engine.optimizer, engine.training_dataloader, engine.lr_scheduler]
-        engine,
-        engine.optimizer,
-        engine.training_dataloader,
-        engine.lr_scheduler
-    ]
    return tuple(return_items)
@@ -171,38 +208,28 @@ def _add_core_arguments(parser):
    """
    group = parser.add_argument_group('DeepSpeed', 'DeepSpeed configurations')
-    group.add_argument(
+    group.add_argument('--deepspeed',
-        '--deepspeed',
+                       default=False,
-        default=False,
+                       action='store_true',
-        action='store_true',
+                       help='Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
-        help=
-        'Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
-    group.add_argument('--deepspeed_config',
+    group.add_argument('--deepspeed_config', default=None, type=str, help='DeepSpeed json configuration file.')
-                       default=None,
-                       type=str,
-                       help='DeepSpeed json configuration file.')
-    group.add_argument(
+    group.add_argument('--deepscale',
-        '--deepscale',
+                       default=False,
-        default=False,
+                       action='store_true',
-        action='store_true',
+                       help='Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
-        help=
-        'Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'
-    )
    group.add_argument('--deepscale_config',
                       default=None,
                       type=str,
                       help='Deprecated DeepSpeed json configuration file.')
-    group.add_argument(
+    group.add_argument('--deepspeed_mpi',
-        '--deepspeed_mpi',
+                       default=False,
-        default=False,
+                       action='store_true',
-        action='store_true',
+                       help="Run via MPI, this will attempt to discover the necessary variables to initialize torch "
-        help=
+                       "distributed from the MPI environment")
-        "Run via MPI, this will attempt to discover the necessary variables to initialize torch "
-        "distributed from the MPI environment")
    return parser
@@ -278,10 +305,8 @@ def init_inference(model, config=None, **kwargs):
    Returns:
        A deepspeed.InferenceEngine wrapped model.
    """
-    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
+    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__,
-        __version__,
+                                                                             __git_branch__),
-        __git_hash__,
-        __git_branch__),
             ranks=[0])
    # Load config_dict from config first
@@ -293,17 +318,14 @@ def init_inference(model, config=None, **kwargs):
    elif isinstance(config, dict):
        config_dict = config
    else:
-        raise ValueError(
+        raise ValueError(f"'config' argument expected string or dictionary, got {type(config)}")
-            f"'config' argument expected string or dictionary, got {type(config)}")
    # Update with values from kwargs, ensuring no conflicting overlap between config and kwargs
    overlap_keys = set(config_dict.keys()).intersection(kwargs.keys())
    # If there is overlap, error out if values are different
    for key in overlap_keys:
        if config_dict[key] != kwargs[key]:
-            raise ValueError(
+            raise ValueError(f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}")
-                f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}"
-            )
    config_dict.update(kwargs)
    ds_inference_config = DeepSpeedInferenceConfig(**config_dict)

--- a/deepspeed/autotuning/__init__.py
+++ b/deepspeed/autotuning/__init__.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 from .autotuner import Autotuner
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 import shutil
 import subprocess
@@ -40,6 +43,7 @@ class Autotuner:
    """The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods.
    Autotuning with DeepSpeed requires no code change from DeepSpeed users. Please refer to the README for usage details.
    """
    def __init__(self, args, active_resources):
        self.args = args
        self.selected_exp_dir = None
@@ -77,7 +81,7 @@ class Autotuner:
        if not os.path.exists(self.results_dir):
            try:
                os.makedirs(self.results_dir, exist_ok=True)
-                logger.info(f"Created autotuning resutls directory: {self.exps_dir}")
+                logger.info(f"Created autotuning results directory: {self.exps_dir}")
            except:
                logger.error(
                    f"Failed to create {self.results_dir}, please check `results_dir` in the autotuning config file is accessible by all the nodes in the job."
@@ -92,7 +96,8 @@ class Autotuner:
        assert self.exp_num_gpus <= self.rm.num_gpus_per_node, "num_gpus in the autotuning configuration must not be less than the --num_gpus value in the train script if any"
        assert self.exp_num_nodes <= len(
-            self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
+            self.rm.nodes
+        ), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
        self.records = {}
        self.optimal_cmd = None
@@ -125,18 +130,10 @@ class Autotuner:
                row.append(val[0]['name'])
                tab.append(row)
            summary = tabulate(tab,
-                               headers=[
+                               headers=["tuning_space", "num_experiments", "best_metric_val", "best_exp_name"],
-                                   "tuning_space",
-                                   "num_experiments",
-                                   "best_metric_val",
-                                   "best_exp_name"
-                               ],
                               tablefmt="pipe")
            print(summary)
-            with open(os.path.join(self.results_dir,
+            with open(os.path.join(self.results_dir, 'summary.txt'), 'w', buffering=BUFSIZE) as fd:
-                                   'summary.txt'),
-                      'w',
-                      buffering=BUFSIZE) as fd:
                fd.write(summary)
                fd.flush()
                os.fsync(fd)
@@ -148,9 +145,7 @@ class Autotuner:
                    f"{best_exp['name']} is the optimal setup after tuning. The exp result is at {best_exp['result_dir']}."
                )
            else:
-                logger.info(
+                logger.info(f"No optimal setup is found. Please check that experiments were run successfully.")
-                    f"No optimal setup is found. Please check that experiments were run successfully."
-                )
            tuning_duration = datetime.timedelta(seconds=(time.time() - self.start_time))
            logger.info(f"Tuning completed in {tuning_duration}")
@@ -172,8 +167,8 @@ class Autotuner:
        user_config_file = None
        if "--deepspeed_config" in user_args:
            idx = user_args.index("--deepspeed_config")
-            assert ".json" in user_args[idx +
+            assert ".json" in user_args[
-                                        1],  "DeepSpeed --deepspeed_config requires a json file to specify the configuration"
+                idx + 1], "DeepSpeed --deepspeed_config requires a json file to specify the configuration"
            user_config_file = user_args[idx + 1]
        elif "--deepspeed" in user_args:
@@ -183,15 +178,10 @@ class Autotuner:
        logger.debug(f"user_config_file = {user_config_file}")
        if user_config_file is not None:
-            assert os.path.isfile(
+            assert os.path.isfile(user_config_file), "DeepSpeed configuration file: {} is not an existing file".format(
-                user_config_file
+                user_config_file)
-            ), "DeepSpeed configuration file: {} is not an existing file".format(
-                user_config_file
-            )
            if os.path.exists(user_config_file):
-                return json.load(open(user_config_file,
+                return json.load(open(user_config_file, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys)
-                                      "r"),
-                                 object_pairs_hook=dict_raise_error_on_duplicate_keys)
        return None
@@ -258,13 +248,11 @@ class Autotuner:
        return self.autotuning_config.mp_size
    def max_train_micro_batch_size_per_gpu(self):
-        if self.max_train_batch_size() and self.max_train_batch_size(
+        if self.max_train_batch_size(
-        ) > 0:  # if the user specifies a max_train_batch_size
+        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
-            max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size(
+            max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size() // (
-            ) // (self.exp_num_gpus * self.exp_num_nodes
+                self.exp_num_gpus * self.exp_num_nodes)  # gradient accumulation steps >=1
-                  )  # gradient accumulation steps >=1
+            return min(self.autotuning_config.max_train_micro_batch_size_per_gpu, max_train_micro_batch_size)
-            return min(self.autotuning_config.max_train_micro_batch_size_per_gpu,
-                       max_train_micro_batch_size)
        else:
            return self.autotuning_config.max_train_micro_batch_size_per_gpu
@@ -361,19 +349,14 @@ class Autotuner:
            if model_info and "hidden_size" in model_info:
                hs = model_info["hidden_size"]
                template_config[ZERO_OPTIMIZATION]['reduce_bucket_size'] = hs * hs
-                template_config[ZERO_OPTIMIZATION][
+                template_config[ZERO_OPTIMIZATION]['stage3_prefetch_bucket_size'] = 0.9 * hs * hs
-                    'stage3_prefetch_bucket_size'] = 0.9 * hs * hs
+                template_config[ZERO_OPTIMIZATION]['stage3_param_persistence_threshold'] = 10 * hs
-                template_config[ZERO_OPTIMIZATION][
-                    'stage3_param_persistence_threshold'] = 10 * hs
            prefix = "z3_"
        else:
            return exps
        # replace the corresponding parameter values if the user specifies them in the DeepSpeed configuration file
-        replace_dict(tuning_space,
+        replace_dict(tuning_space, self.user_config, [ZERO_OPTIMIZATION, TRAIN_MICRO_BATCH_SIZE_PER_GPU])
-                     self.user_config,
-                     [ZERO_OPTIMIZATION,
-                      TRAIN_MICRO_BATCH_SIZE_PER_GPU])
        logger.debug(f"tuning_space = {json.dumps(tuning_space)}")
@@ -397,11 +380,9 @@ class Autotuner:
            # if the config does not use offloading, remove the offloading section
            config_zero = config.get(ZERO_OPTIMIZATION, None)
            if config_zero:
-                if OFFLOAD_OPTIMIZER not in config_zero and OFFLOAD_OPTIMIZER in exp_config[
+                if OFFLOAD_OPTIMIZER not in config_zero and OFFLOAD_OPTIMIZER in exp_config[ZERO_OPTIMIZATION]:
-                        ZERO_OPTIMIZATION]:
                    del exp_config[ZERO_OPTIMIZATION][OFFLOAD_OPTIMIZER]
-                if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[
+                if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[ZERO_OPTIMIZATION]:
-                        ZERO_OPTIMIZATION]:
                    del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM]
            # set gradient accumulation steps according to max_train_batch_size_per_gpu
            mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
@@ -438,13 +419,10 @@ class Autotuner:
        else:
            return
-        logger.info(
+        logger.info(f"The model has {number_to_string(self.get_model_num_params())} parameters.")
-            f"The model has {number_to_string(self.get_model_num_params())} parameters.")
        self.gpu_mem = self.get_gpu_memory_info()
-        logger.info(
+        logger.info(f"Memory per GPU in the system is {memory_to_string(self.gpu_mem, postfix='B')}.")
-            f"Memory per GPU in the system is {memory_to_string(self.gpu_mem, postfix='B')}."
-        )
        self.activation_mem = self.get_activation_memory_per_gpu()
        logger.info(
@@ -452,9 +430,7 @@ class Autotuner:
        )
        #TODO: FIX THIS
-        stage = self.user_config.get(ZERO_OPTIMIZATION,
+        stage = self.user_config.get(ZERO_OPTIMIZATION, {}).get(ZERO_OPTIMIZATION_STAGE, "all")
-                                     {}).get(ZERO_OPTIMIZATION_STAGE,
-                                             "all")
        stage = "all"
        user_zero_stages = [stage] if not isinstance(stage, list) else stage
        logger.info(f"User-defined zero stages are {stage}.")
@@ -463,15 +439,13 @@ class Autotuner:
        max_mbs = 0
        metric_val = 0
-        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(ZeroStageEnum.disabled) + self.activation_mem
-            ZeroStageEnum.disabled) + self.activation_mem
        if self.gpu_mem > required_gpu_mem:
            if "all" in user_zero_stages or ZeroStageEnum.disabled in user_zero_stages:
                logger.info(
                    f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space"
                )
-                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_0)
-                    DEFAULT_TUNING_SPACE_ZERO_0)
                if next_mbs > mbs:
                    mbs = next_mbs
                    max_mbs = next_max_mbs
@@ -490,8 +464,10 @@ class Autotuner:
                logger.info(
                    f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space"
                )
-                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_1,
-                    DEFAULT_TUNING_SPACE_ZERO_1, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                                                                          prev_max_mbs=max_mbs,
+                                                                          prev_best_mbs=mbs,
+                                                                          prev_best_metric_val=metric_val)
                if next_mbs > mbs:
                    mbs = next_mbs
                    max_mbs = next_max_mbs
@@ -510,8 +486,10 @@ class Autotuner:
                logger.info(
                    f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space"
                )
-                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_2,
-                    DEFAULT_TUNING_SPACE_ZERO_2, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                                                                          prev_max_mbs=max_mbs,
+                                                                          prev_best_mbs=mbs,
+                                                                          prev_best_metric_val=metric_val)
                if next_mbs > mbs:
                    mbs = next_mbs
                    max_mbs = next_max_mbs
@@ -523,15 +501,16 @@ class Autotuner:
                f"The model is not runable with ZERO stage {ZeroStageEnum.gradients} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
            )
-        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(ZeroStageEnum.weights) + self.activation_mem
-            ZeroStageEnum.weights) + self.activation_mem
        if self.gpu_mem > required_gpu_mem:
            if "all" in user_zero_stages or ZeroStageEnum.weights in user_zero_stages:
                logger.info(
                    f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space"
                )
-                _, _, next_metric_val = self.tune_space(
+                _, _, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_3,
-                    DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                                                        prev_max_mbs=max_mbs,
+                                                        prev_best_mbs=mbs,
+                                                        prev_best_metric_val=metric_val)
                if has_mlflow:
                    mlflow.log_metric(f"z3{self.metric()}", next_metric_val)
        else:
@@ -542,11 +521,7 @@ class Autotuner:
        if has_mlflow:
            mlflow.end_run()
-    def tune_space(self,
+    def tune_space(self, tuning_space, prev_max_mbs=0, prev_best_mbs=0, prev_best_metric_val=0):
-                   tuning_space,
-                   prev_max_mbs=0,
-                   prev_best_mbs=0,
-                   prev_best_metric_val=0):
        config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
        tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
@@ -557,26 +532,20 @@ class Autotuner:
        # calculate max micro batch size using gpu memory, model instantiation memory and activation memory
        # calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1
        calculated_max_micro_batch_size = int(
-            self.gpu_mem -
+            self.gpu_mem - self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem
-            self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem
        logger.info(
            f"Start tuning for space {tuning_space_name}, calculated_max_micro_batch_size = {calculated_max_micro_batch_size}"
        )
        if calculated_max_micro_batch_size < prev_max_mbs:
-            logger.info(
+            logger.info(f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}")
-                f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}"
-            )
            return 0, 0, 0
        if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance(
-                self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU],
+                self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], list):
-                list):
            # user-specified micro batch size per gpu is a list which overwrites the default tuning behavior
            tuning_micro_batch_sizes = [
-                s for s in self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
+                s for s in self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] if isinstance(s, int)
-                if isinstance(s,
-                              int)
            ]
            gas = self.get_gas_from_user_config()
            min_micro_batch_size = min(tuning_micro_batch_sizes)
@@ -589,9 +558,7 @@ class Autotuner:
                stage, prev_max_mbs, calculated_max_micro_batch_size)
            if max_micro_batch_size < prev_max_mbs:
-                logger.info(
+                logger.info(f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}")
-                    f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}"
-                )
                return 0, 0, 0
            tuning_micro_batch_sizes, max_train_batch_size_per_gpu = self.get_tuning_micro_batch_size_list(
@@ -609,19 +576,15 @@ class Autotuner:
            return 0, 0, 0
        # tune micro batch sizes and gradient accumulation steps given max_train_batch_size_per_gpu
-        tuning_micro_batch_sizes = self.run_tuning_micro_batch_sizes(
+        tuning_micro_batch_sizes = self.run_tuning_micro_batch_sizes(tuning_micro_batch_sizes,
-            tuning_micro_batch_sizes,
+                                                                     max_train_batch_size_per_gpu,
-            max_train_batch_size_per_gpu,
+                                                                     min_micro_batch_size, stage,
-            min_micro_batch_size,
+                                                                     tuning_micro_batch_sizes_overwritten)
-            stage,
-            tuning_micro_batch_sizes_overwritten)
        fast_best_record = self.get_best_space_record(tuning_space_name)
        fast_best_metric_val = fast_best_record[1] if fast_best_record else 0
-        fast_best_mbs = fast_best_record[0][DS_CONFIG][
+        fast_best_mbs = fast_best_record[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] if fast_best_record else 0
-            TRAIN_MICRO_BATCH_SIZE_PER_GPU] if fast_best_record else 0
+        logger.info(f"fast_best_mbs = {fast_best_mbs}, name = {fast_best_record[0]['name']}")
-        logger.info(
-            f"fast_best_mbs = {fast_best_mbs}, name = {fast_best_record[0]['name']}")
        if self.fast_enabled() or stage == 0:
            logger.info(f"End tuning for space: {tuning_space_name}")
@@ -631,8 +594,7 @@ class Autotuner:
        if stage > 0:
            if fast_best_mbs <= prev_best_mbs or fast_best_metric_val < prev_best_metric_val:
                logger.info(
-                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters."
+                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters.")
-                )
                return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
        tuning_space[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = tuning_micro_batch_sizes
@@ -654,8 +616,7 @@ class Autotuner:
        else:
            t = GridSearchTuner(exps, self.rm, self.metric())
-        sample_size = len(self.rm.nodes) * self.rm.num_gpus_per_node // (
+        sample_size = len(self.rm.nodes) * self.rm.num_gpus_per_node // (self.exp_num_gpus * self.exp_num_nodes)
-            self.exp_num_gpus * self.exp_num_nodes)
        num_exps = t.tune(sample_size=sample_size,
                          n_trials=self.autotuning_config.tuner_num_trials,
                          early_stopping=self.autotuning_config.tuner_early_stopping)
@@ -669,8 +630,7 @@ class Autotuner:
        if full_best_metric_val > fast_best_metric_val:
            best_metric_val = full_best_metric_val
-            best_mbs = full_best_record[0][DS_CONFIG][
+            best_mbs = full_best_record[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] if full_best_record else -1
-                TRAIN_MICRO_BATCH_SIZE_PER_GPU] if full_best_record else -1
        else:
            best_metric_val = fast_best_metric_val
            best_mbs = fast_best_mbs
@@ -682,9 +642,7 @@ class Autotuner:
        if tuning_space_name not in self.records:
            return 0
        space_records = self.records[tuning_space_name]
-        sorted_space_records = sorted(
+        sorted_space_records = sorted(space_records, key=lambda x: x[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU])
-            space_records,
-            key=lambda x: x[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU])
        prev_metric_val = None
        prev_micro_batch_size = 0
        for (exp, metric_val, _) in sorted_space_records:
@@ -692,8 +650,7 @@ class Autotuner:
                if metric_val < prev_metric_val:
                    break
                if (metric_val >= prev_metric_val
-                        and (metric_val - prev_metric_val) / prev_metric_val <
+                        and (metric_val - prev_metric_val) / prev_metric_val < METRIC_PERCENT_DIFF_CONST):
-                        METRIC_PERCENT_DIFF_CONST):
                    break
            prev_metric_val = metric_val
            prev_micro_batch_size = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]
@@ -718,16 +675,8 @@ class Autotuner:
        ds_config = copy.deepcopy(self.user_config)
        replace_dict(ds_config, DEFAULT_MIN_MEM_CONFIG)
-        model_info_path = os.path.join(self.results_dir,
+        model_info_path = os.path.join(self.results_dir, "profile_model_info", "model_info.json")
-                                       "profile_model_info",
+        ds_config[AUTOTUNING] = {"enabled": True, "model_info_path": model_info_path, "model_info": {"profile": True}}
-                                       "model_info.json")
-        ds_config[AUTOTUNING] = {
-            "enabled": True,
-            "model_info_path": model_info_path,
-            "model_info": {
-                "profile": True
-            }
-        }
        exp_config = {}
        exp_name = "profile_model_info"
@@ -748,8 +697,7 @@ class Autotuner:
        for exp_id, (exp_json, err) in self.rm.finished_experiments.items():
            self.rm.clear()
            if err:
-                logger.error(
+                logger.error(f"The model is not runnable with DeepSpeed with error = {err}")
-                    f"The model is not runnable with DeepSpeed with error = {err}")
                return None
        if os.path.exists(model_info_path):
@@ -790,12 +738,8 @@ class Autotuner:
            best_space_records[GLOBAL_TUNING_SPACE] = global_best_record
        return best_space_records
-    def run_tuning_micro_batch_sizes(self,
+    def run_tuning_micro_batch_sizes(self, tuning_micro_batch_sizes, max_train_batch_size_per_gpu,
-                                     tuning_micro_batch_sizes,
+                                     min_micro_batch_size, stage, tuning_micro_batch_sizes_overwritten):
-                                     max_train_batch_size_per_gpu,
-                                     min_micro_batch_size,
-                                     stage,
-                                     tuning_micro_batch_sizes_overwritten):
        assert tuning_micro_batch_sizes, "the tuning micro batch size list is empty"
        tuning_micro_batch_sizes.sort()
        max_micro_batch_size = tuning_micro_batch_sizes[-1]
@@ -838,8 +782,7 @@ class Autotuner:
                        results = hjson.load(f)
                        metric_val = results[self.metric()]
                        self.update_records(tuning_space_name, exp, metric_val, 1)
-                        if max_micro_batch_size == exp[DS_CONFIG][
+                        if max_micro_batch_size == exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
-                                TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
                            max_micro_batch_size_metric_val = metric_val
                        if has_mlflow:
                            os.environ.pop('MLFLOW_RUN_ID')
@@ -862,9 +805,8 @@ class Autotuner:
        # in a auto-detected tuning_micro_batch_sizs list, max_micro_batch_size might not be performant as the memory consumption is close to max
        # try smaller values while gas stays the same
        # if finding a more performant mbs value, use it to replace max_micro_batch_size in the list
-        min_micro_batch_size_with_same_gas = (
+        min_micro_batch_size_with_same_gas = (tuning_micro_batch_sizes[-2] +
-            tuning_micro_batch_sizes[-2] +
+                                              1) if len(tuning_micro_batch_sizes) > 1 else min_micro_batch_size
-            1) if len(tuning_micro_batch_sizes) > 1 else min_micro_batch_size
        prev_best_metric_val = max_micro_batch_size_metric_val
        prev_best_mbs = max_micro_batch_size
@@ -872,10 +814,7 @@ class Autotuner:
        stride = (max_micro_batch_size - min_micro_batch_size_with_same_gas) // 3
        if stride == 0:
            stride = 1
-        for mbs in reversed(
+        for mbs in reversed(range(min_micro_batch_size_with_same_gas, max_micro_batch_size, stride)):
-                range(min_micro_batch_size_with_same_gas,
-                      max_micro_batch_size,
-                      stride)):
            ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mbs
            gas = max_train_batch_size_per_gpu // mbs
            ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
@@ -908,10 +847,7 @@ class Autotuner:
            tuning_micro_batch_sizes[-1] = prev_best_mbs
        return tuning_micro_batch_sizes
-    def get_min_max_micro_batch_size(self,
+    def get_min_max_micro_batch_size(self, stage, min_micro_batch_size, calculated_max_micro_batch_size):
-                                     stage,
-                                     min_micro_batch_size,
-                                     calculated_max_micro_batch_size):
        # get min and max micro batch size with gradient accumulation steps = 1
        if min_micro_batch_size > calculated_max_micro_batch_size:
            return -1, -1
@@ -927,8 +863,7 @@ class Autotuner:
        # search for the min micro batch size
        if min_micro_batch_size < 1:
            if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance(
-                    self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU],
+                    self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], int):
-                    int):
                # user specifies train_micro_batch_size_per_gpu as an int
                mbs = int(self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU])
            else:
@@ -951,8 +886,7 @@ class Autotuner:
                min_micro_batch_size = mbs
            else:
                self.update_records(tuning_space_name, exp, 0, 1)
-                logger.info(
+                logger.info(f"User-specified micro batch size per GPU {mbs} does not run")
-                    f"User-specified micro batch size per GPU {mbs} does not run")
                if self.min_train_micro_batch_size_per_gpu() == mbs:
                    return -1, -1
                mbs = self.min_train_micro_batch_size_per_gpu()
@@ -964,8 +898,7 @@ class Autotuner:
                exp, metric_val = self.run_ds_config(ds_config, exp_name)
                if not metric_val:
                    self.update_records(tuning_space_name, exp, 0, 1)
-                    logger.info(
+                    logger.info(f"min_train_micro_batch_size_per_gpu {mbs} is not runnable.")
-                        f"min_train_micro_batch_size_per_gpu {mbs} is not runnable.")
                    return -1, -1
                self.update_records(tuning_space_name, exp, metric_val, 1)
                min_micro_batch_size = mbs
@@ -975,8 +908,7 @@ class Autotuner:
            ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
            ds_config[TRAIN_BATCH_SIZE] = min_micro_batch_size * gas * \
                self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
-            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(
+            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(min_micro_batch_size)
-                min_micro_batch_size)
            exp, metric_val = self.run_ds_config(ds_config, exp_name)
            if metric_val:
                self.update_records(tuning_space_name, exp, metric_val, 1)
@@ -986,13 +918,8 @@ class Autotuner:
                return -1, -1
        # search for the max micro batch size
-        max_micro_batch_size = min(calculated_max_micro_batch_size,
+        max_micro_batch_size = min(calculated_max_micro_batch_size, self.max_train_micro_batch_size_per_gpu())
-                                   self.max_train_micro_batch_size_per_gpu())
+        for mbs in [math.ceil(1.05 * max_micro_batch_size), max_micro_batch_size, int(0.95 * max_micro_batch_size)]:
-        for mbs in [
-                math.ceil(1.05 * max_micro_batch_size),
-                max_micro_batch_size,
-                int(0.95 * max_micro_batch_size)
-        ]:
            if mbs > self.max_train_micro_batch_size_per_gpu():
                continue
            if mbs in used_micro_batch_sizes:
@@ -1011,12 +938,11 @@ class Autotuner:
            else:
                self.update_records(tuning_space_name, exp, 0, 1)
-        space_records = self.records[
+        space_records = self.records[tuning_space_name] if tuning_space_name in self.records else []
-            tuning_space_name] if tuning_space_name in self.records else []
        if space_records:
            prev_idx = min(range(len(space_records)),
-                           key=lambda i: abs(space_records[i][0][DS_CONFIG][
+                           key=lambda i: abs(space_records[i][0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] -
-                               TRAIN_MICRO_BATCH_SIZE_PER_GPU] - min_micro_batch_size))
+                                             min_micro_batch_size))
            prev_metric_val = space_records[prev_idx][1]
        else:
            prev_metric_val = None
@@ -1037,8 +963,8 @@ class Autotuner:
                    low = mid + 1
                    self.update_records(tuning_space_name, exp, metric_val, 1)
                    used_micro_batch_sizes.append(mid)
-                    if prev_metric_val and ((metric_val - prev_metric_val) /
+                    if prev_metric_val and (
-                                            prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
+                        (metric_val - prev_metric_val) / prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
                        logger.info(f"performance plateaus at mbs = {low}")
                        break
                    prev_metric_val = metric_val
@@ -1049,9 +975,7 @@ class Autotuner:
                low = mid + 1
        max_micro_batch_size = low - 1
-        logger.info(
+        logger.info(f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}.")
-            f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}."
-        )
        return min_micro_batch_size, max_micro_batch_size
@@ -1067,8 +991,7 @@ class Autotuner:
                    gas = int(val)
            elif isinstance(gas_in_config, list):
                logger.info(
-                    f"Specifying a list of {GRADIENT_ACCUMULATION_STEPS} to tune is not supported. 1 would be used."
+                    f"Specifying a list of {GRADIENT_ACCUMULATION_STEPS} to tune is not supported. 1 would be used.")
-                )
        assert gas > 0, "Gradient accumulation steps must be positive."
        return gas
@@ -1083,9 +1006,7 @@ class Autotuner:
                    return (user_args[idx + 1])
        return None
-    def get_tuning_micro_batch_size_list(self,
+    def get_tuning_micro_batch_size_list(self, min_micro_batch_size, max_micro_batch_size,
-                                         min_micro_batch_size,
-                                         max_micro_batch_size,
                                         num_tuning_micro_batch_sizes):
        """Get a list of micro batch sizes to tune based on min and max values, as well as the size of the list.
        Args:
@@ -1098,17 +1019,16 @@ class Autotuner:
        """
        if min_micro_batch_size <= 0 or max_micro_batch_size <= 0:
            logger.info(
-                f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}"
+                f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}")
-            )
            return [], 0
        # NUM_GPUS=$(( ${NUM_WORKERS} * ${NUM_GPUS_PER_WORKER} ))
        # DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) ))
        # GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${BATCH_SIZE} * ${DP_SIZE}) ))
-        if self.max_train_batch_size() and self.max_train_batch_size(
+        if self.max_train_batch_size(
-        ) > 0:  # if the user specifies a max_train_batch_size
+        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
-            max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size(
+            max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size() // (self.exp_num_gpus *
-            ) // (self.exp_num_gpus * self.exp_num_nodes)
+                                                                                            self.exp_num_nodes)
        else:
            gas = self.get_gas_from_user_config()
            max_train_batch_size_per_gpu = max_micro_batch_size * gas // self.mp_size()
@@ -1117,15 +1037,14 @@ class Autotuner:
            min_micro_batch_size = max_micro_batch_size // 2
        # constant stride
-        stride = (max_micro_batch_size -
+        stride = (max_micro_batch_size - min_micro_batch_size) // num_tuning_micro_batch_sizes
-                  min_micro_batch_size) // num_tuning_micro_batch_sizes
        if stride == 0:
            stride = 1
        ls = []
        min_gas = max_train_batch_size_per_gpu // max_micro_batch_size
        # if gas is the same as min_gas, do not add mbs to the tuning list
        for mbs in range(min_micro_batch_size, max_micro_batch_size, stride):
-            if max_micro_batch_size // mbs != min_gas:
+            if max_train_batch_size_per_gpu // mbs != min_gas:
                ls.append(mbs)
        ls.append(max_micro_batch_size)
@@ -1187,8 +1106,6 @@ class Autotuner:
            result = subprocess.Popen(self.optimal_cmd)
            result.wait()
-            logger.info(
+            logger.info(f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}")
-                f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}"
-            )
        else:
            logger.info(f"No optimal DeepSpeed configuration found by autotuning.")
--- a/deepspeed/autotuning/config.py
+++ b/deepspeed/autotuning/config.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
-"""
+# SPDX-License-Identifier: Apache-2.0
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
+# DeepSpeed Team
-"""
 from deepspeed.runtime.config_utils import get_scalar_param, get_dict_param, DeepSpeedConfigObject
 from deepspeed.autotuning.constants import *
 class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
    def __init__(self, param_dict):
        super(DeepSpeedAutotuningConfig, self).__init__()
@@ -31,102 +31,65 @@ class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
        self._initialize(autotuning_dict)
    def _initialize(self, autotuning_dict):
-        self.enabled = get_scalar_param(autotuning_dict,
+        self.enabled = get_scalar_param(autotuning_dict, AUTOTUNING_ENABLED, AUTOTUNING_ENABLED_DEFAULT)
-                                        AUTOTUNING_ENABLED,
-                                        AUTOTUNING_ENABLED_DEFAULT)
-        self.fast = get_scalar_param(autotuning_dict,
+        self.fast = get_scalar_param(autotuning_dict, AUTOTUNING_FAST, AUTOTUNING_FAST_DEFAULT)
-                                     AUTOTUNING_FAST,
-                                     AUTOTUNING_FAST_DEFAULT)
-        self.results_dir = get_scalar_param(autotuning_dict,
+        self.results_dir = get_scalar_param(autotuning_dict, AUTOTUNING_RESULTS_DIR, AUTOTUNING_RESULTS_DIR_DEFAULT)
-                                            AUTOTUNING_RESULTS_DIR,
-                                            AUTOTUNING_RESULTS_DIR_DEFAULT)
        assert self.results_dir, "results_dir cannot be empty"
-        self.exps_dir = get_scalar_param(autotuning_dict,
+        self.exps_dir = get_scalar_param(autotuning_dict, AUTOTUNING_EXPS_DIR, AUTOTUNING_EXPS_DIR_DEFAULT)
-                                         AUTOTUNING_EXPS_DIR,
-                                         AUTOTUNING_EXPS_DIR_DEFAULT)
        assert self.exps_dir, "exps_dir cannot be empty"
-        self.overwrite = get_scalar_param(autotuning_dict,
+        self.overwrite = get_scalar_param(autotuning_dict, AUTOTUNING_OVERWRITE, AUTOTUNING_OVERWRITE_DEFAULT)
-                                          AUTOTUNING_OVERWRITE,
-                                          AUTOTUNING_OVERWRITE_DEFAULT)
-        self.start_profile_step = get_scalar_param(
+        self.start_profile_step = get_scalar_param(autotuning_dict, AUTOTUNING_START_PROFILE_STEP,
-            autotuning_dict,
+                                                   AUTOTUNING_START_PROFILE_STEP_DEFAULT)
-            AUTOTUNING_START_PROFILE_STEP,
-            AUTOTUNING_START_PROFILE_STEP_DEFAULT)
-        self.end_profile_step = get_scalar_param(autotuning_dict,
+        self.end_profile_step = get_scalar_param(autotuning_dict, AUTOTUNING_END_PROFILE_STEP,
-                                                 AUTOTUNING_END_PROFILE_STEP,
                                                 AUTOTUNING_END_PROFILE_STEP_DEFAULT)
-        self.metric = get_scalar_param(autotuning_dict,
+        self.metric = get_scalar_param(autotuning_dict, AUTOTUNING_METRIC, AUTOTUNING_METRIC_DEFAULT)
-                                       AUTOTUNING_METRIC,
-                                       AUTOTUNING_METRIC_DEFAULT)
-        self.metric_path = get_scalar_param(autotuning_dict,
+        self.metric_path = get_scalar_param(autotuning_dict, AUTOTUNING_METRIC_PATH, AUTOTUNING_METRIC_PATH_DEFAULT)
-                                            AUTOTUNING_METRIC_PATH,
-                                            AUTOTUNING_METRIC_PATH_DEFAULT)
-        self.tuner_type = get_scalar_param(autotuning_dict,
+        self.tuner_type = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_TYPE, AUTOTUNING_TUNER_TYPE_DEFAULT)
-                                           AUTOTUNING_TUNER_TYPE,
-                                           AUTOTUNING_TUNER_TYPE_DEFAULT)
-        self.tuner_early_stopping = get_scalar_param(
+        self.tuner_early_stopping = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_EARLY_STOPPING,
-            autotuning_dict,
+                                                     AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT)
-            AUTOTUNING_TUNER_EARLY_STOPPING,
-            AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT)
-        self.tuner_num_trials = get_scalar_param(autotuning_dict,
+        self.tuner_num_trials = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_NUM_TRIALS,
-                                                 AUTOTUNING_TUNER_NUM_TRIALS,
                                                 AUTOTUNING_TUNER_NUM_TRIALS_DEFAULT)
-        self.arg_mappings = get_dict_param(autotuning_dict,
+        self.arg_mappings = get_dict_param(autotuning_dict, AUTOTUNING_ARG_MAPPINGS, AUTOTUNING_ARG_MAPPINGS_DEFAULT)
-                                           AUTOTUNING_ARG_MAPPINGS,
-                                           AUTOTUNING_ARG_MAPPINGS_DEFAULT)
        self.model_info = get_model_info_config(autotuning_dict)
-        self.model_info_path = get_scalar_param(autotuning_dict,
+        self.model_info_path = get_scalar_param(autotuning_dict, AUTOTUNING_MODEL_INFO_PATH,
-                                                AUTOTUNING_MODEL_INFO_PATH,
                                                AUTOTUNING_MODEL_INFO_PATH_DEFAULT)
-        self.mp_size = get_scalar_param(autotuning_dict,
+        self.mp_size = get_scalar_param(autotuning_dict, AUTOTUNING_MP_SIZE, AUTOTUNING_MP_SIZE_DEFAULT)
-                                        AUTOTUNING_MP_SIZE,
-                                        AUTOTUNING_MP_SIZE_DEFAULT)
-        self.max_train_batch_size = get_dict_param(
+        self.max_train_batch_size = get_dict_param(autotuning_dict, AUTOTUNING_MAX_TRAIN_BATCH_SIZE,
-            autotuning_dict,
+                                                   AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT)
-            AUTOTUNING_MAX_TRAIN_BATCH_SIZE,
-            AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT)
-        self.min_train_batch_size = get_dict_param(
+        self.min_train_batch_size = get_dict_param(autotuning_dict, AUTOTUNING_MIN_TRAIN_BATCH_SIZE,
-            autotuning_dict,
+                                                   AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT)
-            AUTOTUNING_MIN_TRAIN_BATCH_SIZE,
-            AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT)
        self.max_train_micro_batch_size_per_gpu = get_dict_param(
-            autotuning_dict,
+            autotuning_dict, AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
-            AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
            AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
        self.min_train_micro_batch_size_per_gpu = get_dict_param(
-            autotuning_dict,
+            autotuning_dict, AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
-            AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
            AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
-        self.num_tuning_micro_batch_sizes = get_dict_param(
+        self.num_tuning_micro_batch_sizes = get_dict_param(autotuning_dict, AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES,
-            autotuning_dict,
+                                                           AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT)
-            AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES,
-            AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT)
 def get_model_info_config(param_dict):
    if MODEL_INFO in param_dict and param_dict[MODEL_INFO] is not None:
        model_info_config = {}
        for key, default_value in MODEL_INFO_KEY_DEFAULT_DICT.items():
-            model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO],
+            model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO], key, default_value)
-                                                      key,
-                                                      default_value)
        return model_info_config
    return None

--- a/deepspeed/autotuning/constants.py
+++ b/deepspeed/autotuning/constants.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
-"""
+# SPDX-License-Identifier: Apache-2.0
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
+# DeepSpeed Team
-"""
 #########################################
 # autotunner implementation constants
@@ -10,17 +9,13 @@ Licensed under the MIT license.
 import os
-DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
-                                            "config_templates",
                                            "template_zero0.json")
-DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
-                                            "config_templates",
                                            "template_zero1.json")
-DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
-                                            "config_templates",
                                            "template_zero2.json")
-DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
-                                            "config_templates",
                                            "template_zero3.json")
 METRIC_PERCENT_DIFF_CONST = 0.05
@@ -157,50 +152,31 @@ DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}}
 DEFAULT_TUNING_SPACE_ZERO_1 = {
    "zero_optimization": {
        "stage": 1,
-        "reduce_bucket_size": [5e7,
+        "reduce_bucket_size": [5e7, 5e8, 1e9],
-                               5e8,
+        "allgather_bucket_size": [5e7, 5e8, 1e9],
-                               1e9],
-        "allgather_bucket_size": [5e7,
-                                  5e8,
-                                  1e9],
    }
 }
 DEFAULT_TUNING_SPACE_ZERO_2 = {
    "zero_optimization": {
        "stage": 2,
-        "overlap_comm": [True,
+        "overlap_comm": [True, False],
-                         False],
+        "reduce_scatter": [False, True],
-        "reduce_scatter": [False,
+        "reduce_bucket_size": [5e7, 5e8, 1e9],
-                           True],
+        "allgather_bucket_size": [5e7, 5e8, 1e9],
-        "reduce_bucket_size": [5e7,
+        "contiguous_gradients": [False, True]
-                               5e8,
-                               1e9],
-        "allgather_bucket_size": [5e7,
-                                  5e8,
-                                  1e9],
-        "contiguous_gradients": [False,
-                                 True]
    },
 }
 DEFAULT_TUNING_SPACE_ZERO_3 = {
    "zero_optimization": {
        "stage": 3,
-        "overlap_comm": [True,
+        "overlap_comm": [True, False],
-                         False],
+        "reduce_scatter": [False, True],
-        "reduce_scatter": [False,
+        "reduce_bucket_size": [5e7, 5e8, 1e9],
-                           True],
+        "allgather_partitions": [True, False],
-        "reduce_bucket_size": [5e7,
+        "allgather_bucket_size": [5e7, 5e8, 1e9],
-                               5e8,
+        "contiguous_gradients": [False, True]
-                               1e9],
-        "allgather_partitions": [True,
-                                 False],
-        "allgather_bucket_size": [5e7,
-                                  5e8,
-                                  1e9],
-        "contiguous_gradients": [False,
-                                 True]
    },
 }

--- a/deepspeed/autotuning/scheduler.py
+++ b/deepspeed/autotuning/scheduler.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 import copy
@@ -28,13 +31,8 @@ TIMEOUT = 5
 class ResourceManager:
-    def __init__(self,
-                 args,
+    def __init__(self, args, hosts, num_gpus_per_node, results_dir, exps_dir, arg_mappings):
-                 hosts,
-                 num_gpus_per_node,
-                 results_dir,
-                 exps_dir,
-                 arg_mappings):
        self.results_dir = results_dir
        self.exps_dir = exps_dir
@@ -69,13 +67,10 @@ class ResourceManager:
                    exp["exp_id"] = self.experiment_count
                    self.experiment_count += 1
-                    result_dir = exp["result_dir"] = os.path.join(
+                    result_dir = exp["result_dir"] = os.path.join(self.results_dir, exp['name'])
-                        self.results_dir,
-                        exp['name'])
                    if AUTOTUNING in exp["ds_config"]:
                        metric_file = os.path.join(result_dir, "metrics.json")
-                        exp["ds_config"][AUTOTUNING][
+                        exp["ds_config"][AUTOTUNING][AUTOTUNING_METRIC_PATH] = metric_file
-                            AUTOTUNING_METRIC_PATH] = metric_file
                    stderr_file = os.path.join(result_dir, "stderr.log")
                    model_info_file = os.path.join(result_dir, "model_info.json")
                    metric_file = os.path.join(result_dir, "metrics.json")
@@ -86,11 +81,8 @@ class ResourceManager:
                            err = search_error(stderr_file)
                            exp_id = exp["exp_id"]
                            self.finished_experiments[exp_id] = (exp, err)
-                            if err or os.path.exists(metric_file) or os.path.exists(
+                            if err or os.path.exists(metric_file) or os.path.exists(model_info_file):
-                                    model_info_file):
+                                logger.info(f"Skipping exp {exp['name']} whose result already exists")
-                                logger.info(
-                                    f"Skipping exp {exp['name']} whose result already exists"
-                                )
                                continue
                    self.experiment_queue.append(exp)
@@ -113,11 +105,7 @@ class ResourceManager:
                    user_args.append(val)
                    user_args.append(str(nval))
-        t = threading.Thread(target=run_experiment,
+        t = threading.Thread(target=run_experiment, args=(exp, reservations, user_script, user_args))
-                             args=(exp,
-                                   reservations,
-                                   user_script,
-                                   user_args))
        t.start()
        self.running_experiments[exp_id] = (t, exp, reservations, time.time())
@@ -270,6 +258,7 @@ class ResourceManager:
 class Node:
    def __init__(self, host, max_slots):
        self.host = host
        self.max_slots = max_slots
@@ -284,6 +273,7 @@ class Node:
 class Reservation:
    def __init__(self, node, slots):
        self.node = node
        self.slots = slots
@@ -389,9 +379,8 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
        f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}, and ds_config = {os.path.abspath(ds_config_path)}"
    )
-    with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(
+    with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(os.path.join(exp_dir, "stderr.log"),
-        os.path.join(exp_dir, "stderr.log"), "wb"
+                                                                      "wb") as err:
-    ) as err:
        result = subprocess.Popen(cmd, stdout=out, stderr=err)
        result.wait()
        out.flush()
@@ -401,9 +390,7 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
    clean_up(exp, reservations)
-    logger.info(
+    logger.info(f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}")
-        f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}"
-    )
 PDSH_MAX_FAN_OUT = 1024
@@ -417,8 +404,7 @@ def clean_up(exp: dict, reservations):
    for reservation in reservations:
        nodes_str += f"{reservation.node.host},"
    nodes_str = nodes_str[:-1]
-    logger.debug(
+    logger.debug(f"Cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
-        f"Cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
    # PDSH flags for max node fan out and specific hosts to launch on
    # See https://linux.die.net/man/1/pdsh for flag details
@@ -441,6 +427,4 @@ def clean_up(exp: dict, reservations):
    if result.returncode > 0:
        sys.exit(result.returncode)
-    logger.info(
+    logger.info(f"Done cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
-        f"Done cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}"
-    )
--- a/deepspeed/autotuning/tuner/__init__.py
+++ b/deepspeed/autotuning/tuner/__init__.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 from .index_based_tuner import RandomTuner, GridSearchTuner
 # from .ga_tuner import GATuner

--- a/deepspeed/autotuning/tuner/base_tuner.py
+++ b/deepspeed/autotuning/tuner/base_tuner.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 import sys
@@ -8,6 +11,7 @@ from deepspeed.utils import logger
 class BaseTuner:
    def __init__(self, exps, resource_manager, metric):
        self.all_exps = exps
        self.rm = resource_manager
@@ -42,8 +46,8 @@ class BaseTuner:
                self.rm.schedule_experiments(exp_paths)
                self.rm.run()
                exp, metric_val = self.rm.parse_results(self.metric)
-                if self.best_exp == None or self.best_metric_val == None or (
+                if self.best_exp == None or self.best_metric_val == None or (metric_val
-                        metric_val and metric_val > self.best_metric_val):
+                                                                             and metric_val > self.best_metric_val):
                    # logger.info(f"tuner finds better = {exp}")
                    self.best_exp = exp
                    self.best_metric_val = metric_val

--- a/deepspeed/autotuning/tuner/cost_model.py
+++ b/deepspeed/autotuning/tuner/cost_model.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 from .utils import *
@@ -9,6 +12,7 @@ except ImportError:
 class XGBoostCostModel():
    def __init__(self, loss_type, num_threads=None, log_interval=25, upper_model=None):
        assert xgb is not None, "missing requirements, please install deepspeed w. 'autotuning_ml' extra."

--- a/deepspeed/autotuning/tuner/index_based_tuner.py
+++ b/deepspeed/autotuning/tuner/index_based_tuner.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 import random
@@ -7,6 +10,7 @@ from .base_tuner import BaseTuner
 class RandomTuner(BaseTuner):
    """Explore the search space in random order"""
    def __init__(self, exps: list, resource_manager, metric):
        super().__init__(exps, resource_manager, metric)
@@ -22,6 +26,7 @@ class RandomTuner(BaseTuner):
 class GridSearchTuner(BaseTuner):
    """Explore the search space in sequential order"""
    def __init__(self, exps: list, resource_manager, metric):
        super().__init__(exps, resource_manager, metric)

--- a/deepspeed/autotuning/tuner/model_based_tuner.py
+++ b/deepspeed/autotuning/tuner/model_based_tuner.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 import hjson
@@ -15,6 +18,7 @@ INIT_NUM = 2
 class ModelBasedTuner(BaseTuner):
    """Exploring the search space with a cost model"""
    def __init__(self, exps: list, resource_manager, metric, tuning_sapce):
        super().__init__(exps, resource_manager, metric)
        self.tuning_space = tuning_sapce
@@ -25,8 +29,7 @@ class ModelBasedTuner(BaseTuner):
        self.dims = dict_to_dims(self.tuning_space)
-        logger.info(
+        logger.info(f"Create config dim: {self.dims}, all configs: {self.num_all_configs}")
-            f"Create config dim: {self.dims}, all configs: {self.num_all_configs}")
        self.visited = set([])
@@ -71,9 +74,7 @@ class ModelBasedTuner(BaseTuner):
        n = len(estimates)
        top_idx = np.argsort(estimates)
-        top_idx_ret = top_idx if self.metric == AUTOTUNING_METRIC_LATENCY else top_idx[::
+        top_idx_ret = top_idx if self.metric == AUTOTUNING_METRIC_LATENCY else top_idx[::-1][:n]
-                                                                                       -1][:
-                                                                                           n]
        # top_configs = [self.all_configs[i] for i in top_idx]
@@ -145,9 +146,7 @@ class ModelBasedTuner(BaseTuner):
                self.evaluated_configs.append(feature_val)
                self.evaluated_perf.append(curr_iter)
-        logger.debug(
+        logger.debug(f"**Evaluated configs: {len(self.evaluated_configs)}, evaluated perf: {self.evaluated_perf}")
-            f"**Evaluated configs: {len(self.evaluated_configs)}, evaluated perf: {self.evaluated_perf}"
-        )
        self.cost_model.fit(self.evaluated_configs, self.evaluated_perf)

--- a/deepspeed/autotuning/tuner/utils.py
+++ b/deepspeed/autotuning/tuner/utils.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 import numpy as np
 import itertools
@@ -44,9 +47,7 @@ def gen_combinations(d: dict):
    for v in values:
        if not isinstance(v, list):
            v = [v]
-    values_choices = (gen_combinations(v) if isinstance(v,
+    values_choices = (gen_combinations(v) if isinstance(v, dict) else get_list(v) for v in values)
-                                                        dict) else get_list(v)
-                      for v in values)
    for comb in itertools.product(*values_choices):
        yield dict(zip(keys, comb))