Apply ruff flake8-comprehensions (#21694)

5e8c8eb5 · Aaron Gokaslan · GitHub · df06fb1f · 5e8c8eb5 · 5e8c8eb5
Unverified Commit 5e8c8eb5 authored Feb 22, 2023 by Aaron Gokaslan Committed by GitHub Feb 22, 2023
20 changed files
--- a/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
@@ -142,7 +142,7 @@ def convert_xmod_checkpoint_to_pytorch(
            bert_output.adapter_layer_norm.weight = xmod_layer.adapter_layer_norm.weight
            bert_output.adapter_layer_norm.bias = xmod_layer.adapter_layer_norm.bias

-        if list(sorted(bert_output.adapter_modules.keys())) != list(sorted(xmod_layer.adapter_modules.keys())):
+        if sorted(bert_output.adapter_modules.keys()) != sorted(xmod_layer.adapter_modules.keys()):
            raise AssertionError("Lists of language adapters do not match.")
        for lang_code, adapter in xmod_layer.adapter_modules.items():
            to_adapter = bert_output.adapter_modules[lang_code]

--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -395,7 +395,7 @@ class XmodOutput(nn.Module):
        else:
            self.adapter_layer_norm = None
        self.adapter_reuse_layer_norm = config.adapter_reuse_layer_norm
-        self.adapter_modules = nn.ModuleDict(dict())
+        self.adapter_modules = nn.ModuleDict({})
        for language in config.languages:
            self.adapter_modules[str(language)] = XmodAdapter(config)


--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -515,7 +515,7 @@ def binary_mask_to_rle(mask):
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
-    return [x for x in runs]
+    return list(runs)


 # Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle

--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -145,7 +145,7 @@ def export_pytorch(
            device = torch.device(device)
            if device.type == "cuda" and torch.cuda.is_available():
                model.to(device)
-                model_inputs_device = dict()
+                model_inputs_device = {}
                for k, v in model_inputs.items():
                    if isinstance(v, Tuple):
                        model_inputs_device[k] = tuple(

--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -358,7 +358,7 @@ class AdamW(Optimizer):
            raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
        if not 0.0 <= eps:
            raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
+        defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
        super().__init__(params, defaults)

    def step(self, closure: Callable = None):
@@ -527,17 +527,17 @@ class Adafactor(Optimizer):
        if warmup_init and not relative_step:
            raise ValueError("`warmup_init=True` requires `relative_step=True`")

-        defaults = dict(
-            lr=lr,
-            eps=eps,
-            clip_threshold=clip_threshold,
-            decay_rate=decay_rate,
-            beta1=beta1,
-            weight_decay=weight_decay,
-            scale_parameter=scale_parameter,
-            relative_step=relative_step,
-            warmup_init=warmup_init,
-        )
+        defaults = {
+            "lr": lr,
+            "eps": eps,
+            "clip_threshold": clip_threshold,
+            "decay_rate": decay_rate,
+            "beta1": beta1,
+            "weight_decay": weight_decay,
+            "scale_parameter": scale_parameter,
+            "relative_step": relative_step,
+            "warmup_init": warmup_init,
+        }
        super().__init__(params, defaults)

    @staticmethod

--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -262,7 +262,7 @@ class AdamWeightDecay(Adam):
            coefficients = self._fallback_apply_state(var_device, var_dtype)
            apply_state[(var_device, var_dtype)] = coefficients

-        return coefficients["lr_t"], dict(apply_state=apply_state)
+        return coefficients["lr_t"], {"apply_state": apply_state}

    def _resource_apply_dense(self, grad, var, apply_state=None):
        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
@@ -333,7 +333,7 @@ class GradientAccumulator(object):
        """The accumulated gradients on the current replica."""
        if not self._gradients:
            raise ValueError("The accumulator should be called first to initialize the gradients")
-        return list(gradient.value() if gradient is not None else gradient for gradient in self._gradients)
+        return [gradient.value() if gradient is not None else gradient for gradient in self._gradients]

    def __call__(self, gradients):
        """Accumulates `gradients` on the current replica."""

--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -1083,7 +1083,7 @@ class Pipeline(_ScikitCompat):
                final_iterator = self.get_iterator(
                    inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
                )
-                outputs = [output for output in final_iterator]
+                outputs = list(final_iterator)
                return outputs
            else:
                return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)

--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -210,7 +210,7 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
            inputs = [inputs]
        elif isinstance(inputs, Iterable):
            # Copy to avoid overriding arguments
-            inputs = [i for i in inputs]
+            inputs = list(inputs)
        else:
            raise ValueError(f"Invalid arguments {kwargs}")


--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -425,7 +425,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                if self.verbose:
                    logger.info(f"Adding {token} to the vocabulary")

-        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
+        added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)}
        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
        self.added_tokens_encoder.update(added_tok_encoder)
        self.added_tokens_decoder.update(added_tok_decoder)
@@ -495,9 +495,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
            `List[str]`: The list of tokens.
        """
        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
-        all_special_tokens_extended = dict(
-            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
-        )
+        all_special_tokens_extended = {
+            str(t): t for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
+        }

        text, kwargs = self.prepare_for_tokenization(text, **kwargs)


--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1918,7 +1918,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                obj.pop("__type")
                return AddedToken(**obj)
            elif isinstance(obj, (list, tuple)):
-                return list(convert_added_tokens(o) for o in obj)
+                return [convert_added_tokens(o) for o in obj]
            elif isinstance(obj, dict):
                return {k: convert_added_tokens(v) for k, v in obj.items()}
            return obj
@@ -1992,7 +1992,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                added_tok_encoder = json.load(added_tokens_handle)

            # Sort added tokens by index
-            added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
+            added_tok_encoder_sorted = sorted(added_tok_encoder.items(), key=lambda x: x[1])

            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
            # individual tokens would repeatedly rebuild a trie, which can be slow.
@@ -2129,7 +2129,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                    out["__type"] = "AddedToken"
                return out
            elif isinstance(obj, (list, tuple)):
-                return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
+                return [convert_added_tokens(o, add_type_field=add_type_field) for o in obj]
            elif isinstance(obj, dict):
                return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
            return obj
@@ -2502,23 +2502,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
        """
        # To avoid duplicating
-        all_kwargs = dict(
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            is_split_into_words=is_split_into_words,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-        )
+        all_kwargs = {
+            "add_special_tokens": add_special_tokens,
+            "padding": padding,
+            "truncation": truncation,
+            "max_length": max_length,
+            "stride": stride,
+            "is_split_into_words": is_split_into_words,
+            "pad_to_multiple_of": pad_to_multiple_of,
+            "return_tensors": return_tensors,
+            "return_token_type_ids": return_token_type_ids,
+            "return_attention_mask": return_attention_mask,
+            "return_overflowing_tokens": return_overflowing_tokens,
+            "return_special_tokens_mask": return_special_tokens_mask,
+            "return_offsets_mapping": return_offsets_mapping,
+            "return_length": return_length,
+            "verbose": verbose,
+        }
        all_kwargs.update(kwargs)
        if text is None and text_target is None:
            raise ValueError("You need to specify either `text` or `text_target`.")
@@ -3010,7 +3010,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

        batch_outputs = {}
        for i in range(batch_size):
-            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+            inputs = {k: v[i] for k, v in encoded_inputs.items()}
            outputs = self._pad(
                inputs,
                max_length=max_length,

--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -162,7 +162,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        """
        base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
        full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
-        added_vocab = dict((tok, index) for tok, index in full_vocab.items() if tok not in base_vocab)
+        added_vocab = {tok: index for tok, index in full_vocab.items() if tok not in base_vocab}
        return added_vocab

    def __len__(self) -> int:

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1081,7 +1081,7 @@ class Trainer:
                    skipped = 0
                    for module in opt_model.modules():
                        if isinstance(module, nn.Embedding):
-                            skipped += sum(dict((p.data_ptr(), p.numel()) for p in module.parameters()).values())
+                            skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
                            print(f"skipped {module}: {skipped/2**20}M params")
                            manager.register_module_override(module, "weight", {"optim_bits": 32})
                            logger.debug(f"bitsandbytes: will optimize {module} in fp32")
@@ -2564,12 +2564,12 @@ class Trainer:
        elif isinstance(data, (tuple, list)):
            return type(data)(self._prepare_input(v) for v in data)
        elif isinstance(data, torch.Tensor):
-            kwargs = dict(device=self.args.device)
+            kwargs = {"device": self.args.device}
            if self.deepspeed and data.dtype != torch.int64:
                # NLP models inputs are int64 and those get adjusted to the right dtype of the
                # embedding. Other models such as wav2vec2's inputs are already float and thus
                # may need special handling to match the dtypes of the model
-                kwargs.update(dict(dtype=self.args.hf_deepspeed_config.dtype()))
+                kwargs.update({"dtype": self.args.hf_deepspeed_config.dtype()})
            return data.to(**kwargs)
        return data


--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -534,7 +534,7 @@ def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, genera
    indices = torch.randperm(len(lengths), generator=generator)
    megabatch_size = mega_batch_mult * batch_size
    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
-    megabatches = [list(sorted(megabatch, key=lambda i: lengths[i], reverse=True)) for megabatch in megabatches]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]

    # The rest is to get the biggest batch first.
    # Since each megabatch is sorted by descending length, the longest element is the first

--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -505,21 +505,21 @@ class TrainerMemoryTracker:
        if self.torch is not None:
            self.gpu_mem_used_now = self.torch.cuda.memory_allocated()
            self.gpu_mem_used_peak = self.torch.cuda.max_memory_allocated()
-            self.gpu[self.cur_stage] = dict(
-                begin=self.gpu_mem_used_at_start,
-                end=self.gpu_mem_used_now,
-                alloc=(self.gpu_mem_used_now - self.gpu_mem_used_at_start),
-                peaked=max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now),
-            )
+            self.gpu[self.cur_stage] = {
+                "begin": self.gpu_mem_used_at_start,
+                "end": self.gpu_mem_used_now,
+                "alloc": (self.gpu_mem_used_now - self.gpu_mem_used_at_start),
+                "peaked": max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now),
+            }

        # cpu
        self.cpu_mem_used_now = self.cpu_mem_used()
-        self.cpu[self.cur_stage] = dict(
-            begin=self.cpu_mem_used_at_start,
-            end=self.cpu_mem_used_now,
-            alloc=(self.cpu_mem_used_now - self.cpu_mem_used_at_start),
-            peaked=max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now),
-        )
+        self.cpu[self.cur_stage] = {
+            "begin": self.cpu_mem_used_at_start,
+            "end": self.cpu_mem_used_now,
+            "alloc": (self.cpu_mem_used_now - self.cpu_mem_used_at_start),
+            "peaked": max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now),
+        }

        # reset - cycle finished
        self.cur_stage = None

--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1874,7 +1874,7 @@ class TrainingArguments:
        the token values by removing their value.
        """
        # filter out fields that are defined as field(init=False)
-        d = dict((field.name, getattr(self, field.name)) for field in fields(self) if field.init)
+        d = {field.name: getattr(self, field.name) for field in fields(self) if field.init}

        for k, v in d.items():
            if isinstance(v, Enum):

--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -1085,19 +1085,19 @@ def add_code_sample_docstrings(
        # putting all kwargs for docstrings in a dict to be used
        # with the `.format(**doc_kwargs)`. Note that string might
        # be formatted with non-existing keys, which is fine.
-        doc_kwargs = dict(
-            model_class=model_class,
-            processor_class=processor_class,
-            checkpoint=checkpoint,
-            mask=mask,
-            qa_target_start_index=qa_target_start_index,
-            qa_target_end_index=qa_target_end_index,
-            expected_output=expected_output,
-            expected_loss=expected_loss,
-            real_checkpoint=real_checkpoint,
-            fake_checkpoint=checkpoint,
-            true="{true}",  # For <Tip warning={true}> syntax that conflicts with formatting.
-        )
+        doc_kwargs = {
+            "model_class": model_class,
+            "processor_class": processor_class,
+            "checkpoint": checkpoint,
+            "mask": mask,
+            "qa_target_start_index": qa_target_start_index,
+            "qa_target_end_index": qa_target_end_index,
+            "expected_output": expected_output,
+            "expected_loss": expected_loss,
+            "real_checkpoint": real_checkpoint,
+            "fake_checkpoint": checkpoint,
+            "true": "{true}",  # For <Tip warning={true}> syntax that conflicts with formatting.
+        }

        if ("SequenceClassification" in model_class or "AudioClassification" in model_class) and modality == "audio":
            code_sample = sample_docstrings["AudioClassification"]

--- a/src/transformers/utils/hp_naming.py
+++ b/src/transformers/utils/hp_naming.py
@@ -96,12 +96,12 @@ class TrialShortNamer:
        if cls.NAMING_INFO is not None:
            return

-        info = dict(
-            short_word={},
-            reverse_short_word={},
-            short_param={},
-            reverse_short_param={},
-        )
+        info = {
+            "short_word": {},
+            "reverse_short_word": {},
+            "short_param": {},
+            "reverse_short_param": {},
+        }

        field_keys = list(cls.DEFAULTS.keys())


--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -902,7 +902,7 @@ def get_checkpoint_shard_files(
    with open(index_filename, "r") as f:
        index = json.loads(f.read())

-    shard_filenames = sorted(list(set(index["weight_map"].values())))
+    shard_filenames = sorted(set(index["weight_map"].values()))
    sharded_metadata = index["metadata"]
    sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys())
    sharded_metadata["weight_map"] = index["weight_map"].copy()

--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -51,6 +51,6 @@ def get_device_map(n_layers, devices):
    """Returns a dictionary of layers distributed evenly across all devices."""
    layers = list(range(n_layers))
    n_blocks = int(ceil(n_layers / len(devices)))
-    layers_list = list(layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks))
+    layers_list = [layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)]

    return dict(zip(devices, layers_list))
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -157,9 +157,13 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
        super().setUp()

        master_port = get_master_port(real_launcher=False)
-        self.dist_env_1_gpu = dict(
-            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
-        )
+        self.dist_env_1_gpu = {
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": master_port,
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+        }

    def tearDown(self):
        super().tearDown()
@@ -212,14 +216,18 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
        self.batch_size = args.train_batch_size

        master_port = get_master_port(real_launcher=False)
-        self.dist_env_1_gpu = dict(
-            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
-        )
+        self.dist_env_1_gpu = {
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": master_port,
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+        }

-        self.ds_config_file = dict(
-            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
-            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
-        )
+        self.ds_config_file = {
+            "zero2": f"{self.test_file_dir_str}/ds_config_zero2.json",
+            "zero3": f"{self.test_file_dir_str}/ds_config_zero3.json",
+        }

        # use self.get_config_dict(stage) to use these to ensure the original is not modified
        with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
@@ -230,10 +238,10 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
            # It's in the file as a demo for users since we want everything to work out of the box even if slower.
            config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False

-        self.ds_config_dict = dict(
-            zero2=config_zero2,
-            zero3=config_zero3,
-        )
+        self.ds_config_dict = {
+            "zero2": config_zero2,
+            "zero3": config_zero3,
+        }

    def tearDown(self):
        super().tearDown()
@@ -370,7 +378,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
            # this actually doesn't have to be on NVMe, any storage will do since this test only
            # runs a simple check that we can use some directory as if it were NVMe
            nvme_path = self.get_auto_remove_tmp_dir()
-            nvme_config = dict(device="nvme", nvme_path=nvme_path)
+            nvme_config = {"device": "nvme", "nvme_path": nvme_path}
            ds_config_zero3_dict = self.get_config_dict(ZERO3)
            ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
            ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
@@ -415,7 +423,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        # force cpu offload
        ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
        with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = dict(local_rank=0, deepspeed=ds_config_dict)
+            kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
            kwargs[dtype] = True
            trainer = get_regression_trainer(**kwargs)
            with CaptureLogger(deepspeed_logger) as cl:
@@ -431,7 +439,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
        # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
        with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = dict(local_rank=0, deepspeed=self.get_config_dict(stage))
+            kwargs = {"local_rank": 0, "deepspeed": self.get_config_dict(stage)}
            kwargs[dtype] = True
            trainer = get_regression_trainer(**kwargs)

@@ -449,15 +457,15 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
        with mockenv_context(**self.dist_env_1_gpu):
            a = b = 0.0
-            kwargs = dict(
-                a=a,
-                b=b,
-                local_rank=0,
-                train_len=8,
-                deepspeed=self.get_config_dict(stage),
-                per_device_train_batch_size=8,
-                logging_steps=1,
-            )
+            kwargs = {
+                "a": a,
+                "b": b,
+                "local_rank": 0,
+                "train_len": 8,
+                "deepspeed": self.get_config_dict(stage),
+                "per_device_train_batch_size": 8,
+                "logging_steps": 1,
+            }
            kwargs[dtype] = True
            trainer = get_regression_trainer(**kwargs)

@@ -494,13 +502,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        train_len = 64
        a = b = 0.0

-        kwargs = dict(
-            a=a,
-            b=b,
-            local_rank=0,
-            train_len=train_len,
-            deepspeed=self.get_config_dict(stage),
-        )
+        kwargs = {
+            "a": a,
+            "b": b,
+            "local_rank": 0,
+            "train_len": train_len,
+            "deepspeed": self.get_config_dict(stage),
+        }
        kwargs[dtype] = True

        with mockenv_context(**self.dist_env_1_gpu):
@@ -583,11 +591,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T

        # save checkpoints
        with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = dict(
-                output_dir=output_dir,
-                save_steps=freq,
-                deepspeed=ds_config_dict,
-            )
+            kwargs = {
+                "output_dir": output_dir,
+                "save_steps": freq,
+                "deepspeed": ds_config_dict,
+            }
            kwargs[dtype] = True
            trainer = get_regression_trainer(**kwargs)
            trainer.train()
@@ -600,7 +608,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        with mockenv_context(**self.dist_env_1_gpu):
            ds_config_dict = self.get_config_dict(stage)
            output_dir = self.get_auto_remove_tmp_dir()
-            kwargs = dict(output_dir=output_dir, deepspeed=ds_config_dict)
+            kwargs = {"output_dir": output_dir, "deepspeed": ds_config_dict}
            kwargs[dtype] = True
            trainer = get_regression_trainer(**kwargs)

@@ -632,7 +640,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        if stage == ZERO3:
            ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True

-        kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
+        kwargs = {
+            "output_dir": output_dir,
+            "train_len": 128,
+            "save_steps": 5,
+            "learning_rate": 0.1,
+            "deepspeed": ds_config_dict,
+        }
        kwargs[dtype] = True

        with mockenv_context(**self.dist_env_1_gpu):
@@ -679,16 +693,16 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T

        ds_config_dict = self.get_config_dict(stage)

-        kwargs = dict(
-            output_dir=output_dir,
-            train_len=4,
-            per_device_train_batch_size=4,
-            num_train_epochs=1,
-            save_strategy="steps",
-            save_steps=1,
-            learning_rate=0.1,
-            deepspeed=ds_config_dict,
-        )
+        kwargs = {
+            "output_dir": output_dir,
+            "train_len": 4,
+            "per_device_train_batch_size": 4,
+            "num_train_epochs": 1,
+            "save_strategy": "steps",
+            "save_steps": 1,
+            "learning_rate": 0.1,
+            "deepspeed": ds_config_dict,
+        }
        kwargs[dtype] = True

        with mockenv_context(**self.dist_env_1_gpu):
@@ -710,7 +724,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        # test that we can switch from zero2 to zero3 in the same process for example
        # test is_zero, etc.
        output_dir = self.get_auto_remove_tmp_dir()
-        kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)
+        kwargs = {"output_dir": output_dir, "train_len": 8, "fp16": True}

        ds_config_zero3_dict = self.get_config_dict(ZERO3)
        ds_config_zero2_dict = self.get_config_dict(ZERO2)
@@ -808,7 +822,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T

            def get_dataset():
                data_file = str(self.tests_dir / "fixtures/tests_samples/SQUAD/sample.json")
-                data_files = dict(train=data_file, validation=data_file)
+                data_files = {"train": data_file, "validation": data_file}
                raw_datasets = datasets.load_dataset("json", data_files=data_files, field="data")
                train_dataset = raw_datasets["train"].map(_add_eos_to_examples).map(_convert_to_features, batched=True)
                valid_dataset = deepcopy(train_dataset)
@@ -903,7 +917,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):

        do_train = True
        do_eval = False
-        kwargs = dict(stage=stage, dtype=dtype, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
+        kwargs = {
+            "stage": stage,
+            "dtype": dtype,
+            "eval_steps": 1,
+            "distributed": True,
+            "do_train": do_train,
+            "do_eval": do_eval,
+        }

        # 1. normal training
        output_dir = self.run_and_check(**kwargs)