Unverified Commit 5e8c8eb5 authored by Aaron Gokaslan's avatar Aaron Gokaslan Committed by GitHub
Browse files

Apply ruff flake8-comprehensions (#21694)

parent df06fb1f
...@@ -142,7 +142,7 @@ def convert_xmod_checkpoint_to_pytorch( ...@@ -142,7 +142,7 @@ def convert_xmod_checkpoint_to_pytorch(
bert_output.adapter_layer_norm.weight = xmod_layer.adapter_layer_norm.weight bert_output.adapter_layer_norm.weight = xmod_layer.adapter_layer_norm.weight
bert_output.adapter_layer_norm.bias = xmod_layer.adapter_layer_norm.bias bert_output.adapter_layer_norm.bias = xmod_layer.adapter_layer_norm.bias
if list(sorted(bert_output.adapter_modules.keys())) != list(sorted(xmod_layer.adapter_modules.keys())): if sorted(bert_output.adapter_modules.keys()) != sorted(xmod_layer.adapter_modules.keys()):
raise AssertionError("Lists of language adapters do not match.") raise AssertionError("Lists of language adapters do not match.")
for lang_code, adapter in xmod_layer.adapter_modules.items(): for lang_code, adapter in xmod_layer.adapter_modules.items():
to_adapter = bert_output.adapter_modules[lang_code] to_adapter = bert_output.adapter_modules[lang_code]
......
...@@ -395,7 +395,7 @@ class XmodOutput(nn.Module): ...@@ -395,7 +395,7 @@ class XmodOutput(nn.Module):
else: else:
self.adapter_layer_norm = None self.adapter_layer_norm = None
self.adapter_reuse_layer_norm = config.adapter_reuse_layer_norm self.adapter_reuse_layer_norm = config.adapter_reuse_layer_norm
self.adapter_modules = nn.ModuleDict(dict()) self.adapter_modules = nn.ModuleDict({})
for language in config.languages: for language in config.languages:
self.adapter_modules[str(language)] = XmodAdapter(config) self.adapter_modules[str(language)] = XmodAdapter(config)
......
...@@ -515,7 +515,7 @@ def binary_mask_to_rle(mask): ...@@ -515,7 +515,7 @@ def binary_mask_to_rle(mask):
pixels = np.concatenate([[0], pixels, [0]]) pixels = np.concatenate([[0], pixels, [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1 runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2] runs[1::2] -= runs[::2]
return [x for x in runs] return list(runs)
# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle # Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
......
...@@ -145,7 +145,7 @@ def export_pytorch( ...@@ -145,7 +145,7 @@ def export_pytorch(
device = torch.device(device) device = torch.device(device)
if device.type == "cuda" and torch.cuda.is_available(): if device.type == "cuda" and torch.cuda.is_available():
model.to(device) model.to(device)
model_inputs_device = dict() model_inputs_device = {}
for k, v in model_inputs.items(): for k, v in model_inputs.items():
if isinstance(v, Tuple): if isinstance(v, Tuple):
model_inputs_device[k] = tuple( model_inputs_device[k] = tuple(
......
...@@ -358,7 +358,7 @@ class AdamW(Optimizer): ...@@ -358,7 +358,7 @@ class AdamW(Optimizer):
raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)") raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
if not 0.0 <= eps: if not 0.0 <= eps:
raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0") raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
super().__init__(params, defaults) super().__init__(params, defaults)
def step(self, closure: Callable = None): def step(self, closure: Callable = None):
...@@ -527,17 +527,17 @@ class Adafactor(Optimizer): ...@@ -527,17 +527,17 @@ class Adafactor(Optimizer):
if warmup_init and not relative_step: if warmup_init and not relative_step:
raise ValueError("`warmup_init=True` requires `relative_step=True`") raise ValueError("`warmup_init=True` requires `relative_step=True`")
defaults = dict( defaults = {
lr=lr, "lr": lr,
eps=eps, "eps": eps,
clip_threshold=clip_threshold, "clip_threshold": clip_threshold,
decay_rate=decay_rate, "decay_rate": decay_rate,
beta1=beta1, "beta1": beta1,
weight_decay=weight_decay, "weight_decay": weight_decay,
scale_parameter=scale_parameter, "scale_parameter": scale_parameter,
relative_step=relative_step, "relative_step": relative_step,
warmup_init=warmup_init, "warmup_init": warmup_init,
) }
super().__init__(params, defaults) super().__init__(params, defaults)
@staticmethod @staticmethod
......
...@@ -262,7 +262,7 @@ class AdamWeightDecay(Adam): ...@@ -262,7 +262,7 @@ class AdamWeightDecay(Adam):
coefficients = self._fallback_apply_state(var_device, var_dtype) coefficients = self._fallback_apply_state(var_device, var_dtype)
apply_state[(var_device, var_dtype)] = coefficients apply_state[(var_device, var_dtype)] = coefficients
return coefficients["lr_t"], dict(apply_state=apply_state) return coefficients["lr_t"], {"apply_state": apply_state}
def _resource_apply_dense(self, grad, var, apply_state=None): def _resource_apply_dense(self, grad, var, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
...@@ -333,7 +333,7 @@ class GradientAccumulator(object): ...@@ -333,7 +333,7 @@ class GradientAccumulator(object):
"""The accumulated gradients on the current replica.""" """The accumulated gradients on the current replica."""
if not self._gradients: if not self._gradients:
raise ValueError("The accumulator should be called first to initialize the gradients") raise ValueError("The accumulator should be called first to initialize the gradients")
return list(gradient.value() if gradient is not None else gradient for gradient in self._gradients) return [gradient.value() if gradient is not None else gradient for gradient in self._gradients]
def __call__(self, gradients): def __call__(self, gradients):
"""Accumulates `gradients` on the current replica.""" """Accumulates `gradients` on the current replica."""
......
...@@ -1083,7 +1083,7 @@ class Pipeline(_ScikitCompat): ...@@ -1083,7 +1083,7 @@ class Pipeline(_ScikitCompat):
final_iterator = self.get_iterator( final_iterator = self.get_iterator(
inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
) )
outputs = [output for output in final_iterator] outputs = list(final_iterator)
return outputs return outputs
else: else:
return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params) return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
......
...@@ -210,7 +210,7 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler): ...@@ -210,7 +210,7 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
inputs = [inputs] inputs = [inputs]
elif isinstance(inputs, Iterable): elif isinstance(inputs, Iterable):
# Copy to avoid overriding arguments # Copy to avoid overriding arguments
inputs = [i for i in inputs] inputs = list(inputs)
else: else:
raise ValueError(f"Invalid arguments {kwargs}") raise ValueError(f"Invalid arguments {kwargs}")
......
...@@ -425,7 +425,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -425,7 +425,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
if self.verbose: if self.verbose:
logger.info(f"Adding {token} to the vocabulary") logger.info(f"Adding {token} to the vocabulary")
added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)}
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
self.added_tokens_encoder.update(added_tok_encoder) self.added_tokens_encoder.update(added_tok_encoder)
self.added_tokens_decoder.update(added_tok_decoder) self.added_tokens_decoder.update(added_tok_decoder)
...@@ -495,9 +495,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -495,9 +495,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
`List[str]`: The list of tokens. `List[str]`: The list of tokens.
""" """
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
all_special_tokens_extended = dict( all_special_tokens_extended = {
(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken) str(t): t for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
) }
text, kwargs = self.prepare_for_tokenization(text, **kwargs) text, kwargs = self.prepare_for_tokenization(text, **kwargs)
......
...@@ -1918,7 +1918,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1918,7 +1918,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
obj.pop("__type") obj.pop("__type")
return AddedToken(**obj) return AddedToken(**obj)
elif isinstance(obj, (list, tuple)): elif isinstance(obj, (list, tuple)):
return list(convert_added_tokens(o) for o in obj) return [convert_added_tokens(o) for o in obj]
elif isinstance(obj, dict): elif isinstance(obj, dict):
return {k: convert_added_tokens(v) for k, v in obj.items()} return {k: convert_added_tokens(v) for k, v in obj.items()}
return obj return obj
...@@ -1992,7 +1992,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1992,7 +1992,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
added_tok_encoder = json.load(added_tokens_handle) added_tok_encoder = json.load(added_tokens_handle)
# Sort added tokens by index # Sort added tokens by index
added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1])) added_tok_encoder_sorted = sorted(added_tok_encoder.items(), key=lambda x: x[1])
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow. # individual tokens would repeatedly rebuild a trie, which can be slow.
...@@ -2129,7 +2129,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -2129,7 +2129,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
out["__type"] = "AddedToken" out["__type"] = "AddedToken"
return out return out
elif isinstance(obj, (list, tuple)): elif isinstance(obj, (list, tuple)):
return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj) return [convert_added_tokens(o, add_type_field=add_type_field) for o in obj]
elif isinstance(obj, dict): elif isinstance(obj, dict):
return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()} return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
return obj return obj
...@@ -2502,23 +2502,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -2502,23 +2502,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
""" """
# To avoid duplicating # To avoid duplicating
all_kwargs = dict( all_kwargs = {
add_special_tokens=add_special_tokens, "add_special_tokens": add_special_tokens,
padding=padding, "padding": padding,
truncation=truncation, "truncation": truncation,
max_length=max_length, "max_length": max_length,
stride=stride, "stride": stride,
is_split_into_words=is_split_into_words, "is_split_into_words": is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of, "pad_to_multiple_of": pad_to_multiple_of,
return_tensors=return_tensors, "return_tensors": return_tensors,
return_token_type_ids=return_token_type_ids, "return_token_type_ids": return_token_type_ids,
return_attention_mask=return_attention_mask, "return_attention_mask": return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens, "return_overflowing_tokens": return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask, "return_special_tokens_mask": return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping, "return_offsets_mapping": return_offsets_mapping,
return_length=return_length, "return_length": return_length,
verbose=verbose, "verbose": verbose,
) }
all_kwargs.update(kwargs) all_kwargs.update(kwargs)
if text is None and text_target is None: if text is None and text_target is None:
raise ValueError("You need to specify either `text` or `text_target`.") raise ValueError("You need to specify either `text` or `text_target`.")
...@@ -3010,7 +3010,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -3010,7 +3010,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
batch_outputs = {} batch_outputs = {}
for i in range(batch_size): for i in range(batch_size):
inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) inputs = {k: v[i] for k, v in encoded_inputs.items()}
outputs = self._pad( outputs = self._pad(
inputs, inputs,
max_length=max_length, max_length=max_length,
......
...@@ -162,7 +162,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -162,7 +162,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
""" """
base_vocab = self._tokenizer.get_vocab(with_added_tokens=False) base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
full_vocab = self._tokenizer.get_vocab(with_added_tokens=True) full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
added_vocab = dict((tok, index) for tok, index in full_vocab.items() if tok not in base_vocab) added_vocab = {tok: index for tok, index in full_vocab.items() if tok not in base_vocab}
return added_vocab return added_vocab
def __len__(self) -> int: def __len__(self) -> int:
......
...@@ -1081,7 +1081,7 @@ class Trainer: ...@@ -1081,7 +1081,7 @@ class Trainer:
skipped = 0 skipped = 0
for module in opt_model.modules(): for module in opt_model.modules():
if isinstance(module, nn.Embedding): if isinstance(module, nn.Embedding):
skipped += sum(dict((p.data_ptr(), p.numel()) for p in module.parameters()).values()) skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
print(f"skipped {module}: {skipped/2**20}M params") print(f"skipped {module}: {skipped/2**20}M params")
manager.register_module_override(module, "weight", {"optim_bits": 32}) manager.register_module_override(module, "weight", {"optim_bits": 32})
logger.debug(f"bitsandbytes: will optimize {module} in fp32") logger.debug(f"bitsandbytes: will optimize {module} in fp32")
...@@ -2564,12 +2564,12 @@ class Trainer: ...@@ -2564,12 +2564,12 @@ class Trainer:
elif isinstance(data, (tuple, list)): elif isinstance(data, (tuple, list)):
return type(data)(self._prepare_input(v) for v in data) return type(data)(self._prepare_input(v) for v in data)
elif isinstance(data, torch.Tensor): elif isinstance(data, torch.Tensor):
kwargs = dict(device=self.args.device) kwargs = {"device": self.args.device}
if self.deepspeed and data.dtype != torch.int64: if self.deepspeed and data.dtype != torch.int64:
# NLP models inputs are int64 and those get adjusted to the right dtype of the # NLP models inputs are int64 and those get adjusted to the right dtype of the
# embedding. Other models such as wav2vec2's inputs are already float and thus # embedding. Other models such as wav2vec2's inputs are already float and thus
# may need special handling to match the dtypes of the model # may need special handling to match the dtypes of the model
kwargs.update(dict(dtype=self.args.hf_deepspeed_config.dtype())) kwargs.update({"dtype": self.args.hf_deepspeed_config.dtype()})
return data.to(**kwargs) return data.to(**kwargs)
return data return data
......
...@@ -534,7 +534,7 @@ def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, genera ...@@ -534,7 +534,7 @@ def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, genera
indices = torch.randperm(len(lengths), generator=generator) indices = torch.randperm(len(lengths), generator=generator)
megabatch_size = mega_batch_mult * batch_size megabatch_size = mega_batch_mult * batch_size
megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)] megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
megabatches = [list(sorted(megabatch, key=lambda i: lengths[i], reverse=True)) for megabatch in megabatches] megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
# The rest is to get the biggest batch first. # The rest is to get the biggest batch first.
# Since each megabatch is sorted by descending length, the longest element is the first # Since each megabatch is sorted by descending length, the longest element is the first
......
...@@ -505,21 +505,21 @@ class TrainerMemoryTracker: ...@@ -505,21 +505,21 @@ class TrainerMemoryTracker:
if self.torch is not None: if self.torch is not None:
self.gpu_mem_used_now = self.torch.cuda.memory_allocated() self.gpu_mem_used_now = self.torch.cuda.memory_allocated()
self.gpu_mem_used_peak = self.torch.cuda.max_memory_allocated() self.gpu_mem_used_peak = self.torch.cuda.max_memory_allocated()
self.gpu[self.cur_stage] = dict( self.gpu[self.cur_stage] = {
begin=self.gpu_mem_used_at_start, "begin": self.gpu_mem_used_at_start,
end=self.gpu_mem_used_now, "end": self.gpu_mem_used_now,
alloc=(self.gpu_mem_used_now - self.gpu_mem_used_at_start), "alloc": (self.gpu_mem_used_now - self.gpu_mem_used_at_start),
peaked=max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now), "peaked": max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now),
) }
# cpu # cpu
self.cpu_mem_used_now = self.cpu_mem_used() self.cpu_mem_used_now = self.cpu_mem_used()
self.cpu[self.cur_stage] = dict( self.cpu[self.cur_stage] = {
begin=self.cpu_mem_used_at_start, "begin": self.cpu_mem_used_at_start,
end=self.cpu_mem_used_now, "end": self.cpu_mem_used_now,
alloc=(self.cpu_mem_used_now - self.cpu_mem_used_at_start), "alloc": (self.cpu_mem_used_now - self.cpu_mem_used_at_start),
peaked=max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now), "peaked": max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now),
) }
# reset - cycle finished # reset - cycle finished
self.cur_stage = None self.cur_stage = None
......
...@@ -1874,7 +1874,7 @@ class TrainingArguments: ...@@ -1874,7 +1874,7 @@ class TrainingArguments:
the token values by removing their value. the token values by removing their value.
""" """
# filter out fields that are defined as field(init=False) # filter out fields that are defined as field(init=False)
d = dict((field.name, getattr(self, field.name)) for field in fields(self) if field.init) d = {field.name: getattr(self, field.name) for field in fields(self) if field.init}
for k, v in d.items(): for k, v in d.items():
if isinstance(v, Enum): if isinstance(v, Enum):
......
...@@ -1085,19 +1085,19 @@ def add_code_sample_docstrings( ...@@ -1085,19 +1085,19 @@ def add_code_sample_docstrings(
# putting all kwargs for docstrings in a dict to be used # putting all kwargs for docstrings in a dict to be used
# with the `.format(**doc_kwargs)`. Note that string might # with the `.format(**doc_kwargs)`. Note that string might
# be formatted with non-existing keys, which is fine. # be formatted with non-existing keys, which is fine.
doc_kwargs = dict( doc_kwargs = {
model_class=model_class, "model_class": model_class,
processor_class=processor_class, "processor_class": processor_class,
checkpoint=checkpoint, "checkpoint": checkpoint,
mask=mask, "mask": mask,
qa_target_start_index=qa_target_start_index, "qa_target_start_index": qa_target_start_index,
qa_target_end_index=qa_target_end_index, "qa_target_end_index": qa_target_end_index,
expected_output=expected_output, "expected_output": expected_output,
expected_loss=expected_loss, "expected_loss": expected_loss,
real_checkpoint=real_checkpoint, "real_checkpoint": real_checkpoint,
fake_checkpoint=checkpoint, "fake_checkpoint": checkpoint,
true="{true}", # For <Tip warning={true}> syntax that conflicts with formatting. "true": "{true}", # For <Tip warning={true}> syntax that conflicts with formatting.
) }
if ("SequenceClassification" in model_class or "AudioClassification" in model_class) and modality == "audio": if ("SequenceClassification" in model_class or "AudioClassification" in model_class) and modality == "audio":
code_sample = sample_docstrings["AudioClassification"] code_sample = sample_docstrings["AudioClassification"]
......
...@@ -96,12 +96,12 @@ class TrialShortNamer: ...@@ -96,12 +96,12 @@ class TrialShortNamer:
if cls.NAMING_INFO is not None: if cls.NAMING_INFO is not None:
return return
info = dict( info = {
short_word={}, "short_word": {},
reverse_short_word={}, "reverse_short_word": {},
short_param={}, "short_param": {},
reverse_short_param={}, "reverse_short_param": {},
) }
field_keys = list(cls.DEFAULTS.keys()) field_keys = list(cls.DEFAULTS.keys())
......
...@@ -902,7 +902,7 @@ def get_checkpoint_shard_files( ...@@ -902,7 +902,7 @@ def get_checkpoint_shard_files(
with open(index_filename, "r") as f: with open(index_filename, "r") as f:
index = json.loads(f.read()) index = json.loads(f.read())
shard_filenames = sorted(list(set(index["weight_map"].values()))) shard_filenames = sorted(set(index["weight_map"].values()))
sharded_metadata = index["metadata"] sharded_metadata = index["metadata"]
sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys()) sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys())
sharded_metadata["weight_map"] = index["weight_map"].copy() sharded_metadata["weight_map"] = index["weight_map"].copy()
......
...@@ -51,6 +51,6 @@ def get_device_map(n_layers, devices): ...@@ -51,6 +51,6 @@ def get_device_map(n_layers, devices):
"""Returns a dictionary of layers distributed evenly across all devices.""" """Returns a dictionary of layers distributed evenly across all devices."""
layers = list(range(n_layers)) layers = list(range(n_layers))
n_blocks = int(ceil(n_layers / len(devices))) n_blocks = int(ceil(n_layers / len(devices)))
layers_list = list(layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)) layers_list = [layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)]
return dict(zip(devices, layers_list)) return dict(zip(devices, layers_list))
...@@ -157,9 +157,13 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -157,9 +157,13 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
super().setUp() super().setUp()
master_port = get_master_port(real_launcher=False) master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict( self.dist_env_1_gpu = {
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" "MASTER_ADDR": "localhost",
) "MASTER_PORT": master_port,
"RANK": "0",
"LOCAL_RANK": "0",
"WORLD_SIZE": "1",
}
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
...@@ -212,14 +216,18 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus): ...@@ -212,14 +216,18 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
self.batch_size = args.train_batch_size self.batch_size = args.train_batch_size
master_port = get_master_port(real_launcher=False) master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict( self.dist_env_1_gpu = {
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" "MASTER_ADDR": "localhost",
) "MASTER_PORT": master_port,
"RANK": "0",
"LOCAL_RANK": "0",
"WORLD_SIZE": "1",
}
self.ds_config_file = dict( self.ds_config_file = {
zero2=f"{self.test_file_dir_str}/ds_config_zero2.json", "zero2": f"{self.test_file_dir_str}/ds_config_zero2.json",
zero3=f"{self.test_file_dir_str}/ds_config_zero3.json", "zero3": f"{self.test_file_dir_str}/ds_config_zero3.json",
) }
# use self.get_config_dict(stage) to use these to ensure the original is not modified # use self.get_config_dict(stage) to use these to ensure the original is not modified
with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f: with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
...@@ -230,10 +238,10 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus): ...@@ -230,10 +238,10 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
# It's in the file as a demo for users since we want everything to work out of the box even if slower. # It's in the file as a demo for users since we want everything to work out of the box even if slower.
config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False
self.ds_config_dict = dict( self.ds_config_dict = {
zero2=config_zero2, "zero2": config_zero2,
zero3=config_zero3, "zero3": config_zero3,
) }
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
...@@ -370,7 +378,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -370,7 +378,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# this actually doesn't have to be on NVMe, any storage will do since this test only # this actually doesn't have to be on NVMe, any storage will do since this test only
# runs a simple check that we can use some directory as if it were NVMe # runs a simple check that we can use some directory as if it were NVMe
nvme_path = self.get_auto_remove_tmp_dir() nvme_path = self.get_auto_remove_tmp_dir()
nvme_config = dict(device="nvme", nvme_path=nvme_path) nvme_config = {"device": "nvme", "nvme_path": nvme_path}
ds_config_zero3_dict = self.get_config_dict(ZERO3) ds_config_zero3_dict = self.get_config_dict(ZERO3)
ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
...@@ -415,7 +423,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -415,7 +423,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# force cpu offload # force cpu offload
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
kwargs = dict(local_rank=0, deepspeed=ds_config_dict) kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
kwargs[dtype] = True kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs) trainer = get_regression_trainer(**kwargs)
with CaptureLogger(deepspeed_logger) as cl: with CaptureLogger(deepspeed_logger) as cl:
...@@ -431,7 +439,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -431,7 +439,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger. # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
kwargs = dict(local_rank=0, deepspeed=self.get_config_dict(stage)) kwargs = {"local_rank": 0, "deepspeed": self.get_config_dict(stage)}
kwargs[dtype] = True kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs) trainer = get_regression_trainer(**kwargs)
...@@ -449,15 +457,15 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -449,15 +457,15 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step. # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
a = b = 0.0 a = b = 0.0
kwargs = dict( kwargs = {
a=a, "a": a,
b=b, "b": b,
local_rank=0, "local_rank": 0,
train_len=8, "train_len": 8,
deepspeed=self.get_config_dict(stage), "deepspeed": self.get_config_dict(stage),
per_device_train_batch_size=8, "per_device_train_batch_size": 8,
logging_steps=1, "logging_steps": 1,
) }
kwargs[dtype] = True kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs) trainer = get_regression_trainer(**kwargs)
...@@ -494,13 +502,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -494,13 +502,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
train_len = 64 train_len = 64
a = b = 0.0 a = b = 0.0
kwargs = dict( kwargs = {
a=a, "a": a,
b=b, "b": b,
local_rank=0, "local_rank": 0,
train_len=train_len, "train_len": train_len,
deepspeed=self.get_config_dict(stage), "deepspeed": self.get_config_dict(stage),
) }
kwargs[dtype] = True kwargs[dtype] = True
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
...@@ -583,11 +591,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -583,11 +591,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# save checkpoints # save checkpoints
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
kwargs = dict( kwargs = {
output_dir=output_dir, "output_dir": output_dir,
save_steps=freq, "save_steps": freq,
deepspeed=ds_config_dict, "deepspeed": ds_config_dict,
) }
kwargs[dtype] = True kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs) trainer = get_regression_trainer(**kwargs)
trainer.train() trainer.train()
...@@ -600,7 +608,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -600,7 +608,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
ds_config_dict = self.get_config_dict(stage) ds_config_dict = self.get_config_dict(stage)
output_dir = self.get_auto_remove_tmp_dir() output_dir = self.get_auto_remove_tmp_dir()
kwargs = dict(output_dir=output_dir, deepspeed=ds_config_dict) kwargs = {"output_dir": output_dir, "deepspeed": ds_config_dict}
kwargs[dtype] = True kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs) trainer = get_regression_trainer(**kwargs)
...@@ -632,7 +640,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -632,7 +640,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
if stage == ZERO3: if stage == ZERO3:
ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict) kwargs = {
"output_dir": output_dir,
"train_len": 128,
"save_steps": 5,
"learning_rate": 0.1,
"deepspeed": ds_config_dict,
}
kwargs[dtype] = True kwargs[dtype] = True
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
...@@ -679,16 +693,16 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -679,16 +693,16 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
ds_config_dict = self.get_config_dict(stage) ds_config_dict = self.get_config_dict(stage)
kwargs = dict( kwargs = {
output_dir=output_dir, "output_dir": output_dir,
train_len=4, "train_len": 4,
per_device_train_batch_size=4, "per_device_train_batch_size": 4,
num_train_epochs=1, "num_train_epochs": 1,
save_strategy="steps", "save_strategy": "steps",
save_steps=1, "save_steps": 1,
learning_rate=0.1, "learning_rate": 0.1,
deepspeed=ds_config_dict, "deepspeed": ds_config_dict,
) }
kwargs[dtype] = True kwargs[dtype] = True
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
...@@ -710,7 +724,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -710,7 +724,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# test that we can switch from zero2 to zero3 in the same process for example # test that we can switch from zero2 to zero3 in the same process for example
# test is_zero, etc. # test is_zero, etc.
output_dir = self.get_auto_remove_tmp_dir() output_dir = self.get_auto_remove_tmp_dir()
kwargs = dict(output_dir=output_dir, train_len=8, fp16=True) kwargs = {"output_dir": output_dir, "train_len": 8, "fp16": True}
ds_config_zero3_dict = self.get_config_dict(ZERO3) ds_config_zero3_dict = self.get_config_dict(ZERO3)
ds_config_zero2_dict = self.get_config_dict(ZERO2) ds_config_zero2_dict = self.get_config_dict(ZERO2)
...@@ -808,7 +822,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T ...@@ -808,7 +822,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
def get_dataset(): def get_dataset():
data_file = str(self.tests_dir / "fixtures/tests_samples/SQUAD/sample.json") data_file = str(self.tests_dir / "fixtures/tests_samples/SQUAD/sample.json")
data_files = dict(train=data_file, validation=data_file) data_files = {"train": data_file, "validation": data_file}
raw_datasets = datasets.load_dataset("json", data_files=data_files, field="data") raw_datasets = datasets.load_dataset("json", data_files=data_files, field="data")
train_dataset = raw_datasets["train"].map(_add_eos_to_examples).map(_convert_to_features, batched=True) train_dataset = raw_datasets["train"].map(_add_eos_to_examples).map(_convert_to_features, batched=True)
valid_dataset = deepcopy(train_dataset) valid_dataset = deepcopy(train_dataset)
...@@ -903,7 +917,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -903,7 +917,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
do_train = True do_train = True
do_eval = False do_eval = False
kwargs = dict(stage=stage, dtype=dtype, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval) kwargs = {
"stage": stage,
"dtype": dtype,
"eval_steps": 1,
"distributed": True,
"do_train": do_train,
"do_eval": do_eval,
}
# 1. normal training # 1. normal training
output_dir = self.run_and_check(**kwargs) output_dir = self.run_and_check(**kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment