Unverified Commit 5e8c8eb5 authored by Aaron Gokaslan's avatar Aaron Gokaslan Committed by GitHub
Browse files

Apply ruff flake8-comprehensions (#21694)

parent df06fb1f
......@@ -142,7 +142,7 @@ def convert_xmod_checkpoint_to_pytorch(
bert_output.adapter_layer_norm.weight = xmod_layer.adapter_layer_norm.weight
bert_output.adapter_layer_norm.bias = xmod_layer.adapter_layer_norm.bias
if list(sorted(bert_output.adapter_modules.keys())) != list(sorted(xmod_layer.adapter_modules.keys())):
if sorted(bert_output.adapter_modules.keys()) != sorted(xmod_layer.adapter_modules.keys()):
raise AssertionError("Lists of language adapters do not match.")
for lang_code, adapter in xmod_layer.adapter_modules.items():
to_adapter = bert_output.adapter_modules[lang_code]
......
......@@ -395,7 +395,7 @@ class XmodOutput(nn.Module):
else:
self.adapter_layer_norm = None
self.adapter_reuse_layer_norm = config.adapter_reuse_layer_norm
self.adapter_modules = nn.ModuleDict(dict())
self.adapter_modules = nn.ModuleDict({})
for language in config.languages:
self.adapter_modules[str(language)] = XmodAdapter(config)
......
......@@ -515,7 +515,7 @@ def binary_mask_to_rle(mask):
pixels = np.concatenate([[0], pixels, [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2]
return [x for x in runs]
return list(runs)
# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
......
......@@ -145,7 +145,7 @@ def export_pytorch(
device = torch.device(device)
if device.type == "cuda" and torch.cuda.is_available():
model.to(device)
model_inputs_device = dict()
model_inputs_device = {}
for k, v in model_inputs.items():
if isinstance(v, Tuple):
model_inputs_device[k] = tuple(
......
......@@ -358,7 +358,7 @@ class AdamW(Optimizer):
raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
if not 0.0 <= eps:
raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
super().__init__(params, defaults)
def step(self, closure: Callable = None):
......@@ -527,17 +527,17 @@ class Adafactor(Optimizer):
if warmup_init and not relative_step:
raise ValueError("`warmup_init=True` requires `relative_step=True`")
defaults = dict(
lr=lr,
eps=eps,
clip_threshold=clip_threshold,
decay_rate=decay_rate,
beta1=beta1,
weight_decay=weight_decay,
scale_parameter=scale_parameter,
relative_step=relative_step,
warmup_init=warmup_init,
)
defaults = {
"lr": lr,
"eps": eps,
"clip_threshold": clip_threshold,
"decay_rate": decay_rate,
"beta1": beta1,
"weight_decay": weight_decay,
"scale_parameter": scale_parameter,
"relative_step": relative_step,
"warmup_init": warmup_init,
}
super().__init__(params, defaults)
@staticmethod
......
......@@ -262,7 +262,7 @@ class AdamWeightDecay(Adam):
coefficients = self._fallback_apply_state(var_device, var_dtype)
apply_state[(var_device, var_dtype)] = coefficients
return coefficients["lr_t"], dict(apply_state=apply_state)
return coefficients["lr_t"], {"apply_state": apply_state}
def _resource_apply_dense(self, grad, var, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
......@@ -333,7 +333,7 @@ class GradientAccumulator(object):
"""The accumulated gradients on the current replica."""
if not self._gradients:
raise ValueError("The accumulator should be called first to initialize the gradients")
return list(gradient.value() if gradient is not None else gradient for gradient in self._gradients)
return [gradient.value() if gradient is not None else gradient for gradient in self._gradients]
def __call__(self, gradients):
"""Accumulates `gradients` on the current replica."""
......
......@@ -1083,7 +1083,7 @@ class Pipeline(_ScikitCompat):
final_iterator = self.get_iterator(
inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
)
outputs = [output for output in final_iterator]
outputs = list(final_iterator)
return outputs
else:
return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
......
......@@ -210,7 +210,7 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
inputs = [inputs]
elif isinstance(inputs, Iterable):
# Copy to avoid overriding arguments
inputs = [i for i in inputs]
inputs = list(inputs)
else:
raise ValueError(f"Invalid arguments {kwargs}")
......
......@@ -425,7 +425,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
if self.verbose:
logger.info(f"Adding {token} to the vocabulary")
added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)}
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
self.added_tokens_encoder.update(added_tok_encoder)
self.added_tokens_decoder.update(added_tok_decoder)
......@@ -495,9 +495,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
`List[str]`: The list of tokens.
"""
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
all_special_tokens_extended = dict(
(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
)
all_special_tokens_extended = {
str(t): t for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
}
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
......
......@@ -1918,7 +1918,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
obj.pop("__type")
return AddedToken(**obj)
elif isinstance(obj, (list, tuple)):
return list(convert_added_tokens(o) for o in obj)
return [convert_added_tokens(o) for o in obj]
elif isinstance(obj, dict):
return {k: convert_added_tokens(v) for k, v in obj.items()}
return obj
......@@ -1992,7 +1992,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
added_tok_encoder = json.load(added_tokens_handle)
# Sort added tokens by index
added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
added_tok_encoder_sorted = sorted(added_tok_encoder.items(), key=lambda x: x[1])
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow.
......@@ -2129,7 +2129,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
out["__type"] = "AddedToken"
return out
elif isinstance(obj, (list, tuple)):
return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
return [convert_added_tokens(o, add_type_field=add_type_field) for o in obj]
elif isinstance(obj, dict):
return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
return obj
......@@ -2502,23 +2502,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
"""
# To avoid duplicating
all_kwargs = dict(
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
)
all_kwargs = {
"add_special_tokens": add_special_tokens,
"padding": padding,
"truncation": truncation,
"max_length": max_length,
"stride": stride,
"is_split_into_words": is_split_into_words,
"pad_to_multiple_of": pad_to_multiple_of,
"return_tensors": return_tensors,
"return_token_type_ids": return_token_type_ids,
"return_attention_mask": return_attention_mask,
"return_overflowing_tokens": return_overflowing_tokens,
"return_special_tokens_mask": return_special_tokens_mask,
"return_offsets_mapping": return_offsets_mapping,
"return_length": return_length,
"verbose": verbose,
}
all_kwargs.update(kwargs)
if text is None and text_target is None:
raise ValueError("You need to specify either `text` or `text_target`.")
......@@ -3010,7 +3010,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
batch_outputs = {}
for i in range(batch_size):
inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
inputs = {k: v[i] for k, v in encoded_inputs.items()}
outputs = self._pad(
inputs,
max_length=max_length,
......
......@@ -162,7 +162,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
"""
base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
added_vocab = dict((tok, index) for tok, index in full_vocab.items() if tok not in base_vocab)
added_vocab = {tok: index for tok, index in full_vocab.items() if tok not in base_vocab}
return added_vocab
def __len__(self) -> int:
......
......@@ -1081,7 +1081,7 @@ class Trainer:
skipped = 0
for module in opt_model.modules():
if isinstance(module, nn.Embedding):
skipped += sum(dict((p.data_ptr(), p.numel()) for p in module.parameters()).values())
skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
print(f"skipped {module}: {skipped/2**20}M params")
manager.register_module_override(module, "weight", {"optim_bits": 32})
logger.debug(f"bitsandbytes: will optimize {module} in fp32")
......@@ -2564,12 +2564,12 @@ class Trainer:
elif isinstance(data, (tuple, list)):
return type(data)(self._prepare_input(v) for v in data)
elif isinstance(data, torch.Tensor):
kwargs = dict(device=self.args.device)
kwargs = {"device": self.args.device}
if self.deepspeed and data.dtype != torch.int64:
# NLP models inputs are int64 and those get adjusted to the right dtype of the
# embedding. Other models such as wav2vec2's inputs are already float and thus
# may need special handling to match the dtypes of the model
kwargs.update(dict(dtype=self.args.hf_deepspeed_config.dtype()))
kwargs.update({"dtype": self.args.hf_deepspeed_config.dtype()})
return data.to(**kwargs)
return data
......
......@@ -534,7 +534,7 @@ def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, genera
indices = torch.randperm(len(lengths), generator=generator)
megabatch_size = mega_batch_mult * batch_size
megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
megabatches = [list(sorted(megabatch, key=lambda i: lengths[i], reverse=True)) for megabatch in megabatches]
megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
# The rest is to get the biggest batch first.
# Since each megabatch is sorted by descending length, the longest element is the first
......
......@@ -505,21 +505,21 @@ class TrainerMemoryTracker:
if self.torch is not None:
self.gpu_mem_used_now = self.torch.cuda.memory_allocated()
self.gpu_mem_used_peak = self.torch.cuda.max_memory_allocated()
self.gpu[self.cur_stage] = dict(
begin=self.gpu_mem_used_at_start,
end=self.gpu_mem_used_now,
alloc=(self.gpu_mem_used_now - self.gpu_mem_used_at_start),
peaked=max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now),
)
self.gpu[self.cur_stage] = {
"begin": self.gpu_mem_used_at_start,
"end": self.gpu_mem_used_now,
"alloc": (self.gpu_mem_used_now - self.gpu_mem_used_at_start),
"peaked": max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now),
}
# cpu
self.cpu_mem_used_now = self.cpu_mem_used()
self.cpu[self.cur_stage] = dict(
begin=self.cpu_mem_used_at_start,
end=self.cpu_mem_used_now,
alloc=(self.cpu_mem_used_now - self.cpu_mem_used_at_start),
peaked=max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now),
)
self.cpu[self.cur_stage] = {
"begin": self.cpu_mem_used_at_start,
"end": self.cpu_mem_used_now,
"alloc": (self.cpu_mem_used_now - self.cpu_mem_used_at_start),
"peaked": max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now),
}
# reset - cycle finished
self.cur_stage = None
......
......@@ -1874,7 +1874,7 @@ class TrainingArguments:
the token values by removing their value.
"""
# filter out fields that are defined as field(init=False)
d = dict((field.name, getattr(self, field.name)) for field in fields(self) if field.init)
d = {field.name: getattr(self, field.name) for field in fields(self) if field.init}
for k, v in d.items():
if isinstance(v, Enum):
......
......@@ -1085,19 +1085,19 @@ def add_code_sample_docstrings(
# putting all kwargs for docstrings in a dict to be used
# with the `.format(**doc_kwargs)`. Note that string might
# be formatted with non-existing keys, which is fine.
doc_kwargs = dict(
model_class=model_class,
processor_class=processor_class,
checkpoint=checkpoint,
mask=mask,
qa_target_start_index=qa_target_start_index,
qa_target_end_index=qa_target_end_index,
expected_output=expected_output,
expected_loss=expected_loss,
real_checkpoint=real_checkpoint,
fake_checkpoint=checkpoint,
true="{true}", # For <Tip warning={true}> syntax that conflicts with formatting.
)
doc_kwargs = {
"model_class": model_class,
"processor_class": processor_class,
"checkpoint": checkpoint,
"mask": mask,
"qa_target_start_index": qa_target_start_index,
"qa_target_end_index": qa_target_end_index,
"expected_output": expected_output,
"expected_loss": expected_loss,
"real_checkpoint": real_checkpoint,
"fake_checkpoint": checkpoint,
"true": "{true}", # For <Tip warning={true}> syntax that conflicts with formatting.
}
if ("SequenceClassification" in model_class or "AudioClassification" in model_class) and modality == "audio":
code_sample = sample_docstrings["AudioClassification"]
......
......@@ -96,12 +96,12 @@ class TrialShortNamer:
if cls.NAMING_INFO is not None:
return
info = dict(
short_word={},
reverse_short_word={},
short_param={},
reverse_short_param={},
)
info = {
"short_word": {},
"reverse_short_word": {},
"short_param": {},
"reverse_short_param": {},
}
field_keys = list(cls.DEFAULTS.keys())
......
......@@ -902,7 +902,7 @@ def get_checkpoint_shard_files(
with open(index_filename, "r") as f:
index = json.loads(f.read())
shard_filenames = sorted(list(set(index["weight_map"].values())))
shard_filenames = sorted(set(index["weight_map"].values()))
sharded_metadata = index["metadata"]
sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys())
sharded_metadata["weight_map"] = index["weight_map"].copy()
......
......@@ -51,6 +51,6 @@ def get_device_map(n_layers, devices):
"""Returns a dictionary of layers distributed evenly across all devices."""
layers = list(range(n_layers))
n_blocks = int(ceil(n_layers / len(devices)))
layers_list = list(layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks))
layers_list = [layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)]
return dict(zip(devices, layers_list))
......@@ -157,9 +157,13 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
super().setUp()
master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)
self.dist_env_1_gpu = {
"MASTER_ADDR": "localhost",
"MASTER_PORT": master_port,
"RANK": "0",
"LOCAL_RANK": "0",
"WORLD_SIZE": "1",
}
def tearDown(self):
super().tearDown()
......@@ -212,14 +216,18 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
self.batch_size = args.train_batch_size
master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)
self.dist_env_1_gpu = {
"MASTER_ADDR": "localhost",
"MASTER_PORT": master_port,
"RANK": "0",
"LOCAL_RANK": "0",
"WORLD_SIZE": "1",
}
self.ds_config_file = dict(
zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
)
self.ds_config_file = {
"zero2": f"{self.test_file_dir_str}/ds_config_zero2.json",
"zero3": f"{self.test_file_dir_str}/ds_config_zero3.json",
}
# use self.get_config_dict(stage) to use these to ensure the original is not modified
with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
......@@ -230,10 +238,10 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
# It's in the file as a demo for users since we want everything to work out of the box even if slower.
config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False
self.ds_config_dict = dict(
zero2=config_zero2,
zero3=config_zero3,
)
self.ds_config_dict = {
"zero2": config_zero2,
"zero3": config_zero3,
}
def tearDown(self):
super().tearDown()
......@@ -370,7 +378,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# this actually doesn't have to be on NVMe, any storage will do since this test only
# runs a simple check that we can use some directory as if it were NVMe
nvme_path = self.get_auto_remove_tmp_dir()
nvme_config = dict(device="nvme", nvme_path=nvme_path)
nvme_config = {"device": "nvme", "nvme_path": nvme_path}
ds_config_zero3_dict = self.get_config_dict(ZERO3)
ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
......@@ -415,7 +423,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# force cpu offload
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
with mockenv_context(**self.dist_env_1_gpu):
kwargs = dict(local_rank=0, deepspeed=ds_config_dict)
kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
with CaptureLogger(deepspeed_logger) as cl:
......@@ -431,7 +439,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
with mockenv_context(**self.dist_env_1_gpu):
kwargs = dict(local_rank=0, deepspeed=self.get_config_dict(stage))
kwargs = {"local_rank": 0, "deepspeed": self.get_config_dict(stage)}
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
......@@ -449,15 +457,15 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
with mockenv_context(**self.dist_env_1_gpu):
a = b = 0.0
kwargs = dict(
a=a,
b=b,
local_rank=0,
train_len=8,
deepspeed=self.get_config_dict(stage),
per_device_train_batch_size=8,
logging_steps=1,
)
kwargs = {
"a": a,
"b": b,
"local_rank": 0,
"train_len": 8,
"deepspeed": self.get_config_dict(stage),
"per_device_train_batch_size": 8,
"logging_steps": 1,
}
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
......@@ -494,13 +502,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
train_len = 64
a = b = 0.0
kwargs = dict(
a=a,
b=b,
local_rank=0,
train_len=train_len,
deepspeed=self.get_config_dict(stage),
)
kwargs = {
"a": a,
"b": b,
"local_rank": 0,
"train_len": train_len,
"deepspeed": self.get_config_dict(stage),
}
kwargs[dtype] = True
with mockenv_context(**self.dist_env_1_gpu):
......@@ -583,11 +591,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# save checkpoints
with mockenv_context(**self.dist_env_1_gpu):
kwargs = dict(
output_dir=output_dir,
save_steps=freq,
deepspeed=ds_config_dict,
)
kwargs = {
"output_dir": output_dir,
"save_steps": freq,
"deepspeed": ds_config_dict,
}
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
trainer.train()
......@@ -600,7 +608,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
with mockenv_context(**self.dist_env_1_gpu):
ds_config_dict = self.get_config_dict(stage)
output_dir = self.get_auto_remove_tmp_dir()
kwargs = dict(output_dir=output_dir, deepspeed=ds_config_dict)
kwargs = {"output_dir": output_dir, "deepspeed": ds_config_dict}
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
......@@ -632,7 +640,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
if stage == ZERO3:
ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
kwargs = {
"output_dir": output_dir,
"train_len": 128,
"save_steps": 5,
"learning_rate": 0.1,
"deepspeed": ds_config_dict,
}
kwargs[dtype] = True
with mockenv_context(**self.dist_env_1_gpu):
......@@ -679,16 +693,16 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
ds_config_dict = self.get_config_dict(stage)
kwargs = dict(
output_dir=output_dir,
train_len=4,
per_device_train_batch_size=4,
num_train_epochs=1,
save_strategy="steps",
save_steps=1,
learning_rate=0.1,
deepspeed=ds_config_dict,
)
kwargs = {
"output_dir": output_dir,
"train_len": 4,
"per_device_train_batch_size": 4,
"num_train_epochs": 1,
"save_strategy": "steps",
"save_steps": 1,
"learning_rate": 0.1,
"deepspeed": ds_config_dict,
}
kwargs[dtype] = True
with mockenv_context(**self.dist_env_1_gpu):
......@@ -710,7 +724,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# test that we can switch from zero2 to zero3 in the same process for example
# test is_zero, etc.
output_dir = self.get_auto_remove_tmp_dir()
kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)
kwargs = {"output_dir": output_dir, "train_len": 8, "fp16": True}
ds_config_zero3_dict = self.get_config_dict(ZERO3)
ds_config_zero2_dict = self.get_config_dict(ZERO2)
......@@ -808,7 +822,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
def get_dataset():
data_file = str(self.tests_dir / "fixtures/tests_samples/SQUAD/sample.json")
data_files = dict(train=data_file, validation=data_file)
data_files = {"train": data_file, "validation": data_file}
raw_datasets = datasets.load_dataset("json", data_files=data_files, field="data")
train_dataset = raw_datasets["train"].map(_add_eos_to_examples).map(_convert_to_features, batched=True)
valid_dataset = deepcopy(train_dataset)
......@@ -903,7 +917,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
do_train = True
do_eval = False
kwargs = dict(stage=stage, dtype=dtype, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
kwargs = {
"stage": stage,
"dtype": dtype,
"eval_steps": 1,
"distributed": True,
"do_train": do_train,
"do_eval": do_eval,
}
# 1. normal training
output_dir = self.run_and_check(**kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment