Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5e8c8eb5
Unverified
Commit
5e8c8eb5
authored
Feb 22, 2023
by
Aaron Gokaslan
Committed by
GitHub
Feb 22, 2023
Browse files
Apply ruff flake8-comprehensions (#21694)
parent
df06fb1f
Changes
230
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
158 additions
and
137 deletions
+158
-137
src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
...od/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
+1
-1
src/transformers/models/xmod/modeling_xmod.py
src/transformers/models/xmod/modeling_xmod.py
+1
-1
src/transformers/models/yolos/image_processing_yolos.py
src/transformers/models/yolos/image_processing_yolos.py
+1
-1
src/transformers/onnx/convert.py
src/transformers/onnx/convert.py
+1
-1
src/transformers/optimization.py
src/transformers/optimization.py
+12
-12
src/transformers/optimization_tf.py
src/transformers/optimization_tf.py
+2
-2
src/transformers/pipelines/base.py
src/transformers/pipelines/base.py
+1
-1
src/transformers/pipelines/question_answering.py
src/transformers/pipelines/question_answering.py
+1
-1
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+4
-4
src/transformers/tokenization_utils_base.py
src/transformers/tokenization_utils_base.py
+21
-21
src/transformers/tokenization_utils_fast.py
src/transformers/tokenization_utils_fast.py
+1
-1
src/transformers/trainer.py
src/transformers/trainer.py
+3
-3
src/transformers/trainer_pt_utils.py
src/transformers/trainer_pt_utils.py
+1
-1
src/transformers/trainer_utils.py
src/transformers/trainer_utils.py
+12
-12
src/transformers/training_args.py
src/transformers/training_args.py
+1
-1
src/transformers/utils/doc.py
src/transformers/utils/doc.py
+13
-13
src/transformers/utils/hp_naming.py
src/transformers/utils/hp_naming.py
+6
-6
src/transformers/utils/hub.py
src/transformers/utils/hub.py
+1
-1
src/transformers/utils/model_parallel_utils.py
src/transformers/utils/model_parallel_utils.py
+1
-1
tests/deepspeed/test_deepspeed.py
tests/deepspeed/test_deepspeed.py
+74
-53
No files found.
src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
View file @
5e8c8eb5
...
...
@@ -142,7 +142,7 @@ def convert_xmod_checkpoint_to_pytorch(
bert_output
.
adapter_layer_norm
.
weight
=
xmod_layer
.
adapter_layer_norm
.
weight
bert_output
.
adapter_layer_norm
.
bias
=
xmod_layer
.
adapter_layer_norm
.
bias
if
list
(
sorted
(
bert_output
.
adapter_modules
.
keys
())
)
!=
list
(
sorted
(
xmod_layer
.
adapter_modules
.
keys
())
)
:
if
sorted
(
bert_output
.
adapter_modules
.
keys
())
!=
sorted
(
xmod_layer
.
adapter_modules
.
keys
()):
raise
AssertionError
(
"Lists of language adapters do not match."
)
for
lang_code
,
adapter
in
xmod_layer
.
adapter_modules
.
items
():
to_adapter
=
bert_output
.
adapter_modules
[
lang_code
]
...
...
src/transformers/models/xmod/modeling_xmod.py
View file @
5e8c8eb5
...
...
@@ -395,7 +395,7 @@ class XmodOutput(nn.Module):
else
:
self
.
adapter_layer_norm
=
None
self
.
adapter_reuse_layer_norm
=
config
.
adapter_reuse_layer_norm
self
.
adapter_modules
=
nn
.
ModuleDict
(
dict
()
)
self
.
adapter_modules
=
nn
.
ModuleDict
(
{}
)
for
language
in
config
.
languages
:
self
.
adapter_modules
[
str
(
language
)]
=
XmodAdapter
(
config
)
...
...
src/transformers/models/yolos/image_processing_yolos.py
View file @
5e8c8eb5
...
...
@@ -515,7 +515,7 @@ def binary_mask_to_rle(mask):
pixels
=
np
.
concatenate
([[
0
],
pixels
,
[
0
]])
runs
=
np
.
where
(
pixels
[
1
:]
!=
pixels
[:
-
1
])[
0
]
+
1
runs
[
1
::
2
]
-=
runs
[::
2
]
return
[
x
for
x
in
runs
]
return
list
(
runs
)
# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
...
...
src/transformers/onnx/convert.py
View file @
5e8c8eb5
...
...
@@ -145,7 +145,7 @@ def export_pytorch(
device
=
torch
.
device
(
device
)
if
device
.
type
==
"cuda"
and
torch
.
cuda
.
is_available
():
model
.
to
(
device
)
model_inputs_device
=
dict
()
model_inputs_device
=
{}
for
k
,
v
in
model_inputs
.
items
():
if
isinstance
(
v
,
Tuple
):
model_inputs_device
[
k
]
=
tuple
(
...
...
src/transformers/optimization.py
View file @
5e8c8eb5
...
...
@@ -358,7 +358,7 @@ class AdamW(Optimizer):
raise
ValueError
(
f
"Invalid beta parameter:
{
betas
[
1
]
}
- should be in [0.0, 1.0)"
)
if
not
0.0
<=
eps
:
raise
ValueError
(
f
"Invalid epsilon value:
{
eps
}
- should be >= 0.0"
)
defaults
=
dict
(
lr
=
lr
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
,
correct_bias
=
correct_bias
)
defaults
=
{
"lr"
:
lr
,
"
betas
"
:
betas
,
"
eps
"
:
eps
,
"
weight_decay
"
:
weight_decay
,
"
correct_bias
"
:
correct_bias
}
super
().
__init__
(
params
,
defaults
)
def
step
(
self
,
closure
:
Callable
=
None
):
...
...
@@ -527,17 +527,17 @@ class Adafactor(Optimizer):
if
warmup_init
and
not
relative_step
:
raise
ValueError
(
"`warmup_init=True` requires `relative_step=True`"
)
defaults
=
dict
(
lr
=
lr
,
eps
=
eps
,
clip_threshold
=
clip_threshold
,
decay_rate
=
decay_rate
,
beta1
=
beta1
,
weight_decay
=
weight_decay
,
scale_parameter
=
scale_parameter
,
relative_step
=
relative_step
,
warmup_init
=
warmup_init
,
)
defaults
=
{
"
lr
"
:
lr
,
"
eps
"
:
eps
,
"
clip_threshold
"
:
clip_threshold
,
"
decay_rate
"
:
decay_rate
,
"
beta1
"
:
beta1
,
"
weight_decay
"
:
weight_decay
,
"
scale_parameter
"
:
scale_parameter
,
"
relative_step
"
:
relative_step
,
"
warmup_init
"
:
warmup_init
,
}
super
().
__init__
(
params
,
defaults
)
@
staticmethod
...
...
src/transformers/optimization_tf.py
View file @
5e8c8eb5
...
...
@@ -262,7 +262,7 @@ class AdamWeightDecay(Adam):
coefficients
=
self
.
_fallback_apply_state
(
var_device
,
var_dtype
)
apply_state
[(
var_device
,
var_dtype
)]
=
coefficients
return
coefficients
[
"lr_t"
],
dict
(
apply_state
=
apply_state
)
return
coefficients
[
"lr_t"
],
{
"
apply_state
"
:
apply_state
}
def
_resource_apply_dense
(
self
,
grad
,
var
,
apply_state
=
None
):
lr_t
,
kwargs
=
self
.
_get_lr
(
var
.
device
,
var
.
dtype
.
base_dtype
,
apply_state
)
...
...
@@ -333,7 +333,7 @@ class GradientAccumulator(object):
"""The accumulated gradients on the current replica."""
if
not
self
.
_gradients
:
raise
ValueError
(
"The accumulator should be called first to initialize the gradients"
)
return
list
(
gradient
.
value
()
if
gradient
is
not
None
else
gradient
for
gradient
in
self
.
_gradients
)
return
[
gradient
.
value
()
if
gradient
is
not
None
else
gradient
for
gradient
in
self
.
_gradients
]
def
__call__
(
self
,
gradients
):
"""Accumulates `gradients` on the current replica."""
...
...
src/transformers/pipelines/base.py
View file @
5e8c8eb5
...
...
@@ -1083,7 +1083,7 @@ class Pipeline(_ScikitCompat):
final_iterator
=
self
.
get_iterator
(
inputs
,
num_workers
,
batch_size
,
preprocess_params
,
forward_params
,
postprocess_params
)
outputs
=
[
output
for
output
in
final_iterator
]
outputs
=
list
(
final_iterator
)
return
outputs
else
:
return
self
.
run_multi
(
inputs
,
preprocess_params
,
forward_params
,
postprocess_params
)
...
...
src/transformers/pipelines/question_answering.py
View file @
5e8c8eb5
...
...
@@ -210,7 +210,7 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
inputs
=
[
inputs
]
elif
isinstance
(
inputs
,
Iterable
):
# Copy to avoid overriding arguments
inputs
=
[
i
for
i
in
inputs
]
inputs
=
list
(
inputs
)
else
:
raise
ValueError
(
f
"Invalid arguments
{
kwargs
}
"
)
...
...
src/transformers/tokenization_utils.py
View file @
5e8c8eb5
...
...
@@ -425,7 +425,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
if
self
.
verbose
:
logger
.
info
(
f
"Adding
{
token
}
to the vocabulary"
)
added_tok_encoder
=
dict
((
tok
,
len
(
self
)
+
i
)
for
i
,
tok
in
enumerate
(
tokens_to_add
)
)
added_tok_encoder
=
{
tok
:
len
(
self
)
+
i
for
i
,
tok
in
enumerate
(
tokens_to_add
)
}
added_tok_decoder
=
{
v
:
k
for
k
,
v
in
added_tok_encoder
.
items
()}
self
.
added_tokens_encoder
.
update
(
added_tok_encoder
)
self
.
added_tokens_decoder
.
update
(
added_tok_decoder
)
...
...
@@ -495,9 +495,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
`List[str]`: The list of tokens.
"""
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
all_special_tokens_extended
=
dict
(
(
str
(
t
)
,
t
)
for
t
in
self
.
all_special_tokens_extended
if
isinstance
(
t
,
AddedToken
)
)
all_special_tokens_extended
=
{
str
(
t
)
:
t
for
t
in
self
.
all_special_tokens_extended
if
isinstance
(
t
,
AddedToken
)
}
text
,
kwargs
=
self
.
prepare_for_tokenization
(
text
,
**
kwargs
)
...
...
src/transformers/tokenization_utils_base.py
View file @
5e8c8eb5
...
...
@@ -1918,7 +1918,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
obj
.
pop
(
"__type"
)
return
AddedToken
(
**
obj
)
elif
isinstance
(
obj
,
(
list
,
tuple
)):
return
list
(
convert_added_tokens
(
o
)
for
o
in
obj
)
return
[
convert_added_tokens
(
o
)
for
o
in
obj
]
elif
isinstance
(
obj
,
dict
):
return
{
k
:
convert_added_tokens
(
v
)
for
k
,
v
in
obj
.
items
()}
return
obj
...
...
@@ -1992,7 +1992,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
added_tok_encoder
=
json
.
load
(
added_tokens_handle
)
# Sort added tokens by index
added_tok_encoder_sorted
=
list
(
sorted
(
added_tok_encoder
.
items
(),
key
=
lambda
x
:
x
[
1
])
)
added_tok_encoder_sorted
=
sorted
(
added_tok_encoder
.
items
(),
key
=
lambda
x
:
x
[
1
])
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow.
...
...
@@ -2129,7 +2129,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
out
[
"__type"
]
=
"AddedToken"
return
out
elif
isinstance
(
obj
,
(
list
,
tuple
)):
return
list
(
convert_added_tokens
(
o
,
add_type_field
=
add_type_field
)
for
o
in
obj
)
return
[
convert_added_tokens
(
o
,
add_type_field
=
add_type_field
)
for
o
in
obj
]
elif
isinstance
(
obj
,
dict
):
return
{
k
:
convert_added_tokens
(
v
,
add_type_field
=
add_type_field
)
for
k
,
v
in
obj
.
items
()}
return
obj
...
...
@@ -2502,23 +2502,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
"""
# To avoid duplicating
all_kwargs
=
dict
(
add_special_tokens
=
add_special_tokens
,
padding
=
padding
,
truncation
=
truncation
,
max_length
=
max_length
,
stride
=
stride
,
is_split_into_words
=
is_split_into_words
,
pad_to_multiple_of
=
pad_to_multiple_of
,
return_tensors
=
return_tensors
,
return_token_type_ids
=
return_token_type_ids
,
return_attention_mask
=
return_attention_mask
,
return_overflowing_tokens
=
return_overflowing_tokens
,
return_special_tokens_mask
=
return_special_tokens_mask
,
return_offsets_mapping
=
return_offsets_mapping
,
return_length
=
return_length
,
verbose
=
verbose
,
)
all_kwargs
=
{
"
add_special_tokens
"
:
add_special_tokens
,
"
padding
"
:
padding
,
"
truncation
"
:
truncation
,
"
max_length
"
:
max_length
,
"
stride
"
:
stride
,
"
is_split_into_words
"
:
is_split_into_words
,
"
pad_to_multiple_of
"
:
pad_to_multiple_of
,
"
return_tensors
"
:
return_tensors
,
"
return_token_type_ids
"
:
return_token_type_ids
,
"
return_attention_mask
"
:
return_attention_mask
,
"
return_overflowing_tokens
"
:
return_overflowing_tokens
,
"
return_special_tokens_mask
"
:
return_special_tokens_mask
,
"
return_offsets_mapping
"
:
return_offsets_mapping
,
"
return_length
"
:
return_length
,
"
verbose
"
:
verbose
,
}
all_kwargs
.
update
(
kwargs
)
if
text
is
None
and
text_target
is
None
:
raise
ValueError
(
"You need to specify either `text` or `text_target`."
)
...
...
@@ -3010,7 +3010,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
batch_outputs
=
{}
for
i
in
range
(
batch_size
):
inputs
=
dict
((
k
,
v
[
i
]
)
for
k
,
v
in
encoded_inputs
.
items
()
)
inputs
=
{
k
:
v
[
i
]
for
k
,
v
in
encoded_inputs
.
items
()
}
outputs
=
self
.
_pad
(
inputs
,
max_length
=
max_length
,
...
...
src/transformers/tokenization_utils_fast.py
View file @
5e8c8eb5
...
...
@@ -162,7 +162,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
"""
base_vocab
=
self
.
_tokenizer
.
get_vocab
(
with_added_tokens
=
False
)
full_vocab
=
self
.
_tokenizer
.
get_vocab
(
with_added_tokens
=
True
)
added_vocab
=
dict
((
tok
,
index
)
for
tok
,
index
in
full_vocab
.
items
()
if
tok
not
in
base_vocab
)
added_vocab
=
{
tok
:
index
for
tok
,
index
in
full_vocab
.
items
()
if
tok
not
in
base_vocab
}
return
added_vocab
def
__len__
(
self
)
->
int
:
...
...
src/transformers/trainer.py
View file @
5e8c8eb5
...
...
@@ -1081,7 +1081,7 @@ class Trainer:
skipped
=
0
for
module
in
opt_model
.
modules
():
if
isinstance
(
module
,
nn
.
Embedding
):
skipped
+=
sum
(
dict
((
p
.
data_ptr
()
,
p
.
numel
()
)
for
p
in
module
.
parameters
()
)
.
values
())
skipped
+=
sum
(
{
p
.
data_ptr
()
:
p
.
numel
()
for
p
in
module
.
parameters
()
}
.
values
())
print
(
f
"skipped
{
module
}
:
{
skipped
/
2
**
20
}
M params"
)
manager
.
register_module_override
(
module
,
"weight"
,
{
"optim_bits"
:
32
})
logger
.
debug
(
f
"bitsandbytes: will optimize
{
module
}
in fp32"
)
...
...
@@ -2564,12 +2564,12 @@ class Trainer:
elif
isinstance
(
data
,
(
tuple
,
list
)):
return
type
(
data
)(
self
.
_prepare_input
(
v
)
for
v
in
data
)
elif
isinstance
(
data
,
torch
.
Tensor
):
kwargs
=
dict
(
device
=
self
.
args
.
device
)
kwargs
=
{
"
device
"
:
self
.
args
.
device
}
if
self
.
deepspeed
and
data
.
dtype
!=
torch
.
int64
:
# NLP models inputs are int64 and those get adjusted to the right dtype of the
# embedding. Other models such as wav2vec2's inputs are already float and thus
# may need special handling to match the dtypes of the model
kwargs
.
update
(
dict
(
dtype
=
self
.
args
.
hf_deepspeed_config
.
dtype
()
)
)
kwargs
.
update
(
{
"
dtype
"
:
self
.
args
.
hf_deepspeed_config
.
dtype
()
}
)
return
data
.
to
(
**
kwargs
)
return
data
...
...
src/transformers/trainer_pt_utils.py
View file @
5e8c8eb5
...
...
@@ -534,7 +534,7 @@ def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, genera
indices
=
torch
.
randperm
(
len
(
lengths
),
generator
=
generator
)
megabatch_size
=
mega_batch_mult
*
batch_size
megabatches
=
[
indices
[
i
:
i
+
megabatch_size
].
tolist
()
for
i
in
range
(
0
,
len
(
lengths
),
megabatch_size
)]
megabatches
=
[
list
(
sorted
(
megabatch
,
key
=
lambda
i
:
lengths
[
i
],
reverse
=
True
)
)
for
megabatch
in
megabatches
]
megabatches
=
[
sorted
(
megabatch
,
key
=
lambda
i
:
lengths
[
i
],
reverse
=
True
)
for
megabatch
in
megabatches
]
# The rest is to get the biggest batch first.
# Since each megabatch is sorted by descending length, the longest element is the first
...
...
src/transformers/trainer_utils.py
View file @
5e8c8eb5
...
...
@@ -505,21 +505,21 @@ class TrainerMemoryTracker:
if
self
.
torch
is
not
None
:
self
.
gpu_mem_used_now
=
self
.
torch
.
cuda
.
memory_allocated
()
self
.
gpu_mem_used_peak
=
self
.
torch
.
cuda
.
max_memory_allocated
()
self
.
gpu
[
self
.
cur_stage
]
=
dict
(
begin
=
self
.
gpu_mem_used_at_start
,
end
=
self
.
gpu_mem_used_now
,
alloc
=
(
self
.
gpu_mem_used_now
-
self
.
gpu_mem_used_at_start
),
peaked
=
max
(
0
,
self
.
gpu_mem_used_peak
-
self
.
gpu_mem_used_now
),
)
self
.
gpu
[
self
.
cur_stage
]
=
{
"
begin
"
:
self
.
gpu_mem_used_at_start
,
"
end
"
:
self
.
gpu_mem_used_now
,
"
alloc
"
:
(
self
.
gpu_mem_used_now
-
self
.
gpu_mem_used_at_start
),
"
peaked
"
:
max
(
0
,
self
.
gpu_mem_used_peak
-
self
.
gpu_mem_used_now
),
}
# cpu
self
.
cpu_mem_used_now
=
self
.
cpu_mem_used
()
self
.
cpu
[
self
.
cur_stage
]
=
dict
(
begin
=
self
.
cpu_mem_used_at_start
,
end
=
self
.
cpu_mem_used_now
,
alloc
=
(
self
.
cpu_mem_used_now
-
self
.
cpu_mem_used_at_start
),
peaked
=
max
(
0
,
self
.
cpu_mem_used_peak
-
self
.
cpu_mem_used_now
),
)
self
.
cpu
[
self
.
cur_stage
]
=
{
"
begin
"
:
self
.
cpu_mem_used_at_start
,
"
end
"
:
self
.
cpu_mem_used_now
,
"
alloc
"
:
(
self
.
cpu_mem_used_now
-
self
.
cpu_mem_used_at_start
),
"
peaked
"
:
max
(
0
,
self
.
cpu_mem_used_peak
-
self
.
cpu_mem_used_now
),
}
# reset - cycle finished
self
.
cur_stage
=
None
...
...
src/transformers/training_args.py
View file @
5e8c8eb5
...
...
@@ -1874,7 +1874,7 @@ class TrainingArguments:
the token values by removing their value.
"""
# filter out fields that are defined as field(init=False)
d
=
dict
((
field
.
name
,
getattr
(
self
,
field
.
name
)
)
for
field
in
fields
(
self
)
if
field
.
init
)
d
=
{
field
.
name
:
getattr
(
self
,
field
.
name
)
for
field
in
fields
(
self
)
if
field
.
init
}
for
k
,
v
in
d
.
items
():
if
isinstance
(
v
,
Enum
):
...
...
src/transformers/utils/doc.py
View file @
5e8c8eb5
...
...
@@ -1085,19 +1085,19 @@ def add_code_sample_docstrings(
# putting all kwargs for docstrings in a dict to be used
# with the `.format(**doc_kwargs)`. Note that string might
# be formatted with non-existing keys, which is fine.
doc_kwargs
=
dict
(
model_class
=
model_class
,
processor_class
=
processor_class
,
checkpoint
=
checkpoint
,
mask
=
mask
,
qa_target_start_index
=
qa_target_start_index
,
qa_target_end_index
=
qa_target_end_index
,
expected_output
=
expected_output
,
expected_loss
=
expected_loss
,
real_checkpoint
=
real_checkpoint
,
fake_checkpoint
=
checkpoint
,
true
=
"{true}"
,
# For <Tip warning={true}> syntax that conflicts with formatting.
)
doc_kwargs
=
{
"
model_class
"
:
model_class
,
"
processor_class
"
:
processor_class
,
"
checkpoint
"
:
checkpoint
,
"
mask
"
:
mask
,
"
qa_target_start_index
"
:
qa_target_start_index
,
"
qa_target_end_index
"
:
qa_target_end_index
,
"
expected_output
"
:
expected_output
,
"
expected_loss
"
:
expected_loss
,
"
real_checkpoint
"
:
real_checkpoint
,
"
fake_checkpoint
"
:
checkpoint
,
"
true
"
:
"{true}"
,
# For <Tip warning={true}> syntax that conflicts with formatting.
}
if
(
"SequenceClassification"
in
model_class
or
"AudioClassification"
in
model_class
)
and
modality
==
"audio"
:
code_sample
=
sample_docstrings
[
"AudioClassification"
]
...
...
src/transformers/utils/hp_naming.py
View file @
5e8c8eb5
...
...
@@ -96,12 +96,12 @@ class TrialShortNamer:
if
cls
.
NAMING_INFO
is
not
None
:
return
info
=
dict
(
short_word
=
{},
reverse_short_word
=
{},
short_param
=
{},
reverse_short_param
=
{},
)
info
=
{
"
short_word
"
:
{},
"
reverse_short_word
"
:
{},
"
short_param
"
:
{},
"
reverse_short_param
"
:
{},
}
field_keys
=
list
(
cls
.
DEFAULTS
.
keys
())
...
...
src/transformers/utils/hub.py
View file @
5e8c8eb5
...
...
@@ -902,7 +902,7 @@ def get_checkpoint_shard_files(
with
open
(
index_filename
,
"r"
)
as
f
:
index
=
json
.
loads
(
f
.
read
())
shard_filenames
=
sorted
(
list
(
set
(
index
[
"weight_map"
].
values
()))
)
shard_filenames
=
sorted
(
set
(
index
[
"weight_map"
].
values
()))
sharded_metadata
=
index
[
"metadata"
]
sharded_metadata
[
"all_checkpoint_keys"
]
=
list
(
index
[
"weight_map"
].
keys
())
sharded_metadata
[
"weight_map"
]
=
index
[
"weight_map"
].
copy
()
...
...
src/transformers/utils/model_parallel_utils.py
View file @
5e8c8eb5
...
...
@@ -51,6 +51,6 @@ def get_device_map(n_layers, devices):
"""Returns a dictionary of layers distributed evenly across all devices."""
layers
=
list
(
range
(
n_layers
))
n_blocks
=
int
(
ceil
(
n_layers
/
len
(
devices
)))
layers_list
=
list
(
layers
[
i
:
i
+
n_blocks
]
for
i
in
range
(
0
,
n_layers
,
n_blocks
)
)
layers_list
=
[
layers
[
i
:
i
+
n_blocks
]
for
i
in
range
(
0
,
n_layers
,
n_blocks
)
]
return
dict
(
zip
(
devices
,
layers_list
))
tests/deepspeed/test_deepspeed.py
View file @
5e8c8eb5
...
...
@@ -157,9 +157,13 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
super
().
setUp
()
master_port
=
get_master_port
(
real_launcher
=
False
)
self
.
dist_env_1_gpu
=
dict
(
MASTER_ADDR
=
"localhost"
,
MASTER_PORT
=
master_port
,
RANK
=
"0"
,
LOCAL_RANK
=
"0"
,
WORLD_SIZE
=
"1"
)
self
.
dist_env_1_gpu
=
{
"MASTER_ADDR"
:
"localhost"
,
"MASTER_PORT"
:
master_port
,
"RANK"
:
"0"
,
"LOCAL_RANK"
:
"0"
,
"WORLD_SIZE"
:
"1"
,
}
def
tearDown
(
self
):
super
().
tearDown
()
...
...
@@ -212,14 +216,18 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
self
.
batch_size
=
args
.
train_batch_size
master_port
=
get_master_port
(
real_launcher
=
False
)
self
.
dist_env_1_gpu
=
dict
(
MASTER_ADDR
=
"localhost"
,
MASTER_PORT
=
master_port
,
RANK
=
"0"
,
LOCAL_RANK
=
"0"
,
WORLD_SIZE
=
"1"
)
self
.
dist_env_1_gpu
=
{
"MASTER_ADDR"
:
"localhost"
,
"MASTER_PORT"
:
master_port
,
"RANK"
:
"0"
,
"LOCAL_RANK"
:
"0"
,
"WORLD_SIZE"
:
"1"
,
}
self
.
ds_config_file
=
dict
(
zero2
=
f
"
{
self
.
test_file_dir_str
}
/ds_config_zero2.json"
,
zero3
=
f
"
{
self
.
test_file_dir_str
}
/ds_config_zero3.json"
,
)
self
.
ds_config_file
=
{
"
zero2
"
:
f
"
{
self
.
test_file_dir_str
}
/ds_config_zero2.json"
,
"
zero3
"
:
f
"
{
self
.
test_file_dir_str
}
/ds_config_zero3.json"
,
}
# use self.get_config_dict(stage) to use these to ensure the original is not modified
with
io
.
open
(
self
.
ds_config_file
[
ZERO2
],
"r"
,
encoding
=
"utf-8"
)
as
f
:
...
...
@@ -230,10 +238,10 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
# It's in the file as a demo for users since we want everything to work out of the box even if slower.
config_zero3
[
"zero_optimization"
][
"stage3_gather_16bit_weights_on_model_save"
]
=
False
self
.
ds_config_dict
=
dict
(
zero2
=
config_zero2
,
zero3
=
config_zero3
,
)
self
.
ds_config_dict
=
{
"
zero2
"
:
config_zero2
,
"
zero3
"
:
config_zero3
,
}
def
tearDown
(
self
):
super
().
tearDown
()
...
...
@@ -370,7 +378,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# this actually doesn't have to be on NVMe, any storage will do since this test only
# runs a simple check that we can use some directory as if it were NVMe
nvme_path
=
self
.
get_auto_remove_tmp_dir
()
nvme_config
=
dict
(
device
=
"nvme"
,
nvme_path
=
nvme_path
)
nvme_config
=
{
"
device
"
:
"nvme"
,
"
nvme_path
"
:
nvme_path
}
ds_config_zero3_dict
=
self
.
get_config_dict
(
ZERO3
)
ds_config_zero3_dict
[
"zero_optimization"
][
"offload_optimizer"
]
=
nvme_config
ds_config_zero3_dict
[
"zero_optimization"
][
"offload_param"
]
=
nvme_config
...
...
@@ -415,7 +423,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# force cpu offload
ds_config_dict
[
"zero_optimization"
][
"offload_optimizer"
][
"device"
]
=
"cpu"
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
kwargs
=
dict
(
local_rank
=
0
,
deepspeed
=
ds_config_dict
)
kwargs
=
{
"
local_rank
"
:
0
,
"
deepspeed
"
:
ds_config_dict
}
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
with
CaptureLogger
(
deepspeed_logger
)
as
cl
:
...
...
@@ -431,7 +439,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
kwargs
=
dict
(
local_rank
=
0
,
deepspeed
=
self
.
get_config_dict
(
stage
)
)
kwargs
=
{
"
local_rank
"
:
0
,
"
deepspeed
"
:
self
.
get_config_dict
(
stage
)
}
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
...
...
@@ -449,15 +457,15 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
a
=
b
=
0.0
kwargs
=
dict
(
a
=
a
,
b
=
b
,
local_rank
=
0
,
train_len
=
8
,
deepspeed
=
self
.
get_config_dict
(
stage
),
per_device_train_batch_size
=
8
,
logging_steps
=
1
,
)
kwargs
=
{
"a"
:
a
,
"b"
:
b
,
"
local_rank
"
:
0
,
"
train_len
"
:
8
,
"
deepspeed
"
:
self
.
get_config_dict
(
stage
),
"
per_device_train_batch_size
"
:
8
,
"
logging_steps
"
:
1
,
}
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
...
...
@@ -494,13 +502,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
train_len
=
64
a
=
b
=
0.0
kwargs
=
dict
(
a
=
a
,
b
=
b
,
local_rank
=
0
,
train_len
=
train_len
,
deepspeed
=
self
.
get_config_dict
(
stage
),
)
kwargs
=
{
"a"
:
a
,
"b"
:
b
,
"
local_rank
"
:
0
,
"
train_len
"
:
train_len
,
"
deepspeed
"
:
self
.
get_config_dict
(
stage
),
}
kwargs
[
dtype
]
=
True
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
...
...
@@ -583,11 +591,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# save checkpoints
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
kwargs
=
dict
(
output_dir
=
output_dir
,
save_steps
=
freq
,
deepspeed
=
ds_config_dict
,
)
kwargs
=
{
"
output_dir
"
:
output_dir
,
"
save_steps
"
:
freq
,
"
deepspeed
"
:
ds_config_dict
,
}
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
trainer
.
train
()
...
...
@@ -600,7 +608,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
ds_config_dict
=
self
.
get_config_dict
(
stage
)
output_dir
=
self
.
get_auto_remove_tmp_dir
()
kwargs
=
dict
(
output_dir
=
output_dir
,
deepspeed
=
ds_config_dict
)
kwargs
=
{
"
output_dir
"
:
output_dir
,
"
deepspeed
"
:
ds_config_dict
}
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
...
...
@@ -632,7 +640,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
if
stage
==
ZERO3
:
ds_config_dict
[
"zero_optimization"
][
"stage3_gather_16bit_weights_on_model_save"
]
=
True
kwargs
=
dict
(
output_dir
=
output_dir
,
train_len
=
128
,
save_steps
=
5
,
learning_rate
=
0.1
,
deepspeed
=
ds_config_dict
)
kwargs
=
{
"output_dir"
:
output_dir
,
"train_len"
:
128
,
"save_steps"
:
5
,
"learning_rate"
:
0.1
,
"deepspeed"
:
ds_config_dict
,
}
kwargs
[
dtype
]
=
True
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
...
...
@@ -679,16 +693,16 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
ds_config_dict
=
self
.
get_config_dict
(
stage
)
kwargs
=
dict
(
output_dir
=
output_dir
,
train_len
=
4
,
per_device_train_batch_size
=
4
,
num_train_epochs
=
1
,
save_strategy
=
"steps"
,
save_steps
=
1
,
learning_rate
=
0.1
,
deepspeed
=
ds_config_dict
,
)
kwargs
=
{
"
output_dir
"
:
output_dir
,
"
train_len
"
:
4
,
"
per_device_train_batch_size
"
:
4
,
"
num_train_epochs
"
:
1
,
"
save_strategy
"
:
"steps"
,
"
save_steps
"
:
1
,
"
learning_rate
"
:
0.1
,
"
deepspeed
"
:
ds_config_dict
,
}
kwargs
[
dtype
]
=
True
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
...
...
@@ -710,7 +724,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# test that we can switch from zero2 to zero3 in the same process for example
# test is_zero, etc.
output_dir
=
self
.
get_auto_remove_tmp_dir
()
kwargs
=
dict
(
output_dir
=
output_dir
,
train_len
=
8
,
fp16
=
True
)
kwargs
=
{
"
output_dir
"
:
output_dir
,
"
train_len
"
:
8
,
"
fp16
"
:
True
}
ds_config_zero3_dict
=
self
.
get_config_dict
(
ZERO3
)
ds_config_zero2_dict
=
self
.
get_config_dict
(
ZERO2
)
...
...
@@ -808,7 +822,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
def
get_dataset
():
data_file
=
str
(
self
.
tests_dir
/
"fixtures/tests_samples/SQUAD/sample.json"
)
data_files
=
dict
(
train
=
data_file
,
validation
=
data_file
)
data_files
=
{
"
train
"
:
data_file
,
"
validation
"
:
data_file
}
raw_datasets
=
datasets
.
load_dataset
(
"json"
,
data_files
=
data_files
,
field
=
"data"
)
train_dataset
=
raw_datasets
[
"train"
].
map
(
_add_eos_to_examples
).
map
(
_convert_to_features
,
batched
=
True
)
valid_dataset
=
deepcopy
(
train_dataset
)
...
...
@@ -903,7 +917,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
do_train
=
True
do_eval
=
False
kwargs
=
dict
(
stage
=
stage
,
dtype
=
dtype
,
eval_steps
=
1
,
distributed
=
True
,
do_train
=
do_train
,
do_eval
=
do_eval
)
kwargs
=
{
"stage"
:
stage
,
"dtype"
:
dtype
,
"eval_steps"
:
1
,
"distributed"
:
True
,
"do_train"
:
do_train
,
"do_eval"
:
do_eval
,
}
# 1. normal training
output_dir
=
self
.
run_and_check
(
**
kwargs
)
...
...
Prev
1
…
5
6
7
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment