Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5e8c8eb5
Unverified
Commit
5e8c8eb5
authored
Feb 22, 2023
by
Aaron Gokaslan
Committed by
GitHub
Feb 22, 2023
Browse files
Apply ruff flake8-comprehensions (#21694)
parent
df06fb1f
Changes
230
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
158 additions
and
137 deletions
+158
-137
src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
...od/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
+1
-1
src/transformers/models/xmod/modeling_xmod.py
src/transformers/models/xmod/modeling_xmod.py
+1
-1
src/transformers/models/yolos/image_processing_yolos.py
src/transformers/models/yolos/image_processing_yolos.py
+1
-1
src/transformers/onnx/convert.py
src/transformers/onnx/convert.py
+1
-1
src/transformers/optimization.py
src/transformers/optimization.py
+12
-12
src/transformers/optimization_tf.py
src/transformers/optimization_tf.py
+2
-2
src/transformers/pipelines/base.py
src/transformers/pipelines/base.py
+1
-1
src/transformers/pipelines/question_answering.py
src/transformers/pipelines/question_answering.py
+1
-1
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+4
-4
src/transformers/tokenization_utils_base.py
src/transformers/tokenization_utils_base.py
+21
-21
src/transformers/tokenization_utils_fast.py
src/transformers/tokenization_utils_fast.py
+1
-1
src/transformers/trainer.py
src/transformers/trainer.py
+3
-3
src/transformers/trainer_pt_utils.py
src/transformers/trainer_pt_utils.py
+1
-1
src/transformers/trainer_utils.py
src/transformers/trainer_utils.py
+12
-12
src/transformers/training_args.py
src/transformers/training_args.py
+1
-1
src/transformers/utils/doc.py
src/transformers/utils/doc.py
+13
-13
src/transformers/utils/hp_naming.py
src/transformers/utils/hp_naming.py
+6
-6
src/transformers/utils/hub.py
src/transformers/utils/hub.py
+1
-1
src/transformers/utils/model_parallel_utils.py
src/transformers/utils/model_parallel_utils.py
+1
-1
tests/deepspeed/test_deepspeed.py
tests/deepspeed/test_deepspeed.py
+74
-53
No files found.
src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
View file @
5e8c8eb5
...
@@ -142,7 +142,7 @@ def convert_xmod_checkpoint_to_pytorch(
...
@@ -142,7 +142,7 @@ def convert_xmod_checkpoint_to_pytorch(
bert_output
.
adapter_layer_norm
.
weight
=
xmod_layer
.
adapter_layer_norm
.
weight
bert_output
.
adapter_layer_norm
.
weight
=
xmod_layer
.
adapter_layer_norm
.
weight
bert_output
.
adapter_layer_norm
.
bias
=
xmod_layer
.
adapter_layer_norm
.
bias
bert_output
.
adapter_layer_norm
.
bias
=
xmod_layer
.
adapter_layer_norm
.
bias
if
list
(
sorted
(
bert_output
.
adapter_modules
.
keys
())
)
!=
list
(
sorted
(
xmod_layer
.
adapter_modules
.
keys
())
)
:
if
sorted
(
bert_output
.
adapter_modules
.
keys
())
!=
sorted
(
xmod_layer
.
adapter_modules
.
keys
()):
raise
AssertionError
(
"Lists of language adapters do not match."
)
raise
AssertionError
(
"Lists of language adapters do not match."
)
for
lang_code
,
adapter
in
xmod_layer
.
adapter_modules
.
items
():
for
lang_code
,
adapter
in
xmod_layer
.
adapter_modules
.
items
():
to_adapter
=
bert_output
.
adapter_modules
[
lang_code
]
to_adapter
=
bert_output
.
adapter_modules
[
lang_code
]
...
...
src/transformers/models/xmod/modeling_xmod.py
View file @
5e8c8eb5
...
@@ -395,7 +395,7 @@ class XmodOutput(nn.Module):
...
@@ -395,7 +395,7 @@ class XmodOutput(nn.Module):
else
:
else
:
self
.
adapter_layer_norm
=
None
self
.
adapter_layer_norm
=
None
self
.
adapter_reuse_layer_norm
=
config
.
adapter_reuse_layer_norm
self
.
adapter_reuse_layer_norm
=
config
.
adapter_reuse_layer_norm
self
.
adapter_modules
=
nn
.
ModuleDict
(
dict
()
)
self
.
adapter_modules
=
nn
.
ModuleDict
(
{}
)
for
language
in
config
.
languages
:
for
language
in
config
.
languages
:
self
.
adapter_modules
[
str
(
language
)]
=
XmodAdapter
(
config
)
self
.
adapter_modules
[
str
(
language
)]
=
XmodAdapter
(
config
)
...
...
src/transformers/models/yolos/image_processing_yolos.py
View file @
5e8c8eb5
...
@@ -515,7 +515,7 @@ def binary_mask_to_rle(mask):
...
@@ -515,7 +515,7 @@ def binary_mask_to_rle(mask):
pixels
=
np
.
concatenate
([[
0
],
pixels
,
[
0
]])
pixels
=
np
.
concatenate
([[
0
],
pixels
,
[
0
]])
runs
=
np
.
where
(
pixels
[
1
:]
!=
pixels
[:
-
1
])[
0
]
+
1
runs
=
np
.
where
(
pixels
[
1
:]
!=
pixels
[:
-
1
])[
0
]
+
1
runs
[
1
::
2
]
-=
runs
[::
2
]
runs
[
1
::
2
]
-=
runs
[::
2
]
return
[
x
for
x
in
runs
]
return
list
(
runs
)
# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
...
...
src/transformers/onnx/convert.py
View file @
5e8c8eb5
...
@@ -145,7 +145,7 @@ def export_pytorch(
...
@@ -145,7 +145,7 @@ def export_pytorch(
device
=
torch
.
device
(
device
)
device
=
torch
.
device
(
device
)
if
device
.
type
==
"cuda"
and
torch
.
cuda
.
is_available
():
if
device
.
type
==
"cuda"
and
torch
.
cuda
.
is_available
():
model
.
to
(
device
)
model
.
to
(
device
)
model_inputs_device
=
dict
()
model_inputs_device
=
{}
for
k
,
v
in
model_inputs
.
items
():
for
k
,
v
in
model_inputs
.
items
():
if
isinstance
(
v
,
Tuple
):
if
isinstance
(
v
,
Tuple
):
model_inputs_device
[
k
]
=
tuple
(
model_inputs_device
[
k
]
=
tuple
(
...
...
src/transformers/optimization.py
View file @
5e8c8eb5
...
@@ -358,7 +358,7 @@ class AdamW(Optimizer):
...
@@ -358,7 +358,7 @@ class AdamW(Optimizer):
raise
ValueError
(
f
"Invalid beta parameter:
{
betas
[
1
]
}
- should be in [0.0, 1.0)"
)
raise
ValueError
(
f
"Invalid beta parameter:
{
betas
[
1
]
}
- should be in [0.0, 1.0)"
)
if
not
0.0
<=
eps
:
if
not
0.0
<=
eps
:
raise
ValueError
(
f
"Invalid epsilon value:
{
eps
}
- should be >= 0.0"
)
raise
ValueError
(
f
"Invalid epsilon value:
{
eps
}
- should be >= 0.0"
)
defaults
=
dict
(
lr
=
lr
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
,
correct_bias
=
correct_bias
)
defaults
=
{
"lr"
:
lr
,
"
betas
"
:
betas
,
"
eps
"
:
eps
,
"
weight_decay
"
:
weight_decay
,
"
correct_bias
"
:
correct_bias
}
super
().
__init__
(
params
,
defaults
)
super
().
__init__
(
params
,
defaults
)
def
step
(
self
,
closure
:
Callable
=
None
):
def
step
(
self
,
closure
:
Callable
=
None
):
...
@@ -527,17 +527,17 @@ class Adafactor(Optimizer):
...
@@ -527,17 +527,17 @@ class Adafactor(Optimizer):
if
warmup_init
and
not
relative_step
:
if
warmup_init
and
not
relative_step
:
raise
ValueError
(
"`warmup_init=True` requires `relative_step=True`"
)
raise
ValueError
(
"`warmup_init=True` requires `relative_step=True`"
)
defaults
=
dict
(
defaults
=
{
lr
=
lr
,
"
lr
"
:
lr
,
eps
=
eps
,
"
eps
"
:
eps
,
clip_threshold
=
clip_threshold
,
"
clip_threshold
"
:
clip_threshold
,
decay_rate
=
decay_rate
,
"
decay_rate
"
:
decay_rate
,
beta1
=
beta1
,
"
beta1
"
:
beta1
,
weight_decay
=
weight_decay
,
"
weight_decay
"
:
weight_decay
,
scale_parameter
=
scale_parameter
,
"
scale_parameter
"
:
scale_parameter
,
relative_step
=
relative_step
,
"
relative_step
"
:
relative_step
,
warmup_init
=
warmup_init
,
"
warmup_init
"
:
warmup_init
,
)
}
super
().
__init__
(
params
,
defaults
)
super
().
__init__
(
params
,
defaults
)
@
staticmethod
@
staticmethod
...
...
src/transformers/optimization_tf.py
View file @
5e8c8eb5
...
@@ -262,7 +262,7 @@ class AdamWeightDecay(Adam):
...
@@ -262,7 +262,7 @@ class AdamWeightDecay(Adam):
coefficients
=
self
.
_fallback_apply_state
(
var_device
,
var_dtype
)
coefficients
=
self
.
_fallback_apply_state
(
var_device
,
var_dtype
)
apply_state
[(
var_device
,
var_dtype
)]
=
coefficients
apply_state
[(
var_device
,
var_dtype
)]
=
coefficients
return
coefficients
[
"lr_t"
],
dict
(
apply_state
=
apply_state
)
return
coefficients
[
"lr_t"
],
{
"
apply_state
"
:
apply_state
}
def
_resource_apply_dense
(
self
,
grad
,
var
,
apply_state
=
None
):
def
_resource_apply_dense
(
self
,
grad
,
var
,
apply_state
=
None
):
lr_t
,
kwargs
=
self
.
_get_lr
(
var
.
device
,
var
.
dtype
.
base_dtype
,
apply_state
)
lr_t
,
kwargs
=
self
.
_get_lr
(
var
.
device
,
var
.
dtype
.
base_dtype
,
apply_state
)
...
@@ -333,7 +333,7 @@ class GradientAccumulator(object):
...
@@ -333,7 +333,7 @@ class GradientAccumulator(object):
"""The accumulated gradients on the current replica."""
"""The accumulated gradients on the current replica."""
if
not
self
.
_gradients
:
if
not
self
.
_gradients
:
raise
ValueError
(
"The accumulator should be called first to initialize the gradients"
)
raise
ValueError
(
"The accumulator should be called first to initialize the gradients"
)
return
list
(
gradient
.
value
()
if
gradient
is
not
None
else
gradient
for
gradient
in
self
.
_gradients
)
return
[
gradient
.
value
()
if
gradient
is
not
None
else
gradient
for
gradient
in
self
.
_gradients
]
def
__call__
(
self
,
gradients
):
def
__call__
(
self
,
gradients
):
"""Accumulates `gradients` on the current replica."""
"""Accumulates `gradients` on the current replica."""
...
...
src/transformers/pipelines/base.py
View file @
5e8c8eb5
...
@@ -1083,7 +1083,7 @@ class Pipeline(_ScikitCompat):
...
@@ -1083,7 +1083,7 @@ class Pipeline(_ScikitCompat):
final_iterator
=
self
.
get_iterator
(
final_iterator
=
self
.
get_iterator
(
inputs
,
num_workers
,
batch_size
,
preprocess_params
,
forward_params
,
postprocess_params
inputs
,
num_workers
,
batch_size
,
preprocess_params
,
forward_params
,
postprocess_params
)
)
outputs
=
[
output
for
output
in
final_iterator
]
outputs
=
list
(
final_iterator
)
return
outputs
return
outputs
else
:
else
:
return
self
.
run_multi
(
inputs
,
preprocess_params
,
forward_params
,
postprocess_params
)
return
self
.
run_multi
(
inputs
,
preprocess_params
,
forward_params
,
postprocess_params
)
...
...
src/transformers/pipelines/question_answering.py
View file @
5e8c8eb5
...
@@ -210,7 +210,7 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
...
@@ -210,7 +210,7 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
inputs
=
[
inputs
]
inputs
=
[
inputs
]
elif
isinstance
(
inputs
,
Iterable
):
elif
isinstance
(
inputs
,
Iterable
):
# Copy to avoid overriding arguments
# Copy to avoid overriding arguments
inputs
=
[
i
for
i
in
inputs
]
inputs
=
list
(
inputs
)
else
:
else
:
raise
ValueError
(
f
"Invalid arguments
{
kwargs
}
"
)
raise
ValueError
(
f
"Invalid arguments
{
kwargs
}
"
)
...
...
src/transformers/tokenization_utils.py
View file @
5e8c8eb5
...
@@ -425,7 +425,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
...
@@ -425,7 +425,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
if
self
.
verbose
:
if
self
.
verbose
:
logger
.
info
(
f
"Adding
{
token
}
to the vocabulary"
)
logger
.
info
(
f
"Adding
{
token
}
to the vocabulary"
)
added_tok_encoder
=
dict
((
tok
,
len
(
self
)
+
i
)
for
i
,
tok
in
enumerate
(
tokens_to_add
)
)
added_tok_encoder
=
{
tok
:
len
(
self
)
+
i
for
i
,
tok
in
enumerate
(
tokens_to_add
)
}
added_tok_decoder
=
{
v
:
k
for
k
,
v
in
added_tok_encoder
.
items
()}
added_tok_decoder
=
{
v
:
k
for
k
,
v
in
added_tok_encoder
.
items
()}
self
.
added_tokens_encoder
.
update
(
added_tok_encoder
)
self
.
added_tokens_encoder
.
update
(
added_tok_encoder
)
self
.
added_tokens_decoder
.
update
(
added_tok_decoder
)
self
.
added_tokens_decoder
.
update
(
added_tok_decoder
)
...
@@ -495,9 +495,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
...
@@ -495,9 +495,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
`List[str]`: The list of tokens.
`List[str]`: The list of tokens.
"""
"""
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
all_special_tokens_extended
=
dict
(
all_special_tokens_extended
=
{
(
str
(
t
)
,
t
)
for
t
in
self
.
all_special_tokens_extended
if
isinstance
(
t
,
AddedToken
)
str
(
t
)
:
t
for
t
in
self
.
all_special_tokens_extended
if
isinstance
(
t
,
AddedToken
)
)
}
text
,
kwargs
=
self
.
prepare_for_tokenization
(
text
,
**
kwargs
)
text
,
kwargs
=
self
.
prepare_for_tokenization
(
text
,
**
kwargs
)
...
...
src/transformers/tokenization_utils_base.py
View file @
5e8c8eb5
...
@@ -1918,7 +1918,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
...
@@ -1918,7 +1918,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
obj
.
pop
(
"__type"
)
obj
.
pop
(
"__type"
)
return
AddedToken
(
**
obj
)
return
AddedToken
(
**
obj
)
elif
isinstance
(
obj
,
(
list
,
tuple
)):
elif
isinstance
(
obj
,
(
list
,
tuple
)):
return
list
(
convert_added_tokens
(
o
)
for
o
in
obj
)
return
[
convert_added_tokens
(
o
)
for
o
in
obj
]
elif
isinstance
(
obj
,
dict
):
elif
isinstance
(
obj
,
dict
):
return
{
k
:
convert_added_tokens
(
v
)
for
k
,
v
in
obj
.
items
()}
return
{
k
:
convert_added_tokens
(
v
)
for
k
,
v
in
obj
.
items
()}
return
obj
return
obj
...
@@ -1992,7 +1992,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
...
@@ -1992,7 +1992,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
added_tok_encoder
=
json
.
load
(
added_tokens_handle
)
added_tok_encoder
=
json
.
load
(
added_tokens_handle
)
# Sort added tokens by index
# Sort added tokens by index
added_tok_encoder_sorted
=
list
(
sorted
(
added_tok_encoder
.
items
(),
key
=
lambda
x
:
x
[
1
])
)
added_tok_encoder_sorted
=
sorted
(
added_tok_encoder
.
items
(),
key
=
lambda
x
:
x
[
1
])
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow.
# individual tokens would repeatedly rebuild a trie, which can be slow.
...
@@ -2129,7 +2129,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
...
@@ -2129,7 +2129,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
out
[
"__type"
]
=
"AddedToken"
out
[
"__type"
]
=
"AddedToken"
return
out
return
out
elif
isinstance
(
obj
,
(
list
,
tuple
)):
elif
isinstance
(
obj
,
(
list
,
tuple
)):
return
list
(
convert_added_tokens
(
o
,
add_type_field
=
add_type_field
)
for
o
in
obj
)
return
[
convert_added_tokens
(
o
,
add_type_field
=
add_type_field
)
for
o
in
obj
]
elif
isinstance
(
obj
,
dict
):
elif
isinstance
(
obj
,
dict
):
return
{
k
:
convert_added_tokens
(
v
,
add_type_field
=
add_type_field
)
for
k
,
v
in
obj
.
items
()}
return
{
k
:
convert_added_tokens
(
v
,
add_type_field
=
add_type_field
)
for
k
,
v
in
obj
.
items
()}
return
obj
return
obj
...
@@ -2502,23 +2502,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
...
@@ -2502,23 +2502,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
"""
"""
# To avoid duplicating
# To avoid duplicating
all_kwargs
=
dict
(
all_kwargs
=
{
add_special_tokens
=
add_special_tokens
,
"
add_special_tokens
"
:
add_special_tokens
,
padding
=
padding
,
"
padding
"
:
padding
,
truncation
=
truncation
,
"
truncation
"
:
truncation
,
max_length
=
max_length
,
"
max_length
"
:
max_length
,
stride
=
stride
,
"
stride
"
:
stride
,
is_split_into_words
=
is_split_into_words
,
"
is_split_into_words
"
:
is_split_into_words
,
pad_to_multiple_of
=
pad_to_multiple_of
,
"
pad_to_multiple_of
"
:
pad_to_multiple_of
,
return_tensors
=
return_tensors
,
"
return_tensors
"
:
return_tensors
,
return_token_type_ids
=
return_token_type_ids
,
"
return_token_type_ids
"
:
return_token_type_ids
,
return_attention_mask
=
return_attention_mask
,
"
return_attention_mask
"
:
return_attention_mask
,
return_overflowing_tokens
=
return_overflowing_tokens
,
"
return_overflowing_tokens
"
:
return_overflowing_tokens
,
return_special_tokens_mask
=
return_special_tokens_mask
,
"
return_special_tokens_mask
"
:
return_special_tokens_mask
,
return_offsets_mapping
=
return_offsets_mapping
,
"
return_offsets_mapping
"
:
return_offsets_mapping
,
return_length
=
return_length
,
"
return_length
"
:
return_length
,
verbose
=
verbose
,
"
verbose
"
:
verbose
,
)
}
all_kwargs
.
update
(
kwargs
)
all_kwargs
.
update
(
kwargs
)
if
text
is
None
and
text_target
is
None
:
if
text
is
None
and
text_target
is
None
:
raise
ValueError
(
"You need to specify either `text` or `text_target`."
)
raise
ValueError
(
"You need to specify either `text` or `text_target`."
)
...
@@ -3010,7 +3010,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
...
@@ -3010,7 +3010,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
batch_outputs
=
{}
batch_outputs
=
{}
for
i
in
range
(
batch_size
):
for
i
in
range
(
batch_size
):
inputs
=
dict
((
k
,
v
[
i
]
)
for
k
,
v
in
encoded_inputs
.
items
()
)
inputs
=
{
k
:
v
[
i
]
for
k
,
v
in
encoded_inputs
.
items
()
}
outputs
=
self
.
_pad
(
outputs
=
self
.
_pad
(
inputs
,
inputs
,
max_length
=
max_length
,
max_length
=
max_length
,
...
...
src/transformers/tokenization_utils_fast.py
View file @
5e8c8eb5
...
@@ -162,7 +162,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
...
@@ -162,7 +162,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
"""
"""
base_vocab
=
self
.
_tokenizer
.
get_vocab
(
with_added_tokens
=
False
)
base_vocab
=
self
.
_tokenizer
.
get_vocab
(
with_added_tokens
=
False
)
full_vocab
=
self
.
_tokenizer
.
get_vocab
(
with_added_tokens
=
True
)
full_vocab
=
self
.
_tokenizer
.
get_vocab
(
with_added_tokens
=
True
)
added_vocab
=
dict
((
tok
,
index
)
for
tok
,
index
in
full_vocab
.
items
()
if
tok
not
in
base_vocab
)
added_vocab
=
{
tok
:
index
for
tok
,
index
in
full_vocab
.
items
()
if
tok
not
in
base_vocab
}
return
added_vocab
return
added_vocab
def
__len__
(
self
)
->
int
:
def
__len__
(
self
)
->
int
:
...
...
src/transformers/trainer.py
View file @
5e8c8eb5
...
@@ -1081,7 +1081,7 @@ class Trainer:
...
@@ -1081,7 +1081,7 @@ class Trainer:
skipped
=
0
skipped
=
0
for
module
in
opt_model
.
modules
():
for
module
in
opt_model
.
modules
():
if
isinstance
(
module
,
nn
.
Embedding
):
if
isinstance
(
module
,
nn
.
Embedding
):
skipped
+=
sum
(
dict
((
p
.
data_ptr
()
,
p
.
numel
()
)
for
p
in
module
.
parameters
()
)
.
values
())
skipped
+=
sum
(
{
p
.
data_ptr
()
:
p
.
numel
()
for
p
in
module
.
parameters
()
}
.
values
())
print
(
f
"skipped
{
module
}
:
{
skipped
/
2
**
20
}
M params"
)
print
(
f
"skipped
{
module
}
:
{
skipped
/
2
**
20
}
M params"
)
manager
.
register_module_override
(
module
,
"weight"
,
{
"optim_bits"
:
32
})
manager
.
register_module_override
(
module
,
"weight"
,
{
"optim_bits"
:
32
})
logger
.
debug
(
f
"bitsandbytes: will optimize
{
module
}
in fp32"
)
logger
.
debug
(
f
"bitsandbytes: will optimize
{
module
}
in fp32"
)
...
@@ -2564,12 +2564,12 @@ class Trainer:
...
@@ -2564,12 +2564,12 @@ class Trainer:
elif
isinstance
(
data
,
(
tuple
,
list
)):
elif
isinstance
(
data
,
(
tuple
,
list
)):
return
type
(
data
)(
self
.
_prepare_input
(
v
)
for
v
in
data
)
return
type
(
data
)(
self
.
_prepare_input
(
v
)
for
v
in
data
)
elif
isinstance
(
data
,
torch
.
Tensor
):
elif
isinstance
(
data
,
torch
.
Tensor
):
kwargs
=
dict
(
device
=
self
.
args
.
device
)
kwargs
=
{
"
device
"
:
self
.
args
.
device
}
if
self
.
deepspeed
and
data
.
dtype
!=
torch
.
int64
:
if
self
.
deepspeed
and
data
.
dtype
!=
torch
.
int64
:
# NLP models inputs are int64 and those get adjusted to the right dtype of the
# NLP models inputs are int64 and those get adjusted to the right dtype of the
# embedding. Other models such as wav2vec2's inputs are already float and thus
# embedding. Other models such as wav2vec2's inputs are already float and thus
# may need special handling to match the dtypes of the model
# may need special handling to match the dtypes of the model
kwargs
.
update
(
dict
(
dtype
=
self
.
args
.
hf_deepspeed_config
.
dtype
()
)
)
kwargs
.
update
(
{
"
dtype
"
:
self
.
args
.
hf_deepspeed_config
.
dtype
()
}
)
return
data
.
to
(
**
kwargs
)
return
data
.
to
(
**
kwargs
)
return
data
return
data
...
...
src/transformers/trainer_pt_utils.py
View file @
5e8c8eb5
...
@@ -534,7 +534,7 @@ def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, genera
...
@@ -534,7 +534,7 @@ def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, genera
indices
=
torch
.
randperm
(
len
(
lengths
),
generator
=
generator
)
indices
=
torch
.
randperm
(
len
(
lengths
),
generator
=
generator
)
megabatch_size
=
mega_batch_mult
*
batch_size
megabatch_size
=
mega_batch_mult
*
batch_size
megabatches
=
[
indices
[
i
:
i
+
megabatch_size
].
tolist
()
for
i
in
range
(
0
,
len
(
lengths
),
megabatch_size
)]
megabatches
=
[
indices
[
i
:
i
+
megabatch_size
].
tolist
()
for
i
in
range
(
0
,
len
(
lengths
),
megabatch_size
)]
megabatches
=
[
list
(
sorted
(
megabatch
,
key
=
lambda
i
:
lengths
[
i
],
reverse
=
True
)
)
for
megabatch
in
megabatches
]
megabatches
=
[
sorted
(
megabatch
,
key
=
lambda
i
:
lengths
[
i
],
reverse
=
True
)
for
megabatch
in
megabatches
]
# The rest is to get the biggest batch first.
# The rest is to get the biggest batch first.
# Since each megabatch is sorted by descending length, the longest element is the first
# Since each megabatch is sorted by descending length, the longest element is the first
...
...
src/transformers/trainer_utils.py
View file @
5e8c8eb5
...
@@ -505,21 +505,21 @@ class TrainerMemoryTracker:
...
@@ -505,21 +505,21 @@ class TrainerMemoryTracker:
if
self
.
torch
is
not
None
:
if
self
.
torch
is
not
None
:
self
.
gpu_mem_used_now
=
self
.
torch
.
cuda
.
memory_allocated
()
self
.
gpu_mem_used_now
=
self
.
torch
.
cuda
.
memory_allocated
()
self
.
gpu_mem_used_peak
=
self
.
torch
.
cuda
.
max_memory_allocated
()
self
.
gpu_mem_used_peak
=
self
.
torch
.
cuda
.
max_memory_allocated
()
self
.
gpu
[
self
.
cur_stage
]
=
dict
(
self
.
gpu
[
self
.
cur_stage
]
=
{
begin
=
self
.
gpu_mem_used_at_start
,
"
begin
"
:
self
.
gpu_mem_used_at_start
,
end
=
self
.
gpu_mem_used_now
,
"
end
"
:
self
.
gpu_mem_used_now
,
alloc
=
(
self
.
gpu_mem_used_now
-
self
.
gpu_mem_used_at_start
),
"
alloc
"
:
(
self
.
gpu_mem_used_now
-
self
.
gpu_mem_used_at_start
),
peaked
=
max
(
0
,
self
.
gpu_mem_used_peak
-
self
.
gpu_mem_used_now
),
"
peaked
"
:
max
(
0
,
self
.
gpu_mem_used_peak
-
self
.
gpu_mem_used_now
),
)
}
# cpu
# cpu
self
.
cpu_mem_used_now
=
self
.
cpu_mem_used
()
self
.
cpu_mem_used_now
=
self
.
cpu_mem_used
()
self
.
cpu
[
self
.
cur_stage
]
=
dict
(
self
.
cpu
[
self
.
cur_stage
]
=
{
begin
=
self
.
cpu_mem_used_at_start
,
"
begin
"
:
self
.
cpu_mem_used_at_start
,
end
=
self
.
cpu_mem_used_now
,
"
end
"
:
self
.
cpu_mem_used_now
,
alloc
=
(
self
.
cpu_mem_used_now
-
self
.
cpu_mem_used_at_start
),
"
alloc
"
:
(
self
.
cpu_mem_used_now
-
self
.
cpu_mem_used_at_start
),
peaked
=
max
(
0
,
self
.
cpu_mem_used_peak
-
self
.
cpu_mem_used_now
),
"
peaked
"
:
max
(
0
,
self
.
cpu_mem_used_peak
-
self
.
cpu_mem_used_now
),
)
}
# reset - cycle finished
# reset - cycle finished
self
.
cur_stage
=
None
self
.
cur_stage
=
None
...
...
src/transformers/training_args.py
View file @
5e8c8eb5
...
@@ -1874,7 +1874,7 @@ class TrainingArguments:
...
@@ -1874,7 +1874,7 @@ class TrainingArguments:
the token values by removing their value.
the token values by removing their value.
"""
"""
# filter out fields that are defined as field(init=False)
# filter out fields that are defined as field(init=False)
d
=
dict
((
field
.
name
,
getattr
(
self
,
field
.
name
)
)
for
field
in
fields
(
self
)
if
field
.
init
)
d
=
{
field
.
name
:
getattr
(
self
,
field
.
name
)
for
field
in
fields
(
self
)
if
field
.
init
}
for
k
,
v
in
d
.
items
():
for
k
,
v
in
d
.
items
():
if
isinstance
(
v
,
Enum
):
if
isinstance
(
v
,
Enum
):
...
...
src/transformers/utils/doc.py
View file @
5e8c8eb5
...
@@ -1085,19 +1085,19 @@ def add_code_sample_docstrings(
...
@@ -1085,19 +1085,19 @@ def add_code_sample_docstrings(
# putting all kwargs for docstrings in a dict to be used
# putting all kwargs for docstrings in a dict to be used
# with the `.format(**doc_kwargs)`. Note that string might
# with the `.format(**doc_kwargs)`. Note that string might
# be formatted with non-existing keys, which is fine.
# be formatted with non-existing keys, which is fine.
doc_kwargs
=
dict
(
doc_kwargs
=
{
model_class
=
model_class
,
"
model_class
"
:
model_class
,
processor_class
=
processor_class
,
"
processor_class
"
:
processor_class
,
checkpoint
=
checkpoint
,
"
checkpoint
"
:
checkpoint
,
mask
=
mask
,
"
mask
"
:
mask
,
qa_target_start_index
=
qa_target_start_index
,
"
qa_target_start_index
"
:
qa_target_start_index
,
qa_target_end_index
=
qa_target_end_index
,
"
qa_target_end_index
"
:
qa_target_end_index
,
expected_output
=
expected_output
,
"
expected_output
"
:
expected_output
,
expected_loss
=
expected_loss
,
"
expected_loss
"
:
expected_loss
,
real_checkpoint
=
real_checkpoint
,
"
real_checkpoint
"
:
real_checkpoint
,
fake_checkpoint
=
checkpoint
,
"
fake_checkpoint
"
:
checkpoint
,
true
=
"{true}"
,
# For <Tip warning={true}> syntax that conflicts with formatting.
"
true
"
:
"{true}"
,
# For <Tip warning={true}> syntax that conflicts with formatting.
)
}
if
(
"SequenceClassification"
in
model_class
or
"AudioClassification"
in
model_class
)
and
modality
==
"audio"
:
if
(
"SequenceClassification"
in
model_class
or
"AudioClassification"
in
model_class
)
and
modality
==
"audio"
:
code_sample
=
sample_docstrings
[
"AudioClassification"
]
code_sample
=
sample_docstrings
[
"AudioClassification"
]
...
...
src/transformers/utils/hp_naming.py
View file @
5e8c8eb5
...
@@ -96,12 +96,12 @@ class TrialShortNamer:
...
@@ -96,12 +96,12 @@ class TrialShortNamer:
if
cls
.
NAMING_INFO
is
not
None
:
if
cls
.
NAMING_INFO
is
not
None
:
return
return
info
=
dict
(
info
=
{
short_word
=
{},
"
short_word
"
:
{},
reverse_short_word
=
{},
"
reverse_short_word
"
:
{},
short_param
=
{},
"
short_param
"
:
{},
reverse_short_param
=
{},
"
reverse_short_param
"
:
{},
)
}
field_keys
=
list
(
cls
.
DEFAULTS
.
keys
())
field_keys
=
list
(
cls
.
DEFAULTS
.
keys
())
...
...
src/transformers/utils/hub.py
View file @
5e8c8eb5
...
@@ -902,7 +902,7 @@ def get_checkpoint_shard_files(
...
@@ -902,7 +902,7 @@ def get_checkpoint_shard_files(
with
open
(
index_filename
,
"r"
)
as
f
:
with
open
(
index_filename
,
"r"
)
as
f
:
index
=
json
.
loads
(
f
.
read
())
index
=
json
.
loads
(
f
.
read
())
shard_filenames
=
sorted
(
list
(
set
(
index
[
"weight_map"
].
values
()))
)
shard_filenames
=
sorted
(
set
(
index
[
"weight_map"
].
values
()))
sharded_metadata
=
index
[
"metadata"
]
sharded_metadata
=
index
[
"metadata"
]
sharded_metadata
[
"all_checkpoint_keys"
]
=
list
(
index
[
"weight_map"
].
keys
())
sharded_metadata
[
"all_checkpoint_keys"
]
=
list
(
index
[
"weight_map"
].
keys
())
sharded_metadata
[
"weight_map"
]
=
index
[
"weight_map"
].
copy
()
sharded_metadata
[
"weight_map"
]
=
index
[
"weight_map"
].
copy
()
...
...
src/transformers/utils/model_parallel_utils.py
View file @
5e8c8eb5
...
@@ -51,6 +51,6 @@ def get_device_map(n_layers, devices):
...
@@ -51,6 +51,6 @@ def get_device_map(n_layers, devices):
"""Returns a dictionary of layers distributed evenly across all devices."""
"""Returns a dictionary of layers distributed evenly across all devices."""
layers
=
list
(
range
(
n_layers
))
layers
=
list
(
range
(
n_layers
))
n_blocks
=
int
(
ceil
(
n_layers
/
len
(
devices
)))
n_blocks
=
int
(
ceil
(
n_layers
/
len
(
devices
)))
layers_list
=
list
(
layers
[
i
:
i
+
n_blocks
]
for
i
in
range
(
0
,
n_layers
,
n_blocks
)
)
layers_list
=
[
layers
[
i
:
i
+
n_blocks
]
for
i
in
range
(
0
,
n_layers
,
n_blocks
)
]
return
dict
(
zip
(
devices
,
layers_list
))
return
dict
(
zip
(
devices
,
layers_list
))
tests/deepspeed/test_deepspeed.py
View file @
5e8c8eb5
...
@@ -157,9 +157,13 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
...
@@ -157,9 +157,13 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
super
().
setUp
()
super
().
setUp
()
master_port
=
get_master_port
(
real_launcher
=
False
)
master_port
=
get_master_port
(
real_launcher
=
False
)
self
.
dist_env_1_gpu
=
dict
(
self
.
dist_env_1_gpu
=
{
MASTER_ADDR
=
"localhost"
,
MASTER_PORT
=
master_port
,
RANK
=
"0"
,
LOCAL_RANK
=
"0"
,
WORLD_SIZE
=
"1"
"MASTER_ADDR"
:
"localhost"
,
)
"MASTER_PORT"
:
master_port
,
"RANK"
:
"0"
,
"LOCAL_RANK"
:
"0"
,
"WORLD_SIZE"
:
"1"
,
}
def
tearDown
(
self
):
def
tearDown
(
self
):
super
().
tearDown
()
super
().
tearDown
()
...
@@ -212,14 +216,18 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
...
@@ -212,14 +216,18 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
self
.
batch_size
=
args
.
train_batch_size
self
.
batch_size
=
args
.
train_batch_size
master_port
=
get_master_port
(
real_launcher
=
False
)
master_port
=
get_master_port
(
real_launcher
=
False
)
self
.
dist_env_1_gpu
=
dict
(
self
.
dist_env_1_gpu
=
{
MASTER_ADDR
=
"localhost"
,
MASTER_PORT
=
master_port
,
RANK
=
"0"
,
LOCAL_RANK
=
"0"
,
WORLD_SIZE
=
"1"
"MASTER_ADDR"
:
"localhost"
,
)
"MASTER_PORT"
:
master_port
,
"RANK"
:
"0"
,
"LOCAL_RANK"
:
"0"
,
"WORLD_SIZE"
:
"1"
,
}
self
.
ds_config_file
=
dict
(
self
.
ds_config_file
=
{
zero2
=
f
"
{
self
.
test_file_dir_str
}
/ds_config_zero2.json"
,
"
zero2
"
:
f
"
{
self
.
test_file_dir_str
}
/ds_config_zero2.json"
,
zero3
=
f
"
{
self
.
test_file_dir_str
}
/ds_config_zero3.json"
,
"
zero3
"
:
f
"
{
self
.
test_file_dir_str
}
/ds_config_zero3.json"
,
)
}
# use self.get_config_dict(stage) to use these to ensure the original is not modified
# use self.get_config_dict(stage) to use these to ensure the original is not modified
with
io
.
open
(
self
.
ds_config_file
[
ZERO2
],
"r"
,
encoding
=
"utf-8"
)
as
f
:
with
io
.
open
(
self
.
ds_config_file
[
ZERO2
],
"r"
,
encoding
=
"utf-8"
)
as
f
:
...
@@ -230,10 +238,10 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
...
@@ -230,10 +238,10 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
# It's in the file as a demo for users since we want everything to work out of the box even if slower.
# It's in the file as a demo for users since we want everything to work out of the box even if slower.
config_zero3
[
"zero_optimization"
][
"stage3_gather_16bit_weights_on_model_save"
]
=
False
config_zero3
[
"zero_optimization"
][
"stage3_gather_16bit_weights_on_model_save"
]
=
False
self
.
ds_config_dict
=
dict
(
self
.
ds_config_dict
=
{
zero2
=
config_zero2
,
"
zero2
"
:
config_zero2
,
zero3
=
config_zero3
,
"
zero3
"
:
config_zero3
,
)
}
def
tearDown
(
self
):
def
tearDown
(
self
):
super
().
tearDown
()
super
().
tearDown
()
...
@@ -370,7 +378,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -370,7 +378,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# this actually doesn't have to be on NVMe, any storage will do since this test only
# this actually doesn't have to be on NVMe, any storage will do since this test only
# runs a simple check that we can use some directory as if it were NVMe
# runs a simple check that we can use some directory as if it were NVMe
nvme_path
=
self
.
get_auto_remove_tmp_dir
()
nvme_path
=
self
.
get_auto_remove_tmp_dir
()
nvme_config
=
dict
(
device
=
"nvme"
,
nvme_path
=
nvme_path
)
nvme_config
=
{
"
device
"
:
"nvme"
,
"
nvme_path
"
:
nvme_path
}
ds_config_zero3_dict
=
self
.
get_config_dict
(
ZERO3
)
ds_config_zero3_dict
=
self
.
get_config_dict
(
ZERO3
)
ds_config_zero3_dict
[
"zero_optimization"
][
"offload_optimizer"
]
=
nvme_config
ds_config_zero3_dict
[
"zero_optimization"
][
"offload_optimizer"
]
=
nvme_config
ds_config_zero3_dict
[
"zero_optimization"
][
"offload_param"
]
=
nvme_config
ds_config_zero3_dict
[
"zero_optimization"
][
"offload_param"
]
=
nvme_config
...
@@ -415,7 +423,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -415,7 +423,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# force cpu offload
# force cpu offload
ds_config_dict
[
"zero_optimization"
][
"offload_optimizer"
][
"device"
]
=
"cpu"
ds_config_dict
[
"zero_optimization"
][
"offload_optimizer"
][
"device"
]
=
"cpu"
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
kwargs
=
dict
(
local_rank
=
0
,
deepspeed
=
ds_config_dict
)
kwargs
=
{
"
local_rank
"
:
0
,
"
deepspeed
"
:
ds_config_dict
}
kwargs
[
dtype
]
=
True
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
trainer
=
get_regression_trainer
(
**
kwargs
)
with
CaptureLogger
(
deepspeed_logger
)
as
cl
:
with
CaptureLogger
(
deepspeed_logger
)
as
cl
:
...
@@ -431,7 +439,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -431,7 +439,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
kwargs
=
dict
(
local_rank
=
0
,
deepspeed
=
self
.
get_config_dict
(
stage
)
)
kwargs
=
{
"
local_rank
"
:
0
,
"
deepspeed
"
:
self
.
get_config_dict
(
stage
)
}
kwargs
[
dtype
]
=
True
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
trainer
=
get_regression_trainer
(
**
kwargs
)
...
@@ -449,15 +457,15 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -449,15 +457,15 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
# `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
a
=
b
=
0.0
a
=
b
=
0.0
kwargs
=
dict
(
kwargs
=
{
a
=
a
,
"a"
:
a
,
b
=
b
,
"b"
:
b
,
local_rank
=
0
,
"
local_rank
"
:
0
,
train_len
=
8
,
"
train_len
"
:
8
,
deepspeed
=
self
.
get_config_dict
(
stage
),
"
deepspeed
"
:
self
.
get_config_dict
(
stage
),
per_device_train_batch_size
=
8
,
"
per_device_train_batch_size
"
:
8
,
logging_steps
=
1
,
"
logging_steps
"
:
1
,
)
}
kwargs
[
dtype
]
=
True
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
trainer
=
get_regression_trainer
(
**
kwargs
)
...
@@ -494,13 +502,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -494,13 +502,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
train_len
=
64
train_len
=
64
a
=
b
=
0.0
a
=
b
=
0.0
kwargs
=
dict
(
kwargs
=
{
a
=
a
,
"a"
:
a
,
b
=
b
,
"b"
:
b
,
local_rank
=
0
,
"
local_rank
"
:
0
,
train_len
=
train_len
,
"
train_len
"
:
train_len
,
deepspeed
=
self
.
get_config_dict
(
stage
),
"
deepspeed
"
:
self
.
get_config_dict
(
stage
),
)
}
kwargs
[
dtype
]
=
True
kwargs
[
dtype
]
=
True
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
...
@@ -583,11 +591,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -583,11 +591,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# save checkpoints
# save checkpoints
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
kwargs
=
dict
(
kwargs
=
{
output_dir
=
output_dir
,
"
output_dir
"
:
output_dir
,
save_steps
=
freq
,
"
save_steps
"
:
freq
,
deepspeed
=
ds_config_dict
,
"
deepspeed
"
:
ds_config_dict
,
)
}
kwargs
[
dtype
]
=
True
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
trainer
=
get_regression_trainer
(
**
kwargs
)
trainer
.
train
()
trainer
.
train
()
...
@@ -600,7 +608,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -600,7 +608,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
ds_config_dict
=
self
.
get_config_dict
(
stage
)
ds_config_dict
=
self
.
get_config_dict
(
stage
)
output_dir
=
self
.
get_auto_remove_tmp_dir
()
output_dir
=
self
.
get_auto_remove_tmp_dir
()
kwargs
=
dict
(
output_dir
=
output_dir
,
deepspeed
=
ds_config_dict
)
kwargs
=
{
"
output_dir
"
:
output_dir
,
"
deepspeed
"
:
ds_config_dict
}
kwargs
[
dtype
]
=
True
kwargs
[
dtype
]
=
True
trainer
=
get_regression_trainer
(
**
kwargs
)
trainer
=
get_regression_trainer
(
**
kwargs
)
...
@@ -632,7 +640,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -632,7 +640,13 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
if
stage
==
ZERO3
:
if
stage
==
ZERO3
:
ds_config_dict
[
"zero_optimization"
][
"stage3_gather_16bit_weights_on_model_save"
]
=
True
ds_config_dict
[
"zero_optimization"
][
"stage3_gather_16bit_weights_on_model_save"
]
=
True
kwargs
=
dict
(
output_dir
=
output_dir
,
train_len
=
128
,
save_steps
=
5
,
learning_rate
=
0.1
,
deepspeed
=
ds_config_dict
)
kwargs
=
{
"output_dir"
:
output_dir
,
"train_len"
:
128
,
"save_steps"
:
5
,
"learning_rate"
:
0.1
,
"deepspeed"
:
ds_config_dict
,
}
kwargs
[
dtype
]
=
True
kwargs
[
dtype
]
=
True
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
...
@@ -679,16 +693,16 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -679,16 +693,16 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
ds_config_dict
=
self
.
get_config_dict
(
stage
)
ds_config_dict
=
self
.
get_config_dict
(
stage
)
kwargs
=
dict
(
kwargs
=
{
output_dir
=
output_dir
,
"
output_dir
"
:
output_dir
,
train_len
=
4
,
"
train_len
"
:
4
,
per_device_train_batch_size
=
4
,
"
per_device_train_batch_size
"
:
4
,
num_train_epochs
=
1
,
"
num_train_epochs
"
:
1
,
save_strategy
=
"steps"
,
"
save_strategy
"
:
"steps"
,
save_steps
=
1
,
"
save_steps
"
:
1
,
learning_rate
=
0.1
,
"
learning_rate
"
:
0.1
,
deepspeed
=
ds_config_dict
,
"
deepspeed
"
:
ds_config_dict
,
)
}
kwargs
[
dtype
]
=
True
kwargs
[
dtype
]
=
True
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
with
mockenv_context
(
**
self
.
dist_env_1_gpu
):
...
@@ -710,7 +724,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -710,7 +724,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# test that we can switch from zero2 to zero3 in the same process for example
# test that we can switch from zero2 to zero3 in the same process for example
# test is_zero, etc.
# test is_zero, etc.
output_dir
=
self
.
get_auto_remove_tmp_dir
()
output_dir
=
self
.
get_auto_remove_tmp_dir
()
kwargs
=
dict
(
output_dir
=
output_dir
,
train_len
=
8
,
fp16
=
True
)
kwargs
=
{
"
output_dir
"
:
output_dir
,
"
train_len
"
:
8
,
"
fp16
"
:
True
}
ds_config_zero3_dict
=
self
.
get_config_dict
(
ZERO3
)
ds_config_zero3_dict
=
self
.
get_config_dict
(
ZERO3
)
ds_config_zero2_dict
=
self
.
get_config_dict
(
ZERO2
)
ds_config_zero2_dict
=
self
.
get_config_dict
(
ZERO2
)
...
@@ -808,7 +822,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
...
@@ -808,7 +822,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
def
get_dataset
():
def
get_dataset
():
data_file
=
str
(
self
.
tests_dir
/
"fixtures/tests_samples/SQUAD/sample.json"
)
data_file
=
str
(
self
.
tests_dir
/
"fixtures/tests_samples/SQUAD/sample.json"
)
data_files
=
dict
(
train
=
data_file
,
validation
=
data_file
)
data_files
=
{
"
train
"
:
data_file
,
"
validation
"
:
data_file
}
raw_datasets
=
datasets
.
load_dataset
(
"json"
,
data_files
=
data_files
,
field
=
"data"
)
raw_datasets
=
datasets
.
load_dataset
(
"json"
,
data_files
=
data_files
,
field
=
"data"
)
train_dataset
=
raw_datasets
[
"train"
].
map
(
_add_eos_to_examples
).
map
(
_convert_to_features
,
batched
=
True
)
train_dataset
=
raw_datasets
[
"train"
].
map
(
_add_eos_to_examples
).
map
(
_convert_to_features
,
batched
=
True
)
valid_dataset
=
deepcopy
(
train_dataset
)
valid_dataset
=
deepcopy
(
train_dataset
)
...
@@ -903,7 +917,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
...
@@ -903,7 +917,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
do_train
=
True
do_train
=
True
do_eval
=
False
do_eval
=
False
kwargs
=
dict
(
stage
=
stage
,
dtype
=
dtype
,
eval_steps
=
1
,
distributed
=
True
,
do_train
=
do_train
,
do_eval
=
do_eval
)
kwargs
=
{
"stage"
:
stage
,
"dtype"
:
dtype
,
"eval_steps"
:
1
,
"distributed"
:
True
,
"do_train"
:
do_train
,
"do_eval"
:
do_eval
,
}
# 1. normal training
# 1. normal training
output_dir
=
self
.
run_and_check
(
**
kwargs
)
output_dir
=
self
.
run_and_check
(
**
kwargs
)
...
...
Prev
1
…
5
6
7
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment