Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
60e1d883
"tests/models/vscode:/vscode.git/clone" did not exist on "b3bbe1bdb6207b99db38144a7ed258a060f9e131"
Unverified
Commit
60e1d883
authored
Apr 27, 2022
by
Zachary Mueller
Committed by
GitHub
Apr 27, 2022
Browse files
Fixup no_trainer save logic (#16968)
* Fixup all examples
parent
c79bbc3b
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
189 additions
and
121 deletions
+189
-121
examples/pytorch/image-classification/run_image_classification_no_trainer.py
...age-classification/run_image_classification_no_trainer.py
+17
-11
examples/pytorch/language-modeling/run_clm_no_trainer.py
examples/pytorch/language-modeling/run_clm_no_trainer.py
+17
-11
examples/pytorch/language-modeling/run_mlm_no_trainer.py
examples/pytorch/language-modeling/run_mlm_no_trainer.py
+17
-11
examples/pytorch/multiple-choice/run_swag_no_trainer.py
examples/pytorch/multiple-choice/run_swag_no_trainer.py
+17
-11
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
...torch/question-answering/run_qa_beam_search_no_trainer.py
+16
-10
examples/pytorch/question-answering/run_qa_no_trainer.py
examples/pytorch/question-answering/run_qa_no_trainer.py
+17
-11
examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
...ntic-segmentation/run_semantic_segmentation_no_trainer.py
+17
-11
examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
...speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+3
-1
examples/pytorch/summarization/run_summarization_no_trainer.py
...les/pytorch/summarization/run_summarization_no_trainer.py
+17
-11
examples/pytorch/text-classification/run_glue_no_trainer.py
examples/pytorch/text-classification/run_glue_no_trainer.py
+17
-11
examples/pytorch/token-classification/run_ner_no_trainer.py
examples/pytorch/token-classification/run_ner_no_trainer.py
+17
-11
examples/pytorch/translation/run_translation_no_trainer.py
examples/pytorch/translation/run_translation_no_trainer.py
+17
-11
No files found.
examples/pytorch/image-classification/run_image_classification_no_trainer.py
View file @
60e1d883
...
...
@@ -393,32 +393,38 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -436,7 +442,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
examples/pytorch/language-modeling/run_clm_no_trainer.py
View file @
60e1d883
...
...
@@ -503,33 +503,39 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -547,7 +553,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
examples/pytorch/language-modeling/run_mlm_no_trainer.py
View file @
60e1d883
...
...
@@ -549,33 +549,39 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -593,7 +599,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
examples/pytorch/multiple-choice/run_swag_no_trainer.py
View file @
60e1d883
...
...
@@ -506,33 +506,39 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -550,7 +556,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
View file @
60e1d883
...
...
@@ -765,33 +765,39 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
examples/pytorch/question-answering/run_qa_no_trainer.py
View file @
60e1d883
...
...
@@ -771,33 +771,39 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -815,7 +821,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
View file @
60e1d883
...
...
@@ -501,33 +501,39 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
if
args
.
with_tracking
:
total_loss
=
0
model
.
train
()
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -545,7 +551,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
View file @
60e1d883
...
...
@@ -563,11 +563,13 @@ def main():
logger
.
info
(
f
" Gradient Accumulation steps =
{
args
.
gradient_accumulation_steps
}
"
)
logger
.
info
(
f
" Total optimization steps =
{
args
.
max_train_steps
}
"
)
completed_steps
=
0
starting_epoch
=
0
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
for
epoch
in
range
(
args
.
num_train_epochs
):
starting_epoch
=
0
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
for
step
,
batch
in
enumerate
(
train_dataloader
):
# compute num of losses
...
...
examples/pytorch/summarization/run_summarization_no_trainer.py
View file @
60e1d883
...
...
@@ -569,32 +569,38 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -612,7 +618,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
examples/pytorch/text-classification/run_glue_no_trainer.py
View file @
60e1d883
...
...
@@ -454,32 +454,38 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -497,7 +503,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
examples/pytorch/token-classification/run_ner_no_trainer.py
View file @
60e1d883
...
...
@@ -606,32 +606,38 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -649,7 +655,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
examples/pytorch/translation/run_translation_no_trainer.py
View file @
60e1d883
...
...
@@ -552,33 +552,39 @@ def main():
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
starting_epoch
=
0
# Potentially load in the weights and states from a previous save
if
args
.
resume_from_checkpoint
:
if
args
.
resume_from_checkpoint
is
not
None
or
args
.
resume_from_checkpoint
!=
""
:
accelerator
.
print
(
f
"Resumed from checkpoint:
{
args
.
resume_from_checkpoint
}
"
)
accelerator
.
load_state
(
args
.
resume_from_checkpoint
)
resume_step
=
None
path
=
args
.
resume_from_checkpoint
path
=
os
.
path
.
basename
(
args
.
resume_from_checkpoint
)
else
:
# Get the most recent checkpoint
dirs
=
[
f
.
name
for
f
in
os
.
scandir
(
os
.
getcwd
())
if
f
.
is_dir
()]
dirs
.
sort
(
key
=
os
.
path
.
getctime
)
path
=
dirs
[
-
1
]
# Sorts folders by date modified, most recent checkpoint is the last
if
"epoch"
in
path
:
args
.
num_train_epochs
-=
int
(
path
.
replace
(
"epoch_"
,
""
))
# Extract `epoch_{i}` or `step_{i}`
training_difference
=
os
.
path
.
splitext
(
path
)[
0
]
if
"epoch"
in
training_difference
:
starting_epoch
=
int
(
training_difference
.
replace
(
"epoch_"
,
""
))
+
1
resume_step
=
None
else
:
resume_step
=
int
(
path
.
replace
(
"step_"
,
""
))
args
.
num_tra
in_epoch
s
-
=
resume_step
//
len
(
train_dataloader
)
resume_step
=
(
args
.
num_tra
in_epoch
s
*
len
(
train_dataloader
)
)
-
resume_step
resume_step
=
int
(
training_difference
.
replace
(
"step_"
,
""
))
start
in
g
_epoch
=
resume_step
//
len
(
train_dataloader
)
resume_step
-
=
start
in
g
_epoch
*
len
(
train_dataloader
)
for
epoch
in
range
(
args
.
num_train_epochs
):
for
epoch
in
range
(
starting_epoch
,
args
.
num_train_epochs
):
model
.
train
()
if
args
.
with_tracking
:
total_loss
=
0
for
step
,
batch
in
enumerate
(
train_dataloader
):
# We need to skip steps until we reach the resumed step
if
args
.
resume_from_checkpoint
and
epoch
==
0
and
step
<
resume_step
:
if
args
.
resume_from_checkpoint
and
epoch
==
starting_epoch
:
if
resume_step
is
not
None
and
step
<
resume_step
:
completed_steps
+=
1
continue
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
...
...
@@ -596,7 +602,7 @@ def main():
if
isinstance
(
checkpointing_steps
,
int
):
if
completed_steps
%
checkpointing_steps
==
0
:
output_dir
=
f
"step_
{
completed_steps
}
"
output_dir
=
f
"step_
{
completed_steps
}
"
if
args
.
output_dir
is
not
None
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
output_dir
)
accelerator
.
save_state
(
output_dir
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment