Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
renzhc
diffusers_dcu
Commits
31336dae
Unverified
Commit
31336dae
authored
Jan 24, 2023
by
Pedro Cuenca
Committed by
GitHub
Jan 24, 2023
Browse files
Fix resume epoch for all training scripts except textual_inversion (#2079)
parent
0e98e839
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
119 additions
and
61 deletions
+119
-61
examples/dreambooth/train_dreambooth.py
examples/dreambooth/train_dreambooth.py
+15
-8
examples/dreambooth/train_dreambooth_lora.py
examples/dreambooth/train_dreambooth_lora.py
+15
-8
examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py
...h_projects/dreambooth_inpaint/train_dreambooth_inpaint.py
+15
-8
examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py
...ulti_subject_dreambooth/train_multi_subject_dreambooth.py
+15
-8
examples/text_to_image/train_text_to_image.py
examples/text_to_image/train_text_to_image.py
+14
-6
examples/text_to_image/train_text_to_image_lora.py
examples/text_to_image/train_text_to_image_lora.py
+15
-8
examples/unconditional_image_generation/train_unconditional.py
...les/unconditional_image_generation/train_unconditional.py
+15
-8
examples/unconditional_image_generation/train_unconditional_ort.py
...unconditional_image_generation/train_unconditional_ort.py
+15
-7
No files found.
examples/dreambooth/train_dreambooth.py
View file @
31336dae
...
...
@@ -757,14 +757,21 @@ def main(args):
dirs
=
os
.
listdir
(
args
.
output_dir
)
dirs
=
[
d
for
d
in
dirs
if
d
.
startswith
(
"checkpoint"
)]
dirs
=
sorted
(
dirs
,
key
=
lambda
x
:
int
(
x
.
split
(
"-"
)[
1
]))
path
=
dirs
[
-
1
]
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
resume_global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
num_update_steps_per_epoch
path
=
dirs
[
-
1
]
if
len
(
dirs
)
>
0
else
None
if
path
is
None
:
accelerator
.
print
(
f
"Checkpoint '
{
args
.
resume_from_checkpoint
}
' does not exist. Starting a new training run."
)
args
.
resume_from_checkpoint
=
None
else
:
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
(
num_update_steps_per_epoch
*
args
.
gradient_accumulation_steps
)
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
global_step
,
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
...
...
examples/dreambooth/train_dreambooth_lora.py
View file @
31336dae
...
...
@@ -814,14 +814,21 @@ def main(args):
dirs
=
os
.
listdir
(
args
.
output_dir
)
dirs
=
[
d
for
d
in
dirs
if
d
.
startswith
(
"checkpoint"
)]
dirs
=
sorted
(
dirs
,
key
=
lambda
x
:
int
(
x
.
split
(
"-"
)[
1
]))
path
=
dirs
[
-
1
]
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
resume_global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
num_update_steps_per_epoch
path
=
dirs
[
-
1
]
if
len
(
dirs
)
>
0
else
None
if
path
is
None
:
accelerator
.
print
(
f
"Checkpoint '
{
args
.
resume_from_checkpoint
}
' does not exist. Starting a new training run."
)
args
.
resume_from_checkpoint
=
None
else
:
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
(
num_update_steps_per_epoch
*
args
.
gradient_accumulation_steps
)
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
global_step
,
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
...
...
examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py
View file @
31336dae
...
...
@@ -660,14 +660,21 @@ def main():
dirs
=
os
.
listdir
(
args
.
output_dir
)
dirs
=
[
d
for
d
in
dirs
if
d
.
startswith
(
"checkpoint"
)]
dirs
=
sorted
(
dirs
,
key
=
lambda
x
:
int
(
x
.
split
(
"-"
)[
1
]))
path
=
dirs
[
-
1
]
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
resume_global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
num_update_steps_per_epoch
path
=
dirs
[
-
1
]
if
len
(
dirs
)
>
0
else
None
if
path
is
None
:
accelerator
.
print
(
f
"Checkpoint '
{
args
.
resume_from_checkpoint
}
' does not exist. Starting a new training run."
)
args
.
resume_from_checkpoint
=
None
else
:
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
(
num_update_steps_per_epoch
*
args
.
gradient_accumulation_steps
)
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
global_step
,
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
...
...
examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py
View file @
31336dae
...
...
@@ -748,14 +748,21 @@ def main(args):
dirs
=
os
.
listdir
(
args
.
output_dir
)
dirs
=
[
d
for
d
in
dirs
if
d
.
startswith
(
"checkpoint"
)]
dirs
=
sorted
(
dirs
,
key
=
lambda
x
:
int
(
x
.
split
(
"-"
)[
1
]))
path
=
dirs
[
-
1
]
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
resume_global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
num_update_steps_per_epoch
path
=
dirs
[
-
1
]
if
len
(
dirs
)
>
0
else
None
if
path
is
None
:
accelerator
.
print
(
f
"Checkpoint '
{
args
.
resume_from_checkpoint
}
' does not exist. Starting a new training run."
)
args
.
resume_from_checkpoint
=
None
else
:
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
(
num_update_steps_per_epoch
*
args
.
gradient_accumulation_steps
)
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
global_step
,
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
...
...
examples/text_to_image/train_text_to_image.py
View file @
31336dae
...
...
@@ -599,13 +599,21 @@ def main():
dirs
=
os
.
listdir
(
args
.
output_dir
)
dirs
=
[
d
for
d
in
dirs
if
d
.
startswith
(
"checkpoint"
)]
dirs
=
sorted
(
dirs
,
key
=
lambda
x
:
int
(
x
.
split
(
"-"
)[
1
]))
path
=
dirs
[
-
1
]
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
path
=
dirs
[
-
1
]
if
len
(
dirs
)
>
0
else
None
first_epoch
=
global_step
//
num_update_steps_per_epoch
resume_step
=
global_step
%
num_update_steps_per_epoch
if
path
is
None
:
accelerator
.
print
(
f
"Checkpoint '
{
args
.
resume_from_checkpoint
}
' does not exist. Starting a new training run."
)
args
.
resume_from_checkpoint
=
None
else
:
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
(
num_update_steps_per_epoch
*
args
.
gradient_accumulation_steps
)
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
global_step
,
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
...
...
examples/text_to_image/train_text_to_image_lora.py
View file @
31336dae
...
...
@@ -651,14 +651,21 @@ def main():
dirs
=
os
.
listdir
(
args
.
output_dir
)
dirs
=
[
d
for
d
in
dirs
if
d
.
startswith
(
"checkpoint"
)]
dirs
=
sorted
(
dirs
,
key
=
lambda
x
:
int
(
x
.
split
(
"-"
)[
1
]))
path
=
dirs
[
-
1
]
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
resume_global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
num_update_steps_per_epoch
path
=
dirs
[
-
1
]
if
len
(
dirs
)
>
0
else
None
if
path
is
None
:
accelerator
.
print
(
f
"Checkpoint '
{
args
.
resume_from_checkpoint
}
' does not exist. Starting a new training run."
)
args
.
resume_from_checkpoint
=
None
else
:
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
(
num_update_steps_per_epoch
*
args
.
gradient_accumulation_steps
)
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
global_step
,
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
...
...
examples/unconditional_image_generation/train_unconditional.py
View file @
31336dae
...
...
@@ -439,14 +439,21 @@ def main(args):
dirs
=
os
.
listdir
(
args
.
output_dir
)
dirs
=
[
d
for
d
in
dirs
if
d
.
startswith
(
"checkpoint"
)]
dirs
=
sorted
(
dirs
,
key
=
lambda
x
:
int
(
x
.
split
(
"-"
)[
1
]))
path
=
dirs
[
-
1
]
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
resume_global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
num_update_steps_per_epoch
path
=
dirs
[
-
1
]
if
len
(
dirs
)
>
0
else
None
if
path
is
None
:
accelerator
.
print
(
f
"Checkpoint '
{
args
.
resume_from_checkpoint
}
' does not exist. Starting a new training run."
)
args
.
resume_from_checkpoint
=
None
else
:
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
(
num_update_steps_per_epoch
*
args
.
gradient_accumulation_steps
)
# Train!
for
epoch
in
range
(
first_epoch
,
args
.
num_epochs
):
...
...
examples/unconditional_image_generation/train_unconditional_ort.py
View file @
31336dae
...
...
@@ -396,13 +396,21 @@ def main(args):
dirs
=
os
.
listdir
(
args
.
output_dir
)
dirs
=
[
d
for
d
in
dirs
if
d
.
startswith
(
"checkpoint"
)]
dirs
=
sorted
(
dirs
,
key
=
lambda
x
:
int
(
x
.
split
(
"-"
)[
1
]))
path
=
dirs
[
-
1
]
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
resume_global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
num_update_steps_per_epoch
path
=
dirs
[
-
1
]
if
len
(
dirs
)
>
0
else
None
if
path
is
None
:
accelerator
.
print
(
f
"Checkpoint '
{
args
.
resume_from_checkpoint
}
' does not exist. Starting a new training run."
)
args
.
resume_from_checkpoint
=
None
else
:
accelerator
.
print
(
f
"Resuming from checkpoint
{
path
}
"
)
accelerator
.
load_state
(
os
.
path
.
join
(
args
.
output_dir
,
path
))
global_step
=
int
(
path
.
split
(
"-"
)[
1
])
resume_global_step
=
global_step
*
args
.
gradient_accumulation_steps
first_epoch
=
global_step
//
num_update_steps_per_epoch
resume_step
=
resume_global_step
%
(
num_update_steps_per_epoch
*
args
.
gradient_accumulation_steps
)
for
epoch
in
range
(
first_epoch
,
args
.
num_epochs
):
model
.
train
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment