Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
daecae1f
Unverified
Commit
daecae1f
authored
May 02, 2022
by
calpt
Committed by
GitHub
May 02, 2022
Browse files
[Trainer] Move logic for checkpoint loading into separate methods for easy overriding (#17043)
parent
2de2c9ec
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
57 additions
and
53 deletions
+57
-53
src/transformers/trainer.py
src/transformers/trainer.py
+57
-53
No files found.
src/transformers/trainer.py
View file @
daecae1f
...
...
@@ -1193,32 +1193,7 @@ class Trainer:
raise
ValueError
(
f
"No valid checkpoint found in output directory (
{
args
.
output_dir
}
)"
)
if
resume_from_checkpoint
is
not
None
:
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
resume_from_checkpoint
,
WEIGHTS_NAME
)):
raise
ValueError
(
f
"Can't find a valid checkpoint at
{
resume_from_checkpoint
}
"
)
logger
.
info
(
f
"Loading model from
{
resume_from_checkpoint
}
)."
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
resume_from_checkpoint
,
CONFIG_NAME
)):
config
=
PretrainedConfig
.
from_json_file
(
os
.
path
.
join
(
resume_from_checkpoint
,
CONFIG_NAME
))
checkpoint_version
=
config
.
transformers_version
if
checkpoint_version
is
not
None
and
checkpoint_version
!=
__version__
:
logger
.
warning
(
f
"You are resuming training from a checkpoint trained with
{
checkpoint_version
}
of "
f
"Transformers but your current version is
{
__version__
}
. This is not recommended and could "
"yield to errors or unwanted behaviors."
)
if
args
.
deepspeed
:
# will be resumed in deepspeed_init
pass
else
:
# We load the model state dict on the CPU to avoid an OOM error.
state_dict
=
torch
.
load
(
os
.
path
.
join
(
resume_from_checkpoint
,
WEIGHTS_NAME
),
map_location
=
"cpu"
)
# If the model is on the GPU, it still works!
self
.
_load_state_dict_in_model
(
state_dict
)
# release memory
del
state_dict
self
.
_load_from_checkpoint
(
resume_from_checkpoint
)
# If model was re-initialized, put it on the right device and update self.model_wrapped
if
model_reloaded
:
...
...
@@ -1562,10 +1537,58 @@ class Trainer:
elif
args
.
local_rank
!=
-
1
:
dist
.
barrier
()
logger
.
info
(
f
"Loading best model from
{
self
.
state
.
best_model_checkpoint
}
(score:
{
self
.
state
.
best_metric
}
)."
self
.
_load_best_model
()
# add remaining tr_loss
self
.
_total_loss_scalar
+=
tr_loss
.
item
()
train_loss
=
self
.
_total_loss_scalar
/
self
.
state
.
global_step
metrics
=
speed_metrics
(
"train"
,
start_time
,
num_samples
=
num_train_samples
,
num_steps
=
self
.
state
.
max_steps
)
self
.
store_flos
()
metrics
[
"total_flos"
]
=
self
.
state
.
total_flos
metrics
[
"train_loss"
]
=
train_loss
self
.
is_in_train
=
False
self
.
_memory_tracker
.
stop_and_update_metrics
(
metrics
)
self
.
log
(
metrics
)
self
.
control
=
self
.
callback_handler
.
on_train_end
(
args
,
self
.
state
,
self
.
control
)
return
TrainOutput
(
self
.
state
.
global_step
,
train_loss
,
metrics
)
def
_load_from_checkpoint
(
self
,
resume_from_checkpoint
):
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
resume_from_checkpoint
,
WEIGHTS_NAME
)):
raise
ValueError
(
f
"Can't find a valid checkpoint at
{
resume_from_checkpoint
}
"
)
logger
.
info
(
f
"Loading model from
{
resume_from_checkpoint
}
)."
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
resume_from_checkpoint
,
CONFIG_NAME
)):
config
=
PretrainedConfig
.
from_json_file
(
os
.
path
.
join
(
resume_from_checkpoint
,
CONFIG_NAME
))
checkpoint_version
=
config
.
transformers_version
if
checkpoint_version
is
not
None
and
checkpoint_version
!=
__version__
:
logger
.
warning
(
f
"You are resuming training from a checkpoint trained with
{
checkpoint_version
}
of "
f
"Transformers but your current version is
{
__version__
}
. This is not recommended and could "
"yield to errors or unwanted behaviors."
)
if
self
.
args
.
deepspeed
:
# will be resumed in deepspeed_init
pass
else
:
# We load the model state dict on the CPU to avoid an OOM error.
state_dict
=
torch
.
load
(
os
.
path
.
join
(
resume_from_checkpoint
,
WEIGHTS_NAME
),
map_location
=
"cpu"
)
# If the model is on the GPU, it still works!
self
.
_load_state_dict_in_model
(
state_dict
)
# release memory
del
state_dict
def
_load_best_model
(
self
):
logger
.
info
(
f
"Loading best model from
{
self
.
state
.
best_model_checkpoint
}
(score:
{
self
.
state
.
best_metric
}
)."
)
best_model_path
=
os
.
path
.
join
(
self
.
state
.
best_model_checkpoint
,
WEIGHTS_NAME
)
if
os
.
path
.
exists
(
best_model_path
):
if
self
.
deepspeed
:
...
...
@@ -1590,25 +1613,6 @@ class Trainer:
"on multiple nodes, you should activate `--save_on_each_node`."
)
# add remaining tr_loss
self
.
_total_loss_scalar
+=
tr_loss
.
item
()
train_loss
=
self
.
_total_loss_scalar
/
self
.
state
.
global_step
metrics
=
speed_metrics
(
"train"
,
start_time
,
num_samples
=
num_train_samples
,
num_steps
=
self
.
state
.
max_steps
)
self
.
store_flos
()
metrics
[
"total_flos"
]
=
self
.
state
.
total_flos
metrics
[
"train_loss"
]
=
train_loss
self
.
is_in_train
=
False
self
.
_memory_tracker
.
stop_and_update_metrics
(
metrics
)
self
.
log
(
metrics
)
self
.
control
=
self
.
callback_handler
.
on_train_end
(
args
,
self
.
state
,
self
.
control
)
return
TrainOutput
(
self
.
state
.
global_step
,
train_loss
,
metrics
)
def
_load_state_dict_in_model
(
self
,
state_dict
):
load_result
=
self
.
model
.
load_state_dict
(
state_dict
,
strict
=
False
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment