Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
bf1f43fb
Unverified
Commit
bf1f43fb
authored
Mar 23, 2021
by
Sylvain Gugger
Committed by
GitHub
Mar 23, 2021
Browse files
Update the example template for a no Trainer option (#10865)
parent
2eb596f0
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
409 additions
and
3 deletions
+409
-3
templates/adding_a_new_example_script/cookiecutter.json
templates/adding_a_new_example_script/cookiecutter.json
+2
-1
templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
...directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+407
-2
No files found.
templates/adding_a_new_example_script/cookiecutter.json
View file @
bf1f43fb
...
@@ -4,5 +4,6 @@
...
@@ -4,5 +4,6 @@
"example_shortcut"
:
"{{cookiecutter.directory_name}}"
,
"example_shortcut"
:
"{{cookiecutter.directory_name}}"
,
"model_class"
:
"AutoModel"
,
"model_class"
:
"AutoModel"
,
"authors"
:
"The HuggingFace Team"
,
"authors"
:
"The HuggingFace Team"
,
"can_train_from_scratch"
:
[
"True"
,
"False"
]
"can_train_from_scratch"
:
[
"True"
,
"False"
],
"with_trainer"
:
[
"True"
,
"False"
]
}
}
\ No newline at end of file
templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
View file @
bf1f43fb
...
@@ -14,10 +14,12 @@
...
@@ -14,10 +14,12 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""
"""
Fine-tuning
the library
model
s for
{{cookiecutter.example_name}}.
Fine-tuning
a 🤗 Transformers
model
on
{{cookiecutter.example_name}}.
"""
"""
# You can also adapt this script on your own {{cookiecutter.example_name}} task. Pointers for this are left as comments.
# You can also adapt this script on your own {{cookiecutter.example_name}} task. Pointers for this are left as comments.
{
%-
if
cookiecutter
.
with_trainer
==
"True"
%
}
import
logging
import
logging
import
math
import
math
import
os
import
os
...
@@ -297,7 +299,7 @@ def main():
...
@@ -297,7 +299,7 @@ def main():
{
%-
elif
cookiecutter
.
can_train_from_scratch
==
"False"
%
}
{
%-
elif
cookiecutter
.
can_train_from_scratch
==
"False"
%
}
config
=
AutoConfig
.
from_pretrained
(
config
=
AutoConfig
.
from_pretrained
(
model_args
.
config_name
if
model_args
.
config_name
else
model_args
.
model_name_or_path
,
model_args
.
config_name
if
model_args
.
config_name
else
model_args
.
model_name_or_path
,
num_labels
=
num_labels
,
#
num_labels=num_labels,
Uncomment if you have a certain number of labels
finetuning_task
=
data_args
.
task_name
,
finetuning_task
=
data_args
.
task_name
,
cache_dir
=
model_args
.
cache_dir
,
cache_dir
=
model_args
.
cache_dir
,
revision
=
model_args
.
model_revision
,
revision
=
model_args
.
model_revision
,
...
@@ -426,3 +428,406 @@ def _mp_fn(index):
...
@@ -426,3 +428,406 @@ def _mp_fn(index):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
main
()
main
()
{
%-
elif
cookiecutter
.
with_trainer
==
"False"
%
}
import
argparse
import
logging
import
math
import
os
import
random
import
datasets
from
datasets
import
load_dataset
,
load_metric
from
torch.utils.data.dataloader
import
DataLoader
from
tqdm.auto
import
tqdm
import
transformers
from
accelerate
import
Accelerator
from
transformers
import
(
CONFIG_MAPPING
,
MODEL_MAPPING
,
AdamW
,
AutoConfig
,
{{
cookiecutter
.
model_class
}},
AutoTokenizer
,
DataCollatorWithPadding
,
PretrainedConfig
,
SchedulerType
,
default_data_collator
,
get_scheduler
,
set_seed
,
)
logger
=
logging
.
getLogger
(
__name__
)
{
%-
if
cookiecutter
.
can_train_from_scratch
==
"True"
%
}
# You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
{
%
endif
%
}
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Finetune a transformers model on a text classification task"
)
parser
.
add_argument
(
"--dataset_name"
,
type
=
str
,
default
=
None
,
help
=
"The name of the dataset to use (via the datasets library)."
,
)
parser
.
add_argument
(
"--dataset_config_name"
,
type
=
str
,
default
=
None
,
help
=
"The configuration name of the dataset to use (via the datasets library)."
,
)
parser
.
add_argument
(
"--train_file"
,
type
=
str
,
default
=
None
,
help
=
"A csv or a json file containing the training data."
)
parser
.
add_argument
(
"--validation_file"
,
type
=
str
,
default
=
None
,
help
=
"A csv or a json file containing the validation data."
)
parser
.
add_argument
(
"--max_length"
,
type
=
int
,
default
=
128
,
help
=
(
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
" sequences shorter will be padded if `--pad_to_max_lengh` is passed."
),
)
parser
.
add_argument
(
"--pad_to_max_length"
,
action
=
"store_true"
,
help
=
"If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used."
,
)
parser
.
add_argument
(
"--model_name_or_path"
,
type
=
str
,
help
=
"Path to pretrained model or model identifier from huggingface.co/models."
,
required
=
True
,
)
parser
.
add_argument
(
"--config_name"
,
type
=
str
,
default
=
None
,
help
=
"Pretrained config name or path if not the same as model_name"
,
)
parser
.
add_argument
(
"--tokenizer_name"
,
type
=
str
,
default
=
None
,
help
=
"Pretrained tokenizer name or path if not the same as model_name"
,
)
parser
.
add_argument
(
"--use_slow_tokenizer"
,
action
=
"store_true"
,
help
=
"If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library)."
,
)
parser
.
add_argument
(
"--per_device_train_batch_size"
,
type
=
int
,
default
=
8
,
help
=
"Batch size (per device) for the training dataloader."
,
)
parser
.
add_argument
(
"--per_device_eval_batch_size"
,
type
=
int
,
default
=
8
,
help
=
"Batch size (per device) for the evaluation dataloader."
,
)
parser
.
add_argument
(
"--learning_rate"
,
type
=
float
,
default
=
5e-5
,
help
=
"Initial learning rate (after the potential warmup period) to use."
,
)
parser
.
add_argument
(
"--weight_decay"
,
type
=
float
,
default
=
0.0
,
help
=
"Weight decay to use."
)
parser
.
add_argument
(
"--num_train_epochs"
,
type
=
int
,
default
=
3
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--max_train_steps"
,
type
=
int
,
default
=
None
,
help
=
"Total number of training steps to perform. If provided, overrides num_train_epochs."
,
)
parser
.
add_argument
(
"--gradient_accumulation_steps"
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
,
)
parser
.
add_argument
(
"--lr_scheduler_type"
,
type
=
SchedulerType
,
default
=
"linear"
,
help
=
"The scheduler type to use."
,
choices
=
[
"linear"
,
"cosine"
,
"cosine_with_restarts"
,
"polynomial"
,
"constant"
,
"constant_with_warmup"
],
)
parser
.
add_argument
(
"--num_warmup_steps"
,
type
=
int
,
default
=
0
,
help
=
"Number of steps for the warmup in the lr scheduler."
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
default
=
None
,
help
=
"Where to store the final model."
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"A seed for reproducible training."
)
{
%-
if
cookiecutter
.
can_train_from_scratch
==
"True"
%
}
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
None
,
help
=
"Model type to use if training from scratch."
,
choices
=
MODEL_TYPES
,
)
{
%
endif
%
}
args
=
parser
.
parse_args
()
# Sanity checks
if
args
.
task_name
is
None
and
args
.
train_file
is
None
and
args
.
validation_file
is
None
:
raise
ValueError
(
"Need either a task name or a training/validation file."
)
else
:
if
args
.
train_file
is
not
None
:
extension
=
args
.
train_file
.
split
(
"."
)[
-
1
]
assert
extension
in
[
"csv"
,
"json"
],
"`train_file` should be a csv or a json file."
if
args
.
validation_file
is
not
None
:
extension
=
args
.
validation_file
.
split
(
"."
)[
-
1
]
assert
extension
in
[
"csv"
,
"json"
],
"`validation_file` should be a csv or a json file."
if
args
.
output_dir
is
not
None
:
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
return
args
def
main
():
args
=
parse_args
()
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator
=
Accelerator
()
# Make one log on every process with the configuration for debugging.
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
level
=
logging
.
INFO
,
)
logger
.
info
(
accelerator
.
state
)
# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger
.
setLevel
(
logging
.
INFO
if
accelerator
.
is_local_main_process
else
logging
.
ERROR
)
if
accelerator
.
is_local_main_process
:
datasets
.
utils
.
logging
.
set_verbosity_warning
()
transformers
.
utils
.
logging
.
set_verbosity_info
()
else
:
datasets
.
utils
.
logging
.
set_verbosity_error
()
transformers
.
utils
.
logging
.
set_verbosity_error
()
# If passed along, set the training seed now.
if
args
.
seed
is
not
None
:
set_seed
(
args
.
seed
)
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if
args
.
dataset_name
is
not
None
:
# Downloading and loading a dataset from the hub.
raw_datasets
=
load_dataset
(
args
.
dataset_name
,
args
.
dataset_config_name
)
else
:
data_files
=
{}
if
args
.
train_file
is
not
None
:
data_files
[
"train"
]
=
args
.
train_file
if
args
.
validation_file
is
not
None
:
data_files
[
"validation"
]
=
args
.
validation_file
extension
=
args
.
train_file
.
split
(
"."
)[
-
1
]
raw_datasets
=
load_dataset
(
extension
,
data_files
=
data_files
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
{
%-
if
cookiecutter
.
can_train_from_scratch
==
"True"
%
}
if
model_args
.
config_name
:
config
=
AutoConfig
.
from_pretrained
(
args
.
model_name_or_path
)
elif
model_args
.
model_name_or_path
:
config
=
AutoConfig
.
from_pretrained
(
args
.
model_name_or_path
)
else
:
config
=
CONFIG_MAPPING
[
args
.
model_type
]()
logger
.
warning
(
"You are instantiating a new config instance from scratch."
)
if
model_args
.
tokenizer_name
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
tokenizer_name
,
use_fast
=
not
args
.
use_slow_tokenizer
)
elif
model_args
.
model_name_or_path
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
model_name_or_path
,
use_fast
=
not
args
.
use_slow_tokenizer
)
else
:
raise
ValueError
(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
if
model_args
.
model_name_or_path
:
model
=
{{
cookiecutter
.
model_class
}}.
from_pretrained
(
model_args
.
model_name_or_path
,
from_tf
=
bool
(
".ckpt"
in
model_args
.
model_name_or_path
),
config
=
config
,
)
else
:
logger
.
info
(
"Training new model from scratch"
)
model
=
{{
cookiecutter
.
model_class
}}.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
{
%-
elif
cookiecutter
.
can_train_from_scratch
==
"False"
%
}
config
=
AutoConfig
.
from_pretrained
(
args
.
config_name
if
model_args
.
config_name
else
args
.
model_name_or_path
,
# num_labels=num_labels, Uncomment if you have a certain number of labels
finetuning_task
=
data_args
.
task_name
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer_name
if
model_args
.
tokenizer_name
else
args
.
model_name_or_path
,
use_fast
=
not
args
.
use_slow_tokenizer
,
)
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
model_args
.
model_name_or_path
,
from_tf
=
bool
(
".ckpt"
in
model_args
.
model_name_or_path
),
config
=
config
,
)
{
%
endif
%
}
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names
=
datasets
[
"train"
].
column_names
text_column_name
=
"text"
if
"text"
in
column_names
else
column_names
[
0
]
padding
=
"max_length"
if
args
.
pad_to_max_length
else
False
def
tokenize_function
(
examples
):
result
=
tokenizer
(
examples
[
text_column_name
],
padding
=
padding
,
max_length
=
args
.
max_length
,
truncation
=
True
)
if
"label"
in
examples
:
result
[
"labels"
]
=
examples
[
"label"
]
return
result
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
)
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
# Log a few random samples from the training set:
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
logger
.
info
(
f
"Sample
{
index
}
of the training set:
{
train_dataset
[
index
]
}
."
)
# DataLoaders creation:
if
args
.
pad_to_max_length
:
# If padding was already done ot max length, we use the default data collator that will just convert everything
# to tensors.
data_collator
=
default_data_collator
else
:
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
data_collator
=
DataCollatorWithPadding
(
tokenizer
,
pad_to_multiple_of
=
(
8
if
accelerator
.
use_fp16
else
None
))
train_dataloader
=
DataLoader
(
train_dataset
,
shuffle
=
True
,
collate_fn
=
data_collator
,
batch_size
=
args
.
per_device_train_batch_size
)
eval_dataloader
=
DataLoader
(
eval_dataset
,
collate_fn
=
data_collator
,
batch_size
=
args
.
per_device_eval_batch_size
)
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay
=
[
"bias"
,
"LayerNorm.weight"
]
optimizer_grouped_parameters
=
[
{
"params"
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
"weight_decay"
:
args
.
weight_decay
,
},
{
"params"
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
any
(
nd
in
n
for
nd
in
no_decay
)],
"weight_decay"
:
0.0
,
},
]
optimizer
=
AdamW
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
)
# Prepare everything with our `accelerator`.
model
,
optimizer
,
train_dataloader
,
eval_dataloader
=
accelerator
.
prepare
(
model
,
optimizer
,
train_dataloader
,
eval_dataloader
)
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
# shorter in multiprocess)
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch
=
math
.
ceil
(
len
(
train_dataloader
)
/
args
.
gradient_accumulation_steps
)
if
args
.
max_train_steps
is
None
:
args
.
max_train_steps
=
args
.
num_train_epochs
*
num_update_steps_per_epoch
else
:
args
.
num_train_epochs
=
math
.
ceil
(
args
.
max_train_steps
/
num_update_steps_per_epoch
)
lr_scheduler
=
get_scheduler
(
name
=
args
.
lr_scheduler_type
,
optimizer
=
optimizer
,
num_warmup_steps
=
args
.
num_warmup_steps
,
num_training_steps
=
args
.
max_train_steps
,
)
# TODO Get the proper metric function
# metric = load_metric(xxx)
# Train!
total_batch_size
=
args
.
per_device_train_batch_size
*
accelerator
.
num_processes
*
args
.
gradient_accumulation_steps
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
f
" Num examples =
{
len
(
train_dataset
)
}
"
)
logger
.
info
(
f
" Num Epochs =
{
args
.
num_train_epochs
}
"
)
logger
.
info
(
f
" Instantaneous batch size per device =
{
args
.
per_device_train_batch_size
}
"
)
logger
.
info
(
f
" Total train batch size (w. parallel, distributed & accumulation) =
{
total_batch_size
}
"
)
logger
.
info
(
f
" Gradient Accumulation steps =
{
args
.
gradient_accumulation_steps
}
"
)
logger
.
info
(
f
" Total optimization steps =
{
args
.
max_train_steps
}
"
)
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
for
epoch
in
range
(
args
.
num_train_epochs
):
model
.
train
()
for
step
,
batch
in
enumerate
(
train_dataloader
):
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
loss
=
loss
/
args
.
gradient_accumulation_steps
accelerator
.
backward
(
loss
)
if
step
%
args
.
gradient_accumulation_steps
==
0
or
step
==
len
(
train_dataloader
)
-
1
:
optimizer
.
step
()
lr_scheduler
.
step
()
optimizer
.
zero_grad
()
progress_bar
.
update
(
1
)
completed_steps
+=
1
if
completed_steps
>=
args
.
max_train_steps
:
break
model
.
eval
()
for
step
,
batch
in
enumerate
(
eval_dataloader
):
outputs
=
model
(
**
batch
)
predictions
=
outputs
.
logits
.
argmax
(
dim
=-
1
)
metric
.
add_batch
(
predictions
=
accelerator
.
gather
(
predictions
),
references
=
accelerator
.
gather
(
batch
[
"labels"
]),
)
eval_metric
=
metric
.
compute
()
logger
.
info
(
f
"epoch
{
epoch
}
:
{
eval_metric
}
"
)
if
args
.
output_dir
is
not
None
:
accelerator
.
wait_for_everyone
()
unwrapped_model
=
accelerator
.
unwrap_model
(
model
)
unwrapped_model
.
save_pretrained
(
args
.
output_dir
,
save_function
=
accelerator
.
save
)
if
__name__
==
"__main__"
:
main
()
{
%
endif
%
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment