Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
bf1f43fb
"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "4ad2f68e3439a98ba76317a6453f910c4a631284"
Unverified
Commit
bf1f43fb
authored
Mar 23, 2021
by
Sylvain Gugger
Committed by
GitHub
Mar 23, 2021
Browse files
Update the example template for a no Trainer option (#10865)
parent
2eb596f0
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
409 additions
and
3 deletions
+409
-3
templates/adding_a_new_example_script/cookiecutter.json
templates/adding_a_new_example_script/cookiecutter.json
+2
-1
templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
...directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+407
-2
No files found.
templates/adding_a_new_example_script/cookiecutter.json
View file @
bf1f43fb
...
@@ -4,5 +4,6 @@
...
@@ -4,5 +4,6 @@
"example_shortcut"
:
"{{cookiecutter.directory_name}}"
,
"example_shortcut"
:
"{{cookiecutter.directory_name}}"
,
"model_class"
:
"AutoModel"
,
"model_class"
:
"AutoModel"
,
"authors"
:
"The HuggingFace Team"
,
"authors"
:
"The HuggingFace Team"
,
"can_train_from_scratch"
:
[
"True"
,
"False"
]
"can_train_from_scratch"
:
[
"True"
,
"False"
],
"with_trainer"
:
[
"True"
,
"False"
]
}
}
\ No newline at end of file
templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
View file @
bf1f43fb
...
@@ -14,10 +14,12 @@
...
@@ -14,10 +14,12 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""
"""
Fine-tuning
the library
model
s for
{{cookiecutter.example_name}}.
Fine-tuning
a 🤗 Transformers
model
on
{{cookiecutter.example_name}}.
"""
"""
# You can also adapt this script on your own {{cookiecutter.example_name}} task. Pointers for this are left as comments.
# You can also adapt this script on your own {{cookiecutter.example_name}} task. Pointers for this are left as comments.
{
%-
if
cookiecutter
.
with_trainer
==
"True"
%
}
import
logging
import
logging
import
math
import
math
import
os
import
os
...
@@ -297,7 +299,7 @@ def main():
...
@@ -297,7 +299,7 @@ def main():
{
%-
elif
cookiecutter
.
can_train_from_scratch
==
"False"
%
}
{
%-
elif
cookiecutter
.
can_train_from_scratch
==
"False"
%
}
config
=
AutoConfig
.
from_pretrained
(
config
=
AutoConfig
.
from_pretrained
(
model_args
.
config_name
if
model_args
.
config_name
else
model_args
.
model_name_or_path
,
model_args
.
config_name
if
model_args
.
config_name
else
model_args
.
model_name_or_path
,
num_labels
=
num_labels
,
#
num_labels=num_labels,
Uncomment if you have a certain number of labels
finetuning_task
=
data_args
.
task_name
,
finetuning_task
=
data_args
.
task_name
,
cache_dir
=
model_args
.
cache_dir
,
cache_dir
=
model_args
.
cache_dir
,
revision
=
model_args
.
model_revision
,
revision
=
model_args
.
model_revision
,
...
@@ -426,3 +428,406 @@ def _mp_fn(index):
...
@@ -426,3 +428,406 @@ def _mp_fn(index):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
main
()
main
()
{
%-
elif
cookiecutter
.
with_trainer
==
"False"
%
}
import
argparse
import
logging
import
math
import
os
import
random
import
datasets
from
datasets
import
load_dataset
,
load_metric
from
torch.utils.data.dataloader
import
DataLoader
from
tqdm.auto
import
tqdm
import
transformers
from
accelerate
import
Accelerator
from
transformers
import
(
CONFIG_MAPPING
,
MODEL_MAPPING
,
AdamW
,
AutoConfig
,
{{
cookiecutter
.
model_class
}},
AutoTokenizer
,
DataCollatorWithPadding
,
PretrainedConfig
,
SchedulerType
,
default_data_collator
,
get_scheduler
,
set_seed
,
)
logger
=
logging
.
getLogger
(
__name__
)
{
%-
if
cookiecutter
.
can_train_from_scratch
==
"True"
%
}
# You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
{
%
endif
%
}
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Finetune a transformers model on a text classification task"
)
parser
.
add_argument
(
"--dataset_name"
,
type
=
str
,
default
=
None
,
help
=
"The name of the dataset to use (via the datasets library)."
,
)
parser
.
add_argument
(
"--dataset_config_name"
,
type
=
str
,
default
=
None
,
help
=
"The configuration name of the dataset to use (via the datasets library)."
,
)
parser
.
add_argument
(
"--train_file"
,
type
=
str
,
default
=
None
,
help
=
"A csv or a json file containing the training data."
)
parser
.
add_argument
(
"--validation_file"
,
type
=
str
,
default
=
None
,
help
=
"A csv or a json file containing the validation data."
)
parser
.
add_argument
(
"--max_length"
,
type
=
int
,
default
=
128
,
help
=
(
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
" sequences shorter will be padded if `--pad_to_max_lengh` is passed."
),
)
parser
.
add_argument
(
"--pad_to_max_length"
,
action
=
"store_true"
,
help
=
"If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used."
,
)
parser
.
add_argument
(
"--model_name_or_path"
,
type
=
str
,
help
=
"Path to pretrained model or model identifier from huggingface.co/models."
,
required
=
True
,
)
parser
.
add_argument
(
"--config_name"
,
type
=
str
,
default
=
None
,
help
=
"Pretrained config name or path if not the same as model_name"
,
)
parser
.
add_argument
(
"--tokenizer_name"
,
type
=
str
,
default
=
None
,
help
=
"Pretrained tokenizer name or path if not the same as model_name"
,
)
parser
.
add_argument
(
"--use_slow_tokenizer"
,
action
=
"store_true"
,
help
=
"If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library)."
,
)
parser
.
add_argument
(
"--per_device_train_batch_size"
,
type
=
int
,
default
=
8
,
help
=
"Batch size (per device) for the training dataloader."
,
)
parser
.
add_argument
(
"--per_device_eval_batch_size"
,
type
=
int
,
default
=
8
,
help
=
"Batch size (per device) for the evaluation dataloader."
,
)
parser
.
add_argument
(
"--learning_rate"
,
type
=
float
,
default
=
5e-5
,
help
=
"Initial learning rate (after the potential warmup period) to use."
,
)
parser
.
add_argument
(
"--weight_decay"
,
type
=
float
,
default
=
0.0
,
help
=
"Weight decay to use."
)
parser
.
add_argument
(
"--num_train_epochs"
,
type
=
int
,
default
=
3
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--max_train_steps"
,
type
=
int
,
default
=
None
,
help
=
"Total number of training steps to perform. If provided, overrides num_train_epochs."
,
)
parser
.
add_argument
(
"--gradient_accumulation_steps"
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
,
)
parser
.
add_argument
(
"--lr_scheduler_type"
,
type
=
SchedulerType
,
default
=
"linear"
,
help
=
"The scheduler type to use."
,
choices
=
[
"linear"
,
"cosine"
,
"cosine_with_restarts"
,
"polynomial"
,
"constant"
,
"constant_with_warmup"
],
)
parser
.
add_argument
(
"--num_warmup_steps"
,
type
=
int
,
default
=
0
,
help
=
"Number of steps for the warmup in the lr scheduler."
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
default
=
None
,
help
=
"Where to store the final model."
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"A seed for reproducible training."
)
{
%-
if
cookiecutter
.
can_train_from_scratch
==
"True"
%
}
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
None
,
help
=
"Model type to use if training from scratch."
,
choices
=
MODEL_TYPES
,
)
{
%
endif
%
}
args
=
parser
.
parse_args
()
# Sanity checks
if
args
.
task_name
is
None
and
args
.
train_file
is
None
and
args
.
validation_file
is
None
:
raise
ValueError
(
"Need either a task name or a training/validation file."
)
else
:
if
args
.
train_file
is
not
None
:
extension
=
args
.
train_file
.
split
(
"."
)[
-
1
]
assert
extension
in
[
"csv"
,
"json"
],
"`train_file` should be a csv or a json file."
if
args
.
validation_file
is
not
None
:
extension
=
args
.
validation_file
.
split
(
"."
)[
-
1
]
assert
extension
in
[
"csv"
,
"json"
],
"`validation_file` should be a csv or a json file."
if
args
.
output_dir
is
not
None
:
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
return
args
def
main
():
args
=
parse_args
()
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator
=
Accelerator
()
# Make one log on every process with the configuration for debugging.
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
level
=
logging
.
INFO
,
)
logger
.
info
(
accelerator
.
state
)
# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger
.
setLevel
(
logging
.
INFO
if
accelerator
.
is_local_main_process
else
logging
.
ERROR
)
if
accelerator
.
is_local_main_process
:
datasets
.
utils
.
logging
.
set_verbosity_warning
()
transformers
.
utils
.
logging
.
set_verbosity_info
()
else
:
datasets
.
utils
.
logging
.
set_verbosity_error
()
transformers
.
utils
.
logging
.
set_verbosity_error
()
# If passed along, set the training seed now.
if
args
.
seed
is
not
None
:
set_seed
(
args
.
seed
)
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if
args
.
dataset_name
is
not
None
:
# Downloading and loading a dataset from the hub.
raw_datasets
=
load_dataset
(
args
.
dataset_name
,
args
.
dataset_config_name
)
else
:
data_files
=
{}
if
args
.
train_file
is
not
None
:
data_files
[
"train"
]
=
args
.
train_file
if
args
.
validation_file
is
not
None
:
data_files
[
"validation"
]
=
args
.
validation_file
extension
=
args
.
train_file
.
split
(
"."
)[
-
1
]
raw_datasets
=
load_dataset
(
extension
,
data_files
=
data_files
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
{
%-
if
cookiecutter
.
can_train_from_scratch
==
"True"
%
}
if
model_args
.
config_name
:
config
=
AutoConfig
.
from_pretrained
(
args
.
model_name_or_path
)
elif
model_args
.
model_name_or_path
:
config
=
AutoConfig
.
from_pretrained
(
args
.
model_name_or_path
)
else
:
config
=
CONFIG_MAPPING
[
args
.
model_type
]()
logger
.
warning
(
"You are instantiating a new config instance from scratch."
)
if
model_args
.
tokenizer_name
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
tokenizer_name
,
use_fast
=
not
args
.
use_slow_tokenizer
)
elif
model_args
.
model_name_or_path
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
model_name_or_path
,
use_fast
=
not
args
.
use_slow_tokenizer
)
else
:
raise
ValueError
(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
if
model_args
.
model_name_or_path
:
model
=
{{
cookiecutter
.
model_class
}}.
from_pretrained
(
model_args
.
model_name_or_path
,
from_tf
=
bool
(
".ckpt"
in
model_args
.
model_name_or_path
),
config
=
config
,
)
else
:
logger
.
info
(
"Training new model from scratch"
)
model
=
{{
cookiecutter
.
model_class
}}.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
{
%-
elif
cookiecutter
.
can_train_from_scratch
==
"False"
%
}
config
=
AutoConfig
.
from_pretrained
(
args
.
config_name
if
model_args
.
config_name
else
args
.
model_name_or_path
,
# num_labels=num_labels, Uncomment if you have a certain number of labels
finetuning_task
=
data_args
.
task_name
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer_name
if
model_args
.
tokenizer_name
else
args
.
model_name_or_path
,
use_fast
=
not
args
.
use_slow_tokenizer
,
)
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
model_args
.
model_name_or_path
,
from_tf
=
bool
(
".ckpt"
in
model_args
.
model_name_or_path
),
config
=
config
,
)
{
%
endif
%
}
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names
=
datasets
[
"train"
].
column_names
text_column_name
=
"text"
if
"text"
in
column_names
else
column_names
[
0
]
padding
=
"max_length"
if
args
.
pad_to_max_length
else
False
def
tokenize_function
(
examples
):
result
=
tokenizer
(
examples
[
text_column_name
],
padding
=
padding
,
max_length
=
args
.
max_length
,
truncation
=
True
)
if
"label"
in
examples
:
result
[
"labels"
]
=
examples
[
"label"
]
return
result
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
)
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
# Log a few random samples from the training set:
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
logger
.
info
(
f
"Sample
{
index
}
of the training set:
{
train_dataset
[
index
]
}
."
)
# DataLoaders creation:
if
args
.
pad_to_max_length
:
# If padding was already done ot max length, we use the default data collator that will just convert everything
# to tensors.
data_collator
=
default_data_collator
else
:
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
data_collator
=
DataCollatorWithPadding
(
tokenizer
,
pad_to_multiple_of
=
(
8
if
accelerator
.
use_fp16
else
None
))
train_dataloader
=
DataLoader
(
train_dataset
,
shuffle
=
True
,
collate_fn
=
data_collator
,
batch_size
=
args
.
per_device_train_batch_size
)
eval_dataloader
=
DataLoader
(
eval_dataset
,
collate_fn
=
data_collator
,
batch_size
=
args
.
per_device_eval_batch_size
)
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay
=
[
"bias"
,
"LayerNorm.weight"
]
optimizer_grouped_parameters
=
[
{
"params"
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
"weight_decay"
:
args
.
weight_decay
,
},
{
"params"
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
any
(
nd
in
n
for
nd
in
no_decay
)],
"weight_decay"
:
0.0
,
},
]
optimizer
=
AdamW
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
)
# Prepare everything with our `accelerator`.
model
,
optimizer
,
train_dataloader
,
eval_dataloader
=
accelerator
.
prepare
(
model
,
optimizer
,
train_dataloader
,
eval_dataloader
)
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
# shorter in multiprocess)
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch
=
math
.
ceil
(
len
(
train_dataloader
)
/
args
.
gradient_accumulation_steps
)
if
args
.
max_train_steps
is
None
:
args
.
max_train_steps
=
args
.
num_train_epochs
*
num_update_steps_per_epoch
else
:
args
.
num_train_epochs
=
math
.
ceil
(
args
.
max_train_steps
/
num_update_steps_per_epoch
)
lr_scheduler
=
get_scheduler
(
name
=
args
.
lr_scheduler_type
,
optimizer
=
optimizer
,
num_warmup_steps
=
args
.
num_warmup_steps
,
num_training_steps
=
args
.
max_train_steps
,
)
# TODO Get the proper metric function
# metric = load_metric(xxx)
# Train!
total_batch_size
=
args
.
per_device_train_batch_size
*
accelerator
.
num_processes
*
args
.
gradient_accumulation_steps
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
f
" Num examples =
{
len
(
train_dataset
)
}
"
)
logger
.
info
(
f
" Num Epochs =
{
args
.
num_train_epochs
}
"
)
logger
.
info
(
f
" Instantaneous batch size per device =
{
args
.
per_device_train_batch_size
}
"
)
logger
.
info
(
f
" Total train batch size (w. parallel, distributed & accumulation) =
{
total_batch_size
}
"
)
logger
.
info
(
f
" Gradient Accumulation steps =
{
args
.
gradient_accumulation_steps
}
"
)
logger
.
info
(
f
" Total optimization steps =
{
args
.
max_train_steps
}
"
)
# Only show the progress bar once on each machine.
progress_bar
=
tqdm
(
range
(
args
.
max_train_steps
),
disable
=
not
accelerator
.
is_local_main_process
)
completed_steps
=
0
for
epoch
in
range
(
args
.
num_train_epochs
):
model
.
train
()
for
step
,
batch
in
enumerate
(
train_dataloader
):
outputs
=
model
(
**
batch
)
loss
=
outputs
.
loss
loss
=
loss
/
args
.
gradient_accumulation_steps
accelerator
.
backward
(
loss
)
if
step
%
args
.
gradient_accumulation_steps
==
0
or
step
==
len
(
train_dataloader
)
-
1
:
optimizer
.
step
()
lr_scheduler
.
step
()
optimizer
.
zero_grad
()
progress_bar
.
update
(
1
)
completed_steps
+=
1
if
completed_steps
>=
args
.
max_train_steps
:
break
model
.
eval
()
for
step
,
batch
in
enumerate
(
eval_dataloader
):
outputs
=
model
(
**
batch
)
predictions
=
outputs
.
logits
.
argmax
(
dim
=-
1
)
metric
.
add_batch
(
predictions
=
accelerator
.
gather
(
predictions
),
references
=
accelerator
.
gather
(
batch
[
"labels"
]),
)
eval_metric
=
metric
.
compute
()
logger
.
info
(
f
"epoch
{
epoch
}
:
{
eval_metric
}
"
)
if
args
.
output_dir
is
not
None
:
accelerator
.
wait_for_everyone
()
unwrapped_model
=
accelerator
.
unwrap_model
(
model
)
unwrapped_model
.
save_pretrained
(
args
.
output_dir
,
save_function
=
accelerator
.
save
)
if
__name__
==
"__main__"
:
main
()
{
%
endif
%
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment