Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
569f61a7
Unverified
Commit
569f61a7
authored
Jul 26, 2021
by
Matt
Committed by
GitHub
Jul 26, 2021
Browse files
Add TF multiple choice example (#12865)
* Add new multiple-choice example, remove old one
parent
4f19881f
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
483 additions
and
812 deletions
+483
-812
examples/tensorflow/multiple-choice/README.md
examples/tensorflow/multiple-choice/README.md
+24
-19
examples/tensorflow/multiple-choice/run_swag.py
examples/tensorflow/multiple-choice/run_swag.py
+459
-0
examples/tensorflow/multiple-choice/run_tf_multiple_choice.py
...ples/tensorflow/multiple-choice/run_tf_multiple_choice.py
+0
-220
examples/tensorflow/multiple-choice/utils_multiple_choice.py
examples/tensorflow/multiple-choice/utils_multiple_choice.py
+0
-573
No files found.
examples/tensorflow/multiple-choice/README.md
View file @
569f61a7
<!---
Copyright 202
0
The HuggingFace Team. All rights reserved.
Copyright 202
1
The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -13,26 +13,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Multiple-choice training (e.g. SWAG)
# Multiple Choice
This folder contains the
`run_swag.py`
script, showing an examples of
*multiple-choice answering*
with the
🤗 Transformers library. For straightforward use-cases you may be able to use these scripts without modification,
although we have also included comments in the code to indicate areas that you may need to adapt to your own projects.
##
Fine-tuning on SWAG
##
# Multi-GPU and TPU usage
By default, the script uses a
`MirroredStrategy`
and will use multiple GPUs effectively if they are available. TPUs
can also be used by passing the name of the TPU resource with the
`--tpu`
argument.
### Memory usage and data loading
One thing to note is that all data is loaded into memory in this script. Most multiple-choice datasets are small
enough that this is not an issue, but if you have a very large dataset you will need to modify the script to handle
data streaming. This is particularly challenging for TPUs, given the stricter requirements and the sheer volume of data
required to keep them fed. A full explanation of all the possible pitfalls is a bit beyond this example script and
README, but for more information you can see the 'Input Datasets' section of
[
this document
](
https://www.tensorflow.org/guide/tpu
)
.
### Example command
```
bash
export
SWAG_DIR
=
/path/to/swag_data_dir
python ./examples/multiple-choice/run_tf_multiple_choice.py
\
--task_name
swag
\
--model_name_or_path
bert-base-cased
\
--do_train
\
--do_eval
\
--data_dir
$SWAG_DIR
\
--learning_rate
5e-5
\
--num_train_epochs
3
\
--max_seq_length
80
\
--output_dir
models_bert/swag_base
\
--per_gpu_eval_batch_size
=
16
\
--per_device_train_batch_size
=
16
\
--logging-dir
logs
\
--gradient_accumulation_steps
2
\
--overwrite_output
python run_swag.py
\
--model_name_or_path
distilbert-base-cased
\
--output_dir
output
\
--do_eval
\
--do_train
```
examples/tensorflow/multiple-choice/run_swag.py
0 → 100644
View file @
569f61a7
#!/usr/bin/env python
# coding=utf-8
# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for multiple choice.
"""
# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
import
logging
import
os
import
sys
from
dataclasses
import
dataclass
,
field
from
pathlib
import
Path
from
typing
import
Optional
import
datasets
import
numpy
as
np
import
tensorflow
as
tf
from
datasets
import
load_dataset
import
transformers
from
transformers
import
(
CONFIG_NAME
,
TF2_WEIGHTS_NAME
,
AutoConfig
,
AutoTokenizer
,
HfArgumentParser
,
TFAutoModelForMultipleChoice
,
TFTrainingArguments
,
create_optimizer
,
set_seed
,
)
from
transformers.utils
import
check_min_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.10.0.dev0"
)
logger
=
logging
.
getLogger
(
__name__
)
# region Helper classes and functions
class
SavePretrainedCallback
(
tf
.
keras
.
callbacks
.
Callback
):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch.
def
__init__
(
self
,
output_dir
,
**
kwargs
):
super
().
__init__
()
self
.
output_dir
=
output_dir
def
on_epoch_end
(
self
,
epoch
,
logs
=
None
):
self
.
model
.
save_pretrained
(
self
.
output_dir
)
def
convert_dataset_for_tensorflow
(
dataset
,
non_label_column_names
,
batch_size
,
dataset_mode
=
"variable_batch"
,
shuffle
=
True
,
drop_remainder
=
True
):
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
"""
def
densify_ragged_batch
(
features
,
label
=
None
):
features
=
{
feature
:
ragged_tensor
.
to_tensor
(
shape
=
batch_shape
[
feature
])
for
feature
,
ragged_tensor
in
features
.
items
()
}
if
label
is
None
:
return
features
else
:
return
features
,
label
feature_keys
=
list
(
set
(
dataset
.
features
.
keys
())
-
set
(
non_label_column_names
+
[
"label"
]))
if
dataset_mode
==
"variable_batch"
:
batch_shape
=
{
key
:
None
for
key
in
feature_keys
}
data
=
{
key
:
tf
.
ragged
.
constant
(
dataset
[
key
])
for
key
in
feature_keys
}
elif
dataset_mode
==
"constant_batch"
:
data
=
{
key
:
tf
.
ragged
.
constant
(
dataset
[
key
])
for
key
in
feature_keys
}
batch_shape
=
{
key
:
tf
.
concat
(([
batch_size
],
ragged_tensor
.
bounding_shape
()[
1
:]),
axis
=
0
)
for
key
,
ragged_tensor
in
data
.
items
()
}
else
:
raise
ValueError
(
"Unknown dataset mode!"
)
if
"label"
in
dataset
.
features
:
labels
=
tf
.
convert_to_tensor
(
np
.
array
(
dataset
[
"label"
]))
tf_dataset
=
tf
.
data
.
Dataset
.
from_tensor_slices
((
data
,
labels
))
else
:
tf_dataset
=
tf
.
data
.
Dataset
.
from_tensor_slices
(
data
)
if
shuffle
:
tf_dataset
=
tf_dataset
.
shuffle
(
buffer_size
=
len
(
dataset
))
options
=
tf
.
data
.
Options
()
options
.
experimental_distribute
.
auto_shard_policy
=
tf
.
data
.
experimental
.
AutoShardPolicy
.
OFF
tf_dataset
=
(
tf_dataset
.
with_options
(
options
)
.
batch
(
batch_size
=
batch_size
,
drop_remainder
=
drop_remainder
)
.
map
(
densify_ragged_batch
)
)
return
tf_dataset
# endregion
# region Arguments
@
dataclass
class
ModelArguments
:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path
:
str
=
field
(
metadata
=
{
"help"
:
"Path to pretrained model or model identifier from huggingface.co/models"
}
)
config_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained config name or path if not the same as model_name"
}
)
tokenizer_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained tokenizer name or path if not the same as model_name"
}
)
cache_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Where do you want to store the pretrained models downloaded from huggingface.co"
},
)
use_fast_tokenizer
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
},
)
model_revision
:
str
=
field
(
default
=
"main"
,
metadata
=
{
"help"
:
"The specific model version to use (can be a branch name, tag name or commit id)."
},
)
use_auth_token
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Will use the token generated when running `transformers-cli login` (necessary to use this script "
"with private models)."
},
)
@
dataclass
class
DataTrainingArguments
:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
train_file
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The input training data file (a text file)."
})
validation_file
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"An optional input evaluation data file to evaluate the perplexity on (a text file)."
},
)
overwrite_cache
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached training and evaluation sets"
}
)
preprocessing_num_workers
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The number of processes to use for the preprocessing."
},
)
max_seq_length
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The maximum total input sequence length after tokenization. If passed, sequences longer "
"than this will be truncated, sequences shorter will be padded."
},
)
pad_to_max_length
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether to pad all samples to the maximum sentence length. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
"efficient on GPU but very bad for TPU."
},
)
max_train_samples
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_eval_samples
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
},
)
def
__post_init__
(
self
):
if
self
.
train_file
is
not
None
:
extension
=
self
.
train_file
.
split
(
"."
)[
-
1
]
assert
extension
in
[
"csv"
,
"json"
],
"`train_file` should be a csv or a json file."
if
self
.
validation_file
is
not
None
:
extension
=
self
.
validation_file
.
split
(
"."
)[
-
1
]
assert
extension
in
[
"csv"
,
"json"
],
"`validation_file` should be a csv or a json file."
# endregion
def
main
():
# region Argument parsing
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser
=
HfArgumentParser
((
ModelArguments
,
DataTrainingArguments
,
TFTrainingArguments
))
if
len
(
sys
.
argv
)
==
2
and
sys
.
argv
[
1
].
endswith
(
".json"
):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args
,
data_args
,
training_args
=
parser
.
parse_json_file
(
json_file
=
os
.
path
.
abspath
(
sys
.
argv
[
1
]))
else
:
model_args
,
data_args
,
training_args
=
parser
.
parse_args_into_dataclasses
()
output_dir
=
Path
(
training_args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# endregion
# region Logging
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
handlers
=
[
logging
.
StreamHandler
(
sys
.
stdout
)],
)
log_level
=
training_args
.
get_process_log_level
()
logger
.
setLevel
(
log_level
)
datasets
.
utils
.
logging
.
set_verbosity
(
log_level
)
transformers
.
utils
.
logging
.
set_verbosity
(
log_level
)
transformers
.
utils
.
logging
.
enable_default_handler
()
transformers
.
utils
.
logging
.
enable_explicit_format
()
# endregion
# region Checkpoints
checkpoint
=
None
if
len
(
os
.
listdir
(
training_args
.
output_dir
))
>
0
and
not
training_args
.
overwrite_output_dir
:
if
(
output_dir
/
CONFIG_NAME
).
is_file
()
and
(
output_dir
/
TF2_WEIGHTS_NAME
).
is_file
():
checkpoint
=
output_dir
logger
.
info
(
f
"Checkpoint detected, resuming training from checkpoint in
{
training_args
.
output_dir
}
. To avoid this"
" behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
else
:
raise
ValueError
(
f
"Output directory (
{
training_args
.
output_dir
}
) already exists and is not empty. "
"Use --overwrite_output_dir to continue regardless."
)
# endregion
# Set seed before initializing model.
set_seed
(
training_args
.
seed
)
# region Load datasets
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if
data_args
.
train_file
is
not
None
or
data_args
.
validation_file
is
not
None
:
data_files
=
{}
if
data_args
.
train_file
is
not
None
:
data_files
[
"train"
]
=
data_args
.
train_file
if
data_args
.
validation_file
is
not
None
:
data_files
[
"validation"
]
=
data_args
.
validation_file
extension
=
data_args
.
train_file
.
split
(
"."
)[
-
1
]
raw_datasets
=
load_dataset
(
extension
,
data_files
=
data_files
,
cache_dir
=
model_args
.
cache_dir
)
else
:
# Downloading and loading the swag dataset from the hub.
raw_datasets
=
load_dataset
(
"swag"
,
"regular"
,
cache_dir
=
model_args
.
cache_dir
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# When using your own dataset or a different dataset from swag, you will probably need to change this.
ending_names
=
[
f
"ending
{
i
}
"
for
i
in
range
(
4
)]
context_name
=
"sent1"
question_header_name
=
"sent2"
# endregion
# region Load model config and tokenizer
if
checkpoint
is
not
None
:
config_path
=
training_args
.
output_dir
elif
model_args
.
config_name
:
config_path
=
model_args
.
config_name
else
:
config_path
=
model_args
.
model_name_or_path
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config
=
AutoConfig
.
from_pretrained
(
config_path
,
cache_dir
=
model_args
.
cache_dir
,
revision
=
model_args
.
model_revision
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
tokenizer_name
if
model_args
.
tokenizer_name
else
model_args
.
model_name_or_path
,
cache_dir
=
model_args
.
cache_dir
,
use_fast
=
model_args
.
use_fast_tokenizer
,
revision
=
model_args
.
model_revision
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
# endregion
# region Dataset preprocessing
if
data_args
.
max_seq_length
is
None
:
max_seq_length
=
tokenizer
.
model_max_length
if
max_seq_length
>
1024
:
logger
.
warning
(
f
"The tokenizer picked seems to have a very large `model_max_length` (
{
tokenizer
.
model_max_length
}
). "
"Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
)
max_seq_length
=
1024
else
:
if
data_args
.
max_seq_length
>
tokenizer
.
model_max_length
:
logger
.
warning
(
f
"The max_seq_length passed (
{
data_args
.
max_seq_length
}
) is larger than the maximum length for the"
f
"model (
{
tokenizer
.
model_max_length
}
). Using max_seq_length=
{
tokenizer
.
model_max_length
}
."
)
max_seq_length
=
min
(
data_args
.
max_seq_length
,
tokenizer
.
model_max_length
)
def
preprocess_function
(
examples
):
first_sentences
=
[[
context
]
*
4
for
context
in
examples
[
context_name
]]
question_headers
=
examples
[
question_header_name
]
second_sentences
=
[
[
f
"
{
header
}
{
examples
[
end
][
i
]
}
"
for
end
in
ending_names
]
for
i
,
header
in
enumerate
(
question_headers
)
]
# Flatten out
first_sentences
=
sum
(
first_sentences
,
[])
second_sentences
=
sum
(
second_sentences
,
[])
# Tokenize
tokenized_examples
=
tokenizer
(
first_sentences
,
second_sentences
,
truncation
=
True
,
max_length
=
max_seq_length
)
# Un-flatten
data
=
{
k
:
[
v
[
i
:
i
+
4
]
for
i
in
range
(
0
,
len
(
v
),
4
)]
for
k
,
v
in
tokenized_examples
.
items
()}
return
data
if
training_args
.
do_train
:
if
"train"
not
in
raw_datasets
:
raise
ValueError
(
"--do_train requires a train dataset"
)
train_dataset
=
raw_datasets
[
"train"
]
non_label_columns
=
[
feature
for
feature
in
train_dataset
.
features
if
feature
not
in
(
"label"
,
"labels"
)]
if
data_args
.
max_train_samples
is
not
None
:
train_dataset
=
train_dataset
.
select
(
range
(
data_args
.
max_train_samples
))
with
training_args
.
main_process_first
(
desc
=
"train dataset map pre-processing"
):
train_dataset
=
train_dataset
.
map
(
preprocess_function
,
batched
=
True
,
num_proc
=
data_args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
)
if
training_args
.
do_eval
:
if
"validation"
not
in
raw_datasets
:
raise
ValueError
(
"--do_eval requires a validation dataset"
)
eval_dataset
=
raw_datasets
[
"validation"
]
if
not
training_args
.
do_train
:
non_label_columns
=
[
feature
for
feature
in
eval_dataset
.
features
if
feature
not
in
(
"label"
,
"labels"
)]
if
data_args
.
max_eval_samples
is
not
None
:
eval_dataset
=
eval_dataset
.
select
(
range
(
data_args
.
max_eval_samples
))
with
training_args
.
main_process_first
(
desc
=
"validation dataset map pre-processing"
):
eval_dataset
=
eval_dataset
.
map
(
preprocess_function
,
batched
=
True
,
num_proc
=
data_args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
)
# endregion
with
training_args
.
strategy
.
scope
():
# region Build model
if
checkpoint
is
None
:
model_path
=
model_args
.
model_name_or_path
else
:
model_path
=
checkpoint
model
=
TFAutoModelForMultipleChoice
.
from_pretrained
(
model_path
,
config
=
config
,
cache_dir
=
model_args
.
cache_dir
,
revision
=
model_args
.
model_revision
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
num_replicas
=
training_args
.
strategy
.
num_replicas_in_sync
total_train_batch_size
=
training_args
.
per_device_train_batch_size
*
num_replicas
total_eval_batch_size
=
training_args
.
per_device_eval_batch_size
*
num_replicas
if
training_args
.
do_train
:
total_train_steps
=
(
len
(
train_dataset
)
//
total_train_batch_size
)
*
int
(
training_args
.
num_train_epochs
)
optimizer
,
lr_schedule
=
create_optimizer
(
init_lr
=
training_args
.
learning_rate
,
num_train_steps
=
int
(
total_train_steps
),
num_warmup_steps
=
0
)
else
:
optimizer
=
"adam"
# Just put anything in here, since we're not using it anyway
model
.
compile
(
optimizer
=
optimizer
,
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
),
metrics
=
[
tf
.
keras
.
metrics
.
SparseCategoricalAccuracy
(
name
=
"accuracy"
)],
)
# endregion
# region Training
if
training_args
.
do_train
:
tf_train_dataset
=
convert_dataset_for_tensorflow
(
train_dataset
,
non_label_column_names
=
non_label_columns
,
batch_size
=
total_train_batch_size
)
if
training_args
.
do_eval
:
validation_data
=
convert_dataset_for_tensorflow
(
eval_dataset
,
non_label_column_names
=
non_label_columns
,
batch_size
=
total_eval_batch_size
)
else
:
validation_data
=
None
model
.
fit
(
tf_train_dataset
,
validation_data
=
validation_data
,
epochs
=
int
(
training_args
.
num_train_epochs
),
callbacks
=
[
SavePretrainedCallback
(
output_dir
=
training_args
.
output_dir
)],
)
# endregion
# region Evaluation
if
training_args
.
do_eval
and
not
training_args
.
do_train
:
# Do a standalone evaluation pass
tf_eval_dataset
=
convert_dataset_for_tensorflow
(
eval_dataset
,
non_label_column_names
=
non_label_columns
,
batch_size
=
total_eval_batch_size
)
model
.
evaluate
(
tf_eval_dataset
)
# endregion
# region Push to hub
if
training_args
.
push_to_hub
:
model
.
push_to_hub
(
finetuned_from
=
model_args
.
model_name_or_path
,
tasks
=
"multiple-choice"
,
dataset_tags
=
"swag"
,
dataset_args
=
"regular"
,
dataset
=
"SWAG"
,
language
=
"en"
,
)
# endregion
if
__name__
==
"__main__"
:
main
()
examples/tensorflow/multiple-choice/run_tf_multiple_choice.py
deleted
100755 → 0
View file @
4f19881f
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
import
logging
import
os
from
dataclasses
import
dataclass
,
field
from
typing
import
Dict
,
Optional
import
numpy
as
np
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
EvalPrediction
,
HfArgumentParser
,
TFAutoModelForMultipleChoice
,
TFTrainer
,
TFTrainingArguments
,
set_seed
,
)
from
transformers.utils
import
logging
as
hf_logging
from
utils_multiple_choice
import
Split
,
TFMultipleChoiceDataset
,
processors
hf_logging
.
set_verbosity_info
()
hf_logging
.
enable_default_handler
()
hf_logging
.
enable_explicit_format
()
logger
=
logging
.
getLogger
(
__name__
)
def
simple_accuracy
(
preds
,
labels
):
return
(
preds
==
labels
).
mean
()
@
dataclass
class
ModelArguments
:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path
:
str
=
field
(
metadata
=
{
"help"
:
"Path to pretrained model or model identifier from huggingface.co/models"
}
)
config_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained config name or path if not the same as model_name"
}
)
tokenizer_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained tokenizer name or path if not the same as model_name"
}
)
cache_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Where do you want to store the pretrained models downloaded from huggingface.co"
},
)
@
dataclass
class
DataTrainingArguments
:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
task_name
:
str
=
field
(
metadata
=
{
"help"
:
"The name of the task to train on: "
+
", "
.
join
(
processors
.
keys
())})
data_dir
:
str
=
field
(
metadata
=
{
"help"
:
"Should contain the data files for the task."
})
max_seq_length
:
int
=
field
(
default
=
128
,
metadata
=
{
"help"
:
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
},
)
overwrite_cache
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached training and evaluation sets"
}
)
def
main
():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser
=
HfArgumentParser
((
ModelArguments
,
DataTrainingArguments
,
TFTrainingArguments
))
model_args
,
data_args
,
training_args
=
parser
.
parse_args_into_dataclasses
()
if
(
os
.
path
.
exists
(
training_args
.
output_dir
)
and
os
.
listdir
(
training_args
.
output_dir
)
and
training_args
.
do_train
and
not
training_args
.
overwrite_output_dir
):
raise
ValueError
(
f
"Output directory (
{
training_args
.
output_dir
}
) already exists and is not empty. Use --overwrite_output_dir to overcome."
)
# Setup logging
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
level
=
logging
.
INFO
,
)
logger
.
warning
(
f
"device:
{
training_args
.
device
}
, n_replicas:
{
training_args
.
n_replicas
}
, "
f
"16-bits training:
{
training_args
.
fp16
}
"
)
logger
.
info
(
f
"Training/evaluation parameters
{
training_args
}
"
)
# Set seed
set_seed
(
training_args
.
seed
)
try
:
processor
=
processors
[
data_args
.
task_name
]()
label_list
=
processor
.
get_labels
()
num_labels
=
len
(
label_list
)
except
KeyError
:
raise
ValueError
(
f
"Task not found:
{
data_args
.
task_name
}
"
)
# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config
=
AutoConfig
.
from_pretrained
(
model_args
.
config_name
if
model_args
.
config_name
else
model_args
.
model_name_or_path
,
num_labels
=
num_labels
,
finetuning_task
=
data_args
.
task_name
,
cache_dir
=
model_args
.
cache_dir
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
tokenizer_name
if
model_args
.
tokenizer_name
else
model_args
.
model_name_or_path
,
cache_dir
=
model_args
.
cache_dir
,
)
with
training_args
.
strategy
.
scope
():
model
=
TFAutoModelForMultipleChoice
.
from_pretrained
(
model_args
.
model_name_or_path
,
from_pt
=
bool
(
".bin"
in
model_args
.
model_name_or_path
),
config
=
config
,
cache_dir
=
model_args
.
cache_dir
,
)
# Get datasets
train_dataset
=
(
TFMultipleChoiceDataset
(
data_dir
=
data_args
.
data_dir
,
tokenizer
=
tokenizer
,
task
=
data_args
.
task_name
,
max_seq_length
=
data_args
.
max_seq_length
,
overwrite_cache
=
data_args
.
overwrite_cache
,
mode
=
Split
.
train
,
)
if
training_args
.
do_train
else
None
)
eval_dataset
=
(
TFMultipleChoiceDataset
(
data_dir
=
data_args
.
data_dir
,
tokenizer
=
tokenizer
,
task
=
data_args
.
task_name
,
max_seq_length
=
data_args
.
max_seq_length
,
overwrite_cache
=
data_args
.
overwrite_cache
,
mode
=
Split
.
dev
,
)
if
training_args
.
do_eval
else
None
)
def
compute_metrics
(
p
:
EvalPrediction
)
->
Dict
:
preds
=
np
.
argmax
(
p
.
predictions
,
axis
=
1
)
return
{
"acc"
:
simple_accuracy
(
preds
,
p
.
label_ids
)}
# Initialize our Trainer
trainer
=
TFTrainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
.
get_dataset
()
if
train_dataset
else
None
,
eval_dataset
=
eval_dataset
.
get_dataset
()
if
eval_dataset
else
None
,
compute_metrics
=
compute_metrics
,
)
# Training
if
training_args
.
do_train
:
trainer
.
train
()
trainer
.
save_model
()
tokenizer
.
save_pretrained
(
training_args
.
output_dir
)
# Evaluation
results
=
{}
if
training_args
.
do_eval
:
logger
.
info
(
"*** Evaluate ***"
)
result
=
trainer
.
evaluate
()
output_eval_file
=
os
.
path
.
join
(
training_args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
,
value
in
result
.
items
():
logger
.
info
(
f
"
{
key
}
=
{
value
}
"
)
writer
.
write
(
f
"
{
key
}
=
{
value
}
\n
"
)
results
.
update
(
result
)
return
results
if
__name__
==
"__main__"
:
main
()
examples/tensorflow/multiple-choice/utils_multiple_choice.py
deleted
100644 → 0
View file @
4f19881f
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
import
csv
import
glob
import
json
import
logging
import
os
from
dataclasses
import
dataclass
from
enum
import
Enum
from
typing
import
List
,
Optional
import
tqdm
from
filelock
import
FileLock
from
transformers
import
PreTrainedTokenizer
,
is_tf_available
,
is_torch_available
logger
=
logging
.
getLogger
(
__name__
)
@
dataclass
(
frozen
=
True
)
class
InputExample
:
"""
A single training/test example for multiple choice
Args:
example_id: Unique id for the example.
question: string. The untokenized text of the second sequence (question).
contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
example_id
:
str
question
:
str
contexts
:
List
[
str
]
endings
:
List
[
str
]
label
:
Optional
[
str
]
@
dataclass
(
frozen
=
True
)
class
InputFeatures
:
"""
A single set of features of data.
Property names are the same names as the corresponding inputs to a model.
"""
example_id
:
str
input_ids
:
List
[
List
[
int
]]
attention_mask
:
Optional
[
List
[
List
[
int
]]]
token_type_ids
:
Optional
[
List
[
List
[
int
]]]
label
:
Optional
[
int
]
class
Split
(
Enum
):
train
=
"train"
dev
=
"dev"
test
=
"test"
if
is_torch_available
():
import
torch
from
torch.utils.data.dataset
import
Dataset
class
MultipleChoiceDataset
(
Dataset
):
"""
This will be superseded by a framework-agnostic approach
soon.
"""
features
:
List
[
InputFeatures
]
def
__init__
(
self
,
data_dir
:
str
,
tokenizer
:
PreTrainedTokenizer
,
task
:
str
,
max_seq_length
:
Optional
[
int
]
=
None
,
overwrite_cache
=
False
,
mode
:
Split
=
Split
.
train
,
):
processor
=
processors
[
task
]()
cached_features_file
=
os
.
path
.
join
(
data_dir
,
f
"cached_
{
mode
.
value
}
_
{
tokenizer
.
__class__
.
__name__
}
_
{
max_seq_length
}
_
{
task
}
"
)
# Make sure only the first process in distributed training processes the dataset,
# and the others will use the cache.
lock_path
=
cached_features_file
+
".lock"
with
FileLock
(
lock_path
):
if
os
.
path
.
exists
(
cached_features_file
)
and
not
overwrite_cache
:
logger
.
info
(
f
"Loading features from cached file
{
cached_features_file
}
"
)
self
.
features
=
torch
.
load
(
cached_features_file
)
else
:
logger
.
info
(
f
"Creating features from dataset file at
{
data_dir
}
"
)
label_list
=
processor
.
get_labels
()
if
mode
==
Split
.
dev
:
examples
=
processor
.
get_dev_examples
(
data_dir
)
elif
mode
==
Split
.
test
:
examples
=
processor
.
get_test_examples
(
data_dir
)
else
:
examples
=
processor
.
get_train_examples
(
data_dir
)
logger
.
info
(
f
"Training examples:
{
len
(
examples
)
}
"
)
self
.
features
=
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
)
logger
.
info
(
f
"Saving features into cached file
{
cached_features_file
}
"
)
torch
.
save
(
self
.
features
,
cached_features_file
)
def
__len__
(
self
):
return
len
(
self
.
features
)
def
__getitem__
(
self
,
i
)
->
InputFeatures
:
return
self
.
features
[
i
]
if
is_tf_available
():
import
tensorflow
as
tf
class
TFMultipleChoiceDataset
:
"""
This will be superseded by a framework-agnostic approach
soon.
"""
features
:
List
[
InputFeatures
]
def
__init__
(
self
,
data_dir
:
str
,
tokenizer
:
PreTrainedTokenizer
,
task
:
str
,
max_seq_length
:
Optional
[
int
]
=
128
,
overwrite_cache
=
False
,
mode
:
Split
=
Split
.
train
,
):
processor
=
processors
[
task
]()
logger
.
info
(
f
"Creating features from dataset file at
{
data_dir
}
"
)
label_list
=
processor
.
get_labels
()
if
mode
==
Split
.
dev
:
examples
=
processor
.
get_dev_examples
(
data_dir
)
elif
mode
==
Split
.
test
:
examples
=
processor
.
get_test_examples
(
data_dir
)
else
:
examples
=
processor
.
get_train_examples
(
data_dir
)
logger
.
info
(
f
"Training examples:
{
len
(
examples
)
}
"
)
self
.
features
=
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
)
def
gen
():
for
(
ex_index
,
ex
)
in
tqdm
.
tqdm
(
enumerate
(
self
.
features
),
desc
=
"convert examples to features"
):
if
ex_index
%
10000
==
0
:
logger
.
info
(
f
"Writing example
{
ex_index
}
of
{
len
(
examples
)
}
"
)
yield
(
{
"example_id"
:
0
,
"input_ids"
:
ex
.
input_ids
,
"attention_mask"
:
ex
.
attention_mask
,
"token_type_ids"
:
ex
.
token_type_ids
,
},
ex
.
label
,
)
self
.
dataset
=
tf
.
data
.
Dataset
.
from_generator
(
gen
,
(
{
"example_id"
:
tf
.
int32
,
"input_ids"
:
tf
.
int32
,
"attention_mask"
:
tf
.
int32
,
"token_type_ids"
:
tf
.
int32
,
},
tf
.
int64
,
),
(
{
"example_id"
:
tf
.
TensorShape
([]),
"input_ids"
:
tf
.
TensorShape
([
None
,
None
]),
"attention_mask"
:
tf
.
TensorShape
([
None
,
None
]),
"token_type_ids"
:
tf
.
TensorShape
([
None
,
None
]),
},
tf
.
TensorShape
([]),
),
)
def
get_dataset
(
self
):
self
.
dataset
=
self
.
dataset
.
apply
(
tf
.
data
.
experimental
.
assert_cardinality
(
len
(
self
.
features
)))
return
self
.
dataset
def
__len__
(
self
):
return
len
(
self
.
features
)
def
__getitem__
(
self
,
i
)
->
InputFeatures
:
return
self
.
features
[
i
]
class
DataProcessor
:
"""Base class for data converters for multiple choice data sets."""
def
get_train_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for the train set."""
raise
NotImplementedError
()
def
get_dev_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for the dev set."""
raise
NotImplementedError
()
def
get_test_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for the test set."""
raise
NotImplementedError
()
def
get_labels
(
self
):
"""Gets the list of labels for this data set."""
raise
NotImplementedError
()
class
RaceProcessor
(
DataProcessor
):
"""Processor for the RACE data set."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
train"
)
high
=
os
.
path
.
join
(
data_dir
,
"train/high"
)
middle
=
os
.
path
.
join
(
data_dir
,
"train/middle"
)
high
=
self
.
_read_txt
(
high
)
middle
=
self
.
_read_txt
(
middle
)
return
self
.
_create_examples
(
high
+
middle
,
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
dev"
)
high
=
os
.
path
.
join
(
data_dir
,
"dev/high"
)
middle
=
os
.
path
.
join
(
data_dir
,
"dev/middle"
)
high
=
self
.
_read_txt
(
high
)
middle
=
self
.
_read_txt
(
middle
)
return
self
.
_create_examples
(
high
+
middle
,
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
test"
)
high
=
os
.
path
.
join
(
data_dir
,
"test/high"
)
middle
=
os
.
path
.
join
(
data_dir
,
"test/middle"
)
high
=
self
.
_read_txt
(
high
)
middle
=
self
.
_read_txt
(
middle
)
return
self
.
_create_examples
(
high
+
middle
,
"test"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
,
"2"
,
"3"
]
def
_read_txt
(
self
,
input_dir
):
lines
=
[]
files
=
glob
.
glob
(
input_dir
+
"/*txt"
)
for
file
in
tqdm
.
tqdm
(
files
,
desc
=
"read files"
):
with
open
(
file
,
"r"
,
encoding
=
"utf-8"
)
as
fin
:
data_raw
=
json
.
load
(
fin
)
data_raw
[
"race_id"
]
=
file
lines
.
append
(
data_raw
)
return
lines
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
_
,
data_raw
)
in
enumerate
(
lines
):
race_id
=
f
"
{
set_type
}
-
{
data_raw
[
'race_id'
]
}
"
article
=
data_raw
[
"article"
]
for
i
in
range
(
len
(
data_raw
[
"answers"
])):
truth
=
str
(
ord
(
data_raw
[
"answers"
][
i
])
-
ord
(
"A"
))
question
=
data_raw
[
"questions"
][
i
]
options
=
data_raw
[
"options"
][
i
]
examples
.
append
(
InputExample
(
example_id
=
race_id
,
question
=
question
,
contexts
=
[
article
,
article
,
article
,
article
],
# this is not efficient but convenient
endings
=
[
options
[
0
],
options
[
1
],
options
[
2
],
options
[
3
]],
label
=
truth
,
)
)
return
examples
class
SynonymProcessor
(
DataProcessor
):
"""Processor for the Synonym data set."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
train"
)
return
self
.
_create_examples
(
self
.
_read_csv
(
os
.
path
.
join
(
data_dir
,
"mctrain.csv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
dev"
)
return
self
.
_create_examples
(
self
.
_read_csv
(
os
.
path
.
join
(
data_dir
,
"mchp.csv"
)),
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
dev"
)
return
self
.
_create_examples
(
self
.
_read_csv
(
os
.
path
.
join
(
data_dir
,
"mctest.csv"
)),
"test"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
,
"2"
,
"3"
,
"4"
]
def
_read_csv
(
self
,
input_file
):
with
open
(
input_file
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
return
list
(
csv
.
reader
(
f
))
def
_create_examples
(
self
,
lines
:
List
[
List
[
str
]],
type
:
str
):
"""Creates examples for the training and dev sets."""
examples
=
[
InputExample
(
example_id
=
line
[
0
],
question
=
""
,
# in the swag dataset, the
# common beginning of each
# choice is stored in "sent2".
contexts
=
[
line
[
1
],
line
[
1
],
line
[
1
],
line
[
1
],
line
[
1
]],
endings
=
[
line
[
2
],
line
[
3
],
line
[
4
],
line
[
5
],
line
[
6
]],
label
=
line
[
7
],
)
for
line
in
lines
# we skip the line with the column names
]
return
examples
class
SwagProcessor
(
DataProcessor
):
"""Processor for the SWAG data set."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
train"
)
return
self
.
_create_examples
(
self
.
_read_csv
(
os
.
path
.
join
(
data_dir
,
"train.csv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
dev"
)
return
self
.
_create_examples
(
self
.
_read_csv
(
os
.
path
.
join
(
data_dir
,
"val.csv"
)),
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
dev"
)
raise
ValueError
(
"For swag testing, the input file does not contain a label column. It can not be tested in current code"
"setting!"
)
return
self
.
_create_examples
(
self
.
_read_csv
(
os
.
path
.
join
(
data_dir
,
"test.csv"
)),
"test"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
,
"2"
,
"3"
]
def
_read_csv
(
self
,
input_file
):
with
open
(
input_file
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
return
list
(
csv
.
reader
(
f
))
def
_create_examples
(
self
,
lines
:
List
[
List
[
str
]],
type
:
str
):
"""Creates examples for the training and dev sets."""
if
type
==
"train"
and
lines
[
0
][
-
1
]
!=
"label"
:
raise
ValueError
(
"For training, the input file must contain a label column."
)
examples
=
[
InputExample
(
example_id
=
line
[
2
],
question
=
line
[
5
],
# in the swag dataset, the
# common beginning of each
# choice is stored in "sent2".
contexts
=
[
line
[
4
],
line
[
4
],
line
[
4
],
line
[
4
]],
endings
=
[
line
[
7
],
line
[
8
],
line
[
9
],
line
[
10
]],
label
=
line
[
11
],
)
for
line
in
lines
[
1
:]
# we skip the line with the column names
]
return
examples
class
ArcProcessor
(
DataProcessor
):
"""Processor for the ARC data set (request from allennlp)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
train"
)
return
self
.
_create_examples
(
self
.
_read_json
(
os
.
path
.
join
(
data_dir
,
"train.jsonl"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
dev"
)
return
self
.
_create_examples
(
self
.
_read_json
(
os
.
path
.
join
(
data_dir
,
"dev.jsonl"
)),
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
logger
.
info
(
f
"LOOKING AT
{
data_dir
}
test"
)
return
self
.
_create_examples
(
self
.
_read_json
(
os
.
path
.
join
(
data_dir
,
"test.jsonl"
)),
"test"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
,
"2"
,
"3"
]
def
_read_json
(
self
,
input_file
):
with
open
(
input_file
,
"r"
,
encoding
=
"utf-8"
)
as
fin
:
lines
=
fin
.
readlines
()
return
lines
def
_create_examples
(
self
,
lines
,
type
):
"""Creates examples for the training and dev sets."""
# There are two types of labels. They should be normalized
def
normalize
(
truth
):
if
truth
in
"ABCD"
:
return
ord
(
truth
)
-
ord
(
"A"
)
elif
truth
in
"1234"
:
return
int
(
truth
)
-
1
else
:
logger
.
info
(
f
"truth ERROR!
{
truth
}
"
)
return
None
examples
=
[]
three_choice
=
0
four_choice
=
0
five_choice
=
0
other_choices
=
0
# we deleted example which has more than or less than four choices
for
line
in
tqdm
.
tqdm
(
lines
,
desc
=
"read arc data"
):
data_raw
=
json
.
loads
(
line
.
strip
(
"
\n
"
))
if
len
(
data_raw
[
"question"
][
"choices"
])
==
3
:
three_choice
+=
1
continue
elif
len
(
data_raw
[
"question"
][
"choices"
])
==
5
:
five_choice
+=
1
continue
elif
len
(
data_raw
[
"question"
][
"choices"
])
!=
4
:
other_choices
+=
1
continue
four_choice
+=
1
truth
=
str
(
normalize
(
data_raw
[
"answerKey"
]))
assert
truth
!=
"None"
question_choices
=
data_raw
[
"question"
]
question
=
question_choices
[
"stem"
]
id
=
data_raw
[
"id"
]
options
=
question_choices
[
"choices"
]
if
len
(
options
)
==
4
:
examples
.
append
(
InputExample
(
example_id
=
id
,
question
=
question
,
contexts
=
[
options
[
0
][
"para"
].
replace
(
"_"
,
""
),
options
[
1
][
"para"
].
replace
(
"_"
,
""
),
options
[
2
][
"para"
].
replace
(
"_"
,
""
),
options
[
3
][
"para"
].
replace
(
"_"
,
""
),
],
endings
=
[
options
[
0
][
"text"
],
options
[
1
][
"text"
],
options
[
2
][
"text"
],
options
[
3
][
"text"
]],
label
=
truth
,
)
)
if
type
==
"train"
:
assert
len
(
examples
)
>
1
assert
examples
[
0
].
label
is
not
None
logger
.
info
(
f
"len examples:
{
len
(
examples
)
}
"
)
logger
.
info
(
f
"Three choices:
{
three_choice
}
"
)
logger
.
info
(
f
"Five choices:
{
five_choice
}
"
)
logger
.
info
(
f
"Other choices:
{
other_choices
}
"
)
logger
.
info
(
f
"four choices:
{
four_choice
}
"
)
return
examples
def
convert_examples_to_features
(
examples
:
List
[
InputExample
],
label_list
:
List
[
str
],
max_length
:
int
,
tokenizer
:
PreTrainedTokenizer
,
)
->
List
[
InputFeatures
]:
"""
Loads a data file into a list of `InputFeatures`
"""
label_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
features
=
[]
for
(
ex_index
,
example
)
in
tqdm
.
tqdm
(
enumerate
(
examples
),
desc
=
"convert examples to features"
):
if
ex_index
%
10000
==
0
:
logger
.
info
(
f
"Writing example
{
ex_index
}
of
{
len
(
examples
)
}
"
)
choices_inputs
=
[]
for
ending_idx
,
(
context
,
ending
)
in
enumerate
(
zip
(
example
.
contexts
,
example
.
endings
)):
text_a
=
context
if
example
.
question
.
find
(
"_"
)
!=
-
1
:
# this is for cloze question
text_b
=
example
.
question
.
replace
(
"_"
,
ending
)
else
:
text_b
=
example
.
question
+
" "
+
ending
inputs
=
tokenizer
(
text_a
,
text_b
,
add_special_tokens
=
True
,
max_length
=
max_length
,
padding
=
"max_length"
,
truncation
=
True
,
return_overflowing_tokens
=
True
,
)
if
"num_truncated_tokens"
in
inputs
and
inputs
[
"num_truncated_tokens"
]
>
0
:
logger
.
info
(
"Attention! you are cropping tokens (swag task is ok). "
"If you are training ARC and RACE and you are poping question + options,"
"you need to try to use a bigger max seq length!"
)
choices_inputs
.
append
(
inputs
)
label
=
label_map
[
example
.
label
]
input_ids
=
[
x
[
"input_ids"
]
for
x
in
choices_inputs
]
attention_mask
=
(
[
x
[
"attention_mask"
]
for
x
in
choices_inputs
]
if
"attention_mask"
in
choices_inputs
[
0
]
else
None
)
token_type_ids
=
(
[
x
[
"token_type_ids"
]
for
x
in
choices_inputs
]
if
"token_type_ids"
in
choices_inputs
[
0
]
else
None
)
features
.
append
(
InputFeatures
(
example_id
=
example
.
example_id
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
label
=
label
,
)
)
for
f
in
features
[:
2
]:
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"feature: {f}"
)
return
features
processors
=
{
"race"
:
RaceProcessor
,
"swag"
:
SwagProcessor
,
"arc"
:
ArcProcessor
,
"syn"
:
SynonymProcessor
}
MULTIPLE_CHOICE_TASKS_NUM_LABELS
=
{
"race"
,
4
,
"swag"
,
4
,
"arc"
,
4
,
"syn"
,
5
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment