Commit 1849aa7d authored by thomwolf's avatar thomwolf
Browse files

update readme and pretrained model weight files

parent 43e0e8fa
...@@ -119,6 +119,7 @@ with torch.no_grad(): ...@@ -119,6 +119,7 @@ with torch.no_grad():
# See the models docstrings for the detail of all the outputs # See the models docstrings for the detail of all the outputs
# In our case, the first element is the hidden state of the last layer of the Bert model # In our case, the first element is the hidden state of the last layer of the Bert model
encoded_layers = outputs[0] encoded_layers = outputs[0]
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension) # We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size) assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
``` ```
...@@ -218,19 +219,27 @@ Before running anyone of these GLUE tasks you should download the ...@@ -218,19 +219,27 @@ Before running anyone of these GLUE tasks you should download the
[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
and unpack it to some directory `$GLUE_DIR`. and unpack it to some directory `$GLUE_DIR`.
You should also install the additional packages required by the examples:
```shell
pip install -r ./examples/requirements.txt
```
```shell ```shell
export GLUE_DIR=/path/to/glue export GLUE_DIR=/path/to/glue
export TASK_NAME=MRPC export TASK_NAME=MRPC
python run_bert_classifier.py \ python ./examples/run_glue.py \
--model_type bert \
--model_name_or_path bert-base-uncased \
--task_name $TASK_NAME \ --task_name $TASK_NAME \
--do_train \ --do_train \
--do_eval \ --do_eval \
--do_lower_case \ --do_lower_case \
--data_dir $GLUE_DIR/$TASK_NAME \ --data_dir $GLUE_DIR/$TASK_NAME \
--bert_model bert-base-uncased \
--max_seq_length 128 \ --max_seq_length 128 \
--train_batch_size 32 \ --per_gpu_eval_batch_size=8 \
--per_gpu_train_batch_size=8 \
--learning_rate 2e-5 \ --learning_rate 2e-5 \
--num_train_epochs 3.0 \ --num_train_epochs 3.0 \
--output_dir /tmp/$TASK_NAME/ --output_dir /tmp/$TASK_NAME/
...@@ -243,7 +252,7 @@ The dev set results will be present within the text file 'eval_results.txt' in t ...@@ -243,7 +252,7 @@ The dev set results will be present within the text file 'eval_results.txt' in t
#### Fine-tuning XLNet model on the STS-B regression task #### Fine-tuning XLNet model on the STS-B regression task
This example code fine-tunes XLNet on the STS-B corpus using parallel training on a server with 4 V100 GPUs. This example code fine-tunes XLNet on the STS-B corpus using parallel training on a server with 4 V100 GPUs.
Parallel training is a simple way to use several GPU (but it is slower and less flexible than distributed training, see below). Parallel training is a simple way to use several GPUs (but is slower and less flexible than distributed training, see below).
```shell ```shell
export GLUE_DIR=/path/to/glue export GLUE_DIR=/path/to/glue
...@@ -252,6 +261,7 @@ python ./examples/run_glue.py \ ...@@ -252,6 +261,7 @@ python ./examples/run_glue.py \
--model_type xlnet \ --model_type xlnet \
--model_name_or_path xlnet-large-cased \ --model_name_or_path xlnet-large-cased \
--do_train \ --do_train \
--do_eval \
--task_name=sts-b \ --task_name=sts-b \
--data_dir=${GLUE_DIR}/STS-B \ --data_dir=${GLUE_DIR}/STS-B \
--output_dir=./proc_data/sts-b-110 \ --output_dir=./proc_data/sts-b-110 \
...@@ -266,15 +276,14 @@ python ./examples/run_glue.py \ ...@@ -266,15 +276,14 @@ python ./examples/run_glue.py \
--warmup_steps=120 --warmup_steps=120
``` ```
On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should results in a Pearson correlation coefficient of `+0.917` on the development set.
These hyper-parameters give evaluation results pearsonr of `0.918`.
#### Fine-tuning Bert model on the MRPC classification task #### Fine-tuning Bert model on the MRPC classification task
This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92. This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92.
```bash ```bash
python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \ python -m torch.distributed.launch --nproc_per_node 8 ./examples/run_glue.py \
--model_type bert \ --model_type bert \
--model_name_or_path bert-large-uncased-whole-word-masking \ --model_name_or_path bert-large-uncased-whole-word-masking \
--task_name MRPC \ --task_name MRPC \
...@@ -308,7 +317,7 @@ Training with these hyper-parameters gave us the following results: ...@@ -308,7 +317,7 @@ Training with these hyper-parameters gave us the following results:
This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD: This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
```bash ```bash
python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \ python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
--model_type bert \ --model_type bert \
--model_name_or_path bert-large-uncased-whole-word-masking \ --model_name_or_path bert-large-uncased-whole-word-masking \
--do_train \ --do_train \
......
tensorboardX
scikit-learn
\ No newline at end of file
...@@ -129,7 +129,19 @@ def create_and_check_required_methods_tokenizer(tester, input_text, output_text, ...@@ -129,7 +129,19 @@ def create_and_check_required_methods_tokenizer(tester, input_text, output_text,
tester.assertNotEqual(len(tokens_2), 0) tester.assertNotEqual(len(tokens_2), 0)
tester.assertIsInstance(text_2, (str, unicode)) tester.assertIsInstance(text_2, (str, unicode))
def create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
weights_list = list(tokenizer_class.max_model_input_sizes.keys())
weights_lists_2 = []
for file_id, map_list in tokenizer_class.pretrained_vocab_files_map.items():
weights_lists_2.append(list(map_list.keys()))
for weights_list_2 in weights_lists_2:
tester.assertListEqual(weights_list, weights_list_2)
def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs): def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs) create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs) create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs) create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
......
...@@ -138,7 +138,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -138,7 +138,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file.""" """Save the tokenizer vocabulary to a directory or file."""
index = 0
if os.path.isdir(vocab_path): if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file']) vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file'])
torch.save(self.__dict__, vocab_file) torch.save(self.__dict__, vocab_file)
......
...@@ -163,6 +163,11 @@ class PreTrainedTokenizer(object): ...@@ -163,6 +163,11 @@ class PreTrainedTokenizer(object):
for file_id, map_list in cls.pretrained_vocab_files_map.items(): for file_id, map_list in cls.pretrained_vocab_files_map.items():
vocab_files[file_id] = map_list[pretrained_model_name_or_path] vocab_files[file_id] = map_list[pretrained_model_name_or_path]
else: else:
logger.info(
"Model name '{}' not found in model shortcut name list ({}). "
"Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
pretrained_model_name_or_path, ', '.join(s3_models),
pretrained_model_name_or_path))
all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE} 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
all_vocab_files_names.update(cls.vocab_files_names) all_vocab_files_names.update(cls.vocab_files_names)
...@@ -175,6 +180,14 @@ class PreTrainedTokenizer(object): ...@@ -175,6 +180,14 @@ class PreTrainedTokenizer(object):
logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
full_file_name = None full_file_name = None
vocab_files[file_id] = full_file_name vocab_files[file_id] = full_file_name
if all(full_file_name is None for full_file_name in vocab_files.values()):
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find tokenizer files"
"at this path or url.".format(
pretrained_model_name_or_path, ', '.join(s3_models),
pretrained_model_name_or_path, ))
return None
# Get files from url, cache, or disk depending on the case # Get files from url, cache, or disk depending on the case
try: try:
......
...@@ -59,6 +59,13 @@ PRETRAINED_VOCAB_FILES_MAP = { ...@@ -59,6 +59,13 @@ PRETRAINED_VOCAB_FILES_MAP = {
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'xlm-mlm-en-2048': 512, 'xlm-mlm-en-2048': 512,
'xlm-mlm-ende-1024': 512,
'xlm-mlm-enfr-1024': 512,
'xlm-mlm-enro-1024': 512,
'xlm-mlm-tlm-xnli15-1024': 512,
'xlm-mlm-xnli15-1024': 512,
'xlm-clm-enfr-1024': 512,
'xlm-clm-ende-1024': 512,
} }
def get_pairs(word): def get_pairs(word):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment