Merge pull request #1549 from hlums/master

Fix token order in xlnet preprocessing for SQuAD

Merge pull request #1549 from hlums/master
Fix token order in xlnet preprocessing for SQuAD
1d4d0702 · Thomas Wolf · GitHub · 8a628355 · 9a3b173c · 1d4d0702
Unverified Commit 1d4d0702 authored Nov 04, 2019 by Thomas Wolf Committed by GitHub Nov 04, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 75 additions and 14 deletions

examples/README.md examples/README.md +40 -2

examples/run_squad.py examples/run_squad.py +5 -1

examples/utils_squad.py examples/utils_squad.py +30 -11

No files found.
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,7 +9,7 @@ similar API between the different models.
 | [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
 | [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
-| [SQuAD](#squad) | Using BERT/XLM/XLNet/RoBERTa for question answering, examples with distributed training.                                                                                  |
+| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training.                                                                                  |
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 | [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training.                                                                                  |
 | [Abstractive summarization](#abstractive-summarization) | Fine-tuning the library models for abstractive summarization tasks on the CNN/Daily Mail dataset. |
@@ -415,6 +415,44 @@ exact_match = 86.91
 This fine-tuned model is available as a checkpoint under the reference
 `bert-large-uncased-whole-word-masking-finetuned-squad`.
+#### Fine-tuning XLNet on SQuAD
+This example code fine-tunes XLNet on the SQuAD dataset. See above to download the data for SQuAD .
+```bash
+export SQUAD_DIR=/path/to/SQUAD
+python /data/home/hlu/transformers/examples/run_squad.py \
+    --model_type xlnet \
+    --model_name_or_path xlnet-large-cased \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --train_file /data/home/hlu/notebooks/NLP/examples/question_answering/train-v1.1.json \
+    --predict_file /data/home/hlu/notebooks/NLP/examples/question_answering/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ./wwm_cased_finetuned_squad/ \
+    --per_gpu_eval_batch_size=4  \
+    --per_gpu_train_batch_size=4   \
+    --save_steps 5000
+```
+Training with the previously defined hyper-parameters yields the following results:
+```python
+{
+"exact": 85.45884578997162,
+"f1": 92.5974600601065,
+"total": 10570,
+"HasAns_exact": 85.45884578997162,
+"HasAns_f1": 92.59746006010651,
+"HasAns_total": 10570
+}
+```
 ## Named Entity Recognition
 Based on the script [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py).
@@ -542,4 +580,4 @@ python run_summarization_finetuning.py \
    --model_name_or_path=bert2bert \
    --do_train \
    --data_path=$DATA_PATH \
 ```
\ No newline at end of file
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -305,7 +305,11 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
                                                max_seq_length=args.max_seq_length,
                                                doc_stride=args.doc_stride,
                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate)
+                                                is_training=not evaluate,
+                                                cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
+                                                pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
+                                                cls_token_at_end=True if args.model_type in ['xlnet'] else False,
+                                                sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -192,7 +192,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
                                 cls_token_segment_id=0, pad_token_segment_id=0,
-                                 mask_padding_with_zero=True):
+                                 mask_padding_with_zero=True,
+                                 sequence_a_is_doc=False):
    """Loads a data file into a list of `InputBatch`s."""
    unique_id = 1000000000
@@ -272,17 +273,19 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                p_mask.append(0)
                cls_index = 0
-            # Query
+            # XLNet: P SEP Q SEP CLS
-            for token in query_tokens:
+            # Others: CLS Q SEP P SEP
-                tokens.append(token)
+            if not sequence_a_is_doc:
+                # Query
+                tokens += query_tokens
+                segment_ids += [sequence_a_segment_id] * len(query_tokens)
+                p_mask += [1] * len(query_tokens)
+                # SEP token
+                tokens.append(sep_token)
                segment_ids.append(sequence_a_segment_id)
                p_mask.append(1)
-            # SEP token
-            tokens.append(sep_token)
-            segment_ids.append(sequence_a_segment_id)
-            p_mask.append(1)
            # Paragraph
            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
@@ -292,10 +295,23 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(sequence_b_segment_id)
+                if not sequence_a_is_doc:
+                    segment_ids.append(sequence_b_segment_id)
+                else:
+                    segment_ids.append(sequence_a_segment_id)
                p_mask.append(0)
            paragraph_len = doc_span.length
+            if sequence_a_is_doc:
+                # SEP token
+                tokens.append(sep_token)
+                segment_ids.append(sequence_a_segment_id)
+                p_mask.append(1)
+                tokens += query_tokens
+                segment_ids += [sequence_b_segment_id] * len(query_tokens)
+                p_mask += [1] * len(query_tokens)
            # SEP token
            tokens.append(sep_token)
            segment_ids.append(sequence_b_segment_id)
@@ -342,7 +358,10 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    end_position = 0
                    span_is_impossible = True
                else:
-                    doc_offset = len(query_tokens) + 2
+                    if sequence_a_is_doc:
+                        doc_offset = 0
+                    else:
+                        doc_offset = len(query_tokens) + 2
                    start_position = tok_start_position - doc_start + doc_offset
                    end_position = tok_end_position - doc_start + doc_offset