"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "e566adc09c443af843e83b239c3a18b8e7bd422d"
Unverified Commit 562f8640 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into fix-xlnet-squad2.0

parents ca99a2d5 8618bf15
version: 2 version: 2
jobs: jobs:
build_py3_torch_and_tf: run_tests_py3_torch_and_tf:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
- image: circleci/python:3.5 - image: circleci/python:3.5
environment:
OMP_NUM_THREADS: 1
resource_class: xlarge resource_class: xlarge
parallelism: 1 parallelism: 1
steps: steps:
...@@ -11,65 +13,67 @@ jobs: ...@@ -11,65 +13,67 @@ jobs:
- run: sudo pip install torch - run: sudo pip install torch
- run: sudo pip install tensorflow - run: sudo pip install tensorflow
- run: sudo pip install --progress-bar off . - run: sudo pip install --progress-bar off .
- run: sudo pip install pytest codecov pytest-cov - run: sudo pip install pytest codecov pytest-cov pytest-xdist
- run: sudo pip install tensorboardX scikit-learn - run: sudo pip install tensorboardX scikit-learn
- run: python -m pytest -sv ./transformers/tests/ --cov - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov
- run: codecov - run: codecov
build_py3_torch: run_tests_py3_torch:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
- image: circleci/python:3.5 - image: circleci/python:3.5
environment:
OMP_NUM_THREADS: 1
resource_class: xlarge resource_class: xlarge
parallelism: 1 parallelism: 1
steps: steps:
- checkout - checkout
- run: sudo pip install torch - run: sudo pip install torch
- run: sudo pip install --progress-bar off . - run: sudo pip install --progress-bar off .
- run: sudo pip install pytest codecov pytest-cov - run: sudo pip install pytest codecov pytest-cov pytest-xdist
- run: sudo pip install tensorboardX scikit-learn - run: sudo pip install tensorboardX scikit-learn
- run: python -m pytest -sv ./transformers/tests/ --cov - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov
- run: python -m pytest -sv ./examples/
- run: codecov - run: codecov
build_py3_tf: run_tests_py3_tf:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
- image: circleci/python:3.5 - image: circleci/python:3.5
environment:
OMP_NUM_THREADS: 1
resource_class: xlarge resource_class: xlarge
parallelism: 1 parallelism: 1
steps: steps:
- checkout - checkout
- run: sudo pip install tensorflow - run: sudo pip install tensorflow
- run: sudo pip install --progress-bar off . - run: sudo pip install --progress-bar off .
- run: sudo pip install pytest codecov pytest-cov - run: sudo pip install pytest codecov pytest-cov pytest-xdist
- run: sudo pip install tensorboardX scikit-learn - run: sudo pip install tensorboardX scikit-learn
- run: python -m pytest -sv ./transformers/tests/ --cov - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov
- run: codecov - run: codecov
build_py2_torch: run_tests_py3_custom_tokenizers:
working_directory: ~/transformers working_directory: ~/transformers
resource_class: large
parallelism: 1
docker: docker:
- image: circleci/python:2.7 - image: circleci/python:3.5
steps: steps:
- checkout - checkout
- run: sudo pip install torch
- run: sudo pip install --progress-bar off . - run: sudo pip install --progress-bar off .
- run: sudo pip install pytest codecov pytest-cov - run: sudo pip install pytest pytest-xdist
- run: python -m pytest -sv ./transformers/tests/ --cov - run: sudo pip install mecab-python3
- run: codecov - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
build_py2_tf: run_examples_py3_torch:
working_directory: ~/transformers working_directory: ~/transformers
resource_class: large
parallelism: 1
docker: docker:
- image: circleci/python:2.7 - image: circleci/python:3.5
environment:
OMP_NUM_THREADS: 1
resource_class: xlarge
parallelism: 1
steps: steps:
- checkout - checkout
- run: sudo pip install tensorflow - run: sudo pip install torch
- run: sudo pip install --progress-bar off . - run: sudo pip install --progress-bar off .
- run: sudo pip install pytest codecov pytest-cov - run: sudo pip install pytest pytest-xdist
- run: python -m pytest -sv ./transformers/tests/ --cov - run: sudo pip install tensorboardX scikit-learn
- run: codecov - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
deploy_doc: deploy_doc:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
...@@ -82,6 +86,16 @@ jobs: ...@@ -82,6 +86,16 @@ jobs:
- run: sudo pip install --progress-bar off -r docs/requirements.txt - run: sudo pip install --progress-bar off -r docs/requirements.txt
- run: sudo pip install --progress-bar off -r requirements.txt - run: sudo pip install --progress-bar off -r requirements.txt
- run: ./.circleci/deploy.sh - run: ./.circleci/deploy.sh
check_repository_consistency:
working_directory: ~/transformers
docker:
- image: circleci/python:3.5
resource_class: small
parallelism: 1
steps:
- checkout
- run: sudo pip install requests
- run: python ./utils/link_tester.py
workflow_filters: &workflow_filters workflow_filters: &workflow_filters
filters: filters:
branches: branches:
...@@ -91,9 +105,10 @@ workflows: ...@@ -91,9 +105,10 @@ workflows:
version: 2 version: 2
build_and_test: build_and_test:
jobs: jobs:
- build_py3_torch_and_tf - check_repository_consistency
- build_py3_torch - run_examples_py3_torch
- build_py3_tf - run_tests_py3_custom_tokenizers
- build_py2_torch - run_tests_py3_torch_and_tf
- build_py2_tf - run_tests_py3_torch
- run_tests_py3_tf
- deploy_doc: *workflow_filters - deploy_doc: *workflow_filters
...@@ -23,3 +23,4 @@ deploy_doc "fe02e45" v1.1.0 ...@@ -23,3 +23,4 @@ deploy_doc "fe02e45" v1.1.0
deploy_doc "89fd345" v1.2.0 deploy_doc "89fd345" v1.2.0
deploy_doc "fc9faa8" v2.0.0 deploy_doc "fc9faa8" v2.0.0
deploy_doc "3ddce1d" v2.1.1 deploy_doc "3ddce1d" v2.1.1
deploy_doc "3616209" v2.2.0
...@@ -106,7 +106,7 @@ Follow these steps to start contributing: ...@@ -106,7 +106,7 @@ Follow these steps to start contributing:
```bash ```bash
$ git clone git@github.com:<your Github handle>/transformers.git $ git clone git@github.com:<your Github handle>/transformers.git
$ cd transformers $ cd transformers
$ git remote add upstream git@github.com:huggingface/transformers.git $ git remote add upstream https://github.com/huggingface/transformers.git
``` ```
3. Create a new branch to hold your development changes: 3. Create a new branch to hold your development changes:
...@@ -168,7 +168,7 @@ Follow these steps to start contributing: ...@@ -168,7 +168,7 @@ Follow these steps to start contributing:
to be merged; to be merged;
4. Make sure pre-existing tests still pass; 4. Make sure pre-existing tests still pass;
5. Add high-coverage tests. No quality test, no merge; 5. Add high-coverage tests. No quality test, no merge;
6. All public methods must have informative doctrings; 6. All public methods must have informative docstrings;
### Style guide ### Style guide
......
...@@ -55,10 +55,12 @@ Choose the right framework for every part of a model's lifetime ...@@ -55,10 +55,12 @@ Choose the right framework for every part of a model's lifetime
| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities | | [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 | | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch | | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
| [Quick tour: pipelines](#quick-tour-of-pipelines) | Using Pipelines: Wrapper around tokenizer and models to use finetuned models |
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation | | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers | | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers | | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
| [Documentation](https://huggingface.co/transformers/) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) | Full API documentation and more | | [Documentation][(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
## Installation ## Installation
...@@ -86,21 +88,41 @@ When TensorFlow 2.0 and/or PyTorch has been installed, you can install from sour ...@@ -86,21 +88,41 @@ When TensorFlow 2.0 and/or PyTorch has been installed, you can install from sour
pip install [--editable] . pip install [--editable] .
``` ```
### Run the examples
Examples are included in the repository but are not shipped with the library.
Therefore, in order to run the latest versions of the examples you also need to install from source. To do so, create a new virtual environment and follow these steps:
```bash
git clone https://github.com/huggingface/transformers
cd transformers
pip install [--editable] .
```
### Tests ### Tests
A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples). A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
These tests can be run using `pytest` (install pytest if needed with `pip install pytest`). These tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests. Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
You can run the tests from the root of the cloned repository with the commands: You can run the tests from the root of the cloned repository with the commands:
```bash
python -m unittest discover -s transformers/tests -p "*test.py" -t .
python -m unittest discover -s examples -p "*test.py" -t examples
```
or
```bash ```bash
python -m pytest -sv ./transformers/tests/ python -m pytest -sv ./transformers/tests/
python -m pytest -sv ./examples/ python -m pytest -sv ./examples/
``` ```
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
### Do you want to run a Transformer model on a mobile device? ### Do you want to run a Transformer model on a mobile device?
You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo. You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
...@@ -120,9 +142,13 @@ At some point in the future, you'll be able to seamlessly move from pre-training ...@@ -120,9 +142,13 @@ At some point in the future, you'll be able to seamlessly move from pre-training
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation). 8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. 9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
10. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR. 10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
14. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html). These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
...@@ -144,7 +170,7 @@ import torch ...@@ -144,7 +170,7 @@ import torch
from transformers import * from transformers import *
# Transformers has a unified API # Transformers has a unified API
# for 8 transformer architectures and 30 pretrained weights. # for 10 transformer architectures and 30 pretrained weights.
# Model | Tokenizer | Pretrained weights shortcut # Model | Tokenizer | Pretrained weights shortcut
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'), MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
(OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'), (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
...@@ -154,7 +180,9 @@ MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'), ...@@ -154,7 +180,9 @@ MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'), (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'), (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
(DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'), (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
(RobertaModel, RobertaTokenizer, 'roberta-base')] (RobertaModel, RobertaTokenizer, 'roberta-base'),
(XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
]
# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel` # To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
...@@ -252,6 +280,11 @@ print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sen ...@@ -252,6 +280,11 @@ print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sen
## Quick tour of the fine-tuning/usage scripts ## Quick tour of the fine-tuning/usage scripts
**Important**
Before running the fine-tuning scripts, please read the
[instructions](#run-the-examples) on how to
setup your environment to run the examples.
The library comprises several example scripts with SOTA performances for NLU and NLG tasks: The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*) - `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
...@@ -418,6 +451,76 @@ python ./examples/run_generation.py \ ...@@ -418,6 +451,76 @@ python ./examples/run_generation.py \
--repetition_penalty=1.2 \ --repetition_penalty=1.2 \
``` ```
## Quick tour of model sharing
New in `v2.2.2`: you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
```shell
transformers-cli login
# log in using the same credentials as on huggingface.co
```
Upload your model:
```shell
transformers-cli upload ./path/to/pretrained_model/
# ^^ Upload folder containing weights/tokenizer/config
# saved via `.save_pretrained()`
transformers-cli upload ./config.json [--filename folder/foobar.json]
# ^^ Upload a single file
# (you can optionally override its filename, which can be nested inside a folder)
```
Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
```python
"username/model_name"
```
Anyone can load it from code:
```python
tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
model = AutoModel.from_pretrained("username/pretrained_model")
```
Finally, list all your files on S3:
```shell
transformers-cli ls
# List all your S3 objects.
```
## Quick tour of pipelines
New in version `v2.3`: `Pipeline` are high-level objects which automatically handle tokenization, running your data through a transformers model
and outputting the result in a structured object.
You can create `Pipeline` objects for the following down-stream tasks:
- `feature-extraction`: Generates a tensor representation for the input sequence
- `ner`: Generates named entity mapping for each word in the input sequence.
- `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence.
- `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question
in the context.
```python
from transformers import pipeline
# Allocate a pipeline for sentiment-analysis
nlp = pipeline('sentiment-analysis')
nlp('We are very happy to include pipeline into the transformers repository.')
>>> {'label': 'POSITIVE', 'score': 0.99893874}
# Allocate a pipeline for question-answering
nlp = pipeline('question-answering')
nlp({
'question': 'What is the name of the repository ?',
'context': 'Pipeline have been included in the huggingface/transformers repository'
})
>>> {'score': 0.28756016668193496, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
```
## Migrating from pytorch-transformers to transformers ## Migrating from pytorch-transformers to transformers
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`. Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
......
cd docs
function deploy_doc(){
echo "Creating doc at commit $1 and pushing to folder $2"
git checkout $1
if [ ! -z "$2" ]
then
echo "Pushing version" $2
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
else
echo "Pushing master"
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
fi
}
deploy_doc "master"
deploy_doc "b33a385" v1.0.0
deploy_doc "fe02e45" v1.1.0
deploy_doc "89fd345" v1.2.0
deploy_doc "fc9faa8" v2.0.0
deploy_doc "3ddce1d" v2.1.1
deploy_doc "f2f3294" v2.2.0
\ No newline at end of file
function addIcon() { function addIcon() {
const huggingFaceLogo = "https://huggingface.co/assets/transformers-docs/huggingface_logo.svg"; const huggingFaceLogo = "https://huggingface.co/landing/assets/transformers-docs/huggingface_logo.svg";
const image = document.createElement("img"); const image = document.createElement("img");
image.setAttribute("src", huggingFaceLogo); image.setAttribute("src", huggingFaceLogo);
...@@ -24,10 +24,10 @@ function addCustomFooter() { ...@@ -24,10 +24,10 @@ function addCustomFooter() {
social.classList.add("footer__Social"); social.classList.add("footer__Social");
const imageDetails = [ const imageDetails = [
{ link: "https://huggingface.co", imageLink: "https://huggingface.co/assets/transformers-docs/website.svg" }, { link: "https://huggingface.co", imageLink: "https://huggingface.co/landing/assets/transformers-docs/website.svg" },
{ link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/twitter.svg" }, { link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/twitter.svg" },
{ link: "https://github.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/github.svg" }, { link: "https://github.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/github.svg" },
{ link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/assets/transformers-docs/linkedin.svg" } { link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/landing/assets/transformers-docs/linkedin.svg" }
]; ];
imageDetails.forEach(imageLinks => { imageDetails.forEach(imageLinks => {
......
...@@ -26,7 +26,7 @@ author = u'huggingface' ...@@ -26,7 +26,7 @@ author = u'huggingface'
# The short X.Y version # The short X.Y version
version = u'' version = u''
# The full version, including alpha/beta/rc tags # The full version, including alpha/beta/rc tags
release = u'2.1.1' release = u'2.3.0'
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
......
...@@ -47,6 +47,10 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train ...@@ -47,6 +47,10 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau. 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. 7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_. 8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
...@@ -55,6 +59,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train ...@@ -55,6 +59,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
installation installation
quickstart quickstart
pretrained_models pretrained_models
model_sharing
examples examples
notebooks notebooks
serialization serialization
...@@ -89,3 +94,5 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train ...@@ -89,3 +94,5 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
model_doc/roberta model_doc/roberta
model_doc/distilbert model_doc/distilbert
model_doc/ctrl model_doc/ctrl
model_doc/camembert
model_doc/albert
...@@ -24,15 +24,24 @@ pip install [--editable] . ...@@ -24,15 +24,24 @@ pip install [--editable] .
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples). An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
Tests can be run using `pytest` (install pytest if needed with `pip install pytest`). Tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
Run all the tests from the root of the cloned repository with the commands: Run all the tests from the root of the cloned repository with the commands:
```bash
python -m unittest discover -s transformers/tests -p "*test.py" -t .
python -m unittest discover -s examples -p "*test.py" -t examples
```
or
``` bash ``` bash
python -m pytest -sv ./transformers/tests/ python -m pytest -sv ./transformers/tests/
python -m pytest -sv ./examples/ python -m pytest -sv ./examples/
``` ```
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
## OpenAI GPT original tokenization workflow ## OpenAI GPT original tokenization workflow
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`: If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
......
...@@ -5,6 +5,7 @@ The ``.optimization`` module provides: ...@@ -5,6 +5,7 @@ The ``.optimization`` module provides:
- an optimizer with weight decay fixed that can be used to fine-tuned models, and - an optimizer with weight decay fixed that can be used to fine-tuned models, and
- several schedules in the form of schedule objects that inherit from ``_LRSchedule``: - several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
- a gradient accumulation class to accumulate the gradients of multiple batches
``AdamW`` ``AdamW``
~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~
...@@ -12,6 +13,15 @@ The ``.optimization`` module provides: ...@@ -12,6 +13,15 @@ The ``.optimization`` module provides:
.. autoclass:: transformers.AdamW .. autoclass:: transformers.AdamW
:members: :members:
``AdamWeightDecay``
~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AdamWeightDecay
:members:
.. autofunction:: transformers.create_optimizer
:members:
Schedules Schedules
---------------------------------------------------- ----------------------------------------------------
...@@ -49,3 +59,17 @@ Learning Rate Schedules ...@@ -49,3 +59,17 @@ Learning Rate Schedules
.. image:: /imgs/warmup_linear_schedule.png .. image:: /imgs/warmup_linear_schedule.png
:target: /imgs/warmup_linear_schedule.png :target: /imgs/warmup_linear_schedule.png
:alt: :alt:
``Warmup``
~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Warmup
:members:
Gradient Strategies
----------------------------------------------------
``GradientAccumulator``
~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.GradientAccumulator
...@@ -54,5 +54,100 @@ Additionally, the following method can be used to load values from a data file ...@@ -54,5 +54,100 @@ Additionally, the following method can be used to load values from a data file
Example usage Example usage
^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^
An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
XNLI
~~~~~~~~~~~~~~~~~~~~~
`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates
the quality of cross-lingual text representations.
XNLI is crowd-sourced dataset based on `MultiNLI <http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment
annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
It was released together with the paper
`XNLI: Evaluating Cross-lingual Sentence Representations <https://arxiv.org/abs/1809.05053>`__
This library hosts the processor to load the XNLI data:
- :class:`~transformers.data.processors.utils.XnliProcessor`
Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
An example using these processors is given in the An example using these processors is given in the
`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script. `run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
\ No newline at end of file
SQuAD
~~~~~~~~~~~~~~~~~~~~~
`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
`SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside
the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
This library hosts a processor for each of the two versions:
Processors
^^^^^^^^^^^^^^^^^^^^^^^^^
Those processors are:
- :class:`~transformers.data.processors.utils.SquadV1Processor`
- :class:`~transformers.data.processors.utils.SquadV2Processor`
They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
.. autoclass:: transformers.data.processors.squad.SquadProcessor
:members:
Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
that can be used as model inputs.
.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
Examples are given below.
Example usage
^^^^^^^^^^^^^^^^^^^^^^^^^
Here is an example using the processors as well as the conversion method using data files:
Example::
# Loading a V2 processor
processor = SquadV2Processor()
examples = processor.get_dev_examples(squad_v2_data_dir)
# Loading a V1 processor
processor = SquadV1Processor()
examples = processor.get_dev_examples(squad_v1_data_dir)
features = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=args.doc_stride,
max_query_length=max_query_length,
is_training=not evaluate,
)
Using `tensorflow_datasets` is as easy as using a data file:
Example::
# tensorflow_datasets only handle Squad V1.
tfds_examples = tfds.load("squad")
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
features = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=args.doc_stride,
max_query_length=max_query_length,
is_training=not evaluate,
)
Another example using these processors is given in the
`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
...@@ -104,6 +104,6 @@ for batch in train_data: ...@@ -104,6 +104,6 @@ for batch in train_data:
loss = model(batch) loss = model(batch)
loss.backward() loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue) torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
scheduler.step()
optimizer.step() optimizer.step()
scheduler.step()
``` ```
ALBERT
----------------------------------------------------
``AlbrtConfig``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertConfig
:members:
``AlbertTokenizer``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertTokenizer
:members:
``AlbertModel``
~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertModel
:members:
``AlbertForMaskedLM``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertForMaskedLM
:members:
``AlbertForSequenceClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertForSequenceClassification
:members:
``AlbertForQuestionAnswering``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertForQuestionAnswering
:members:
``TFAlbertModel``
~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertModel
:members:
``TFAlbertForMaskedLM``
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertForMaskedLM
:members:
``TFAlbertForSequenceClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertForSequenceClassification
:members:
CamemBERT
----------------------------------------------------
``CamembertConfig``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertConfig
:members:
``CamembertTokenizer``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertTokenizer
:members:
``CamembertModel``
~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertModel
:members:
``CamembertForMaskedLM``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForMaskedLM
:members:
``CamembertForSequenceClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForSequenceClassification
:members:
``CamembertForMultipleChoice``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForMultipleChoice
:members:
``CamembertForTokenClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForTokenClassification
:members:
# Model upload and sharing
Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
```shell
transformers-cli login
# log in using the same credentials as on huggingface.co
```
Upload your model:
```shell
transformers-cli upload ./path/to/pretrained_model/
# ^^ Upload folder containing weights/tokenizer/config
# saved via `.save_pretrained()`
transformers-cli upload ./config.json [--filename folder/foobar.json]
# ^^ Upload a single file
# (you can optionally override its filename, which can be nested inside a folder)
```
Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
```python
"username/pretrained_model"
```
Anyone can load it from code:
```python
tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
model = AutoModel.from_pretrained("username/pretrained_model")
```
Finally, list all your files on S3:
```shell
transformers-cli ls
# List all your S3 objects.
```
...@@ -3,6 +3,7 @@ Pretrained models ...@@ -3,6 +3,7 @@ Pretrained models
Here is the full list of the currently provided pretrained models together with a short presentation of each model. Here is the full list of the currently provided pretrained models together with a short presentation of each model.
For a list that includes community-uploaded models, refer to `https://huggingface.co/models <https://huggingface.co/models>`__.
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Architecture | Shortcut name | Details of the model | | Architecture | Shortcut name | Details of the model |
...@@ -61,6 +62,32 @@ Here is the full list of the currently provided pretrained models together with ...@@ -61,6 +62,32 @@ Here is the full list of the currently provided pretrained models together with
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on uncased German text by DBMDZ | | | | | Trained on uncased German text by DBMDZ |
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). | | | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. |
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. |
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text. Text is tokenized into characters. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-finnish-cased-v1`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on cased Finnish text. |
| | | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-finnish-uncased-v1`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on uncased Finnish text. |
| | | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__). |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | OpenAI GPT English model | | | | | OpenAI GPT English model |
...@@ -128,6 +155,10 @@ Here is the full list of the currently provided pretrained models together with ...@@ -128,6 +155,10 @@ Here is the full list of the currently provided pretrained models together with
| | | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__. | | | | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__. |
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__) | | | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``roberta-base-openai-detector`` | | 12-layer, 768-hidden, 12-heads, 125M parameters | | | ``roberta-base-openai-detector`` | | 12-layer, 768-hidden, 12-heads, 125M parameters |
| | | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model. | | | | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model. |
| | | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__) | | | | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__) |
...@@ -148,12 +179,74 @@ Here is the full list of the currently provided pretrained models together with ...@@ -148,12 +179,74 @@ Here is the full list of the currently provided pretrained models together with
| | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. | | | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. |
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) | | | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | | | ``distilbert-base-german-cased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | | | | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint. |
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``distilbert-base-multilingual-cased`` | | 6-layer, 768-hidden, 12-heads, 134M parameters |
| | | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint. |
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) | | | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters | | CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters |
| | | | Salesforce's Large-sized CTRL English model | | | | | Salesforce's Large-sized CTRL English model |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| CamemBERT | ``camembert-base`` | | 12-layer, 768-hidden, 12-heads, 110M parameters |
| | | | CamemBERT using the BERT-base architecture |
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__) |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
| | | | ALBERT base model |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
| | | | ALBERT large model |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
| | | | ALBERT xlarge model |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
| | | | ALBERT xxlarge model |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
| | | | ALBERT base model with no dropout, additional training data and longer training |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
| | | | ALBERT large model with no dropout, additional training data and longer training |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
| | | | ALBERT xlarge model with no dropout, additional training data and longer training |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| T5 | ``t5-small`` | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads, |
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``t5-base`` | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads, |
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``t5-large`` | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads, |
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``t5-3B`` | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads, |
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``t5-11B`` | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads, |
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| XLM-RoBERTa | ``xlm-roberta-base`` | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads, |
| | | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``xlm-roberta-large`` | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads, |
| | | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
.. <https://huggingface.co/transformers/examples.html>`__ .. <https://huggingface.co/transformers/examples.html>`__
\ No newline at end of file
...@@ -219,4 +219,97 @@ sequence = tokenizer.decode(generated) ...@@ -219,4 +219,97 @@ sequence = tokenizer.decode(generated)
print(sequence) print(sequence)
``` ```
The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`. The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
\ No newline at end of file
### Model2Model example
Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
```python
import torch
from transformers import BertTokenizer, Model2Model
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Encode the input to the encoder (the question)
question = "Who was Jim Henson?"
encoded_question = tokenizer.encode(question)
# Encode the input to the decoder (the answer)
answer = "Jim Henson was a puppeteer"
encoded_answer = tokenizer.encode(answer)
# Convert inputs to PyTorch tensors
question_tensor = torch.tensor([encoded_question])
answer_tensor = torch.tensor([encoded_answer])
```
Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
```python
# In order to compute the loss we need to provide language model
# labels (the token ids that the model should have produced) to
# the decoder.
lm_labels = encoded_answer
labels_tensor = torch.tensor([lm_labels])
# Load pre-trained model (weights)
model = Model2Model.from_pretrained('bert-base-uncased')
# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()
# If you have a GPU, put everything on cuda
question_tensor = question_tensor.to('cuda')
answer_tensor = answer_tensor.to('cuda')
labels_tensor = labels_tensor.to('cuda')
model.to('cuda')
# Predict hidden states features for each layer
with torch.no_grad():
# See the models docstrings for the detail of the inputs
outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
# Transformers models always output tuples.
# See the models docstrings for the detail of all the outputs
# In our case, the first element is the value of the LM loss
lm_loss = outputs[0]
```
This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
```python
# Let's re-use the previous question
question = "Who was Jim Henson?"
encoded_question = tokenizer.encode(question)
question_tensor = torch.tensor([encoded_question])
# This time we try to generate the answer, so we start with an empty sequence
answer = "[CLS]"
encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
answer_tensor = torch.tensor([encoded_answer])
# Load pre-trained model (weights)
model = Model2Model.from_pretrained('fine-tuned-weights')
model.eval()
# If you have a GPU, put everything on cuda
question_tensor = encoded_question.to('cuda')
answer_tensor = encoded_answer.to('cuda')
model.to('cuda')
# Predict all tokens
with torch.no_grad():
outputs = model(question_tensor, answer_tensor)
predictions = outputs[0]
# confirm we were able to predict 'jim'
predicted_index = torch.argmax(predictions[0, -1]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == 'jim'
```
...@@ -3,6 +3,17 @@ ...@@ -3,6 +3,17 @@
In this section a few examples are put together. All of these examples work for several models, making use of the very In this section a few examples are put together. All of these examples work for several models, making use of the very
similar API between the different models. similar API between the different models.
**Important**
To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
Execute the following steps in a new virtual environment:
```bash
git clone https://github.com/huggingface/transformers
cd transformers
pip install [--editable] .
pip install -r ./examples/requirements.txt
```
| Section | Description | | Section | Description |
|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| |----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks. | [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks.
...@@ -12,7 +23,7 @@ similar API between the different models. ...@@ -12,7 +23,7 @@ similar API between the different models.
| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. | | [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. |
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. | | [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
| [Abstractive summarization](#abstractive-summarization) | Fine-tuning the library models for abstractive summarization tasks on the CNN/Daily Mail dataset. | | [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
## TensorFlow 2.0 Bert models on GLUE ## TensorFlow 2.0 Bert models on GLUE
...@@ -506,7 +517,8 @@ Larger batch size may improve the performance while costing more memory. ...@@ -506,7 +517,8 @@ Larger batch size may improve the performance while costing more memory.
## Named Entity Recognition ## Named Entity Recognition
Based on the script [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py). Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2.
This example fine-tune Bert Multilingual on GermEval 2014 (German NER). This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
Details and results for the fine-tuning provided by @stefan-it. Details and results for the fine-tuning provided by @stefan-it.
...@@ -551,7 +563,7 @@ The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so ...@@ -551,7 +563,7 @@ The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
``` ```
### Training ### Prepare the run
Additional environment variables must be set: Additional environment variables must be set:
...@@ -563,6 +575,8 @@ export SAVE_STEPS=750 ...@@ -563,6 +575,8 @@ export SAVE_STEPS=750
export SEED=1 export SEED=1
``` ```
### Run the Pytorch version
To start training, just run: To start training, just run:
```bash ```bash
...@@ -583,7 +597,7 @@ python3 run_ner.py --data_dir ./ \ ...@@ -583,7 +597,7 @@ python3 run_ner.py --data_dir ./ \
If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets. If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
### Evaluation #### Evaluation
Evaluation on development dataset outputs the following for our example: Evaluation on development dataset outputs the following for our example:
...@@ -605,7 +619,7 @@ On the test dataset the following results could be achieved: ...@@ -605,7 +619,7 @@ On the test dataset the following results could be achieved:
10/04/2019 00:42:42 - INFO - __main__ - recall = 0.8624150210424085 10/04/2019 00:42:42 - INFO - __main__ - recall = 0.8624150210424085
``` ```
### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) #### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run): Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
...@@ -615,30 +629,108 @@ Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) a ...@@ -615,30 +629,108 @@ Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) a
| `roberta-large` | 95.96 | 91.87 | `roberta-large` | 95.96 | 91.87
| `distilbert-base-uncased` | 94.34 | 90.32 | `distilbert-base-uncased` | 94.34 | 90.32
## Abstractive summarization ### Run the Tensorflow 2 version
To start training, just run:
```bash
python3 run_tf_ner.py --data_dir ./ \
--model_type bert \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
```
Based on the script Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
[`run_summarization_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_summarization_finetuning.py).
Before running this script you should download **both** CNN and Daily Mail #### Evaluation
datasets from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the
links next to "Stories") in the same folder. Then uncompress the archives by running:
Evaluation on development dataset outputs the following for our example:
```bash ```bash
tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz precision recall f1-score support
LOCderiv 0.7619 0.6154 0.6809 52
PERpart 0.8724 0.8997 0.8858 4057
OTHpart 0.9360 0.9466 0.9413 711
ORGpart 0.7015 0.6989 0.7002 269
LOCpart 0.7668 0.8488 0.8057 496
LOC 0.8745 0.9191 0.8963 235
ORGderiv 0.7723 0.8571 0.8125 91
OTHderiv 0.4800 0.6667 0.5581 18
OTH 0.5789 0.6875 0.6286 16
PERderiv 0.5385 0.3889 0.4516 18
PER 0.5000 0.5000 0.5000 2
ORG 0.0000 0.0000 0.0000 3
micro avg 0.8574 0.8862 0.8715 5968
macro avg 0.8575 0.8862 0.8713 5968
``` ```
note that the finetuning script **will not work** if you do not download both On the test dataset the following results could be achieved:
datasets. We will refer as `$DATA_PATH` the path to where you uncompressed both ```bash
archive. precision recall f1-score support
PERpart 0.8847 0.8944 0.8896 9397
OTHpart 0.9376 0.9353 0.9365 1639
ORGpart 0.7307 0.7044 0.7173 697
LOC 0.9133 0.9394 0.9262 561
LOCpart 0.8058 0.8157 0.8107 1150
ORG 0.0000 0.0000 0.0000 8
OTHderiv 0.5882 0.4762 0.5263 42
PERderiv 0.6571 0.5227 0.5823 44
OTH 0.4906 0.6667 0.5652 39
ORGderiv 0.7016 0.7791 0.7383 172
LOCderiv 0.8256 0.6514 0.7282 109
PER 0.0000 0.0000 0.0000 11
micro avg 0.8722 0.8774 0.8748 13869
macro avg 0.8712 0.8774 0.8740 13869
```
## XNLI
Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
#### Fine-tuning on XNLI
This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins
on a single tesla V100 16GB. The data for XNLI can be downloaded with the following links and should be both saved (and un-zipped) in a
`$XNLI_DIR` directory.
* [XNLI 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip)
* [XNLI-MT 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip)
```bash ```bash
export DATA_PATH=/path/to/dataset/ export XNLI_DIR=/path/to/XNLI
python run_summarization_finetuning.py \ python run_xnli.py \
--output_dir=output \ --model_type bert \
--model_type=bert2bert \ --model_name_or_path bert-base-multilingual-cased \
--model_name_or_path=bert2bert \ --language de \
--do_train \ --train_language en \
--data_path=$DATA_PATH \ --do_train \
--do_eval \
--data_dir $XNLI_DIR \
--per_gpu_train_batch_size 32 \
--learning_rate 5e-5 \
--num_train_epochs 2.0 \
--max_seq_length 128 \
--output_dir /tmp/debug_xnli/ \
--save_steps -1
```
Training with the previously defined hyper-parameters yields the following results on the **test** set:
```bash
acc = 0.7093812375249501
``` ```
from pathlib import Path
import tarfile
import urllib.request
import torch
from transformers.tokenization_camembert import CamembertTokenizer
from transformers.modeling_camembert import CamembertForMaskedLM
def fill_mask(masked_input, model, tokenizer, topk=5):
# Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
assert masked_input.count('<mask>') == 1
input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0) # Batch size 1
logits = model(input_ids)[0] # The last hidden-state is the first element of the output tuple
masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
logits = logits[0, masked_index, :]
prob = logits.softmax(dim=0)
values, indices = prob.topk(k=topk, dim=0)
topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item())
for i in range(len(indices))])
masked_token = tokenizer.mask_token
topk_filled_outputs = []
for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
predicted_token = predicted_token_bpe.replace('\u2581', ' ')
if " {0}".format(masked_token) in masked_input:
topk_filled_outputs.append((
masked_input.replace(
' {0}'.format(masked_token), predicted_token
),
values[index].item(),
predicted_token,
))
else:
topk_filled_outputs.append((
masked_input.replace(masked_token, predicted_token),
values[index].item(),
predicted_token,
))
return topk_filled_outputs
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertForMaskedLM.from_pretrained('camembert-base')
model.eval()
masked_input = "Le camembert est <mask> :)"
print(fill_mask(masked_input, model, tokenizer, topk=3))
...@@ -2,6 +2,10 @@ ...@@ -2,6 +2,10 @@
This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2. This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
**December 6th, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
**November 19th, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
**October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller. **October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
**October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.** **October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
...@@ -15,8 +19,9 @@ Distil* is a class of compressed models that started with DistilBERT. DistilBERT ...@@ -15,8 +19,9 @@ Distil* is a class of compressed models that started with DistilBERT. DistilBERT
We have applied the same method to other Transformer architectures and released the weights: We have applied the same method to other Transformer architectures and released the weights:
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set). - GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set).
- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base` performance on GLUE while being twice faster and 35% smaller. - RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
- and more to come! 🤗🤗🤗 - German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
- Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108). For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
...@@ -27,7 +32,7 @@ Here are the results on the dev sets of GLUE: ...@@ -27,7 +32,7 @@ Here are the results on the dev sets of GLUE:
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 | | BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 | | DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> | | RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
| DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 | | DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 |
<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa. <sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
...@@ -36,6 +41,14 @@ Here are the results on the dev sets of GLUE: ...@@ -36,6 +41,14 @@ Here are the results on the dev sets of GLUE:
<sup>3</sup> We compute this score ourselves for completeness. <sup>3</sup> We compute this score ourselves for completeness.
Here are the results on the *test* sets for 6 of the languages available in XNLI. The results are computed in the zero shot setting (trained on the English portion and evaluated on the target language portion):
| Model | English | Spanish | Chinese | German | Arabic | Urdu |
| :---: | :---: | :---: | :---: | :---: | :---: | :---:|
| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 |
| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 |
| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 |
## Setup ## Setup
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
...@@ -45,13 +58,14 @@ This part of the library has only be tested with Python3.6+. There are few speci ...@@ -45,13 +58,14 @@ This part of the library has only be tested with Python3.6+. There are few speci
## How to use DistilBERT ## How to use DistilBERT
Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT): Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters. - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score). - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2. - `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base. - `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
- and more to come! 🤗🤗🤗 - `distilbert-base-multilingual-cased`: DistilmBERT multilingual model pretrained with the supervision of `bert-base-multilingual-cased` on the concatenation of Wikipedia in 104 different languages. The model supports the 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base.
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models. Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
...@@ -67,6 +81,7 @@ last_hidden_states = outputs[0] # The last hidden-state is the first element of ...@@ -67,6 +81,7 @@ last_hidden_states = outputs[0] # The last hidden-state is the first element of
Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint: Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')` - DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')` - DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
- DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`
## How to train Distil* ## How to train Distil*
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment