Commit 31c23bd5 authored by thomwolf's avatar thomwolf
Browse files

[BIG] pytorch-transformers => transformers

parent 2f071fcb
...@@ -4,33 +4,33 @@ OpenAI GPT2 ...@@ -4,33 +4,33 @@ OpenAI GPT2
``GPT2Config`` ``GPT2Config``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.GPT2Config .. autoclass:: transformers.GPT2Config
:members: :members:
``GPT2Tokenizer`` ``GPT2Tokenizer``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.GPT2Tokenizer .. autoclass:: transformers.GPT2Tokenizer
:members: :members:
``GPT2Model`` ``GPT2Model``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.GPT2Model .. autoclass:: transformers.GPT2Model
:members: :members:
``GPT2LMHeadModel`` ``GPT2LMHeadModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.GPT2LMHeadModel .. autoclass:: transformers.GPT2LMHeadModel
:members: :members:
``GPT2DoubleHeadsModel`` ``GPT2DoubleHeadsModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel .. autoclass:: transformers.GPT2DoubleHeadsModel
:members: :members:
...@@ -4,33 +4,33 @@ RoBERTa ...@@ -4,33 +4,33 @@ RoBERTa
``RobertaConfig`` ``RobertaConfig``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.RobertaConfig .. autoclass:: transformers.RobertaConfig
:members: :members:
``RobertaTokenizer`` ``RobertaTokenizer``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.RobertaTokenizer .. autoclass:: transformers.RobertaTokenizer
:members: :members:
``RobertaModel`` ``RobertaModel``
~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.RobertaModel .. autoclass:: transformers.RobertaModel
:members: :members:
``RobertaForMaskedLM`` ``RobertaForMaskedLM``
~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.RobertaForMaskedLM .. autoclass:: transformers.RobertaForMaskedLM
:members: :members:
``RobertaForSequenceClassification`` ``RobertaForSequenceClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.RobertaForSequenceClassification .. autoclass:: transformers.RobertaForSequenceClassification
:members: :members:
...@@ -5,26 +5,26 @@ Transformer XL ...@@ -5,26 +5,26 @@ Transformer XL
``TransfoXLConfig`` ``TransfoXLConfig``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.TransfoXLConfig .. autoclass:: transformers.TransfoXLConfig
:members: :members:
``TransfoXLTokenizer`` ``TransfoXLTokenizer``
~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.TransfoXLTokenizer .. autoclass:: transformers.TransfoXLTokenizer
:members: :members:
``TransfoXLModel`` ``TransfoXLModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.TransfoXLModel .. autoclass:: transformers.TransfoXLModel
:members: :members:
``TransfoXLLMHeadModel`` ``TransfoXLLMHeadModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel .. autoclass:: transformers.TransfoXLLMHeadModel
:members: :members:
...@@ -4,38 +4,38 @@ XLM ...@@ -4,38 +4,38 @@ XLM
``XLMConfig`` ``XLMConfig``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMConfig .. autoclass:: transformers.XLMConfig
:members: :members:
``XLMTokenizer`` ``XLMTokenizer``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMTokenizer .. autoclass:: transformers.XLMTokenizer
:members: :members:
``XLMModel`` ``XLMModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMModel .. autoclass:: transformers.XLMModel
:members: :members:
``XLMWithLMHeadModel`` ``XLMWithLMHeadModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMWithLMHeadModel .. autoclass:: transformers.XLMWithLMHeadModel
:members: :members:
``XLMForSequenceClassification`` ``XLMForSequenceClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMForSequenceClassification .. autoclass:: transformers.XLMForSequenceClassification
:members: :members:
``XLMForQuestionAnswering`` ``XLMForQuestionAnswering``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMForQuestionAnswering .. autoclass:: transformers.XLMForQuestionAnswering
:members: :members:
...@@ -4,40 +4,40 @@ XLNet ...@@ -4,40 +4,40 @@ XLNet
``XLNetConfig`` ``XLNetConfig``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLNetConfig .. autoclass:: transformers.XLNetConfig
:members: :members:
``XLNetTokenizer`` ``XLNetTokenizer``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLNetTokenizer .. autoclass:: transformers.XLNetTokenizer
:members: :members:
``XLNetModel`` ``XLNetModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLNetModel .. autoclass:: transformers.XLNetModel
:members: :members:
``XLNetLMHeadModel`` ``XLNetLMHeadModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLNetLMHeadModel .. autoclass:: transformers.XLNetLMHeadModel
:members: :members:
``XLNetForSequenceClassification`` ``XLNetForSequenceClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLNetForSequenceClassification .. autoclass:: transformers.XLNetForSequenceClassification
:members: :members:
``XLNetForQuestionAnswering`` ``XLNetForQuestionAnswering``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering .. autoclass:: transformers.XLNetForQuestionAnswering
:members: :members:
Notebooks Notebooks
================================================ ================================================
We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model. We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
* *
The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models. The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
* *
The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models. The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
* *
The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model. The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
Please follow the instructions given in the notebooks to run and modify them. Please follow the instructions given in the notebooks to run and modify them.
...@@ -44,15 +44,15 @@ Here is the full list of the currently provided pretrained models together with ...@@ -44,15 +44,15 @@ Here is the full list of the currently provided pretrained models together with
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-large-uncased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters. | | | ``bert-large-uncased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters. |
| | | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD | | | | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD |
| | | (see details of fine-tuning in the `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__). | | | | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-large-cased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters | | | ``bert-large-cased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters |
| | | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD | | | | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD |
| | | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__) | | | | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-cased-finetuned-mrpc`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | ``bert-base-cased-finetuned-mrpc`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | The ``bert-base-cased`` model fine-tuned on MRPC | | | | | The ``bert-base-cased`` model fine-tuned on MRPC |
| | | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__) | | | | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__) |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | OpenAI GPT English model | | | | | OpenAI GPT English model |
...@@ -120,4 +120,4 @@ Here is the full list of the currently provided pretrained models together with ...@@ -120,4 +120,4 @@ Here is the full list of the currently provided pretrained models together with
| | | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__) | | | | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__) |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
.. <https://huggingface.co/pytorch-transformers/examples.html>`__ .. <https://huggingface.co/transformers/examples.html>`__
\ No newline at end of file \ No newline at end of file
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
## Philosophy ## Philosophy
PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models. Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
The library was designed with two strong goals in mind: The library was designed with two strong goals in mind:
...@@ -39,7 +39,7 @@ The library is build around three type of classes for each models: ...@@ -39,7 +39,7 @@ The library is build around three type of classes for each models:
All these classes can be instantiated from pretrained instances and saved locally using two methods: All these classes can be instantiated from pretrained instances and saved locally using two methods:
- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user, - `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`. - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts: We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
...@@ -59,7 +59,7 @@ Let's start by preparing a tokenized input (a list of token embeddings indices t ...@@ -59,7 +59,7 @@ Let's start by preparing a tokenized input (a list of token embeddings indices t
```python ```python
import torch import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM from transformers import BertTokenizer, BertModel, BertForMaskedLM
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows # OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging import logging
...@@ -106,7 +106,7 @@ model.to('cuda') ...@@ -106,7 +106,7 @@ model.to('cuda')
with torch.no_grad(): with torch.no_grad():
# See the models docstrings for the detail of the inputs # See the models docstrings for the detail of the inputs
outputs = model(tokens_tensor, token_type_ids=segments_tensors) outputs = model(tokens_tensor, token_type_ids=segments_tensors)
# PyTorch-Transformers models always output tuples. # Transformers models always output tuples.
# See the models docstrings for the detail of all the outputs # See the models docstrings for the detail of all the outputs
# In our case, the first element is the hidden state of the last layer of the Bert model # In our case, the first element is the hidden state of the last layer of the Bert model
encoded_layers = outputs[0] encoded_layers = outputs[0]
...@@ -145,7 +145,7 @@ First let's prepare a tokenized input from our text string using `GPT2Tokenizer` ...@@ -145,7 +145,7 @@ First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
```python ```python
import torch import torch
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel from transformers import GPT2Tokenizer, GPT2LMHeadModel
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging import logging
......
...@@ -45,7 +45,7 @@ where ...@@ -45,7 +45,7 @@ where
* ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
* ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ ) * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ). If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
* *
``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information). ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
...@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to ...@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to
.. code-block:: python .. code-block:: python
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME from transformers import WEIGHTS_NAME, CONFIG_NAME
output_dir = "./models/" output_dir = "./models/"
......
...@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab ...@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab
Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
their model to be re-used in other programs, such as efficiency-oriented C++ programs. their model to be re-used in other programs, such as efficiency-oriented C++ programs.
We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
they can be exported, and what to be mindful of when using these models with TorchScript. they can be exported, and what to be mindful of when using these models with TorchScript.
...@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename `` ...@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
.. code-block:: python .. code-block:: python
from pytorch_transformers import BertModel, BertTokenizer, BertConfig from transformers import BertModel, BertTokenizer, BertConfig
import torch import torch
enc = BertTokenizer.from_pretrained("bert-base-uncased") enc = BertTokenizer.from_pretrained("bert-base-uncased")
......
...@@ -13,7 +13,7 @@ similar API between the different models. ...@@ -13,7 +13,7 @@ similar API between the different models.
## Language model fine-tuning ## Language model fine-tuning
Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py). Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT
to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa
...@@ -75,7 +75,7 @@ python run_lm_finetuning.py \ ...@@ -75,7 +75,7 @@ python run_lm_finetuning.py \
## Language generation ## Language generation
Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py). Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
...@@ -91,7 +91,7 @@ python run_generation.py \ ...@@ -91,7 +91,7 @@ python run_generation.py \
## GLUE ## GLUE
Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py). Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
...@@ -319,7 +319,7 @@ eval_loss = 0.44457291918821606 ...@@ -319,7 +319,7 @@ eval_loss = 0.44457291918821606
## SQuAD ## SQuAD
Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py). Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
#### Fine-tuning on SQuAD #### Fine-tuning on SQuAD
......
...@@ -39,7 +39,7 @@ import torch ...@@ -39,7 +39,7 @@ import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset) TensorDataset)
from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME, AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
WarmupLinearSchedule) WarmupLinearSchedule)
......
...@@ -35,10 +35,10 @@ from tqdm import tqdm, trange ...@@ -35,10 +35,10 @@ from tqdm import tqdm, trange
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from pytorch_transformers import (WEIGHTS_NAME, BertConfig, from transformers import (WEIGHTS_NAME, BertConfig,
BertForMultipleChoice, BertTokenizer) BertForMultipleChoice, BertTokenizer)
from pytorch_transformers import AdamW, WarmupLinearSchedule from transformers import AdamW, WarmupLinearSchedule
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -365,7 +365,7 @@ def train(args, train_dataset, model, tokenizer): ...@@ -365,7 +365,7 @@ def train(args, train_dataset, model, tokenizer):
# inputs.update({'cls_index': batch[5], # inputs.update({'cls_index': batch[5],
# 'p_mask': batch[6]}) # 'p_mask': batch[6]})
outputs = model(**inputs) outputs = model(**inputs)
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] # model outputs are always tuple in transformers (see doc)
if args.n_gpu > 1: if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
...@@ -647,7 +647,7 @@ def main(): ...@@ -647,7 +647,7 @@ def main():
if args.eval_all_checkpoints: if args.eval_all_checkpoints:
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
logger.info("Evaluate the following checkpoints: %s", checkpoints) logger.info("Evaluate the following checkpoints: %s", checkpoints)
......
...@@ -28,7 +28,7 @@ import math ...@@ -28,7 +28,7 @@ import math
import torch import torch
from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S', datefmt = '%m/%d/%Y %H:%M:%S',
......
...@@ -13,11 +13,11 @@ For more information on DistilBERT, please refer to our [detailed blog post](htt ...@@ -13,11 +13,11 @@ For more information on DistilBERT, please refer to our [detailed blog post](htt
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details. **Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/transformers/issues/1179) for more details.
## How to use DistilBERT ## How to use DistilBERT
PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT): Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters. - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score). - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
......
...@@ -26,7 +26,7 @@ import torch ...@@ -26,7 +26,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from pytorch_transformers import AdamW, WarmupLinearSchedule from transformers import AdamW, WarmupLinearSchedule
from utils import logger from utils import logger
from dataset import Dataset from dataset import Dataset
......
...@@ -20,7 +20,7 @@ import pickle ...@@ -20,7 +20,7 @@ import pickle
import random import random
import time import time
import numpy as np import numpy as np
from pytorch_transformers import BertTokenizer from transformers import BertTokenizer
import logging import logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
""" """
Preprocessing script before training DistilBERT. Preprocessing script before training DistilBERT.
""" """
from pytorch_transformers import BertForPreTraining from transformers import BertForPreTraining
import torch import torch
import argparse import argparse
......
...@@ -23,8 +23,8 @@ import shutil ...@@ -23,8 +23,8 @@ import shutil
import numpy as np import numpy as np
import torch import torch
from pytorch_transformers import BertTokenizer, BertForMaskedLM from transformers import BertTokenizer, BertForMaskedLM
from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig from transformers import DistilBertForMaskedLM, DistilBertConfig
from distiller import Distiller from distiller import Distiller
from utils import git_log, logger, init_gpu_params, set_seed from utils import git_log, logger, init_gpu_params, set_seed
......
...@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse ...@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from pytorch_transformers import (WEIGHTS_NAME, from transformers import (WEIGHTS_NAME,
BertConfig, BertForSequenceClassification, BertTokenizer, BertConfig, BertForSequenceClassification, BertTokenizer,
XLMConfig, XLMForSequenceClassification, XLMTokenizer, XLMConfig, XLMForSequenceClassification, XLMTokenizer,
XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer) XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment