gpt_hubconf.py 8.06 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
from pytorch_transformers.modeling_openai import (
VictorSanh's avatar
VictorSanh committed
3
4
5
6
7
	OpenAIGPTModel,
	OpenAIGPTLMHeadModel,
	OpenAIGPTDoubleHeadsModel
)

VictorSanh's avatar
VictorSanh committed
8
9
10
# Dependecies that are not specified in global hubconf.py
specific_dependencies = ['spacy', 'ftfy']

VictorSanh's avatar
VictorSanh committed
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# A lot of models share the same param doc. Use a decorator
# to save typing
gpt_docstring = """
    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
    Special tokens need to be trained during the fine-tuning if you use them.
    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.

    The embeddings are ordered as follow in the token embeddings matrice:
        [0,                                                         ----------------------
         ...                                                        -> word embeddings
         config.vocab_size - 1,                                     ______________________
         config.vocab_size,
         ...                                                        -> special embeddings
         config.vocab_size + config.n_special - 1]                  ______________________

    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
        total_tokens_embeddings = config.vocab_size + config.n_special
    You should use the associate indices to index the embeddings.

    Params:
		pretrained_model_name_or_path: either:
			- a str with the name of a pre-trained model to load selected in the list of:
				. `openai-gpt`
			- a path or url to a pretrained model archive containing:
				. `openai_gpt_config.json` a configuration file for the model
				. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
			- a path or url to a pretrained model archive containing:
				. `openai-gpt-config.json` a configuration file for the model
				. a series of NumPy files containing OpenAI TensorFlow trained weights
		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
thomwolf's avatar
thomwolf committed
43
		state_dict: an optional state dictionary (collections.OrderedDict object)
VictorSanh's avatar
VictorSanh committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
		        	to use instead of pre-trained models
		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
"""


def _append_from_pretrained_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator


def openAIGPTTokenizer(*args, **kwargs):
    """
    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
	Peculiarities:
        - lower case all inputs
VictorSanh's avatar
VictorSanh committed
61
        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
VictorSanh's avatar
VictorSanh committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
        - argument special_tokens and function set_special_tokens:
            can be used to add additional symbols (ex: "__classify__") to a vocabulary.

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * openai-gpt
    Keyword args:
	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
					Default: None
	max_len: An artificial maximum length to truncate tokenized sequences to;
        	 Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
			 Default: None

    Example:
		>>> import torch
thomwolf's avatar
thomwolf committed
80
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
VictorSanh's avatar
VictorSanh committed
81
82
83
84
		
		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
VictorSanh's avatar
VictorSanh committed
85
        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
VictorSanh's avatar
VictorSanh committed
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
    """
    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer


@_append_from_pretrained_docstring(gpt_docstring)
def openAIGPTModel(*args, **kwargs):
    """
    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
	identical stacked masked self-attention blocks and pre-trained
	on large scale dataset using language modeling signal.

    Example:
        # Load the tokenizer
		>>> import torch
thomwolf's avatar
thomwolf committed
101
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
VictorSanh's avatar
VictorSanh committed
102
103
104
105
106
107
108
109

        #  Prepare tokenized input
        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> tokens_tensor = torch.tensor([indexed_tokens])

        # Load openAIGPTModel
thomwolf's avatar
thomwolf committed
110
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
VictorSanh's avatar
VictorSanh committed
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
        >>> model.eval()

        # Predict hidden states features for each layer
        >>> with torch.no_grad():
                hidden_states = model(tokens_tensor)
    """
    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(gpt_docstring)
def openAIGPTLMHeadModel(*args, **kwargs):
    """
    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
	tied (pre-trained) language modeling head on top.

	Example:
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
129
        >>> import torch
thomwolf's avatar
thomwolf committed
130
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
VictorSanh's avatar
VictorSanh committed
131
132
133
134
135
136
137
138

        #  Prepare tokenized input
        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> tokens_tensor = torch.tensor([indexed_tokens])

        # Load openAIGPTLMHeadModel
thomwolf's avatar
thomwolf committed
139
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
VictorSanh's avatar
VictorSanh committed
140
141
142
143
144
145
146
147
148
        >>> model.eval()

        # Predict hidden states features for each layer
        >>> with torch.no_grad():
                predictions = model(tokens_tensor)

		# Get the predicted last token
		>>> predicted_index = torch.argmax(predictions[0, -1, :]).item()
		>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
VictorSanh's avatar
VictorSanh committed
149
        '.</w>'
VictorSanh's avatar
VictorSanh committed
150
    """
VictorSanh's avatar
VictorSanh committed
151
    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
VictorSanh's avatar
VictorSanh committed
152
153
154
155
156
157
158
159
160
161
162
163
    return model


@_append_from_pretrained_docstring(gpt_docstring)
def openAIGPTDoubleHeadsModel(*args, **kwargs):
    """
    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
	tied (pre-trained) language modeling head and a multiple choice
	classification head (only initialized, not pre-trained).

	Example:
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
164
        >>> import torch
thomwolf's avatar
thomwolf committed
165
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
VictorSanh's avatar
VictorSanh committed
166
167

        #  Prepare tokenized input
thomwolf's avatar
thomwolf committed
168
169
170
171
172
173
174
175
        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
        >>> tokenized_text1 = tokenizer.tokenize(text1)
        >>> tokenized_text2 = tokenizer.tokenize(text2)
        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
VictorSanh's avatar
VictorSanh committed
176
177

        # Load openAIGPTDoubleHeadsModel
thomwolf's avatar
thomwolf committed
178
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
VictorSanh's avatar
VictorSanh committed
179
180
181
182
183
184
185
186
        >>> model.eval()

        # Predict hidden states features for each layer
        >>> with torch.no_grad():
                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
    """
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
    return model