gpt2_hubconf.py 6.21 KB
Newer Older
VictorSanh's avatar
VictorSanh committed
1
from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
VictorSanh's avatar
VictorSanh committed
2
from pytorch_pretrained_bert.modeling_gpt2 import (
VictorSanh's avatar
VictorSanh committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
	GPT2Model,
	GPT2LMHeadModel,
	GPT2DoubleHeadsModel
)

# A lot of models share the same param doc. Use a decorator
# to save typing
gpt2_docstring = """
	Params:
		pretrained_model_name_or_path: either:
			- a str with the name of a pre-trained model to load selected in the list of:
				. `gpt2`
			- a path or url to a pretrained model archive containing:
				. `gpt2_config.json` a configuration file for the model
				. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
			- a path or url to a pretrained model archive containing:
				. `gpt2_config.json` a configuration file for the model
				. a TensorFlow checkpoint with trained weights
		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
		state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
		*inputs, **kwargs: additional input for the specific GPT-2 class
"""


def _append_from_pretrained_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator


def gpt2Tokenizer(*args, **kwargs):
    """
    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
	Peculiarities:
        - Byte-level BPE

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * openai-gpt
    Keyword args:
	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
					Default: None
	max_len: An artificial maximum length to truncate tokenized sequences to;
        	 Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
			 Default: None

    Example:
		>>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
		
		>>> text = "Who was Jim Henson ?"
        >>> indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
    return tokenizer


@_append_from_pretrained_docstring(gpt2_docstring)
def gpt2Model(*args, **kwargs):
    """
    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
	identical stacked masked self-attention blocks and pre-trained
	on large scale dataset using language modeling signal.

    Example:
        # Load the tokenizer
		>>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')

        #  Prepare tokenized input
        >>> text_1 = "Who was Jim Henson ?"
		>>> text_2 = "Jim Henson was a puppeteer"
        >>> indexed_tokens_1 = tokenizer.encode(text_1)
        >>> indexed_tokens_2 = tokenizer.encode(text_2)
		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])

        # Load gpt2Model
        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')
        >>> model.eval()

        # Predict hidden states features for each layer
		# past can be used to reuse precomputed hidden state in a subsequent predictions
        >>> with torch.no_grad():
                hidden_states_1, past = model(tokens_tensor_1)
				hidden_states_2, past = model(tokens_tensor_2, past=past)
    """
    model = GPT2Model.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(gpt2_docstring)
def gpt2LMHeadModel(*args, **kwargs):
    """
    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
	tied (pre-trained) language modeling head on top.

	Example:
        # Load the tokenizer
		>>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')

        #  Prepare tokenized input
        >>> text_1 = "Who was Jim Henson ?"
		>>> text_2 = "Jim Henson was a puppeteer"
        >>> indexed_tokens_1 = tokenizer.encode(text_1)
        >>> indexed_tokens_2 = tokenizer.encode(text_2)
		>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
		>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])

        # Load gpt2LMHeadModel
        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')
        >>> model.eval()

        # Predict hidden states features for each layer
		# past can be used to reuse precomputed hidden state in a subsequent predictions
        >>> with torch.no_grad():
				predictions_1, past = model(tokens_tensor_1)
				predictions_2, past = model(tokens_tensor_2, past=past)

		# Get the predicted last token
		>>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
		>>> predicted_token = tokenizer.decode([predicted_index])
        >>> assert predicted_token == ' who'
    """
    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(gpt2_docstring)
def gpt2DoubleHeadsModel(*args, **kwargs):
    """
    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
	tied (pre-trained) language modeling head and a multiple choice
	classification head (only initialized, not pre-trained).

	Example:
        # Load the tokenizer
		>>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')

        #  Prepare tokenized input
		>>> text = "Who was Jim Henson ?"
        >>> indexed_tokens = tokenizer.encode(tokenized_text)
        >>> tokens_tensor = torch.tensor([indexed_tokens])
		>>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ])

VictorSanh's avatar
VictorSanh committed
155
        # Load gpt2DoubleHeadsModel
VictorSanh's avatar
VictorSanh committed
156
157
158
159
160
161
162
163
164
        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
        >>> model.eval()

        # Predict hidden states features for each layer
        >>> with torch.no_grad():
                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
    """
    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
    return model