gpt2_hubconf.py 6.7 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
from pytorch_transformers.modeling_gpt2 import (
VictorSanh's avatar
VictorSanh committed
3
4
5
    GPT2Model,
    GPT2LMHeadModel,
    GPT2DoubleHeadsModel
VictorSanh's avatar
VictorSanh committed
6
7
8
9
10
)

# A lot of models share the same param doc. Use a decorator
# to save typing
gpt2_docstring = """
VictorSanh's avatar
VictorSanh committed
11
12
13
    Params:
        pretrained_model_name_or_path: either:
            - a str with the name of a pre-trained model to load selected in the list of:
thomwolf's avatar
thomwolf committed
14
                . `gpt2`, `gpt2-medium`
VictorSanh's avatar
VictorSanh committed
15
16
17
18
19
20
21
22
23
24
            - a path or url to a pretrained model archive containing:
                . `gpt2_config.json` a configuration file for the model
                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
            - a path or url to a pretrained model archive containing:
                . `gpt2_config.json` a configuration file for the model
                . a TensorFlow checkpoint with trained weights
        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
        *inputs, **kwargs: additional input for the specific GPT-2 class
VictorSanh's avatar
VictorSanh committed
25
26
27
28
29
30
31
32
33
34
35
36
37
"""


def _append_from_pretrained_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator


def gpt2Tokenizer(*args, **kwargs):
    """
    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
VictorSanh's avatar
VictorSanh committed
38
    Peculiarities:
VictorSanh's avatar
VictorSanh committed
39
40
41
42
43
        - Byte-level BPE

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
VictorSanh's avatar
VictorSanh committed
44
                                       * gpt2
VictorSanh's avatar
VictorSanh committed
45
    Keyword args:
VictorSanh's avatar
VictorSanh committed
46
47
48
49
    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
VictorSanh's avatar
VictorSanh committed
50
51
             value (if specified) and the underlying BERT model's
             sequence length.
VictorSanh's avatar
VictorSanh committed
52
             Default: None
VictorSanh's avatar
VictorSanh committed
53
54

    Example:
thomwolf's avatar
thomwolf committed
55
56
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
VictorSanh's avatar
VictorSanh committed
57

thomwolf's avatar
thomwolf committed
58
59
        text = "Who was Jim Henson ?"
        indexed_tokens = tokenizer.encode(tokenized_text)
VictorSanh's avatar
VictorSanh committed
60
61
62
63
64
65
66
67
68
    """
    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
    return tokenizer


@_append_from_pretrained_docstring(gpt2_docstring)
def gpt2Model(*args, **kwargs):
    """
    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
VictorSanh's avatar
VictorSanh committed
69
70
    identical stacked masked self-attention blocks and pre-trained
    on large scale dataset using language modeling signal.
VictorSanh's avatar
VictorSanh committed
71
72
73

    Example:
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
74
75
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
VictorSanh's avatar
VictorSanh committed
76
77

        #  Prepare tokenized input
thomwolf's avatar
thomwolf committed
78
79
80
81
82
83
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        indexed_tokens_1 = tokenizer.encode(text_1)
        indexed_tokens_2 = tokenizer.encode(text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
VictorSanh's avatar
VictorSanh committed
84
85

        # Load gpt2Model
thomwolf's avatar
thomwolf committed
86
87
        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
        model.eval()
VictorSanh's avatar
VictorSanh committed
88
89

        # Predict hidden states features for each layer
VictorSanh's avatar
VictorSanh committed
90
        # past can be used to reuse precomputed hidden state in a subsequent predictions
thomwolf's avatar
thomwolf committed
91
        with torch.no_grad():
VictorSanh's avatar
VictorSanh committed
92
                hidden_states_1, past = model(tokens_tensor_1)
VictorSanh's avatar
VictorSanh committed
93
                hidden_states_2, past = model(tokens_tensor_2, past=past)
VictorSanh's avatar
VictorSanh committed
94
95
96
97
98
99
100
101
102
    """
    model = GPT2Model.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(gpt2_docstring)
def gpt2LMHeadModel(*args, **kwargs):
    """
    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
VictorSanh's avatar
VictorSanh committed
103
    tied (pre-trained) language modeling head on top.
VictorSanh's avatar
VictorSanh committed
104

VictorSanh's avatar
VictorSanh committed
105
    Example:
VictorSanh's avatar
VictorSanh committed
106
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
107
108
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
VictorSanh's avatar
VictorSanh committed
109
110

        #  Prepare tokenized input
thomwolf's avatar
thomwolf committed
111
112
113
114
115
116
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        indexed_tokens_1 = tokenizer.encode(text_1)
        indexed_tokens_2 = tokenizer.encode(text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
VictorSanh's avatar
VictorSanh committed
117
118

        # Load gpt2LMHeadModel
thomwolf's avatar
thomwolf committed
119
120
        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
        model.eval()
VictorSanh's avatar
VictorSanh committed
121
122

        # Predict hidden states features for each layer
VictorSanh's avatar
VictorSanh committed
123
        # past can be used to reuse precomputed hidden state in a subsequent predictions
thomwolf's avatar
thomwolf committed
124
        with torch.no_grad():
VictorSanh's avatar
VictorSanh committed
125
126
                predictions_1, past = model(tokens_tensor_1)
                predictions_2, past = model(tokens_tensor_2, past=past)
VictorSanh's avatar
VictorSanh committed
127

VictorSanh's avatar
VictorSanh committed
128
        # Get the predicted last token
thomwolf's avatar
thomwolf committed
129
130
131
        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
        predicted_token = tokenizer.decode([predicted_index])
        assert predicted_token == ' who'
VictorSanh's avatar
VictorSanh committed
132
    """
VictorSanh's avatar
VictorSanh committed
133
    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
VictorSanh's avatar
VictorSanh committed
134
135
136
137
138
139
140
    return model


@_append_from_pretrained_docstring(gpt2_docstring)
def gpt2DoubleHeadsModel(*args, **kwargs):
    """
    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
VictorSanh's avatar
VictorSanh committed
141
142
    tied (pre-trained) language modeling head and a multiple choice
    classification head (only initialized, not pre-trained).
VictorSanh's avatar
VictorSanh committed
143

VictorSanh's avatar
VictorSanh committed
144
    Example:
VictorSanh's avatar
VictorSanh committed
145
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
146
147
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
VictorSanh's avatar
VictorSanh committed
148
149

        #  Prepare tokenized input
thomwolf's avatar
thomwolf committed
150
151
152
153
154
155
156
157
        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
        tokenized_text1 = tokenizer.tokenize(text1)
        tokenized_text2 = tokenizer.tokenize(text2)
        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
VictorSanh's avatar
VictorSanh committed
158

VictorSanh's avatar
VictorSanh committed
159
        # Load gpt2DoubleHeadsModel
thomwolf's avatar
thomwolf committed
160
161
        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
        model.eval()
VictorSanh's avatar
VictorSanh committed
162
163

        # Predict hidden states features for each layer
thomwolf's avatar
thomwolf committed
164
        with torch.no_grad():
VictorSanh's avatar
VictorSanh committed
165
166
167
168
                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
    """
    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
    return model