gpt2.py 8.57 KB
Newer Older
Jason Phang's avatar
gpt3  
Jason Phang committed
1
import torch
Xingjian Shi's avatar
Xingjian Shi committed
2
import transformers
3
from typing import Optional, Union
4
from lm_eval.base import BaseLM
Alexander's avatar
Alexander committed
5
6
import optimum
from optimum.intel.openvino import OVModelForCausalLM
Jason Phang's avatar
gpt3  
Jason Phang committed
7

8

9
10
11
12
13
14
15
16
17
18
19
20
def _get_dtype(
    dtype: Union[str, torch.dtype]
) -> torch.dtype:
    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
    if isinstance(dtype, str) and dtype != "auto":
        # Convert `str` args torch dtype: `float16` -> `torch.float16`
        _torch_dtype = getattr(torch, dtype)
    else:
        _torch_dtype = dtype
    return _torch_dtype


21
class HFLM(BaseLM):
Fabrizio Milo's avatar
Fabrizio Milo committed
22
23
24
25
26
    def __init__(
        self,
        device="cuda",
        pretrained="gpt2",
        revision="main",
Xingjian Shi's avatar
Xingjian Shi committed
27
        low_cpu_mem_usage=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
28
29
30
        subfolder=None,
        tokenizer=None,
        batch_size=1,
31
32
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
33
        dtype: Optional[Union[str, torch.dtype]]="auto",
Fabrizio Milo's avatar
Fabrizio Milo committed
34
    ):
Leo Gao's avatar
Leo Gao committed
35
        super().__init__()
36
37
38

        assert isinstance(device, str)
        assert isinstance(pretrained, str)
39
        assert isinstance(batch_size, (int, str))
40

41
42
43
        device_list = set(
            ["cuda", "cpu"] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
        )
44
        if device and device in device_list:
researcher2's avatar
researcher2 committed
45
            self._device = torch.device(device)
46
            print(f"Using device '{device}'")
Leo Gao's avatar
Leo Gao committed
47
        else:
Fabrizio Milo's avatar
Fabrizio Milo committed
48
            print("Device not specified")
49
            print(f"Cuda Available? {torch.cuda.is_available()}")
Fabrizio Milo's avatar
Fabrizio Milo committed
50
51
52
53
54
            self._device = (
                torch.device("cuda")
                if torch.cuda.is_available()
                else torch.device("cpu")
            )
55

56
57
58
        # TODO: update this to be less of a hack once subfolder is fixed in HF
        revision = revision + ("/" + subfolder if subfolder is not None else "")

59
        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
60
61
62
63
            pretrained,
            load_in_8bit=load_in_8bit,
            low_cpu_mem_usage=low_cpu_mem_usage,
            revision=revision,
64
            torch_dtype=_get_dtype(dtype),
65
            trust_remote_code=trust_remote_code,
66
        ).to(self.device)
Leo Gao's avatar
Leo Gao committed
67
        self.gpt2.eval()
Leo Gao's avatar
Leo Gao committed
68

69
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
Fabrizio Milo's avatar
Fabrizio Milo committed
70
            pretrained if tokenizer is None else tokenizer,
71
            revision=revision,
72
            trust_remote_code=trust_remote_code,
Fabrizio Milo's avatar
Fabrizio Milo committed
73
        )
74

75
        self.vocab_size = self.tokenizer.vocab_size
76

Fabrizio Milo's avatar
Fabrizio Milo committed
77
78
79
80
81
82
83
84
85
        if isinstance(
            self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
        ):
            assert self.tokenizer.encode("hello\n\nhello") == [
                31373,
                198,
                198,
                31373,
            ], self.tokenizer.encode("hello\n\nhello")
Leo Gao's avatar
Leo Gao committed
86

87
        # setup for automatic batch size detection
88
        if batch_size == "auto":
89
90
            self.batch_size_per_gpu = batch_size
        else:
91
            self.batch_size_per_gpu = int(batch_size)
92

93
94
95
96
    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id
97

98
99
100
101
102
103
104
    @property
    def max_length(self):
        try:
            return self.gpt2.config.n_ctx
        except AttributeError:
            # gptneoconfig doesn't have n_ctx apparently
            return self.gpt2.config.max_position_embeddings
105

106
107
108
    @property
    def max_gen_toks(self):
        return 256
Leo Gao's avatar
Leo Gao committed
109

110
111
112
113
    @property
    def batch_size(self):
        # TODO: fix multi-gpu
        return self.batch_size_per_gpu  # * gpus
Leo Gao's avatar
Leo Gao committed
114

115
116
117
118
    @property
    def device(self):
        # TODO: fix multi-gpu
        return self._device
Leo Gao's avatar
Leo Gao committed
119

120
121
    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)
Fabrizio Milo's avatar
Fabrizio Milo committed
122

123
124
125
    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)

Leo Gao's avatar
Leo Gao committed
126
127
128
129
130
131
    def _model_call(self, inps):
        """
        inps: a torch tensor of shape [batch, sequence]
        the size of sequence may vary from call to call

        returns: a torch tensor of shape [batch, sequence, vocab] with the
132
        logits returned from the model
Leo Gao's avatar
Leo Gao committed
133
        """
134
        with torch.no_grad():
135
            return self.gpt2(inps)[0]
Fabrizio Milo's avatar
Fabrizio Milo committed
136

137
    def _model_generate(self, context, max_length, eos_token_id):
138
        generation_kwargs = {"do_sample": False, "max_length": max_length}
139
140
        if eos_token_id is not None:
            generation_kwargs['eos_token_id'] = eos_token_id
Nikhil Pinnaparaju's avatar
Nikhil Pinnaparaju committed
141
            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
142
        return self.gpt2.generate(context, **generation_kwargs)
143
144


145
146
# for backwards compatibility
GPT2LM = HFLM
Alexander's avatar
Alexander committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268

class OPTIMUMLM(BaseLM):
    def __init__(
        self,
        device="cpu",
        pretrained="gpt2",
        revision="main",
        low_cpu_mem_usage=None,
        subfolder=None,
        tokenizer=None,
        batch_size=1,
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
    ):
        super().__init__()

        assert isinstance(device, str)
        assert isinstance(pretrained, str)
        assert isinstance(batch_size, (int,str))

        device_list = set(["cuda", "cpu"] + [f'cuda:{i}' for i in range(torch.cuda.device_count())])
        if device and device in device_list:
            self._device = torch.device(device)
            print(f"Using device '{device}'")
        else:
            print("Device not specified")
            print(f"Cuda Available? {torch.cuda.is_available()}")
            self._device = (
                torch.device("cuda")
                if torch.cuda.is_available()
                else torch.device("cpu")
            )

        # TODO: update this to be less of a hack once subfolder is fixed in HF
        revision = revision + ("/" + subfolder if subfolder is not None else "")

        self.gpt2 = OVModelForCausalLM.from_pretrained(
            pretrained,
            # load_in_8bit=load_in_8bit,
            # low_cpu_mem_usage=low_cpu_mem_usage,
            revision=revision,
            trust_remote_code=trust_remote_code,
            use_cache=True,
        )
        #self.gpt2.eval()

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            pretrained if tokenizer is None else tokenizer,
            revision=revision,
            trust_remote_code=trust_remote_code,
        )

        self.vocab_size = self.tokenizer.vocab_size

        # if isinstance(
        #     self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
        # ):
        #     assert self.tokenizer.encode("hello\n\nhello") == [
        #         31373,
        #         198,
        #         198,
        #         31373,
        #     ], self.tokenizer.encode("hello\n\nhello")

        # setup for automatic batch size detection
        if batch_size == 'auto': 
            self.batch_size_per_gpu = batch_size
        else:
            self.batch_size_per_gpu = int(batch_size) 

    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

    @property
    def max_length(self):
        try:
            return self.gpt2.config.n_ctx
        except AttributeError:
            # gptneoconfig doesn't have n_ctx apparently
            return self.gpt2.config.max_position_embeddings

    @property
    def max_gen_toks(self):
        return 256

    @property
    def batch_size(self):
        # TODO: fix multi-gpu
        return self.batch_size_per_gpu  # * gpus

    @property
    def device(self):
        # TODO: fix multi-gpu
        return self._device

    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)

    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)

    def _model_call(self, inps):
        """
        inps: a torch tensor of shape [batch, sequence]
        the size of sequence may vary from call to call

        returns: a torch tensor of shape [batch, sequence, vocab] with the
        logits returned from the model
        """
        #with torch.no_grad():
        attention_mask = inps.clone()
        attention_mask[:] = 1.0
        return self.gpt2(inps, attention_mask)[0]

    def _model_generate(self, context, max_length, eos_token_id):
        generation_kwargs = {'do_sample': False, 'max_length': max_length}
        if eos_token_id is not None:
            generation_kwargs['eos_token_id'] = eos_token_id
            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
        return self.gpt2.generate(context, **generation_kwargs)