gpt2.py 9.75 KB
Newer Older
Jason Phang's avatar
gpt3  
Jason Phang committed
1
import torch
Xingjian Shi's avatar
Xingjian Shi committed
2
import transformers
3
from typing import Optional, Union
4
from lm_eval.base import BaseLM
Alexander's avatar
Alexander committed
5
6
import optimum
from optimum.intel.openvino import OVModelForCausalLM
Jason Phang's avatar
gpt3  
Jason Phang committed
7

8

jonabur's avatar
jonabur committed
9
def _get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
10
11
12
13
14
15
16
17
18
    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
    if isinstance(dtype, str) and dtype != "auto":
        # Convert `str` args torch dtype: `float16` -> `torch.float16`
        _torch_dtype = getattr(torch, dtype)
    else:
        _torch_dtype = dtype
    return _torch_dtype


19
class HFLM(BaseLM):
20
21
22

    _DEFAULT_MAX_LENGTH = 2048

Fabrizio Milo's avatar
Fabrizio Milo committed
23
24
25
26
27
    def __init__(
        self,
        device="cuda",
        pretrained="gpt2",
        revision="main",
Xingjian Shi's avatar
Xingjian Shi committed
28
        low_cpu_mem_usage=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
29
30
31
        subfolder=None,
        tokenizer=None,
        batch_size=1,
32
        max_batch_size=512,
33
        max_length=None,
34
35
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
jonabur's avatar
jonabur committed
36
        dtype: Optional[Union[str, torch.dtype]] = "auto",
Fabrizio Milo's avatar
Fabrizio Milo committed
37
    ):
Leo Gao's avatar
Leo Gao committed
38
        super().__init__()
39

40
41
        # Initialize model
        if isinstance(pretrained, transformers.PreTrainedModel):
42
43
            self.model = pretrained
            self._device = self.model.device
44
45
46

            if tokenizer:
                assert isinstance(
jonabur's avatar
jonabur committed
47
48
                    tokenizer, transformers.PreTrainedTokenizer
                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
49
50
51
                self.tokenizer = tokenizer
            else:
                # Get tokenizer
52
                model_name = self.model.name_or_path
53
                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
jonabur's avatar
jonabur committed
54
55
56
57
                    model_name,
                    revision=revision,
                    trust_remote_code=trust_remote_code,
                )
58

59
60
        elif isinstance(pretrained, str):

61
62
63
            # Initialize device
            assert isinstance(device, str)
            device_list = set(
jonabur's avatar
jonabur committed
64
65
                ["cuda", "cpu"]
                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
Fabrizio Milo's avatar
Fabrizio Milo committed
66
            )
67
68
69
70
71
72
73
74
75
76
77
78
79
            if device and device in device_list:
                self._device = torch.device(device)
                print(f"Using device '{device}'")
            else:
                print("Device not specified")
                print(f"Cuda Available? {torch.cuda.is_available()}")
                self._device = (
                    torch.device("cuda")
                    if torch.cuda.is_available()
                    else torch.device("cpu")
                )
            revision = revision + ("/" + subfolder if subfolder is not None else "")

80
            # Initialize new model and tokenizer instances
81
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
jonabur's avatar
jonabur committed
82
83
84
85
86
87
88
                pretrained,
                load_in_8bit=load_in_8bit,
                low_cpu_mem_usage=low_cpu_mem_usage,
                revision=revision,
                torch_dtype=_get_dtype(dtype),
                trust_remote_code=trust_remote_code,
            ).to(self.device)
89
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
jonabur's avatar
jonabur committed
90
91
92
93
                tokenizer if tokenizer else pretrained,
                revision=revision,
                trust_remote_code=trust_remote_code,
            )
94

Leo Gao's avatar
Leo Gao committed
95
        else:
jonabur's avatar
jonabur committed
96
97
            raise TypeError(
                "Parameter pretrained should be of type str or transformers.PreTrainedModel"
Fabrizio Milo's avatar
Fabrizio Milo committed
98
            )
99

100
        self.model.eval()
101

102
        self.vocab_size = self.tokenizer.vocab_size
103

104
105
        # Validate batch_size
        assert isinstance(batch_size, (int, str))
Leo Gao's avatar
Leo Gao committed
106

107
        # setup for automatic batch size detection
108
109
110
111
        if str(batch_size).startswith("auto"):
            batch_size = batch_size.split(":")
            self.batch_size_per_gpu = batch_size[0]
            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
112
        else:
113
            self.batch_size_per_gpu = int(batch_size)
114
        self.max_batch_size = max_batch_size
115

116
        self._max_length = max_length
117

118
119
120
121
    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id
122

123
124
    @property
    def max_length(self):
jonabur's avatar
jonabur committed
125
        if self._max_length:  # if max length manually set, return it
126
127
128
            return self._max_length
        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
        for attr in seqlen_config_attrs:
129
130
            if hasattr(self.model.config, attr):
                return getattr(self.model.config, attr)
131
132
133
134
135
        if hasattr(self.tokenizer, "model_max_length"):
            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
                return self._DEFAULT_MAX_LENGTH
            return self.tokenizer.model_max_length
        return self._DEFAULT_MAX_LENGTH
136

137
138
139
    @property
    def max_gen_toks(self):
        return 256
Leo Gao's avatar
Leo Gao committed
140

141
142
143
144
    @property
    def batch_size(self):
        # TODO: fix multi-gpu
        return self.batch_size_per_gpu  # * gpus
Leo Gao's avatar
Leo Gao committed
145

146
147
148
149
    @property
    def device(self):
        # TODO: fix multi-gpu
        return self._device
Leo Gao's avatar
Leo Gao committed
150

151
152
    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)
Fabrizio Milo's avatar
Fabrizio Milo committed
153

154
155
156
    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)

Leo Gao's avatar
Leo Gao committed
157
158
159
160
161
162
    def _model_call(self, inps):
        """
        inps: a torch tensor of shape [batch, sequence]
        the size of sequence may vary from call to call

        returns: a torch tensor of shape [batch, sequence, vocab] with the
163
        logits returned from the model
Leo Gao's avatar
Leo Gao committed
164
        """
165
        with torch.no_grad():
166
            return self.model(inps)[0]
Fabrizio Milo's avatar
Fabrizio Milo committed
167

168
    def _model_generate(self, context, max_length, eos_token_id):
169
        generation_kwargs = {"do_sample": False, "max_length": max_length}
170
        if eos_token_id is not None:
jonabur's avatar
jonabur committed
171
172
173
174
            generation_kwargs["eos_token_id"] = eos_token_id
            generation_kwargs[
                "pad_token_id"
            ] = eos_token_id  # setting eos_token_id as pad token
175
        return self.model.generate(context, **generation_kwargs)
176
177


178
179
# for backwards compatibility
GPT2LM = HFLM
Alexander's avatar
Alexander committed
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

class OPTIMUMLM(BaseLM):
    def __init__(
        self,
        device="cpu",
        pretrained="gpt2",
        revision="main",
        low_cpu_mem_usage=None,
        subfolder=None,
        tokenizer=None,
        batch_size=1,
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
    ):
        super().__init__()

        assert isinstance(device, str)
        assert isinstance(pretrained, str)
        assert isinstance(batch_size, (int,str))

        device_list = set(["cuda", "cpu"] + [f'cuda:{i}' for i in range(torch.cuda.device_count())])
        if device and device in device_list:
            self._device = torch.device(device)
            print(f"Using device '{device}'")
        else:
            print("Device not specified")
            print(f"Cuda Available? {torch.cuda.is_available()}")
            self._device = (
                torch.device("cuda")
                if torch.cuda.is_available()
                else torch.device("cpu")
            )

        # TODO: update this to be less of a hack once subfolder is fixed in HF
        revision = revision + ("/" + subfolder if subfolder is not None else "")

        self.gpt2 = OVModelForCausalLM.from_pretrained(
            pretrained,
Alexander's avatar
Alexander committed
218
            load_in_8bit=load_in_8bit,
Alexander's avatar
Alexander committed
219
220
221
222
223
            revision=revision,
            trust_remote_code=trust_remote_code,
            use_cache=True,
        )

Alexander's avatar
Alexander committed
224
225
226
227
228
229
230
231
        try:
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
                pretrained if tokenizer is None else tokenizer,
                revision=revision,
                trust_remote_code=trust_remote_code,
            )
        except:
            print("Tokenizer is missed. Plaase save it into the same folder with the model.")
Alexander's avatar
Alexander committed
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292

        self.vocab_size = self.tokenizer.vocab_size

        # setup for automatic batch size detection
        if batch_size == 'auto': 
            self.batch_size_per_gpu = batch_size
        else:
            self.batch_size_per_gpu = int(batch_size) 

    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

    @property
    def max_length(self):
        try:
            return self.gpt2.config.n_ctx
        except AttributeError:
            # gptneoconfig doesn't have n_ctx apparently
            return self.gpt2.config.max_position_embeddings

    @property
    def max_gen_toks(self):
        return 256

    @property
    def batch_size(self):
        # TODO: fix multi-gpu
        return self.batch_size_per_gpu  # * gpus

    @property
    def device(self):
        # TODO: fix multi-gpu
        return self._device

    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)

    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)

    def _model_call(self, inps):
        """
        inps: a torch tensor of shape [batch, sequence]
        the size of sequence may vary from call to call

        returns: a torch tensor of shape [batch, sequence, vocab] with the
        logits returned from the model
        """
        #with torch.no_grad():
        attention_mask = inps.clone()
        attention_mask[:] = 1.0
        return self.gpt2(inps, attention_mask)[0]

    def _model_generate(self, context, max_length, eos_token_id):
        generation_kwargs = {'do_sample': False, 'max_length': max_length}
        if eos_token_id is not None:
            generation_kwargs['eos_token_id'] = eos_token_id
            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
        return self.gpt2.generate(context, **generation_kwargs)