gpt2.py 9.78 KB
Newer Older
Jason Phang's avatar
gpt3  
Jason Phang committed
1
import torch
Xingjian Shi's avatar
Xingjian Shi committed
2
import transformers
3
from typing import Optional, Union
4
from lm_eval.base import BaseLM
Jason Phang's avatar
gpt3  
Jason Phang committed
5

6

jonabur's avatar
jonabur committed
7
def _get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
8
9
10
11
12
13
14
15
16
    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
    if isinstance(dtype, str) and dtype != "auto":
        # Convert `str` args torch dtype: `float16` -> `torch.float16`
        _torch_dtype = getattr(torch, dtype)
    else:
        _torch_dtype = dtype
    return _torch_dtype


17
class HFLM(BaseLM):
18
19
20

    _DEFAULT_MAX_LENGTH = 2048

Fabrizio Milo's avatar
Fabrizio Milo committed
21
22
23
24
25
    def __init__(
        self,
        device="cuda",
        pretrained="gpt2",
        revision="main",
Xingjian Shi's avatar
Xingjian Shi committed
26
        low_cpu_mem_usage=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
27
28
29
        subfolder=None,
        tokenizer=None,
        batch_size=1,
30
        max_batch_size=512,
31
        max_length=None,
32
33
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
jonabur's avatar
jonabur committed
34
        dtype: Optional[Union[str, torch.dtype]] = "auto",
Fabrizio Milo's avatar
Fabrizio Milo committed
35
    ):
Leo Gao's avatar
Leo Gao committed
36
        super().__init__()
37

38
39
        # Initialize model
        if isinstance(pretrained, transformers.PreTrainedModel):
40
41
            self.model = pretrained
            self._device = self.model.device
42
43
44

            if tokenizer:
                assert isinstance(
jonabur's avatar
jonabur committed
45
46
                    tokenizer, transformers.PreTrainedTokenizer
                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
47
48
49
                self.tokenizer = tokenizer
            else:
                # Get tokenizer
50
                model_name = self.model.name_or_path
51
                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
jonabur's avatar
jonabur committed
52
53
54
55
                    model_name,
                    revision=revision,
                    trust_remote_code=trust_remote_code,
                )
56

57
58
        elif isinstance(pretrained, str):

59
60
61
            # Initialize device
            assert isinstance(device, str)
            device_list = set(
jonabur's avatar
jonabur committed
62
63
                ["cuda", "cpu"]
                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
Fabrizio Milo's avatar
Fabrizio Milo committed
64
            )
65
66
67
68
69
70
71
72
73
74
75
76
77
            if device and device in device_list:
                self._device = torch.device(device)
                print(f"Using device '{device}'")
            else:
                print("Device not specified")
                print(f"Cuda Available? {torch.cuda.is_available()}")
                self._device = (
                    torch.device("cuda")
                    if torch.cuda.is_available()
                    else torch.device("cpu")
                )
            revision = revision + ("/" + subfolder if subfolder is not None else "")

78
            # Initialize new model and tokenizer instances
79
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
jonabur's avatar
jonabur committed
80
81
82
83
84
85
86
                pretrained,
                load_in_8bit=load_in_8bit,
                low_cpu_mem_usage=low_cpu_mem_usage,
                revision=revision,
                torch_dtype=_get_dtype(dtype),
                trust_remote_code=trust_remote_code,
            ).to(self.device)
87
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
jonabur's avatar
jonabur committed
88
89
90
91
                tokenizer if tokenizer else pretrained,
                revision=revision,
                trust_remote_code=trust_remote_code,
            )
92

Leo Gao's avatar
Leo Gao committed
93
        else:
jonabur's avatar
jonabur committed
94
95
            raise TypeError(
                "Parameter pretrained should be of type str or transformers.PreTrainedModel"
Fabrizio Milo's avatar
Fabrizio Milo committed
96
            )
97

98
        self.model.eval()
99

100
        self.vocab_size = self.tokenizer.vocab_size
101

102
103
        # Validate batch_size
        assert isinstance(batch_size, (int, str))
Leo Gao's avatar
Leo Gao committed
104

105
        # setup for automatic batch size detection
106
107
108
109
        if str(batch_size).startswith("auto"):
            batch_size = batch_size.split(":")
            self.batch_size_per_gpu = batch_size[0]
            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
110
        else:
111
            self.batch_size_per_gpu = int(batch_size)
112
        self.max_batch_size = max_batch_size
113

114
        self._max_length = max_length
115

116
117
118
119
    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id
120

121
122
    @property
    def max_length(self):
jonabur's avatar
jonabur committed
123
        if self._max_length:  # if max length manually set, return it
124
125
126
            return self._max_length
        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
        for attr in seqlen_config_attrs:
127
128
            if hasattr(self.model.config, attr):
                return getattr(self.model.config, attr)
129
130
131
132
133
        if hasattr(self.tokenizer, "model_max_length"):
            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
                return self._DEFAULT_MAX_LENGTH
            return self.tokenizer.model_max_length
        return self._DEFAULT_MAX_LENGTH
134

135
136
137
    @property
    def max_gen_toks(self):
        return 256
Leo Gao's avatar
Leo Gao committed
138

139
140
141
142
    @property
    def batch_size(self):
        # TODO: fix multi-gpu
        return self.batch_size_per_gpu  # * gpus
Leo Gao's avatar
Leo Gao committed
143

144
145
146
147
    @property
    def device(self):
        # TODO: fix multi-gpu
        return self._device
Leo Gao's avatar
Leo Gao committed
148

149
150
    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)
Fabrizio Milo's avatar
Fabrizio Milo committed
151

152
153
154
    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)

Leo Gao's avatar
Leo Gao committed
155
156
157
158
159
160
    def _model_call(self, inps):
        """
        inps: a torch tensor of shape [batch, sequence]
        the size of sequence may vary from call to call

        returns: a torch tensor of shape [batch, sequence, vocab] with the
161
        logits returned from the model
Leo Gao's avatar
Leo Gao committed
162
        """
163
        with torch.no_grad():
164
            return self.model(inps)[0]
Fabrizio Milo's avatar
Fabrizio Milo committed
165

166
    def _model_generate(self, context, max_length, eos_token_id):
167
        generation_kwargs = {"do_sample": False, "max_length": max_length}
168
        if eos_token_id is not None:
jonabur's avatar
jonabur committed
169
170
171
172
            generation_kwargs["eos_token_id"] = eos_token_id
            generation_kwargs[
                "pad_token_id"
            ] = eos_token_id  # setting eos_token_id as pad token
173
        return self.model.generate(context, **generation_kwargs)
174
175


176
177
# for backwards compatibility
GPT2LM = HFLM
Alexander's avatar
Alexander committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

class OPTIMUMLM(BaseLM):
    def __init__(
        self,
        device="cpu",
        pretrained="gpt2",
        revision="main",
        low_cpu_mem_usage=None,
        subfolder=None,
        tokenizer=None,
        batch_size=1,
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
    ):
        super().__init__()

194
195
196
        import optimum
        from optimum.intel.openvino import OVModelForCausalLM

Alexander's avatar
Alexander committed
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
        assert isinstance(device, str)
        assert isinstance(pretrained, str)
        assert isinstance(batch_size, (int,str))

        device_list = set(["cuda", "cpu"] + [f'cuda:{i}' for i in range(torch.cuda.device_count())])
        if device and device in device_list:
            self._device = torch.device(device)
            print(f"Using device '{device}'")
        else:
            print("Device not specified")
            print(f"Cuda Available? {torch.cuda.is_available()}")
            self._device = (
                torch.device("cuda")
                if torch.cuda.is_available()
                else torch.device("cpu")
            )

        # TODO: update this to be less of a hack once subfolder is fixed in HF
        revision = revision + ("/" + subfolder if subfolder is not None else "")

217
218
        ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}

Alexander's avatar
Alexander committed
219
220
        self.gpt2 = OVModelForCausalLM.from_pretrained(
            pretrained,
Alexander's avatar
Alexander committed
221
            load_in_8bit=load_in_8bit,
Alexander's avatar
Alexander committed
222
223
224
            revision=revision,
            trust_remote_code=trust_remote_code,
            use_cache=True,
225
            ov_config=ov_config
Alexander's avatar
Alexander committed
226
227
        )

Alexander's avatar
Alexander committed
228
229
230
231
232
233
234
235
        try:
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
                pretrained if tokenizer is None else tokenizer,
                revision=revision,
                trust_remote_code=trust_remote_code,
            )
        except:
            print("Tokenizer is missed. Plaase save it into the same folder with the model.")
Alexander's avatar
Alexander committed
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285

        self.vocab_size = self.tokenizer.vocab_size

        # setup for automatic batch size detection
        if batch_size == 'auto': 
            self.batch_size_per_gpu = batch_size
        else:
            self.batch_size_per_gpu = int(batch_size) 

    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

    @property
    def max_length(self):
        try:
            return self.gpt2.config.n_ctx
        except AttributeError:
            # gptneoconfig doesn't have n_ctx apparently
            return self.gpt2.config.max_position_embeddings

    @property
    def max_gen_toks(self):
        return 256

    @property
    def batch_size(self):
        # TODO: fix multi-gpu
        return self.batch_size_per_gpu  # * gpus

    @property
    def device(self):
        # TODO: fix multi-gpu
        return self._device

    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)

    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)

    def _model_call(self, inps):
        """
        inps: a torch tensor of shape [batch, sequence]
        the size of sequence may vary from call to call

        returns: a torch tensor of shape [batch, sequence, vocab] with the
        logits returned from the model
        """
Alexander's avatar
Alexander committed
286
        return self.gpt2(inps)[0]
Alexander's avatar
Alexander committed
287
288
289
290
291
292
293

    def _model_generate(self, context, max_length, eos_token_id):
        generation_kwargs = {'do_sample': False, 'max_length': max_length}
        if eos_token_id is not None:
            generation_kwargs['eos_token_id'] = eos_token_id
            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
        return self.gpt2.generate(context, **generation_kwargs)