"vscode:/vscode.git/clone" did not exist on "0375a63b548b77720c175152ba43c896b03f71b8"
Commit fbea4311 authored by jon-tow's avatar jon-tow
Browse files

fix: remove tokenizer costraint in `gpt2`

parent 2d843472
...@@ -47,27 +47,27 @@ class HFLM(BaseLM): ...@@ -47,27 +47,27 @@ class HFLM(BaseLM):
revision=revision, revision=revision,
) )
assert isinstance( # assert isinstance(
self.tokenizer, # self.tokenizer,
( # (
transformers.GPT2Tokenizer, # transformers.GPT2Tokenizer,
transformers.GPT2TokenizerFast, # transformers.GPT2TokenizerFast,
transformers.T5Tokenizer, # transformers.T5Tokenizer,
transformers.T5TokenizerFast, # transformers.T5TokenizerFast,
), # ),
), "this tokenizer has not been checked for compatibility yet!" # ), "this tokenizer has not been checked for compatibility yet!"
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
if isinstance( # if isinstance(
self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast) # self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
): # ):
assert self.tokenizer.encode("hello\n\nhello") == [ # assert self.tokenizer.encode("hello\n\nhello") == [
31373, # 31373,
198, # 198,
198, # 198,
31373, # 31373,
], self.tokenizer.encode("hello\n\nhello") # ], self.tokenizer.encode("hello\n\nhello")
# multithreading and batching # multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size self.batch_size_per_gpu = batch_size # todo: adaptive batch size
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment