Commit fbea4311 authored by jon-tow's avatar jon-tow
Browse files

fix: remove tokenizer costraint in `gpt2`

parent 2d843472
......@@ -47,27 +47,27 @@ class HFLM(BaseLM):
revision=revision,
)
assert isinstance(
self.tokenizer,
(
transformers.GPT2Tokenizer,
transformers.GPT2TokenizerFast,
transformers.T5Tokenizer,
transformers.T5TokenizerFast,
),
), "this tokenizer has not been checked for compatibility yet!"
# assert isinstance(
# self.tokenizer,
# (
# transformers.GPT2Tokenizer,
# transformers.GPT2TokenizerFast,
# transformers.T5Tokenizer,
# transformers.T5TokenizerFast,
# ),
# ), "this tokenizer has not been checked for compatibility yet!"
self.vocab_size = self.tokenizer.vocab_size
if isinstance(
self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
):
assert self.tokenizer.encode("hello\n\nhello") == [
31373,
198,
198,
31373,
], self.tokenizer.encode("hello\n\nhello")
# if isinstance(
# self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
# ):
# assert self.tokenizer.encode("hello\n\nhello") == [
# 31373,
# 198,
# 198,
# 31373,
# ], self.tokenizer.encode("hello\n\nhello")
# multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment