Commit 77b44470 authored by Leo Gao's avatar Leo Gao
Browse files

Add gpt2/3 tokenizer sanity check

parent 7d5aa3f7
......@@ -305,6 +305,8 @@ class Request:
def __eq__(self, other):
return self.type == other.type and self.args == other.args and self.index == other.index
def __repr__(self):
return f"Req_{self.type}{self.args}[{self.index}]\n"
class RequestFactory:
def __getattr__(self, attr):
......
......@@ -16,6 +16,8 @@ class GPT2LM(LM):
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained(pretrained)
self.tokenizer.pad_token = "<|endoftext|>"
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
@classmethod
def create_from_arg_string(cls, arg_string):
args = utils.simple_parse_args_string(arg_string)
......
......@@ -52,8 +52,10 @@ class GPT3LM(LM):
self.engine = engine
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
# to make the annoying "Using pad_token, but it is not set yet." error go away
self.tokenizer.pad_token = "<|endoftext|>"
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
self.truncate = truncate
# Read from environment variable OPENAI_API_SECRET_KEY
......@@ -115,8 +117,12 @@ class GPT3LM(LM):
logprobs=10,
stop=until
)
s = response.choices[0]['text']
for term in until:
s = s.split(term)[0]
res.append(response.choices[0]['text'])
res.append(s)
return res
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment