Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
bfe69313
Commit
bfe69313
authored
Nov 21, 2023
by
lintangsutawika
Browse files
API update
parent
6ac42518
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
39 additions
and
318 deletions
+39
-318
lm_eval/models/openai_completions.py
lm_eval/models/openai_completions.py
+35
-316
tests/tests_master/test_models.py
tests/tests_master/test_models.py
+4
-2
No files found.
lm_eval/models/openai_completions.py
View file @
bfe69313
...
...
@@ -6,59 +6,9 @@ from lm_eval import utils
from
lm_eval.api.model
import
LM
from
lm_eval.api.registry
import
register_model
from
openai
import
OpenAI
def
get_result
(
response
:
dict
,
ctxlen
:
int
)
->
Tuple
[
float
,
bool
]:
"""Process results from OpenAI API response.
:param response: dict
OpenAI API Response
:param ctxlen: int
Length of context (so we can slice them away and only keep the predictions)
:return:
continuation_logprobs: np.array
Log probabilities of continuation tokens
is_greedy: bool
whether argmax matches given continuation exactly
"""
is_greedy
=
True
logprobs
=
response
[
"logprobs"
][
"token_logprobs"
]
continuation_logprobs
=
sum
(
logprobs
[
ctxlen
:])
for
i
in
range
(
ctxlen
,
len
(
response
[
"logprobs"
][
"tokens"
])):
token
=
response
[
"logprobs"
][
"tokens"
][
i
]
top_tokens
=
response
[
"logprobs"
][
"top_logprobs"
][
i
]
top_token
=
max
(
top_tokens
.
keys
(),
key
=
lambda
x
:
top_tokens
[
x
])
if
top_token
!=
token
:
is_greedy
=
False
break
return
continuation_logprobs
,
is_greedy
def
oa_completion
(
**
kwargs
):
"""Query OpenAI API for completion.
Retry with back-off until they respond
"""
try
:
import
openai
,
tiktoken
# noqa: E401
except
ModuleNotFoundError
:
raise
Exception
(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed.
\
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
,
)
backoff_time
=
3
while
True
:
try
:
return
openai
.
Completion
.
create
(
**
kwargs
)
except
openai
.
error
.
OpenAIError
:
import
traceback
traceback
.
print_exc
()
time
.
sleep
(
backoff_time
)
backoff_time
*=
1.5
client
=
OpenAI
()
def
oa_chat_completion
(
**
kwargs
):
"""Query OpenAI API for chat completion.
...
...
@@ -76,8 +26,8 @@ please install these via `pip install lm-eval[openai]` or `pip install -e .[open
backoff_time
=
3
while
True
:
try
:
return
openai
.
C
hat
C
ompletion
.
create
(
**
kwargs
)
except
openai
.
error
.
OpenAIError
:
return
client
.
c
hat
.
c
ompletion
s
.
create
(
**
kwargs
)
except
openai
.
OpenAIError
:
import
traceback
traceback
.
print_exc
()
...
...
@@ -85,263 +35,17 @@ please install these via `pip install lm-eval[openai]` or `pip install -e .[open
backoff_time
*=
1.5
@
register_model
(
"openai"
,
"openai-completions"
,
"gooseai"
)
class
OpenaiCompletionsLM
(
LM
):
REQ_CHUNK_SIZE
=
20
def
__init__
(
self
,
engine
:
str
=
"text-davinci-003"
,
truncate
:
bool
=
False
,
batch_size
:
int
=
1
,
)
->
None
:
"""
:param engine: str
OpenAI API engine (e.g. davinci)
:param truncate: bool
Truncate input if too long (if False and input is too long, throw error)
"""
super
().
__init__
()
try
:
import
openai
,
tiktoken
# noqa: E401
except
ModuleNotFoundError
:
raise
Exception
(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed.
\
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
,
)
self
.
engine
=
engine
self
.
tokenizer
=
tiktoken
.
encoding_for_model
(
self
.
engine
)
self
.
vocab_size
=
self
.
tokenizer
.
n_vocab
self
.
truncate
=
truncate
self
.
end_of_text_token_id
=
self
.
tokenizer
.
eot_token
# Read from environment variable OPENAI_API_SECRET_KEY
openai
.
api_key
=
os
.
environ
[
"OPENAI_API_SECRET_KEY"
]
@
property
def
eot_token_id
(
self
):
return
self
.
end_of_text_token_id
@
property
def
max_length
(
self
)
->
int
:
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
return
2048
@
property
def
max_gen_toks
(
self
)
->
int
:
return
256
@
property
def
batch_size
(
self
):
# Isn't used because we override _loglikelihood_tokens
raise
NotImplementedError
()
@
property
def
device
(
self
):
# Isn't used because we override _loglikelihood_tokens
raise
NotImplementedError
()
def
tok_encode
(
self
,
string
:
str
)
->
List
[
int
]:
return
self
.
tokenizer
.
encode
(
string
)
def
tok_decode
(
self
,
tokens
:
List
[
int
])
->
str
:
return
self
.
tokenizer
.
decode
(
tokens
)
def
_encode_pair
(
self
,
context
:
str
,
continuation
:
str
)
->
Tuple
[
List
[
int
],
List
[
int
]]:
n_spaces
=
len
(
context
)
-
len
(
context
.
rstrip
())
if
n_spaces
>
0
:
continuation
=
context
[
-
n_spaces
:]
+
continuation
context
=
context
[:
-
n_spaces
]
whole_enc
=
self
.
tok_encode
(
context
+
continuation
)
context_enc
=
self
.
tok_encode
(
context
)
context_enc_len
=
len
(
context_enc
)
continuation_enc
=
whole_enc
[
context_enc_len
:]
return
context_enc
,
continuation_enc
def
loglikelihood
(
self
,
requests
)
->
List
[
Tuple
[
float
,
bool
]]:
new_reqs
=
[]
for
context
,
continuation
in
[
req
.
args
for
req
in
requests
]:
if
context
==
""
:
# end of text as context
context_enc
,
continuation_enc
=
[
self
.
eot_token_id
],
self
.
tok_encode
(
continuation
)
else
:
context_enc
,
continuation_enc
=
self
.
_encode_pair
(
context
,
continuation
)
new_reqs
.
append
(((
context
,
continuation
),
context_enc
,
continuation_enc
))
return
self
.
_loglikelihood_tokens
(
new_reqs
)
def
_loglikelihood_tokens
(
self
,
requests
,
disable_tqdm
:
bool
=
False
)
->
List
[
Tuple
[
float
,
bool
]]:
res
=
[]
def
_collate
(
x
):
# this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
# we care about, and so we need some kind of backup for when it isn't
toks
=
x
[
1
]
+
x
[
2
]
return
-
len
(
toks
),
tuple
(
toks
)
re_ord
=
utils
.
Reorderer
(
requests
,
_collate
)
for
chunk
in
tqdm
(
list
(
utils
.
chunks
(
re_ord
.
get_reordered
(),
self
.
REQ_CHUNK_SIZE
)),
disable
=
disable_tqdm
,
):
inps
=
[]
ctxlens
=
[]
for
cache_key
,
context_enc
,
continuation_enc
in
chunk
:
# max_length+1 because the API takes up to 2049 tokens, including the first context token
inp
=
(
context_enc
+
continuation_enc
)[
-
(
self
.
max_length
+
1
)
:]
# TODO: the logic is much simpler if we just look at the length of continuation tokens
ctxlen
=
len
(
context_enc
)
-
max
(
0
,
len
(
context_enc
)
+
len
(
continuation_enc
)
-
(
self
.
max_length
+
1
)
)
inps
.
append
(
inp
)
ctxlens
.
append
(
ctxlen
)
response
=
oa_completion
(
engine
=
self
.
engine
,
prompt
=
inps
,
echo
=
True
,
max_tokens
=
0
,
temperature
=
0.0
,
logprobs
=
10
,
)
for
resp
,
ctxlen
,
(
cache_key
,
context_enc
,
continuation_enc
)
in
zip
(
response
.
choices
,
ctxlens
,
chunk
):
answer
=
get_result
(
resp
,
ctxlen
)
res
.
append
(
answer
)
# partial caching
if
cache_key
is
not
None
:
self
.
cache_hook
.
add_partial
(
"loglikelihood"
,
cache_key
,
answer
)
return
re_ord
.
get_original
(
res
)
def
generate_until
(
self
,
requests
)
->
List
[
str
]:
if
not
requests
:
return
[]
res
=
[]
requests
=
[
req
.
args
for
req
in
requests
]
def
_collate
(
x
):
toks
=
self
.
tok_encode
(
x
[
0
])
return
len
(
toks
),
x
[
0
]
re_ord
=
utils
.
Reorderer
(
requests
,
_collate
)
def
sameuntil_chunks
(
xs
,
size
):
ret
=
[]
lastuntil
=
xs
[
0
][
1
]
for
x
in
xs
:
if
len
(
ret
)
>=
size
or
x
[
1
]
!=
lastuntil
:
yield
ret
,
lastuntil
ret
=
[]
lastuntil
=
x
[
1
]
ret
.
append
(
x
)
if
ret
:
yield
ret
,
lastuntil
# todo: more intelligent batching for heterogeneous `until`
for
chunk
,
request_args
in
tqdm
(
list
(
sameuntil_chunks
(
re_ord
.
get_reordered
(),
self
.
REQ_CHUNK_SIZE
))
):
inps
=
[]
for
context
,
_
in
chunk
:
context_enc
=
self
.
tok_encode
(
context
)
inp
=
context_enc
[
-
(
self
.
max_length
-
self
.
max_gen_toks
)
:]
inps
.
append
(
inp
)
until
=
request_args
.
get
(
"until"
,
[
"<|endoftext|>"
])
response
=
oa_completion
(
engine
=
self
.
engine
,
prompt
=
inps
,
max_tokens
=
self
.
max_gen_toks
,
temperature
=
0.0
,
logprobs
=
10
,
stop
=
until
,
)
for
resp
,
(
context
,
args_
)
in
zip
(
response
.
choices
,
chunk
):
s
=
resp
[
"text"
]
until_
=
args_
.
get
(
"until"
,
[
"<|endoftext|>"
])
for
term
in
until_
:
if
len
(
term
)
>
0
:
s
=
s
.
split
(
term
)[
0
]
# partial caching
self
.
cache_hook
.
add_partial
(
"generate_until"
,
(
context
,
{
"until"
:
until_
}),
s
)
res
.
append
(
s
)
return
re_ord
.
get_original
(
res
)
def
_model_call
(
self
,
inps
):
# Isn't used because we override _loglikelihood_tokens
raise
NotImplementedError
()
def
_model_generate
(
self
,
context
,
max_length
,
eos_token_id
):
# Isn't used because we override generate_until
raise
NotImplementedError
()
def
loglikelihood_rolling
(
self
,
requests
)
->
List
[
float
]:
loglikelihoods
=
[]
for
(
string
,)
in
tqdm
([
req
.
args
for
req
in
requests
]):
rolling_token_windows
=
list
(
map
(
utils
.
make_disjoint_window
,
utils
.
get_rolling_token_windows
(
token_list
=
self
.
tok_encode
(
string
),
prefix_token
=
self
.
eot_token_id
,
max_seq_len
=
self
.
max_length
,
context_len
=
1
,
),
)
)
# TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
rolling_token_windows
=
[(
None
,)
+
x
for
x
in
rolling_token_windows
]
string_nll
=
self
.
_loglikelihood_tokens
(
rolling_token_windows
,
disable_tqdm
=
True
,
)
# discard is_greedy
string_nll
=
[
x
[
0
]
for
x
in
string_nll
]
string_nll
=
sum
(
string_nll
)
loglikelihoods
.
append
(
string_nll
)
return
loglikelihoods
@
register_model
(
"openai-chat-completions"
)
class
OpenaiChatCompletionsLM
(
LM
):
REQ_CHUNK_SIZE
=
20
def
__init__
(
self
,
engine
:
str
=
"gpt-3.5-turbo"
,
truncate
:
bool
=
False
,
batch_size
:
int
=
1
self
,
model
:
str
=
"gpt-3.5-turbo"
,
truncate
:
bool
=
False
,
batch_size
:
int
=
1
)
->
None
:
"""
:param
engine
: str
OpenAI API
engine
(e.g. gpt-3.5-turbo)
:param
model
: str
OpenAI API
model
(e.g. gpt-3.5-turbo)
:param truncate: bool
Truncate input if too long (if False and input is too long, throw error)
"""
...
...
@@ -353,14 +57,20 @@ class OpenaiChatCompletionsLM(LM):
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed.
\
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
,
)
self
.
engine
=
engine
self
.
tokenizer
=
tiktoken
.
encoding_for_model
(
self
.
engine
)
self
.
model
=
model
self
.
frequency_penalty
=
0
self
.
logit_bias
=
None
self
.
n
=
1
self
.
presence_penalty
=
0
self
.
temperature
=
1
self
.
top_p
=
1
self
.
tokenizer
=
tiktoken
.
encoding_for_model
(
self
.
model
)
self
.
vocab_size
=
self
.
tokenizer
.
n_vocab
self
.
truncate
=
truncate
self
.
end_of_text_token_id
=
self
.
tokenizer
.
eot_token
# Read from environment variable OPENAI_API_SECRET_KEY
openai
.
api_key
=
os
.
environ
[
"OPENAI_API_SECRET_KEY"
]
@
property
def
eot_token_id
(
self
):
...
...
@@ -435,25 +145,34 @@ class OpenaiChatCompletionsLM(LM):
):
inps
=
[]
for
context
,
_
in
chunk
:
context_enc
=
self
.
tok_encode
(
context
)
inp
=
context_enc
[
-
(
self
.
max_length
-
self
.
max_gen_toks
):]
inps
.
append
({
"role"
:
"user"
,
"content"
:
inp
})
#
context_enc = self.tok_encode(context)
#
inp = context_enc[-(self.max_length - self.max_gen_toks):]
inps
.
append
({
"role"
:
"user"
,
"content"
:
context
})
until
=
request_args
.
get
(
"until"
,
[
"<|endoftext|>"
])
# until = request_args.get("until", ["<|endoftext|>"])
until
=
request_args
.
get
(
"until"
,
None
)
response
=
oa_chat_completion
(
engine
=
self
.
engine
,
prompt
=
inps
,
messages
=
inps
,
model
=
self
.
model
,
frequency_penalty
=
self
.
frequency_penalty
,
# logit_bias=self.logit_bias,
max_tokens
=
self
.
max_gen_toks
,
temperature
=
0.0
,
logprobs
=
10
,
stop
=
until
,
n
=
self
.
n
,
presence_penalty
=
self
.
presence_penalty
,
temperature
=
self
.
temperature
,
top_p
=
self
.
top_p
,
# stop=until,
)
for
resp
,
(
context
,
args_
)
in
zip
(
response
.
choices
,
chunk
):
print
(
resp
)
import
sys
;
sys
.
exit
()
s
=
resp
[
"text"
]
until_
=
args_
.
get
(
"until"
,
[
"<|endoftext|>"
])
# until_ = args_.get("until", ["<|endoftext|>"])
until_
=
args_
.
get
(
"until"
,
"null"
)
for
term
in
until_
:
if
len
(
term
)
>
0
:
...
...
tests/tests_master/test_models.py
View file @
bfe69313
import
hashlib
import
json
import
openai
from
openai
import
OpenAI
client
=
OpenAI
()
import
os
import
pickle
import
pytest
...
...
@@ -172,7 +174,7 @@ def openai_mock_completion(**kwargs):
if
os
.
path
.
exists
(
fname
):
with
open
(
fname
,
"rb"
)
as
fh
:
return
pickle
.
load
(
fh
)
ret
=
openai
.
C
ompletion
.
create
(
**
kwargs
)
ret
=
client
.
c
ompletion
s
.
create
(
**
kwargs
)
ret
.
api_key
=
""
with
open
(
fname
,
"wb"
)
as
fh
:
pickle
.
dump
(
ret
,
fh
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment