Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e58b8182
Commit
e58b8182
authored
Aug 08, 2024
by
lintangsutawika
Browse files
resolved merge conflict
parents
d213a533
0571eeb1
Changes
105
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
166 additions
and
6 deletions
+166
-6
lm_eval/tasks/truthfulqa/README.md
lm_eval/tasks/truthfulqa/README.md
+2
-2
lm_eval/utils.py
lm_eval/utils.py
+10
-0
pyproject.toml
pyproject.toml
+2
-2
scripts/model_comparator.py
scripts/model_comparator.py
+3
-2
tests/models/test_api.py
tests/models/test_api.py
+149
-0
No files found.
lm_eval/tasks/truthfulqa/README.md
View file @
e58b8182
...
...
@@ -36,8 +36,8 @@ Homepage: `https://github.com/sylinrl/TruthfulQA`
#### Tasks
*
`truthfulqa_mc1`
:
`Multiple-choice, single answer`
*
(MISSING)
`truthfulqa_mc2`
:
`Multiple-choice, multiple answers`
*
(MISSING)
`truthfulqa_gen`
:
`Answer generation`
*
`truthfulqa_mc2`
:
`Multiple-choice, multiple answers`
*
`truthfulqa_gen`
:
`Answer generation`
### Checklist
...
...
lm_eval/utils.py
View file @
e58b8182
...
...
@@ -487,3 +487,13 @@ def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
among ranks in multigpu setting or only pulling a sample of documents
"""
return
islice
(
raw_iterator
,
rank
,
limit
,
world_size
)
def
weighted_f1_score
(
items
):
from
sklearn.metrics
import
f1_score
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
fscore
=
f1_score
(
golds
,
preds
,
average
=
"weighted"
)
return
fscore
pyproject.toml
View file @
e58b8182
...
...
@@ -57,7 +57,7 @@ Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
Repository
=
"https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
a
nthro
pi
c
=
["
anthropic
"]
api
=
[
"
requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken
"
]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"mypy"
]
deepsparse
=
["deepsparse-nightly[llm]>=1.8.0.20240404"]
gptq
=
["auto-gptq[triton]>=0.6.0"]
...
...
@@ -67,7 +67,6 @@ neuronx = ["optimum[neuronx]"]
mamba
=
[
"mamba_ssm"
,
"causal-conv1d==1.0.2"
]
math
=
[
"sympy>=1.12"
,
"antlr4-python3-runtime==4.11"
]
multilingual
=
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
,
"pycountry"
]
openai
=
[
"openai==1.3.9"
,
"tiktoken"
]
optimum
=
["optimum[openvino]"]
promptsource
=
["promptsource>=0.2.3"]
sentencepiece
=
["sentencepiece>=0.1.98"]
...
...
@@ -105,3 +104,4 @@ known-first-party = ["lm_eval"]
[tool.ruff.lint.extend-per-file-ignores]
"__init__.py"
=
["F401","F402","F403"]
"utils.py"
=
["F401"]
scripts/model_comparator.py
View file @
e58b8182
...
...
@@ -4,7 +4,6 @@ from typing import Dict, List, Tuple
import
numpy
as
np
import
pandas
as
pd
import
scipy.stats
import
torch
import
lm_eval.evaluator
...
...
@@ -23,11 +22,13 @@ def memory_stats():
def
calculate_z_value
(
res1
:
Dict
,
res2
:
Dict
)
->
Tuple
[
float
,
float
]:
from
scipy.stats.norm
import
sf
acc1
,
acc2
=
res1
[
"acc,none"
],
res2
[
"acc,none"
]
st_err1
,
st_err2
=
res1
[
"acc_stderr,none"
],
res2
[
"acc_stderr,none"
]
Z
=
(
acc1
-
acc2
)
/
np
.
sqrt
((
st_err1
**
2
)
+
(
st_err2
**
2
))
# Determining the p-value
p_value
=
2
*
scipy
.
stats
.
norm
.
sf
(
abs
(
Z
))
# two-tailed test
p_value
=
2
*
sf
(
abs
(
Z
))
# two-tailed test
return
Z
,
p_value
...
...
tests/models/test_api.py
0 → 100644
View file @
e58b8182
from
unittest.mock
import
MagicMock
,
patch
import
pytest
from
lm_eval.models.openai_completions
import
LocalCompletionsAPI
@
pytest
.
fixture
def
api
():
return
LocalCompletionsAPI
(
base_url
=
"http://test-url.com"
,
tokenizer_backend
=
None
,
model
=
"gpt-3.5-turbo"
)
@
pytest
.
fixture
def
api_tokenized
():
return
LocalCompletionsAPI
(
base_url
=
"http://test-url.com"
,
model
=
"EleutherAI/pythia-1b"
,
tokenizer_backend
=
"huggingface"
,
)
def
test_create_payload_generate
(
api
):
messages
=
[
"Generate a story"
]
gen_kwargs
=
{
"max_tokens"
:
100
,
"temperature"
:
0.7
,
"until"
:
[
"The End"
],
"do_sample"
:
True
,
"seed"
:
1234
,
}
payload
=
api
.
_create_payload
(
messages
,
generate
=
True
,
gen_kwargs
=
gen_kwargs
)
assert
payload
==
{
"prompt"
:
[
"Generate a story"
],
"model"
:
"gpt-3.5-turbo"
,
"max_tokens"
:
100
,
"temperature"
:
0.7
,
"stop"
:
[
"The End"
],
"seed"
:
1234
,
}
def
test_create_payload_loglikelihood
(
api
):
messages
=
[
"The capital of France is"
]
payload
=
api
.
_create_payload
(
messages
,
generate
=
False
,
gen_kwargs
=
None
)
assert
payload
==
{
"model"
:
"gpt-3.5-turbo"
,
"prompt"
:
[
"The capital of France is"
],
"max_tokens"
:
1
,
"logprobs"
:
1
,
"echo"
:
True
,
"temperature"
:
0
,
"seed"
:
1234
,
}
@
pytest
.
mark
.
parametrize
(
"input_messages, generate, gen_kwargs, expected_payload"
,
[
(
[
"Hello, how are"
],
True
,
{
"max_gen_toks"
:
100
,
"temperature"
:
0.7
},
{
"prompt"
:
"Hello, how are"
,
"model"
:
"gpt-3.5-turbo"
,
"max_tokens"
:
100
,
"temperature"
:
0.7
,
"stop"
:
[
"<|endoftext|>"
],
"seed"
:
1234
,
},
),
(
[
"Hello, how are"
,
"you"
],
True
,
{},
{
"prompt"
:
"Hello, how are"
,
"model"
:
"gpt-3.5-turbo"
,
"max_tokens"
:
256
,
"temperature"
:
0
,
"stop"
:
[
"<|endoftext|>"
],
"seed"
:
1234
,
},
),
],
)
def
test_model_generate_call_usage
(
api
,
input_messages
,
generate
,
gen_kwargs
,
expected_payload
):
with
patch
(
"requests.post"
)
as
mock_post
:
mock_response
=
MagicMock
()
mock_response
.
json
.
return_value
=
{
"result"
:
"success"
}
mock_post
.
return_value
=
mock_response
# Act
result
=
api
.
model_call
(
input_messages
,
generate
=
generate
,
gen_kwargs
=
gen_kwargs
)
# Assert
mock_post
.
assert_called_once
()
_
,
kwargs
=
mock_post
.
call_args
assert
"json"
in
kwargs
assert
kwargs
[
"json"
]
==
expected_payload
assert
result
==
{
"result"
:
"success"
}
@
pytest
.
mark
.
parametrize
(
"input_messages, generate, gen_kwargs, expected_payload"
,
[
(
[[
1
,
2
,
3
,
4
,
5
]],
False
,
None
,
{
"model"
:
"EleutherAI/pythia-1b"
,
"prompt"
:
[[
1
,
2
,
3
,
4
,
5
]],
"max_tokens"
:
1
,
"logprobs"
:
1
,
"echo"
:
True
,
"seed"
:
1234
,
"temperature"
:
0
,
},
),
],
)
def
test_model_tokenized_call_usage
(
api_tokenized
,
input_messages
,
generate
,
gen_kwargs
,
expected_payload
):
with
patch
(
"requests.post"
)
as
mock_post
:
mock_response
=
MagicMock
()
mock_response
.
json
.
return_value
=
{
"result"
:
"success"
}
mock_post
.
return_value
=
mock_response
# Act
result
=
api_tokenized
.
model_call
(
input_messages
,
generate
=
generate
,
gen_kwargs
=
gen_kwargs
)
# Assert
mock_post
.
assert_called_once
()
_
,
kwargs
=
mock_post
.
call_args
assert
"json"
in
kwargs
assert
kwargs
[
"json"
]
==
expected_payload
assert
result
==
{
"result"
:
"success"
}
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment