Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
d553e060
Unverified
Commit
d553e060
authored
Jul 24, 2023
by
Hailey Schoelkopf
Committed by
GitHub
Jul 24, 2023
Browse files
Merge pull request #693 from baberabb/big-refactor_fixfin
[Refactor] Fix tests
parents
2820042d
4d8cee80
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
244 additions
and
83 deletions
+244
-83
.github/workflows/new_tasks.yml
.github/workflows/new_tasks.yml
+17
-8
.github/workflows/unit_tests.yml
.github/workflows/unit_tests.yml
+7
-5
.gitignore
.gitignore
+11
-2
tests/conftest.py
tests/conftest.py
+0
-6
tests/extra/test_new_tasks.py
tests/extra/test_new_tasks.py
+128
-0
tests/extra/utilities_testing.py
tests/extra/utilities_testing.py
+32
-0
tests/test_tasks.py
tests/test_tasks.py
+49
-62
No files found.
.github/workflows/new_tasks.yml
View file @
d553e060
...
...
@@ -8,10 +8,11 @@ on:
branches
:
-
big-refactor
workflow_dispatch
:
# comment/edit out the above to stop/change the triggers
jobs
:
changed_files
:
runs-on
:
ubuntu-latest
# windows-latest || macos-latest
timeout-minutes
:
120
name
:
Scan for changed tasks
steps
:
-
name
:
checkout
...
...
@@ -19,11 +20,15 @@ jobs:
with
:
fetch-depth
:
0
# OR "2" -> To retrieve the preceding commit.
# Example 1
# Uses the tj-actions/changed-files@v37 action to check for changes.
# Outputs provided here: https://github.com/tj-actions/changed-files#outputs
# The `files_yaml` input optionally takes a yaml string to specify filters,
# and prepends the filter name to the standard output names.
-
name
:
Check task folders
id
:
changed-tasks
uses
:
tj-actions/changed-files@v37.1.2
with
:
# tasks checks the tasks folder and api checks the api folder for changes
files_yaml
:
|
tasks:
- lm_eval/tasks/**
...
...
@@ -31,31 +36,35 @@ jobs:
- lm_eval/api/**
write_output_files
:
true
# The next step is optional; the files are written to the workspace by default (above).
# so it's just for debugging
-
name
:
Run Tests
if
:
steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run
:
|
echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
echo "One or more test file(s) has changed."
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
-
name
:
Set up Python
3.9
if
:
steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
uses
:
actions/setup-python@v4
with
:
python-version
:
3.9
cache
:
'
pip'
-
name
:
Install dependencies
if
:
steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run
:
|
python -m pip install --upgrade pip
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
#
Install optional git dependencies
#
pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
#
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
#
Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-
name
:
Test with pytest
# if new tasks are added, run tests on them
if
:
steps.changed-tasks.outputs.tasks_any_modified == 'true'
run
:
python -m pytest tests/test_tasks.py -s -vv -n=auto --new_task
run
:
python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
# if api is modified, run tests on it
-
name
:
Test more tasks with pytest
env
:
API
:
true
if
:
steps.changed-tasks.outputs.api_any_modified == 'true'
run
:
python -m pytest tests/
test_api
.py -s -vv -n=auto
--new_task
run
:
python -m pytest tests/
extra/test_new_tasks
.py -s -vv -n=auto
.github/workflows/unit_tests.yml
View file @
d553e060
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
# just comment out unwanted steps to turn off the test.
name
:
Unit Tests
on
:
...
...
@@ -11,7 +11,8 @@ on:
branches
:
-
big-refactor
workflow_dispatch
:
# Jobs run concurrently and steps run sequentially within a job.
# jobs: linter and cpu_tests. Add more jobs/steps as required.
jobs
:
linter
:
name
:
Linters
...
...
@@ -35,9 +36,10 @@ jobs:
flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-
name
:
Lint with mypy
run
:
mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
# mypy turned off for now
# - name: Lint with mypy
# run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
# Job 2
testcpu
:
name
:
CPU Tests
runs-on
:
ubuntu-latest
...
...
.gitignore
View file @
d553e060
...
...
@@ -3,6 +3,15 @@ env
data/
lm_cache
.idea
*.egg-info/
build
dist
*.egg-info
venv
.vscode/
temp
__pycache__
.ipynb_checkpoints
temp
# IPython
profile_default/
ipython_config.py
tests/conftest.py
deleted
100644 → 0
View file @
2820042d
def
pytest_addoption
(
parser
):
parser
.
addoption
(
"--new_task"
,
action
=
"store_true"
,
help
=
"new_tasks_found"
,
)
tests/extra/test_new_tasks.py
0 → 100644
View file @
d553e060
import
pytest
from
itertools
import
islice
import
lm_eval.tasks
as
tasks
from
.utilities_testing
import
load_changed_files
,
parser
from
typing
import
List
from
lm_eval.api.task
import
ConfigurableTask
import
os
# GitHub CI
def
new_tasks
()
->
List
[
str
]:
FILENAME
=
".github/outputs/tasks_all_changed_and_modified_files.txt"
if
os
.
path
.
exists
(
FILENAME
):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return
parser
(
load_changed_files
(
FILENAME
))
elif
os
.
getenv
(
"API"
)
is
not
None
:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return
[
"arc_easy"
,
"hellaswag"
,
"piqa"
,
"wikitext"
]
# if both not true just do arc_easy
else
:
return
[
"arc_easy"
]
def
get_task_class
()
->
List
[
ConfigurableTask
]:
task_name
=
new_tasks
()
x
=
[
cls
for
name
,
cls
in
tasks
.
TASK_REGISTRY
.
items
()
if
name
in
task_name
]
return
x
@
pytest
.
fixture
()
def
limit
()
->
int
:
return
10
# Tests
@
pytest
.
mark
.
parametrize
(
"task_class"
,
get_task_class
())
class
TestNewTasks
:
def
test_download
(
self
,
task_class
:
ConfigurableTask
):
task_class
().
download
()
assert
task_class
().
dataset
is
not
None
def
test_has_training_docs
(
self
,
task_class
:
ConfigurableTask
):
assert
task_class
().
has_training_docs
()
in
[
True
,
False
]
def
test_check_training_docs
(
self
,
task_class
:
ConfigurableTask
):
task
=
task_class
()
if
task
.
has_training_docs
():
assert
task
.
_config
[
"training_split"
]
is
not
None
def
test_has_validation_docs
(
self
,
task_class
):
assert
task_class
().
has_validation_docs
()
in
[
True
,
False
]
def
test_check_validation_docs
(
self
,
task_class
):
task
=
task_class
()
if
task
.
has_validation_docs
():
assert
task
.
_config
[
"validation_split"
]
is
not
None
def
test_has_test_docs
(
self
,
task_class
):
assert
task_class
().
has_test_docs
()
in
[
True
,
False
]
def
test_check_test_docs
(
self
,
task_class
):
task
=
task_class
()
if
task
.
has_test_docs
():
assert
task
.
_config
[
"test_split"
]
is
not
None
def
test_should_decontaminate
(
self
,
task_class
):
task
=
task_class
()
assert
task
.
should_decontaminate
()
in
[
True
,
False
]
if
task
.
should_decontaminate
():
assert
task
.
_config
[
"doc_to_decontamination_query"
]
is
not
None
def
test_doc_to_text
(
self
,
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert
all
(
isinstance
(
x
,
str
)
and
(
x
[
-
1
]
!=
" "
if
len
(
x
)
!=
0
else
True
)
for
x
in
_array
)
def
test_create_choices
(
self
,
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
if
"multiple_choice"
in
task
.
_config
.
group
:
_array
=
[
task
.
doc_to_choice
(
doc
)
for
doc
in
arr
]
# assert all(len(x) == 4 for x in _array)
assert
all
(
isinstance
(
x
,
list
)
for
x
in
_array
)
assert
all
(
isinstance
(
x
[
0
],
str
)
for
x
in
_array
)
def
test_doc_to_target
(
self
,
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array_target
=
[
task
.
doc_to_target
(
doc
)
for
doc
in
arr
]
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
assert
len
(
_array_target
)
==
limit
if
limit
else
True
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def
test_build_all_requests
(
self
,
task_class
,
limit
):
task_class
().
build_all_requests
(
rank
=
1
,
limit
=
limit
,
world_size
=
1
)
assert
task_class
.
instances
is
not
None
def
test_construct_requests
(
self
,
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
requests
=
[
task
.
construct_requests
(
doc
,
task
.
doc_to_text
(
doc
))
for
doc
in
arr
]
assert
all
(
isinstance
(
doc
,
list
)
for
doc
in
requests
)
assert
len
(
requests
)
==
limit
if
limit
else
True
tests/extra/
test_utils
.py
→
tests/extra/
utilities_testing
.py
View file @
d553e060
...
...
@@ -2,16 +2,25 @@ import json
from
typing
import
List
from
lm_eval.utils
import
load_yaml_config
from
pathlib
import
Path
import
sys
# This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
FILE_PATH
=
file_path
=
".github/outputs/tasks_all_changed_and_modified_files.txt"
def
load_changed_files
(
file_path
:
str
=
FILE_PATH
)
->
List
[
str
]:
# reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files
def
load_changed_files
(
file_path
:
str
)
->
List
[
str
]:
with
open
(
file_path
,
"r"
)
as
f
:
return
[
line
.
strip
()
for
line
in
f
.
readlines
()]
content
=
f
.
read
()
words_list
=
[
x
for
x
in
content
.
split
()]
sys
.
stdout
.
write
(
f
"list of files:
{
words_list
}
"
)
return
words_list
# checks the txt file for list of changed files.
# if file ends with .yaml then check yaml for task name
# if file ends with .py then parse the folder for all yaml files
def
parser
(
full_path
:
List
[
str
])
->
List
[
str
]:
_output
=
set
()
for
x
in
full_path
:
...
...
tests/test_tasks.py
View file @
d553e060
import
pytest
from
itertools
import
islice
import
pytest
from
typing
import
List
import
lm_eval.tasks
as
tasks
from
tests.extra.test_utils
import
load_changed_files
,
parser
from
typing
import
List
,
ClassVar
import
os
from
lm_eval.api.task
import
ConfigurableTask
# Using fixtures to get the task class and limit
@
pytest
.
fixture
()
def
any_new_tasks
(
request
)
->
bool
:
return
request
.
config
.
getoption
(
"--new_task"
)
# ["arc_easy] else get list of new tasks
def
new_tasks
(
any_new_tasks
:
bool
)
->
List
[
str
]:
FILENAME
=
".github/outputs/tasks_all_changed_and_modified_files.txt"
if
any_new_tasks
and
os
.
path
.
exists
(
FILENAME
):
return
[
parser
(
load_changed_files
(
FILENAME
))]
elif
os
.
getenv
(
"API"
)
is
not
None
:
return
[
"arc_easy"
,
"hellaswag"
,
"piqa"
,
"wikitext"
]
else
:
return
[
"arc_easy"
]
@
pytest
.
fixture
(
params
=
new_tasks
(
any_new_tasks
))
def
task_class
(
request
):
task_name
=
request
.
param
return
[
cls
for
name
,
cls
in
tasks
.
TASK_REGISTRY
.
items
()
if
name
in
task_name
][
0
]
def
task_class
()
->
ConfigurableTask
:
task_name
=
[
"arc_easy"
]
x
=
[
cls
for
name
,
cls
in
tasks
.
TASK_REGISTRY
.
items
()
if
name
in
task_name
]
return
x
[
0
]
@
pytest
.
fixture
()
def
limit
(
any_new_tasks
:
bool
)
->
int
:
return
10
0
if
any_new_tasks
else
10
def
limit
()
->
int
:
return
10
# Tests
def
test_download
(
task_class
):
def
test_download
(
task_class
:
ConfigurableTask
):
task_class
().
download
()
assert
task_class
().
dataset
is
not
None
def
test_has_training_docs
(
task_class
):
def
test_has_training_docs
(
task_class
:
ConfigurableTask
):
assert
task_class
().
has_training_docs
()
in
[
True
,
False
]
def
test_check_training_docs
(
task_class
):
def
test_check_training_docs
(
task_class
:
ConfigurableTask
):
task
=
task_class
()
assert
task
.
has_training_docs
()
if
task
.
_config
[
"training_split"
]
else
True
if
task
.
has_training_docs
():
assert
task
.
_config
[
"training_split"
]
is
not
None
def
test_has_validation_docs
(
task_class
):
assert
task_class
().
has_
training
_docs
()
in
[
True
,
False
]
assert
task_class
().
has_
validation
_docs
()
in
[
True
,
False
]
def
test_check_validation_docs
(
task_class
):
task
=
task_class
()
assert
(
task_class
().
has_training_docs
()
if
task
.
_config
[
"validation_split"
]
else
True
)
if
task
.
has_validation_docs
():
assert
task
.
_config
[
"validation_split"
]
is
not
None
def
test_has_test_docs
(
task_class
):
assert
task_class
().
has_t
raining
_docs
()
in
[
True
,
False
]
assert
task_class
().
has_t
est
_docs
()
in
[
True
,
False
]
def
test_check_test_docs
(
task_class
):
task
=
task_class
()
assert
task_class
().
has_training_docs
()
if
task
.
_config
[
"test_split"
]
else
True
if
task
.
has_test_docs
():
assert
task
.
_config
[
"test_split"
]
is
not
None
def
test_should_decontaminate
(
task_class
):
task
_class
=
task_class
()
assert
task
_class
.
should_decontaminate
()
in
[
True
,
False
]
if
task
_class
.
should_decontaminate
():
assert
task
_class
.
_config
[
"doc_to_decontamination_query"
]
is
not
None
task
=
task_class
()
assert
task
.
should_decontaminate
()
in
[
True
,
False
]
if
task
.
should_decontaminate
():
assert
task
.
_config
[
"doc_to_decontamination_query"
]
is
not
None
def
test_doc_to_text
(
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
_class
()
.
test_docs
(),
limit
))
if
limit
else
list
(
task_class
().
test_docs
(
))
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task
_class
()
.
doc_to_text
(
doc
)
for
doc
in
arr
]
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert
all
(
isinstance
(
x
,
str
)
and
(
x
[
-
1
]
!=
" "
if
len
(
x
)
!=
0
else
True
)
for
x
in
_array
...
...
@@ -91,24 +77,27 @@ def test_doc_to_text(task_class, limit):
def
test_create_choices
(
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
_class
()
.
test_docs
(),
limit
))
if
limit
else
list
(
task_class
().
test_docs
(
))
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task_class
().
doc_to_choice
(
doc
)
for
doc
in
arr
]
# assert all(len(x) == 4 for x in _array)
assert
all
(
isinstance
(
x
,
list
)
for
x
in
_array
)
assert
all
(
isinstance
(
x
[
0
],
str
)
for
x
in
_array
)
if
"multiple_choice"
in
task
.
_config
.
group
:
_array
=
[
task
.
doc_to_choice
(
doc
)
for
doc
in
arr
]
# assert all(len(x) == 4 for x in _array)
assert
all
(
isinstance
(
x
,
list
)
for
x
in
_array
)
assert
all
(
isinstance
(
x
[
0
],
str
)
for
x
in
_array
)
def
test_doc_to_target
(
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
_class
()
.
test_docs
(),
limit
))
if
limit
else
list
(
task_class
().
test_target
(
))
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array_target
=
[
task
_class
()
.
doc_to_target
(
doc
)
for
doc
in
arr
]
_array_target
=
[
task
.
doc_to_target
(
doc
)
for
doc
in
arr
]
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
assert
len
(
_array_target
)
==
limit
if
limit
else
True
# _array_text = [task.doc_to_text(doc) for doc in arr]
...
...
@@ -122,15 +111,13 @@ def test_build_all_requests(task_class, limit):
def
test_construct_requests
(
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
_class
()
.
test_docs
(),
limit
))
if
limit
else
list
(
task_class
().
test_docs
(
))
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
requests
=
[
task_class
().
construct_requests
(
doc
,
task_class
().
doc_to_text
(
doc
))
for
doc
in
arr
]
requests
=
[
task
.
construct_requests
(
doc
,
task
.
doc_to_text
(
doc
))
for
doc
in
arr
]
assert
all
(
isinstance
(
doc
,
list
)
for
doc
in
requests
)
assert
len
(
requests
)
==
limit
if
limit
else
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment