Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
a3252ed7
Unverified
Commit
a3252ed7
authored
Sep 19, 2023
by
Hailey Schoelkopf
Committed by
GitHub
Sep 19, 2023
Browse files
Merge pull request #862 from EleutherAI/bump-deps
[Refactor] Set python3.8 as allowed version
parents
54a53d6f
fa2ae334
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
13 additions
and
14 deletions
+13
-14
.github/workflows/unit_tests.yml
.github/workflows/unit_tests.yml
+2
-3
lm_eval/decontamination/janitor.py
lm_eval/decontamination/janitor.py
+9
-9
mypy.ini
mypy.ini
+1
-1
pyproject.toml
pyproject.toml
+1
-1
No files found.
.github/workflows/unit_tests.yml
View file @
a3252ed7
...
...
@@ -40,7 +40,7 @@ jobs:
flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
# mypy turned off for now
#
# mypy turned off for now
# - name: Lint with mypy
# run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
# Job 2
...
...
@@ -49,9 +49,8 @@ jobs:
# runs-on: ubuntu-latest
# strategy:
# matrix:
# python-version: [ "3.9", "3.10", "3.11" ]
# python-version: [
"3.8",
"3.9", "3.10", "3.11" ]
# timeout-minutes: 30
# steps:
# - name: Checkout Code
# uses: actions/checkout@v3
...
...
lm_eval/decontamination/janitor.py
View file @
a3252ed7
...
...
@@ -3,7 +3,7 @@ import string
import
pickle
import
traceback
from
pprint
import
pprint
from
typing
import
Iterator
,
Sequence
,
TypeVar
from
typing
import
Iterator
,
Sequence
,
TypeVar
,
List
,
Tuple
# This is a cpp module. Compile janitor_util.cpp with:
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
...
...
@@ -21,7 +21,7 @@ T = TypeVar("T")
# Implementation from nltk source
# https://www.nltk.org/_modules/nltk/util.html
def
form_ngrams
(
sequence
:
Iterator
[
T
],
n
:
int
)
->
Iterator
[
t
uple
[
T
,
...]]:
def
form_ngrams
(
sequence
:
Iterator
[
T
],
n
:
int
)
->
Iterator
[
T
uple
[
T
,
...]]:
history
=
[]
while
n
>
1
:
# PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
...
...
@@ -70,14 +70,14 @@ def word_ngrams(s: str, n: int) -> Iterator[str]:
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def
split_indices
(
s
:
str
)
->
Iterator
[
t
uple
[
str
,
t
uple
[
int
,
int
]]]:
def
split_indices
(
s
:
str
)
->
Iterator
[
T
uple
[
str
,
T
uple
[
int
,
int
]]]:
"""Splits a string on whitespaces and records the indices of each in the original string.
@:return generator((word, (start_idx, end_idx)), ...)
"""
return
((
m
.
group
(
0
),
(
m
.
start
(),
m
.
end
()
-
1
))
for
m
in
re
.
finditer
(
r
"\S+"
,
s
))
def
word_ngrams_indices
(
s
:
str
,
n
:
int
)
->
Iterator
[
t
uple
[
str
,
t
uple
[
int
,
int
]]]:
def
word_ngrams_indices
(
s
:
str
,
n
:
int
)
->
Iterator
[
T
uple
[
str
,
T
uple
[
int
,
int
]]]:
"""Splits a string into pairs of (ngram words, their start/end indices)"""
tokens_with_indices
=
split_indices
(
s
)
...
...
@@ -157,7 +157,7 @@ class Janitor:
print
(
"WARNING: Janitor running in python mode"
)
return
self
.
register_contaminant_python
(
dirt_string
)
def
clean
(
self
,
dirty_string
:
str
)
->
l
ist
[
str
]:
def
clean
(
self
,
dirty_string
:
str
)
->
L
ist
[
str
]:
"""Clean a string (e.g. a training set) by removing all ngrams previously
registered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty"""
...
...
@@ -168,8 +168,8 @@ class Janitor:
return
self
.
clean_python
(
dirty_string
)
def
_split_chunks
(
self
,
dirty_string
:
str
,
dirty_parts
:
Sequence
[
t
uple
]
)
->
l
ist
[
str
]:
self
,
dirty_string
:
str
,
dirty_parts
:
Sequence
[
T
uple
]
)
->
L
ist
[
str
]:
clean_chunks
=
[]
splice_idx
=
0
end
=
-
1
...
...
@@ -197,7 +197,7 @@ class Janitor:
janitor_util
.
clean_ngram
(
dirt_string
,
self
.
delete_chars
,
self
.
ngram_n
)
)
def
clean_cpp
(
self
,
dirty_string
:
str
)
->
l
ist
[
str
]:
def
clean_cpp
(
self
,
dirty_string
:
str
)
->
L
ist
[
str
]:
contamination_indices
=
janitor_util
.
clean_ngram_with_indices
(
dirty_string
,
self
.
delete_chars
,
self
.
ngram_n
)
...
...
@@ -215,7 +215,7 @@ class Janitor:
word_ngrams
(
self
.
normalize_string
(
dirt_string
),
self
.
ngram_n
)
)
def
clean_python
(
self
,
dirty_string
:
str
)
->
l
ist
[
str
]:
def
clean_python
(
self
,
dirty_string
:
str
)
->
L
ist
[
str
]:
contamination_indices
=
(
(
None
,
*
idx_pair
)
for
dirty_ngram
,
idx_pair
in
word_ngrams_indices
(
dirty_string
,
self
.
ngram_n
)
...
...
mypy.ini
View file @
a3252ed7
[mypy]
python_version
=
3.
9
python_version
=
3.
8
show_traceback
=
True
check_untyped_defs
=
True
no_implicit_reexport
=
True
...
...
pyproject.toml
View file @
a3252ed7
...
...
@@ -16,7 +16,7 @@ classifiers = [
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
]
requires-python
=
">=3.
9
"
requires-python
=
">=3.
8
"
license
=
{
"text"
=
"MIT"
}
dependencies
=
[
"accelerate>=0.21.0"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment