Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
7c2687cb
Commit
7c2687cb
authored
Sep 15, 2023
by
haileyschoelkopf
Browse files
update typehints in decontam tool
parent
4c139701
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
9 deletions
+9
-9
lm_eval/decontamination/janitor.py
lm_eval/decontamination/janitor.py
+9
-9
No files found.
lm_eval/decontamination/janitor.py
View file @
7c2687cb
...
...
@@ -3,7 +3,7 @@ import string
import
pickle
import
traceback
from
pprint
import
pprint
from
typing
import
Iterator
,
Sequence
,
TypeVar
from
typing
import
Iterator
,
Sequence
,
TypeVar
,
List
,
Tuple
# This is a cpp module. Compile janitor_util.cpp with:
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
...
...
@@ -21,7 +21,7 @@ T = TypeVar("T")
# Implementation from nltk source
# https://www.nltk.org/_modules/nltk/util.html
def
form_ngrams
(
sequence
:
Iterator
[
T
],
n
:
int
)
->
Iterator
[
t
uple
[
T
,
...]]:
def
form_ngrams
(
sequence
:
Iterator
[
T
],
n
:
int
)
->
Iterator
[
T
uple
[
T
,
...]]:
history
=
[]
while
n
>
1
:
# PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
...
...
@@ -70,14 +70,14 @@ def word_ngrams(s: str, n: int) -> Iterator[str]:
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def
split_indices
(
s
:
str
)
->
Iterator
[
t
uple
[
str
,
t
uple
[
int
,
int
]]]:
def
split_indices
(
s
:
str
)
->
Iterator
[
T
uple
[
str
,
T
uple
[
int
,
int
]]]:
"""Splits a string on whitespaces and records the indices of each in the original string.
@:return generator((word, (start_idx, end_idx)), ...)
"""
return
((
m
.
group
(
0
),
(
m
.
start
(),
m
.
end
()
-
1
))
for
m
in
re
.
finditer
(
r
"\S+"
,
s
))
def
word_ngrams_indices
(
s
:
str
,
n
:
int
)
->
Iterator
[
t
uple
[
str
,
t
uple
[
int
,
int
]]]:
def
word_ngrams_indices
(
s
:
str
,
n
:
int
)
->
Iterator
[
T
uple
[
str
,
T
uple
[
int
,
int
]]]:
"""Splits a string into pairs of (ngram words, their start/end indices)"""
tokens_with_indices
=
split_indices
(
s
)
...
...
@@ -157,7 +157,7 @@ class Janitor:
print
(
"WARNING: Janitor running in python mode"
)
return
self
.
register_contaminant_python
(
dirt_string
)
def
clean
(
self
,
dirty_string
:
str
)
->
l
ist
[
str
]:
def
clean
(
self
,
dirty_string
:
str
)
->
L
ist
[
str
]:
"""Clean a string (e.g. a training set) by removing all ngrams previously
registered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty"""
...
...
@@ -168,8 +168,8 @@ class Janitor:
return
self
.
clean_python
(
dirty_string
)
def
_split_chunks
(
self
,
dirty_string
:
str
,
dirty_parts
:
Sequence
[
t
uple
]
)
->
l
ist
[
str
]:
self
,
dirty_string
:
str
,
dirty_parts
:
Sequence
[
T
uple
]
)
->
L
ist
[
str
]:
clean_chunks
=
[]
splice_idx
=
0
end
=
-
1
...
...
@@ -197,7 +197,7 @@ class Janitor:
janitor_util
.
clean_ngram
(
dirt_string
,
self
.
delete_chars
,
self
.
ngram_n
)
)
def
clean_cpp
(
self
,
dirty_string
:
str
)
->
l
ist
[
str
]:
def
clean_cpp
(
self
,
dirty_string
:
str
)
->
L
ist
[
str
]:
contamination_indices
=
janitor_util
.
clean_ngram_with_indices
(
dirty_string
,
self
.
delete_chars
,
self
.
ngram_n
)
...
...
@@ -215,7 +215,7 @@ class Janitor:
word_ngrams
(
self
.
normalize_string
(
dirt_string
),
self
.
ngram_n
)
)
def
clean_python
(
self
,
dirty_string
:
str
)
->
l
ist
[
str
]:
def
clean_python
(
self
,
dirty_string
:
str
)
->
L
ist
[
str
]:
contamination_indices
=
(
(
None
,
*
idx_pair
)
for
dirty_ngram
,
idx_pair
in
word_ngrams_indices
(
dirty_string
,
self
.
ngram_n
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment