Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
ee4252ff
Commit
ee4252ff
authored
Dec 20, 2024
by
Baber
Browse files
nit
parent
28abec9f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
9 deletions
+12
-9
lm_eval/tasks/ruler/essays.py
lm_eval/tasks/ruler/essays.py
+6
-3
lm_eval/tasks/ruler/prepare.py
lm_eval/tasks/ruler/prepare.py
+5
-6
lm_eval/tasks/ruler/utils.py
lm_eval/tasks/ruler/utils.py
+1
-0
No files found.
lm_eval/tasks/ruler/essays.py
View file @
ee4252ff
...
@@ -14,7 +14,6 @@
...
@@ -14,7 +14,6 @@
import
asyncio
import
asyncio
import
glob
import
glob
import
os
import
os
import
shutil
from
functools
import
cache
from
functools
import
cache
from
typing
import
Dict
from
typing
import
Dict
...
@@ -34,6 +33,8 @@ async def process_html_essay(
...
@@ -34,6 +33,8 @@ async def process_html_essay(
client
:
httpx
.
AsyncClient
,
url
:
str
,
h
:
html2text
.
HTML2Text
,
temp_folder
:
str
client
:
httpx
.
AsyncClient
,
url
:
str
,
h
:
html2text
.
HTML2Text
,
temp_folder
:
str
)
->
None
:
)
->
None
:
filename
=
url
.
split
(
"/"
)[
-
1
].
replace
(
".html"
,
".txt"
)
filename
=
url
.
split
(
"/"
)[
-
1
].
replace
(
".html"
,
".txt"
)
if
os
.
path
.
exists
(
os
.
path
.
join
(
temp_folder
,
filename
)):
return
None
try
:
try
:
content
=
await
fetch_url
(
client
,
url
)
content
=
await
fetch_url
(
client
,
url
)
soup
=
BeautifulSoup
(
content
,
"html.parser"
)
soup
=
BeautifulSoup
(
content
,
"html.parser"
)
...
@@ -53,6 +54,8 @@ async def process_text_essay(
...
@@ -53,6 +54,8 @@ async def process_text_essay(
client
:
httpx
.
AsyncClient
,
url
:
str
,
temp_folder
:
str
client
:
httpx
.
AsyncClient
,
url
:
str
,
temp_folder
:
str
)
->
None
:
)
->
None
:
filename
=
url
.
split
(
"/"
)[
-
1
]
filename
=
url
.
split
(
"/"
)[
-
1
]
if
os
.
path
.
exists
(
os
.
path
.
join
(
temp_folder
,
filename
)):
return
None
try
:
try
:
content
=
await
fetch_url
(
client
,
url
)
content
=
await
fetch_url
(
client
,
url
)
with
open
(
os
.
path
.
join
(
temp_folder
,
filename
),
"w"
,
encoding
=
"utf-8"
)
as
file
:
with
open
(
os
.
path
.
join
(
temp_folder
,
filename
),
"w"
,
encoding
=
"utf-8"
)
as
file
:
...
@@ -113,8 +116,8 @@ async def get_essays() -> Dict[str, str]:
...
@@ -113,8 +116,8 @@ async def get_essays() -> Dict[str, str]:
text
+=
f
.
read
()
text
+=
f
.
read
()
# Cleanup
# Cleanup
shutil
.
rmtree
(
temp_folder_repo
)
#
shutil.rmtree(temp_folder_repo)
shutil
.
rmtree
(
temp_folder_html
)
#
shutil.rmtree(temp_folder_html)
return
{
"text"
:
text
}
return
{
"text"
:
text
}
...
...
lm_eval/tasks/ruler/prepare.py
View file @
ee4252ff
...
@@ -15,8 +15,6 @@ from importlib.metadata import version
...
@@ -15,8 +15,6 @@ from importlib.metadata import version
from
tqdm
import
tqdm
from
tqdm
import
tqdm
COUNT
=
0
NUM_SAMPLES
=
500
NUM_SAMPLES
=
500
REMOVE_NEWLINE_TAB
=
""
REMOVE_NEWLINE_TAB
=
""
STOP_WORDS
=
""
STOP_WORDS
=
""
...
@@ -217,7 +215,6 @@ def generate_samples(
...
@@ -217,7 +215,6 @@ def generate_samples(
TOKENIZER
=
None
,
TOKENIZER
=
None
,
):
):
assert
TOKENIZER
is
not
None
,
"TOKENIZER is not defined."
assert
TOKENIZER
is
not
None
,
"TOKENIZER is not defined."
print
(
"using tokenizer "
,
TOKENIZER
.
name_or_path
)
num_needle_k
=
max
(
num_needle_k
,
num_needle_q
)
num_needle_k
=
max
(
num_needle_k
,
num_needle_q
)
write_jsons
=
[]
write_jsons
=
[]
tokens_to_generate
=
tokens_to_generate
tokens_to_generate
=
tokens_to_generate
...
@@ -263,10 +260,13 @@ def generate_samples(
...
@@ -263,10 +260,13 @@ def generate_samples(
num_haystack
+=
incremental
num_haystack
+=
incremental
print
(
"Num haystack:"
,
num_haystack
)
#
print("Num haystack:", num_haystack)
# Generate samples
# Generate samples
for
index
in
tqdm
(
range
(
num_samples
)):
for
index
in
tqdm
(
range
(
num_samples
),
desc
=
f
"Generating synthetic samples:
{
type_haystack
}
|
{
max_seq_length
}
"
,
):
used_haystack
=
num_haystack
used_haystack
=
num_haystack
while
True
:
while
True
:
try
:
try
:
...
@@ -307,5 +307,4 @@ def generate_samples(
...
@@ -307,5 +307,4 @@ def generate_samples(
False
False
),
f
"Needle not in input:
{
formatted_output
}
. Something went wrong."
),
f
"Needle not in input:
{
formatted_output
}
. Something went wrong."
write_jsons
.
append
(
formatted_output
)
write_jsons
.
append
(
formatted_output
)
print
(
COUNT
)
return
write_jsons
return
write_jsons
lm_eval/tasks/ruler/utils.py
View file @
ee4252ff
...
@@ -13,6 +13,7 @@ from lm_eval.tasks.ruler.prepare import generate_samples
...
@@ -13,6 +13,7 @@ from lm_eval.tasks.ruler.prepare import generate_samples
@
cache
@
cache
def
get_tokenizer
(
pretrained
):
def
get_tokenizer
(
pretrained
):
print
(
"using tokenizer "
,
pretrained
)
return
AutoTokenizer
.
from_pretrained
(
pretrained
,
trust_remote_code
=
True
)
return
AutoTokenizer
.
from_pretrained
(
pretrained
,
trust_remote_code
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment