Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
ee4252ff
Commit
ee4252ff
authored
Dec 20, 2024
by
Baber
Browse files
nit
parent
28abec9f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
9 deletions
+12
-9
lm_eval/tasks/ruler/essays.py
lm_eval/tasks/ruler/essays.py
+6
-3
lm_eval/tasks/ruler/prepare.py
lm_eval/tasks/ruler/prepare.py
+5
-6
lm_eval/tasks/ruler/utils.py
lm_eval/tasks/ruler/utils.py
+1
-0
No files found.
lm_eval/tasks/ruler/essays.py
View file @
ee4252ff
...
...
@@ -14,7 +14,6 @@
import
asyncio
import
glob
import
os
import
shutil
from
functools
import
cache
from
typing
import
Dict
...
...
@@ -34,6 +33,8 @@ async def process_html_essay(
client
:
httpx
.
AsyncClient
,
url
:
str
,
h
:
html2text
.
HTML2Text
,
temp_folder
:
str
)
->
None
:
filename
=
url
.
split
(
"/"
)[
-
1
].
replace
(
".html"
,
".txt"
)
if
os
.
path
.
exists
(
os
.
path
.
join
(
temp_folder
,
filename
)):
return
None
try
:
content
=
await
fetch_url
(
client
,
url
)
soup
=
BeautifulSoup
(
content
,
"html.parser"
)
...
...
@@ -53,6 +54,8 @@ async def process_text_essay(
client
:
httpx
.
AsyncClient
,
url
:
str
,
temp_folder
:
str
)
->
None
:
filename
=
url
.
split
(
"/"
)[
-
1
]
if
os
.
path
.
exists
(
os
.
path
.
join
(
temp_folder
,
filename
)):
return
None
try
:
content
=
await
fetch_url
(
client
,
url
)
with
open
(
os
.
path
.
join
(
temp_folder
,
filename
),
"w"
,
encoding
=
"utf-8"
)
as
file
:
...
...
@@ -113,8 +116,8 @@ async def get_essays() -> Dict[str, str]:
text
+=
f
.
read
()
# Cleanup
shutil
.
rmtree
(
temp_folder_repo
)
shutil
.
rmtree
(
temp_folder_html
)
#
shutil.rmtree(temp_folder_repo)
#
shutil.rmtree(temp_folder_html)
return
{
"text"
:
text
}
...
...
lm_eval/tasks/ruler/prepare.py
View file @
ee4252ff
...
...
@@ -15,8 +15,6 @@ from importlib.metadata import version
from
tqdm
import
tqdm
COUNT
=
0
NUM_SAMPLES
=
500
REMOVE_NEWLINE_TAB
=
""
STOP_WORDS
=
""
...
...
@@ -217,7 +215,6 @@ def generate_samples(
TOKENIZER
=
None
,
):
assert
TOKENIZER
is
not
None
,
"TOKENIZER is not defined."
print
(
"using tokenizer "
,
TOKENIZER
.
name_or_path
)
num_needle_k
=
max
(
num_needle_k
,
num_needle_q
)
write_jsons
=
[]
tokens_to_generate
=
tokens_to_generate
...
...
@@ -263,10 +260,13 @@ def generate_samples(
num_haystack
+=
incremental
print
(
"Num haystack:"
,
num_haystack
)
#
print("Num haystack:", num_haystack)
# Generate samples
for
index
in
tqdm
(
range
(
num_samples
)):
for
index
in
tqdm
(
range
(
num_samples
),
desc
=
f
"Generating synthetic samples:
{
type_haystack
}
|
{
max_seq_length
}
"
,
):
used_haystack
=
num_haystack
while
True
:
try
:
...
...
@@ -307,5 +307,4 @@ def generate_samples(
False
),
f
"Needle not in input:
{
formatted_output
}
. Something went wrong."
write_jsons
.
append
(
formatted_output
)
print
(
COUNT
)
return
write_jsons
lm_eval/tasks/ruler/utils.py
View file @
ee4252ff
...
...
@@ -13,6 +13,7 @@ from lm_eval.tasks.ruler.prepare import generate_samples
@
cache
def
get_tokenizer
(
pretrained
):
print
(
"using tokenizer "
,
pretrained
)
return
AutoTokenizer
.
from_pretrained
(
pretrained
,
trust_remote_code
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment