"profiler/vscode:/vscode.git/clone" did not exist on "c44818e756397cb3aff898d832d2451a1961dc48"
Commit ee4252ff authored by Baber's avatar Baber
Browse files

nit

parent 28abec9f
......@@ -14,7 +14,6 @@
import asyncio
import glob
import os
import shutil
from functools import cache
from typing import Dict
......@@ -34,6 +33,8 @@ async def process_html_essay(
client: httpx.AsyncClient, url: str, h: html2text.HTML2Text, temp_folder: str
) -> None:
filename = url.split("/")[-1].replace(".html", ".txt")
if os.path.exists(os.path.join(temp_folder, filename)):
return None
try:
content = await fetch_url(client, url)
soup = BeautifulSoup(content, "html.parser")
......@@ -53,6 +54,8 @@ async def process_text_essay(
client: httpx.AsyncClient, url: str, temp_folder: str
) -> None:
filename = url.split("/")[-1]
if os.path.exists(os.path.join(temp_folder, filename)):
return None
try:
content = await fetch_url(client, url)
with open(os.path.join(temp_folder, filename), "w", encoding="utf-8") as file:
......@@ -113,8 +116,8 @@ async def get_essays() -> Dict[str, str]:
text += f.read()
# Cleanup
shutil.rmtree(temp_folder_repo)
shutil.rmtree(temp_folder_html)
# shutil.rmtree(temp_folder_repo)
# shutil.rmtree(temp_folder_html)
return {"text": text}
......
......@@ -15,8 +15,6 @@ from importlib.metadata import version
from tqdm import tqdm
COUNT = 0
NUM_SAMPLES = 500
REMOVE_NEWLINE_TAB = ""
STOP_WORDS = ""
......@@ -217,7 +215,6 @@ def generate_samples(
TOKENIZER=None,
):
assert TOKENIZER is not None, "TOKENIZER is not defined."
print("using tokenizer ", TOKENIZER.name_or_path)
num_needle_k = max(num_needle_k, num_needle_q)
write_jsons = []
tokens_to_generate = tokens_to_generate
......@@ -263,10 +260,13 @@ def generate_samples(
num_haystack += incremental
print("Num haystack:", num_haystack)
# print("Num haystack:", num_haystack)
# Generate samples
for index in tqdm(range(num_samples)):
for index in tqdm(
range(num_samples),
desc=f"Generating synthetic samples: {type_haystack} | {max_seq_length}",
):
used_haystack = num_haystack
while True:
try:
......@@ -307,5 +307,4 @@ def generate_samples(
False
), f"Needle not in input: {formatted_output}. Something went wrong."
write_jsons.append(formatted_output)
print(COUNT)
return write_jsons
......@@ -13,6 +13,7 @@ from lm_eval.tasks.ruler.prepare import generate_samples
@cache
def get_tokenizer(pretrained):
print("using tokenizer ", pretrained)
return AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment