"test/vscode:/vscode.git/clone" did not exist on "bf1214a922d80c84bd2450d544542f9d9cc91aaf"
Commit ee4252ff authored by Baber's avatar Baber
Browse files

nit

parent 28abec9f
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
import asyncio import asyncio
import glob import glob
import os import os
import shutil
from functools import cache from functools import cache
from typing import Dict from typing import Dict
...@@ -34,6 +33,8 @@ async def process_html_essay( ...@@ -34,6 +33,8 @@ async def process_html_essay(
client: httpx.AsyncClient, url: str, h: html2text.HTML2Text, temp_folder: str client: httpx.AsyncClient, url: str, h: html2text.HTML2Text, temp_folder: str
) -> None: ) -> None:
filename = url.split("/")[-1].replace(".html", ".txt") filename = url.split("/")[-1].replace(".html", ".txt")
if os.path.exists(os.path.join(temp_folder, filename)):
return None
try: try:
content = await fetch_url(client, url) content = await fetch_url(client, url)
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
...@@ -53,6 +54,8 @@ async def process_text_essay( ...@@ -53,6 +54,8 @@ async def process_text_essay(
client: httpx.AsyncClient, url: str, temp_folder: str client: httpx.AsyncClient, url: str, temp_folder: str
) -> None: ) -> None:
filename = url.split("/")[-1] filename = url.split("/")[-1]
if os.path.exists(os.path.join(temp_folder, filename)):
return None
try: try:
content = await fetch_url(client, url) content = await fetch_url(client, url)
with open(os.path.join(temp_folder, filename), "w", encoding="utf-8") as file: with open(os.path.join(temp_folder, filename), "w", encoding="utf-8") as file:
...@@ -113,8 +116,8 @@ async def get_essays() -> Dict[str, str]: ...@@ -113,8 +116,8 @@ async def get_essays() -> Dict[str, str]:
text += f.read() text += f.read()
# Cleanup # Cleanup
shutil.rmtree(temp_folder_repo) # shutil.rmtree(temp_folder_repo)
shutil.rmtree(temp_folder_html) # shutil.rmtree(temp_folder_html)
return {"text": text} return {"text": text}
......
...@@ -15,8 +15,6 @@ from importlib.metadata import version ...@@ -15,8 +15,6 @@ from importlib.metadata import version
from tqdm import tqdm from tqdm import tqdm
COUNT = 0
NUM_SAMPLES = 500 NUM_SAMPLES = 500
REMOVE_NEWLINE_TAB = "" REMOVE_NEWLINE_TAB = ""
STOP_WORDS = "" STOP_WORDS = ""
...@@ -217,7 +215,6 @@ def generate_samples( ...@@ -217,7 +215,6 @@ def generate_samples(
TOKENIZER=None, TOKENIZER=None,
): ):
assert TOKENIZER is not None, "TOKENIZER is not defined." assert TOKENIZER is not None, "TOKENIZER is not defined."
print("using tokenizer ", TOKENIZER.name_or_path)
num_needle_k = max(num_needle_k, num_needle_q) num_needle_k = max(num_needle_k, num_needle_q)
write_jsons = [] write_jsons = []
tokens_to_generate = tokens_to_generate tokens_to_generate = tokens_to_generate
...@@ -263,10 +260,13 @@ def generate_samples( ...@@ -263,10 +260,13 @@ def generate_samples(
num_haystack += incremental num_haystack += incremental
print("Num haystack:", num_haystack) # print("Num haystack:", num_haystack)
# Generate samples # Generate samples
for index in tqdm(range(num_samples)): for index in tqdm(
range(num_samples),
desc=f"Generating synthetic samples: {type_haystack} | {max_seq_length}",
):
used_haystack = num_haystack used_haystack = num_haystack
while True: while True:
try: try:
...@@ -307,5 +307,4 @@ def generate_samples( ...@@ -307,5 +307,4 @@ def generate_samples(
False False
), f"Needle not in input: {formatted_output}. Something went wrong." ), f"Needle not in input: {formatted_output}. Something went wrong."
write_jsons.append(formatted_output) write_jsons.append(formatted_output)
print(COUNT)
return write_jsons return write_jsons
...@@ -13,6 +13,7 @@ from lm_eval.tasks.ruler.prepare import generate_samples ...@@ -13,6 +13,7 @@ from lm_eval.tasks.ruler.prepare import generate_samples
@cache @cache
def get_tokenizer(pretrained): def get_tokenizer(pretrained):
print("using tokenizer ", pretrained)
return AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True) return AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment