essays.py

# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import asyncio
import glob
import os
import shutil
from typing import Dict

import html2text
import httpx
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm as async_tqdm


async def fetch_url(client: httpx.AsyncClient, url: str) -> str:
    response = await client.get(url)
    response.raise_for_status()
    return response.text


async def process_html_essay(
    client: httpx.AsyncClient, url: str, h: html2text.HTML2Text, temp_folder: str
) -> None:
    filename = url.split("/")[-1].replace(".html", ".txt")
    try:
        content = await fetch_url(client, url)
        soup = BeautifulSoup(content, "html.parser")
        specific_tag = soup.find("font")
        if specific_tag:
            parsed = h.handle(str(specific_tag))

            with open(
                os.path.join(temp_folder, filename), "w", encoding="utf-8"
            ) as file:
                file.write(parsed)
    except Exception as e:
        print(f"Failed to download {filename}: {str(e)}")


async def process_text_essay(
    client: httpx.AsyncClient, url: str, temp_folder: str
) -> None:
    filename = url.split("/")[-1]
    try:
        content = await fetch_url(client, url)
        with open(os.path.join(temp_folder, filename), "w", encoding="utf-8") as file:
            file.write(content)
    except Exception as e:
        print(f"Failed to download {filename}: {str(e)}")


async def get_essays() -> Dict[str, str]:
    temp_folder_repo = "essay_repo"
    temp_folder_html = "essay_html"
    os.makedirs(temp_folder_repo, exist_ok=True)
    os.makedirs(temp_folder_html, exist_ok=True)

    h = html2text.HTML2Text()
    h.ignore_images = True
    h.ignore_tables = True
    h.escape_all = True
    h.reference_links = False
    h.mark_code = False

    url_list = "https://raw.githubusercontent.com/NVIDIA/RULER/main/scripts/data/synthetic/json/PaulGrahamEssays_URLs.txt"

    async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
        # Fetch URL list
        content = await fetch_url(client, url_list)
        urls = content.splitlines()

        # Separate HTML and text URLs
        html_urls = [url for url in urls if ".html" in url]
        text_urls = [url for url in urls if ".html" not in url]

        # Process HTML essays
        html_tasks = [
            process_html_essay(client, url, h, temp_folder_html) for url in html_urls
        ]
        await async_tqdm.gather(*html_tasks, desc="Downloading HTML essays")

        # Process text essays
        text_tasks = [
            process_text_essay(client, url, temp_folder_repo) for url in text_urls
        ]
        await async_tqdm.gather(*text_tasks, desc="Downloading text essays")

    # Collect results
    files_repo = sorted(glob.glob(os.path.join(temp_folder_repo, "*.txt")))
    files_html = sorted(glob.glob(os.path.join(temp_folder_html, "*.txt")))

    # print(
    #     f"Downloaded {len(files_repo)} essays from `https://github.com/gkamradt/LLMTest_NeedleInAHaystack/`"
    # )
    # print(f"Downloaded {len(files_html)} essays from `http://www.paulgraham.com/`")

    # Combine all texts
    text = ""
    for file in files_repo + files_html:
        with open(file, "r", encoding="utf-8") as f:
            text += f.read()

    # Cleanup
    shutil.rmtree(temp_folder_repo)
    shutil.rmtree(temp_folder_html)

    return {"text": text}


def get_all_essays() -> Dict[str, str]:
    """Synchronous wrapper for get_essays()"""
    return asyncio.run(get_essays())