essays.py 3.25 KB
Newer Older
Baber's avatar
Baber committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

import glob
import os
import shutil
import urllib.request
from functools import cache

import html2text
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


@cache
def get_essays():
    temp_folder_repo = "essay_repo"
    temp_folder_html = "essay_html"
    os.makedirs(temp_folder_repo, exist_ok=True)
    os.makedirs(temp_folder_html, exist_ok=True)

    h = html2text.HTML2Text()
    h.ignore_images = True
    h.ignore_tables = True
    h.escape_all = True
    h.reference_links = False
    h.mark_code = False

    url = "https://raw.githubusercontent.com/NVIDIA/RULER/main/scripts/data/synthetic/json/PaulGrahamEssays_URLs.txt"
    response = requests.get(url)
    response.raise_for_status()

    # The content is now in memory as a string
    content = response.text

    # If you want to process it line by line:
    urls = content.splitlines()

    for url in tqdm(urls):
        if ".html" in url:
            filename = url.split("/")[-1].replace(".html", ".txt")
            try:
                with urllib.request.urlopen(url) as website:
                    content = website.read().decode("unicode_escape", "utf-8")
                    soup = BeautifulSoup(content, "html.parser")
                    specific_tag = soup.find("font")
                    parsed = h.handle(str(specific_tag))

                    with open(os.path.join(temp_folder_html, filename), "w") as file:
                        file.write(parsed)

            except Exception as e:
                print(f"Fail download {filename}, ({e})")

        else:
            filename = url.split("/")[-1]
            try:
                with urllib.request.urlopen(url) as website:
                    content = website.read().decode("utf-8")

                with open(os.path.join(temp_folder_repo, filename), "w") as file:
                    file.write(content)

            except Exception as e:
                print(f"Fail download {filename}, ({e})")

    files_repo = sorted(glob.glob(os.path.join(temp_folder_repo, "*.txt")))
    files_html = sorted(glob.glob(os.path.join(temp_folder_html, "*.txt")))
    print(
        f"Download {len(files_repo)} essays from `https://github.com/gkamradt/LLMTest_NeedleInAHaystack/`"
    )
    print(f"Download {len(files_html)} essays from `http://www.paulgraham.com/`")

    text = ""
    for file in files_repo + files_html:
        with open(file, "r") as f:
            text += f.read()

    shutil.rmtree(temp_folder_repo)
    shutil.rmtree(temp_folder_html)
    return {"text": text}

    # with open('PaulGrahamEssays.json', 'w') as f:
    #     json.dump({"text": text}, f)
    #
    # shutil.rmtree(temp_folder_repo)
    # shutil.rmtree(temp_folder_html)