download_math.py 4.56 KB
Newer Older
wanglch's avatar
wanglch committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# This script goes to
# https://arxiv.org/list/math/recent?skip=0&show=2000
# and downloads all the source PDFs, as well as latex equivalents, and puts them together into 
# Searching for:
# <a href="/pdf/2503.08675" title="Download PDF" id="pdf-2503.08675" aria-labelledby="pdf-2503.08675">pdf</a>
# a math_data folder
#!/usr/bin/env python3
import argparse
import os
import re
import time
import io
import tarfile
import requests
from tqdm import tqdm

def download_and_extract_source(paper_id, data_dir):
    source_url = f"https://export.arxiv.org/src/{paper_id}"
    print(f"Downloading source for {paper_id} from {source_url}...")
    response = requests.get(source_url)
    if response.status_code != 200:
        print(f"Error downloading source for {paper_id}: HTTP {response.status_code}")
        return False

    # Try to open as a tar archive.
    try:
        file_obj = io.BytesIO(response.content)
        with tarfile.open(fileobj=file_obj, mode='r:*') as tar:
            # Filter for regular .tex files.
            members = [m for m in tar.getmembers() if m.isfile() and m.name.endswith('.tex')]
            print("Found TeX files:", [m.name for m in members])
            if len(members) == 1:
                member = members[0]
                extracted = tar.extractfile(member)
                if extracted is None:
                    print(f"Error extracting {paper_id}: Could not read the file from the archive.")
                    return False
                content = extracted.read()
                out_path = os.path.join(data_dir, f"{paper_id}.tex")
                with open(out_path, "wb") as f:
                    f.write(content)
                print(f"Saved tex source for {paper_id} as {out_path}")
                return True
            else:
                print(f"Error: {paper_id} contains multiple .tex files or none. Skipping extraction.")
                return False
    except tarfile.ReadError:
        # Not a tar archive; assume it's a single file.
        out_path = os.path.join(data_dir, f"{paper_id}.tex")
        with open(out_path, "wb") as f:
            f.write(response.content)
        print(f"Saved non-archive tex source for {paper_id} as {out_path}")
        return True

def download_pdf(paper_id, data_dir):
    pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
    print(f"Downloading PDF for {paper_id} from {pdf_url}...")
    response = requests.get(pdf_url)
    if response.status_code != 200:
        print(f"Error downloading PDF for {paper_id}: HTTP {response.status_code}")
        return False
    out_path = os.path.join(data_dir, f"{paper_id}.pdf")
    with open(out_path, "wb") as f:
        f.write(response.content)
    print(f"Saved PDF for {paper_id} as {out_path}")
    return True

def main():
    parser = argparse.ArgumentParser(
        description="Download and extract arXiv LaTeX source files and PDFs only if both succeed."
    )
    parser.add_argument(
        "--url",
        type=str,
        default="https://arxiv.org/list/math/recent?skip=0&show=2000",
        help="URL of the arXiv list page to scrape (default: %(default)s)"
    )
    parser.add_argument(
        "--data_dir",
        type=str,
        default="math_data/pdfs",
        help="Directory to save downloaded files (default: %(default)s)"
    )
    args = parser.parse_args()

    if not os.path.exists(args.data_dir):
        os.makedirs(args.data_dir)

    print(f"Downloading list page from {args.url}...")
    response = requests.get(args.url)
    if response.status_code != 200:
        print(f"Error downloading list page: HTTP {response.status_code}")
        return

    # Find all pdf links in the form: <a href="/pdf/2503.08675" ...>pdf</a>
    pattern = re.compile(r'href="/pdf/(\d+\.\d+)"')
    paper_ids = pattern.findall(response.text)
    print(f"Found {len(paper_ids)} papers.")

    # For each paper, only keep the files if both the tex extraction and pdf download succeed.
    for paper_id in tqdm(paper_ids):
        tex_success = download_and_extract_source(paper_id, args.data_dir)
        if not tex_success:
            print(f"Skipping PDF download for {paper_id} because tex extraction failed.")
            continue
        
        pdf_success = download_pdf(paper_id, args.data_dir)
        if not pdf_success:
            # Remove the tex file if the PDF download fails.
            tex_path = os.path.join(args.data_dir, f"{paper_id}.tex")
            if os.path.exists(tex_path):
                os.remove(tex_path)
                print(f"Removed tex file for {paper_id} because PDF download failed.")
        time.sleep(1)

if __name__ == "__main__":
    main()