# This script goes to # https://arxiv.org/list/math/recent?skip=0&show=2000 # and downloads all the source PDFs, as well as latex equivalents, and puts them together into # Searching for: # pdf # a math_data folder #!/usr/bin/env python3 import argparse import os import re import time import io import tarfile import requests from tqdm import tqdm def download_and_extract_source(paper_id, data_dir): source_url = f"https://export.arxiv.org/src/{paper_id}" print(f"Downloading source for {paper_id} from {source_url}...") response = requests.get(source_url) if response.status_code != 200: print(f"Error downloading source for {paper_id}: HTTP {response.status_code}") return False # Try to open as a tar archive. try: file_obj = io.BytesIO(response.content) with tarfile.open(fileobj=file_obj, mode='r:*') as tar: # Filter for regular .tex files. members = [m for m in tar.getmembers() if m.isfile() and m.name.endswith('.tex')] print("Found TeX files:", [m.name for m in members]) if len(members) == 1: member = members[0] extracted = tar.extractfile(member) if extracted is None: print(f"Error extracting {paper_id}: Could not read the file from the archive.") return False content = extracted.read() out_path = os.path.join(data_dir, f"{paper_id}.tex") with open(out_path, "wb") as f: f.write(content) print(f"Saved tex source for {paper_id} as {out_path}") return True else: print(f"Error: {paper_id} contains multiple .tex files or none. Skipping extraction.") return False except tarfile.ReadError: # Not a tar archive; assume it's a single file. out_path = os.path.join(data_dir, f"{paper_id}.tex") with open(out_path, "wb") as f: f.write(response.content) print(f"Saved non-archive tex source for {paper_id} as {out_path}") return True def download_pdf(paper_id, data_dir): pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf" print(f"Downloading PDF for {paper_id} from {pdf_url}...") response = requests.get(pdf_url) if response.status_code != 200: print(f"Error downloading PDF for {paper_id}: HTTP {response.status_code}") return False out_path = os.path.join(data_dir, f"{paper_id}.pdf") with open(out_path, "wb") as f: f.write(response.content) print(f"Saved PDF for {paper_id} as {out_path}") return True def main(): parser = argparse.ArgumentParser( description="Download and extract arXiv LaTeX source files and PDFs only if both succeed." ) parser.add_argument( "--url", type=str, default="https://arxiv.org/list/math/recent?skip=0&show=2000", help="URL of the arXiv list page to scrape (default: %(default)s)" ) parser.add_argument( "--data_dir", type=str, default="math_data/pdfs", help="Directory to save downloaded files (default: %(default)s)" ) args = parser.parse_args() if not os.path.exists(args.data_dir): os.makedirs(args.data_dir) print(f"Downloading list page from {args.url}...") response = requests.get(args.url) if response.status_code != 200: print(f"Error downloading list page: HTTP {response.status_code}") return # Find all pdf links in the form: pdf pattern = re.compile(r'href="/pdf/(\d+\.\d+)"') paper_ids = pattern.findall(response.text) print(f"Found {len(paper_ids)} papers.") # For each paper, only keep the files if both the tex extraction and pdf download succeed. for paper_id in tqdm(paper_ids): tex_success = download_and_extract_source(paper_id, args.data_dir) if not tex_success: print(f"Skipping PDF download for {paper_id} because tex extraction failed.") continue pdf_success = download_pdf(paper_id, args.data_dir) if not pdf_success: # Remove the tex file if the PDF download fails. tex_path = os.path.join(args.data_dir, f"{paper_id}.tex") if os.path.exists(tex_path): os.remove(tex_path) print(f"Removed tex file for {paper_id} because PDF download failed.") time.sleep(1) if __name__ == "__main__": main()