Initial commit

89e60e48 · wanglch · 89e60e48 · 89e60e48 · 89e60e48 · 89e60e48
Commit 89e60e48 authored Mar 13, 2025 by wanglch
20 changed files
--- a/scripts/beaker/pluto-ib.sh
+++ b/scripts/beaker/pluto-ib.sh
+set -ex
+export NCCL_DEBUG=INFO NCCL_SOCKET_IFNAME=ib NCCL_IB_HCA="^=mlx5_1,mlx5_2"
\ No newline at end of file
--- a/scripts/benchmark_throughput.py
+++ b/scripts/benchmark_throughput.py
+"""Benchmark offline inference throughput."""
+
+import argparse
+import base64
+import json
+import random
+import time
+from io import BytesIO
+from typing import List, Optional, Tuple
+
+import torch
+import uvloop
+from PIL import Image
+from tqdm import tqdm
+from transformers import (
+    AutoModelForCausalLM,
+    AutoProcessor,
+    AutoTokenizer,
+    PreTrainedTokenizerBase,
+)
+from vllm import TokensPrompt
+from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args,
+)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"], data["conversations"][1]["value"]) for data in dataset]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    return filtered_dataset
+
+
+def sample_mm_requests_qwen2vl(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+):
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+
+    with open(dataset_path, "r") as f:
+        json_data = [json.loads(line) for line in f.readlines() if len(line.strip()) > 0]
+
+    result = []
+
+    for data in tqdm(json_data):
+        text = processor.apply_chat_template(data["chat_messages"], tokenize=False, add_generation_prompt=True)
+
+        raw_b64 = data["chat_messages"][0]["content"][1]["image_url"]["url"]
+        _main_image = Image.open(BytesIO(base64.b64decode(raw_b64[raw_b64.find(",") + 1 :])))
+
+        # Process inputs using processor
+        inputs = processor(
+            text=[text],
+            # images=[_main_image], # Don't pad out the image tokens yet, since that happens later inside of birr
+            padding=True,
+            return_tensors="np",
+        )
+
+        # print(inputs)
+
+        tokens = inputs["input_ids"][0]
+        prompt_len = len(tokens)
+
+        result.append(
+            (
+                TokensPrompt(
+                    dict(
+                        prompt_token_ids=tokens,
+                        multi_modal_data=dict(image=dict(image_embeds=torch.randn(1036, 3584), image_grid_thw=torch.tensor([[1, 74, 56]]))),
+                        # multi_modal_data=dict(image=main_image)
+                    )
+                ),
+                prompt_len,
+                fixed_output_len,
+            )
+        )
+
+        if len(result) >= num_requests:
+            break
+
+    return result
+
+
+def sample_mm_requests_phi3(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+):
+    processor = AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
+
+    with open(dataset_path, "r") as f:
+        json_data = [json.loads(line) for line in f.readlines() if len(line.strip()) > 0]
+
+    result = []
+
+    for data in tqdm(json_data):
+        inputs = processor.tokenizer.apply_chat_template(
+            [{"role": "user", "content": "<|image_1|>\n" + data["chat_messages"][0]["content"][0]["text"]}], tokenize=True, add_generation_prompt=True
+        )
+
+        raw_b64 = data["chat_messages"][0]["content"][1]["image_url"]["url"]
+        main_image = Image.open(BytesIO(base64.b64decode(raw_b64[raw_b64.find(",") + 1 :])))
+
+        # tokens = inputs["input_ids"][0]
+        tokens = inputs
+        prompt_len = len(tokens)
+
+        result.append(
+            (
+                TokensPrompt(
+                    dict(
+                        prompt_token_ids=tokens,
+                        multi_modal_data=dict(image=main_image),
+                    )
+                ),
+                prompt_len,
+                fixed_output_len,
+            )
+        )
+
+        if len(result) >= num_requests:
+            break
+
+    return result
+
+
+def sample_mm_requests_molmo(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+):
+    processor = AutoProcessor.from_pretrained("allenai/Molmo-7B-D-0924", trust_remote_code=True, torch_dtype="auto", device_map="auto")
+
+    with open(dataset_path, "r") as f:
+        json_data = [json.loads(line) for line in f.readlines() if len(line.strip()) > 0]
+
+    result = []
+
+    for data in tqdm(json_data):
+        raw_b64 = data["chat_messages"][0]["content"][1]["image_url"]["url"]
+        main_image = Image.open(BytesIO(base64.b64decode(raw_b64[raw_b64.find(",") + 1 :])))
+
+        inputs = inputs = processor.process(images=[main_image], text=data["chat_messages"][0]["content"][0]["text"])
+
+        # print(inputs)
+
+        # Molmo has max size of 4096 which is lower than our dataset was generated for
+        tokens = inputs["input_ids"][:2000]
+        # tokens = inputs
+        prompt_len = len(tokens)
+
+        result.append(
+            (
+                TokensPrompt(
+                    dict(
+                        prompt_token_ids=tokens,
+                        multi_modal_data=dict(image=main_image),
+                    )
+                ),
+                prompt_len,
+                fixed_output_len,
+            )
+        )
+
+        if len(result) >= num_requests:
+            break
+
+    return result
+
+
+def run_vllm(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    num_scheduler_steps: int = 1,
+    download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
+) -> float:
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        # speculative_model="[ngram]",
+        # num_speculative_tokens=1,
+        # ngram_prompt_lookup_max=5,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
+        num_scheduler_steps=num_scheduler_steps,
+        disable_async_output_proc=disable_async_output_proc,
+    )
+
+    # Add the requests to the engine.
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
+    for prompt, _, output_len in requests:
+        prompts.append(prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=output_len,
+            )
+        )
+
+    use_beam_search = False
+
+    if not use_beam_search:
+        start = time.perf_counter()
+        llm.generate(prompts, sampling_params, use_tqdm=True)
+        end = time.perf_counter()
+    else:
+        prompts = [prompt for prompt, _, _ in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0][2]
+        for prompt, input_len, _output_len in requests:
+            assert _output_len == output_len
+        start = time.perf_counter()
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ),
+        )
+        end = time.perf_counter()
+    return end - start
+
+
+async def run_vllm_async(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    num_scheduler_steps: int = 1,
+    download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
+    disable_frontend_multiprocessing: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    engine_args = AsyncEngineArgs(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
+        num_scheduler_steps=num_scheduler_steps,
+        disable_async_output_proc=disable_async_output_proc,
+        worker_use_ray=False,
+        disable_log_requests=True,
+    )
+
+    async with build_async_engine_client_from_engine_args(engine_args, disable_frontend_multiprocessing) as llm:
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        for prompt, _, output_len in requests:
+            prompts.append(prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=output_len,
+                )
+            )
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) + max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [prompt for prompt, _, _ in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [(prompt, args.input_len, args.output_len) for _ in range(args.num_prompts)]
+    else:
+        # requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+        #                            args.output_len)
+        requests = sample_mm_requests_qwen2vl(args.dataset, args.num_prompts, tokenizer, args.output_len)
+
+    if args.backend == "vllm":
+        run_args = [
+            requests,
+            args.model,
+            args.tokenizer,
+            args.quantization,
+            args.tensor_parallel_size,
+            args.seed,
+            args.n,
+            args.trust_remote_code,
+            args.dtype,
+            args.max_model_len,
+            args.enforce_eager,
+            args.kv_cache_dtype,
+            args.quantization_param_path,
+            args.device,
+            args.enable_prefix_caching,
+            args.enable_chunked_prefill,
+            args.max_num_batched_tokens,
+            args.distributed_executor_backend,
+            args.gpu_memory_utilization,
+            args.num_scheduler_steps,
+            args.download_dir,
+            args.load_format,
+            args.disable_async_output_proc,
+        ]
+
+        if args.async_engine:
+            run_args.append(args.disable_frontend_multiprocessing)
+            elapsed_time = uvloop.run(run_vllm_async(*run_args))
+        else:
+            elapsed_time = run_vllm(*run_args)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n, args.hf_max_batch_size, args.trust_remote_code)
+    elif args.backend == "mii":
+        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, args.output_len)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(prompt_len + output_len for _, prompt_len, output_len in requests)
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm")
+    parser.add_argument("--dataset", type=str, default=None, help="Path to the dataset.")
+    parser.add_argument("--input-len", type=int, default=None, help="Input prompt length for each request")
+    parser.add_argument("--output-len", type=int, default=None, help="Output length for each request. Overrides the " "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument("--quantization", "-q", choices=[*QUANTIZATION_METHODS, None], default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n", type=int, default=1, help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts", type=int, default=1000, help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.")
+    parser.add_argument("--trust-remote-code", action="store_true", help="trust remote code from huggingface")
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=None,
+        help="Maximum length of a sequence (including prompt and output). " "If None, will be derived from the model.",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="auto",
+        choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
+        help="data type for model weights and activations. "
+        'The "auto" option will use FP16 precision '
+        "for FP32 and FP16 models, and BF16 precision "
+        "for BF16 models.",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.9,
+        help="the fraction of GPU memory to be used for "
+        "the model executor, which can range from 0 to 1."
+        "If unspecified, will use the default value of 0.9.",
+    )
+    parser.add_argument("--enforce-eager", action="store_true", help="enforce eager execution")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)",
+    )
+    parser.add_argument(
+        "--quantization-param-path",
+        type=str,
+        default=None,
+        help="Path to the JSON file containing the KV cache scaling factors. "
+        "This should generally be supplied, when KV cache dtype is FP8. "
+        "Otherwise, KV cache scaling factors default to 1.0, which may cause "
+        "accuracy issues. FP8_E5M2 (without scaling) is only supported on "
+        "cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is "
+        "instead supported for common inference criteria.",
+    )
+    parser.add_argument("--device", type=str, default="auto", choices=DEVICE_OPTIONS, help="device type for vLLM execution")
+    parser.add_argument("--num-scheduler-steps", type=int, default=1, help="Maximum number of forward steps per scheduler call.")
+    parser.add_argument("--enable-prefix-caching", action="store_true", help="Enable automatic prefix caching for vLLM backend.")
+    parser.add_argument("--enable-chunked-prefill", action="store_true", help="enable chunked prefill for vLLM backend.")
+    parser.add_argument("--max-num-batched-tokens", type=int, default=None, help="maximum number of batched tokens per " "iteration")
+    parser.add_argument(
+        "--download-dir", type=str, default=None, help="directory to download and load the weights, " "default to the default cache dir of huggingface"
+    )
+    parser.add_argument("--output-json", type=str, default=None, help="Path to save the throughput results in JSON format.")
+    parser.add_argument(
+        "--distributed-executor-backend",
+        choices=["ray", "mp"],
+        default=None,
+        help="Backend to use for distributed serving. When more than 1 GPU "
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.',
+    )
+    parser.add_argument(
+        "--load-format",
+        type=str,
+        default=EngineArgs.load_format,
+        choices=["auto", "pt", "safetensors", "npcache", "dummy", "tensorizer", "bitsandbytes"],
+        help="The format of the model weights to load.\n\n"
+        '* "auto" will try to load the weights in the safetensors format '
+        "and fall back to the pytorch bin format if safetensors format "
+        "is not available.\n"
+        '* "pt" will load the weights in the pytorch bin format.\n'
+        '* "safetensors" will load the weights in the safetensors format.\n'
+        '* "npcache" will load the weights in pytorch format and store '
+        "a numpy cache to speed up the loading.\n"
+        '* "dummy" will initialize the weights with random values, '
+        "which is mainly for profiling.\n"
+        '* "tensorizer" will load the weights using tensorizer from '
+        "CoreWeave. See the Tensorize vLLM Model script in the Examples"
+        "section for more information.\n"
+        '* "bitsandbytes" will load the weights using bitsandbytes '
+        "quantization.\n",
+    )
+    parser.add_argument("--disable-async-output-proc", action="store_true", default=False, help="Disable async output processor for vLLM backend.")
+    parser.add_argument("--async-engine", action="store_true", default=False, help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing", action="store_true", default=False, help="Disable decoupled async engine frontend.")
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII " "backend.")
+    main(args)
--- a/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml
+++ b/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml
+model:
+  # full fine tune
+  name_or_path: weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/
+  #name_or_path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/
+  vlm: true
+
+  # necessary to prevent random crashes, until vllm fixes some bugs
+  num_scheduler_steps: 1
+
+format:
+  add_generation_prompt: true
+
+generate:
+  # The model's max context length is 8096, but around 1500 tokens are reserved for the image itself
+  max_context_length: 6500
+  temperature: 0.8
+  top_p: 1.0
+  drop_long_outputs: false
+
+
+pipeline:
+  sqs_queue_name: jake-pdf
+  num_workers: 3
+  generation_batch_size: 256
+  tokenization_batch_size: 64
+  output_serializer: default
+  target_bucket: ai2-oe-data
+  target_object_prefix: [your username]/pdfworkspaces/s2orc_3200k_v2/inference_outputs
+  allowed_restarts_per_predictor: 10
+
+task:
+  budget: ai2/oe-data
+  workspace: ai2/oe-data-model-based-cleanup
+  name: qwen2vl-schedsteps-bg
+  replicas: 128
+  priority: LOW
+  gpu_count: 1
+  cluster:
+    - ai2/jupiter-cirrascale-2
+    - ai2/saturn-cirrascale
+
--- a/scripts/build-docker.sh
+++ b/scripts/build-docker.sh
+#!/bin/bash
+
+set -e
+
+VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)')
+echo "$VERSION"
+
+docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference  -t olmocr-inference-$VERSION .
+beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION olmocr-inference-$VERSION
\ No newline at end of file
--- a/scripts/elo/README.md
+++ b/scripts/elo/README.md
+# elo rating
+
+Calculates elo rating of olmOCR vs other tools.
+
+## Data
+
+The pairwise judgment data is stored in `ratings.csv` as win/loss counts:
+```
+MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
+marker,mineru,53,26,67.1,32.9
+mineru,pdelf,22,55,28.6,71.4
+gotocr_format,marker,26,45,36.6,63.4
+marker,pdelf,31,49,38.8,61.3
+gotocr_format,pdelf,29,41,41.4,58.6
+gotocr_format,mineru,38,37,50.7,49.3
+```
+
+*Note* `pdfelf` is olmOCR.
+
+## Usage
+
+To calculate elo ratings, run the following command:
+```bash
+python calculate_elo_ratings.py ratings.csv --num-bootstrap 5000 --num-elo-sims 100 --confidence-level 95 --seed 123
+```
+
+It should print something like:
+```
+Bootstrapped Elo Ratings (95% CI):
+--------------------------------------------------
+pdelf        1813.0 ± 84.9 [1605.9, 1930.0]
+mineru       1545.2 ± 99.7 [1336.7, 1714.1]
+marker       1429.1 ± 100.7 [1267.6, 1645.5]
+gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]
+
+Pairwise Significance Tests:
+--------------------------------------------------
+gotocr_format vs marker       Δ = -216.3 [-470.8,  135.0] p = 0.218
+gotocr_format vs mineru       Δ = -332.5 [-567.5,   19.3] p = 0.051
+gotocr_format vs pdelf        Δ = -600.3 [-826.1, -344.3] p = 0.000*
+marker       vs mineru       Δ = -116.1 [-365.4,  246.5] p = 0.430
+marker       vs pdelf        Δ = -383.9 [-610.6,  -10.9] p = 0.044*
+mineru       vs pdelf        Δ = -267.8 [-517.3,  104.0] p = 0.135
+```
+
+which is also already saved in `results.txt`.
+
+To generate boxplots of elo ratings, run the following command:
+```bash
+python draw_boxplots.py results.txt boxplots.png
+```
+
+which should save boxplots as `boxplots.png`.
\ No newline at end of file
--- a/scripts/elo/boxplots.png
+++ b/scripts/elo/boxplots.png
--- a/scripts/elo/calculate_elo_ratings.py
+++ b/scripts/elo/calculate_elo_ratings.py
+"""
+
+Elo ratings for olmOCR vs baselines.
+
+See data at scripts/elo/ratings.csv
+
+    MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
+    marker,mineru,53,26,67.1,32.9
+    mineru,pdelf,22,55,28.6,71.4
+    gotocr_format,marker,26,45,36.6,63.4
+    marker,pdelf,31,49,38.8,61.3
+    gotocr_format,pdelf,29,41,41.4,58.6
+    gotocr_format,mineru,38,37,50.7,49.3
+
+Invoke via
+    python calculate_elo_ratings.py ratings.csv --num-bootstrap 5000 --num-elo-sims 100 --confidence-level 95 --seed 123
+
+Output:
+
+    Bootstrapped Elo Ratings (95% CI):
+    --------------------------------------------------
+    pdelf        1813.0 ± 84.9 [1605.9, 1930.0]
+    mineru       1545.2 ± 99.7 [1336.7, 1714.1]
+    marker       1429.1 ± 100.7 [1267.6, 1645.5]
+    gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]
+
+    Pairwise Significance Tests:
+    --------------------------------------------------
+    gotocr_format vs marker       Δ = -216.3 [-470.8,  135.0] p = 0.218
+    gotocr_format vs mineru       Δ = -332.5 [-567.5,   19.3] p = 0.051
+    gotocr_format vs pdelf        Δ = -600.3 [-826.1, -344.3] p = 0.000*
+    marker       vs mineru       Δ = -116.1 [-365.4,  246.5] p = 0.430
+    marker       vs pdelf        Δ = -383.9 [-610.6,  -10.9] p = 0.044*
+    mineru       vs pdelf        Δ = -267.8 [-517.3,  104.0] p = 0.135
+
+
+@kylel
+
+"""
+
+import random
+from itertools import combinations
+
+import click
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+
+def calculate_elo(matches_data, all_methods, k_factor=32, initial_rating=1500, n_replications=10, random_state=None):
+    """Calculate Elo ratings with multiple replications per dataset"""
+    all_ratings = {method: [] for method in all_methods}
+
+    for _ in range(n_replications):
+        matches = matches_data.sample(frac=1, replace=False, random_state=random_state).reset_index(drop=True)
+        ratings = {method: initial_rating for method in all_methods}
+
+        for _, row in matches.iterrows():
+            method_a, method_b = row["MethodA"], row["MethodB"]
+            a_wins, b_wins = row["A_wins"], row["B_wins"]
+
+            for _ in range(int(a_wins)):
+                ra, rb = update_single_match(ratings[method_a], ratings[method_b], 1, k_factor)
+                ratings[method_a], ratings[method_b] = ra, rb
+
+            for _ in range(int(b_wins)):
+                ra, rb = update_single_match(ratings[method_a], ratings[method_b], 0, k_factor)
+                ratings[method_a], ratings[method_b] = ra, rb
+
+        for method in all_methods:
+            all_ratings[method].append(ratings[method])
+
+    return {method: np.mean(ratings) for method, ratings in all_ratings.items()}
+
+
+def update_single_match(rating_a, rating_b, actual_score, k_factor):
+    """Update ratings for a single match"""
+    expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
+    new_rating_a = rating_a + k_factor * (actual_score - expected_a)
+    new_rating_b = rating_b + k_factor * ((1 - actual_score) - (1 - expected_a))
+    return new_rating_a, new_rating_b
+
+
+def bootstrap_elo_and_tests(df, num_bootstrap=1000, num_elo_sims=10, confidence_level=95, k_factor=32, initial_rating=1500, random_state=None):
+    """Calculate bootstrapped Elo ratings with confidence intervals and perform pairwise significance tests"""
+
+    ci_lower = (100 - confidence_level) / 2
+    ci_upper = 100 - ci_lower
+
+    all_methods = set(df["MethodA"].unique()) | set(df["MethodB"].unique())
+    bootstrap_ratings = {method: [] for method in all_methods}
+
+    for _ in tqdm(range(num_bootstrap)):
+        bootstrap_sample = df.sample(n=len(df), replace=True, random_state=random_state)
+        ratings = calculate_elo(bootstrap_sample, all_methods, k_factor, initial_rating, num_elo_sims)
+
+        for method in all_methods:
+            bootstrap_ratings[method].append(ratings[method])
+
+    # Calculate statistics and perform significance tests
+    results = {}
+
+    # Basic statistics
+    for method in all_methods:
+        ratings_array = np.array(bootstrap_ratings[method])
+        results[method] = {
+            "mean": np.mean(ratings_array),
+            "std": np.std(ratings_array),
+            "ci_lower": np.percentile(ratings_array, ci_lower),
+            "ci_upper": np.percentile(ratings_array, ci_upper),
+            "bootstrap_samples": ratings_array,  # Store for significance testing
+        }
+
+    # Pairwise significance tests
+    significance_tests = {}
+    for method1, method2 in combinations(all_methods, 2):
+        # Calculate difference distribution
+        diff_distribution = results[method1]["bootstrap_samples"] - results[method2]["bootstrap_samples"]
+
+        # Calculate p-value (two-tailed test)
+        p_value = 2 * min(np.mean(diff_distribution >= 0), np.mean(diff_distribution <= 0))
+
+        # Store results
+        significance_tests[(method1, method2)] = {
+            "diff_mean": np.mean(diff_distribution),
+            "diff_ci_lower": np.percentile(diff_distribution, ci_lower),
+            "diff_ci_upper": np.percentile(diff_distribution, ci_upper),
+            "p_value": p_value,
+        }
+
+    return results, significance_tests
+
+
+@click.command()
+@click.argument("ratings_file", type=click.Path(exists=True))
+@click.option("--num-bootstrap", default=1000, help="Number of bootstrap iterations")
+@click.option("--num-elo-sims", default=10, help="Number of ELO simulations per bootstrap")
+@click.option("--confidence-level", default=95, help="Confidence level for intervals (in percent)")
+@click.option("--seed", default=42, help="Random seed for reproducibility")
+def main(ratings_file, num_bootstrap, num_elo_sims, confidence_level, seed):
+    # Set random seed
+    random.seed(seed)
+    np.random.seed(seed)
+
+    # Load data
+    df = pd.read_csv(ratings_file)
+
+    # Calculate bootstrapped Elo ratings
+    results, significance_tests = bootstrap_elo_and_tests(df, num_bootstrap=num_bootstrap, num_elo_sims=num_elo_sims)
+
+    # Sort and display results
+    print(f"\nBootstrapped Elo Ratings ({confidence_level}% CI):")
+    print("-" * 50)
+    sorted_results = dict(sorted(results.items(), key=lambda x: x[1]["mean"], reverse=True))
+    for method, stats in sorted_results.items():
+        print(f"{method:12} {stats['mean']:6.1f} ± {stats['std']:4.1f} [{stats['ci_lower']:6.1f}, {stats['ci_upper']:6.1f}]")
+
+    # Display pairwise significance tests
+    print("\nPairwise Significance Tests:")
+    print("-" * 50)
+    for (method1, method2), stats in significance_tests.items():
+        sig_marker = "*" if stats["p_value"] < (1 - confidence_level / 100) else " "
+        print(
+            f"{method1:12} vs {method2:12} Δ = {stats['diff_mean']:6.1f} "
+            + f"[{stats['diff_ci_lower']:6.1f}, {stats['diff_ci_upper']:6.1f}] "
+            + f"p = {stats['p_value']:.3f}{sig_marker}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/elo/draw_boxplots.py
+++ b/scripts/elo/draw_boxplots.py
+"""
+
+Boxplots of Elo ratings with 95% confidence intervals for each method.
+
+Invocation:
+    python draw_boxplots.py results.txt boxplots.png
+
+@kylel
+
+"""
+
+import hashlib
+import re
+from pathlib import Path
+
+import click
+import matplotlib.font_manager as font_manager
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+
+# AI2 Colors
+AI2_PINK = "#f0529c"
+AI2_DARK_TEAL = "#0a3235"
+AI2_TEAL = "#105257"
+
+# Name mappings
+NAME_DISPLAY_MAP = {"pdelf": "olmOCR", "mineru": "MinerU", "marker": "Marker", "gotocr_format": "GOTOCR"}
+
+
+def download_and_cache_file(url, cache_dir=None):
+    """Download a file and cache it locally."""
+    if cache_dir is None:
+        cache_dir = Path.home() / ".cache" / "elo_plot"
+
+    cache_dir = Path(cache_dir)
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create filename from URL hash
+    url_hash = hashlib.sha256(url.encode()).hexdigest()[:12]
+    file_name = url.split("/")[-1]
+    cached_path = cache_dir / f"{url_hash}_{file_name}"
+
+    if not cached_path.exists():
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+
+        with open(cached_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+
+    return str(cached_path)
+
+
+def parse_elo_data(file_path):
+    """Parse Elo ratings data from a text file."""
+    with open(file_path, "r") as f:
+        content = f.read()
+
+    # Regular expression to match the data lines
+    pattern = r"(\w+)\s+(\d+\.\d+)\s*±\s*(\d+\.\d+)\s*\[(\d+\.\d+),\s*(\d+\.\d+)\]"
+    matches = re.finditer(pattern, content)
+
+    # Initialize lists to store data
+    names = []
+    medians = []
+    errors = []
+    ci_low = []
+    ci_high = []
+
+    for match in matches:
+        names.append(match.group(1))
+        medians.append(float(match.group(2)))
+        errors.append(float(match.group(3)))
+        ci_low.append(float(match.group(4)))
+        ci_high.append(float(match.group(5)))
+
+    return names, medians, errors, ci_low, ci_high
+
+
+def create_boxplot(names, medians, errors, ci_low, ci_high, output_path, font_path):
+    """Create and save a boxplot of Elo ratings."""
+    # Set up Manrope font
+    font_manager.fontManager.addfont(font_path)
+    plt.rcParams["font.family"] = "Manrope"
+    plt.rcParams["font.weight"] = "medium"
+
+    # Define colors - pdelf in pink, others in shades of teal/grey based on performance
+    max_median = max(medians)
+    colors = []
+    for i, median in enumerate(medians):
+        if names[i] == "pdelf":
+            colors.append(AI2_PINK)
+        else:
+            # Calculate a shade between dark teal and grey based on performance
+            performance_ratio = (median - min(medians)) / (max_median - min(medians))
+            base_color = np.array(tuple(int(AI2_DARK_TEAL[i : i + 2], 16) for i in (1, 3, 5))) / 255.0
+            grey = np.array([0.7, 0.7, 0.7])  # Light grey
+            color = tuple(np.clip(base_color * performance_ratio + grey * (1 - performance_ratio), 0, 1))
+            colors.append(color)
+
+    # Create box plot data
+    box_data = []
+    for i in range(len(names)):
+        q1 = medians[i] - errors[i]
+        q3 = medians[i] + errors[i]
+        box_data.append([ci_low[i], q1, medians[i], q3, ci_high[i]])
+
+    # Create box plot with smaller width and spacing
+    plt.figure(figsize=(4, 4))
+    bp = plt.boxplot(
+        box_data,
+        labels=[NAME_DISPLAY_MAP[name] for name in names],
+        whis=1.5,
+        patch_artist=True,
+        widths=0.15,  # Make boxes much narrower
+        medianprops=dict(color="black"),  # Make median line black
+        positions=np.arange(len(names)) * 0.25,
+    )  # Reduce spacing between boxes significantly
+
+    # Color each box
+    for patch, color in zip(bp["boxes"], colors):
+        patch.set_facecolor(color)
+        patch.set_alpha(0.8)
+
+    # Style the plot
+    # plt.ylabel("Elo Rating", fontsize=12, color=AI2_DARK_TEAL)
+    plt.xticks(
+        np.arange(len(names)) * 0.25,  # Match positions from boxplot
+        [NAME_DISPLAY_MAP[name] for name in names],
+        rotation=45,
+        ha="right",
+        color=AI2_DARK_TEAL,
+    )
+    plt.yticks(color=AI2_DARK_TEAL)
+
+    # Set x-axis limits to maintain proper spacing
+    plt.xlim(-0.1, (len(names) - 1) * 0.25 + 0.1)
+
+    # Remove the title and adjust the layout
+    plt.tight_layout()
+
+    # Remove spines
+    for spine in plt.gca().spines.values():
+        spine.set_visible(False)
+
+    # Add left spine only
+    plt.gca().spines["left"].set_visible(True)
+    plt.gca().spines["left"].set_color(AI2_DARK_TEAL)
+    plt.gca().spines["left"].set_linewidth(0.5)
+
+    # Add bottom spine only
+    plt.gca().spines["bottom"].set_visible(True)
+    plt.gca().spines["bottom"].set_color(AI2_DARK_TEAL)
+    plt.gca().spines["bottom"].set_linewidth(0.5)
+
+    plt.savefig(output_path, dpi=300, bbox_inches="tight", transparent=True)
+    plt.close()
+
+
+@click.command()
+@click.argument("input_file", type=click.Path(exists=True))
+@click.argument("output_file", type=click.Path())
+@click.option(
+    "--manrope-medium-font-path",
+    type=str,
+    default="https://dolma-artifacts.org/Manrope-Medium.ttf",
+    help="Path to the Manrope Medium font file (local path or URL)",
+)
+def main(input_file, output_file, manrope_medium_font_path):
+    """Generate a boxplot from Elo ratings data.
+
+    INPUT_FILE: Path to the text file containing Elo ratings data
+    OUTPUT_FILE: Path where the plot should be saved
+    """
+    try:
+        # Handle font path - download and cache if it's a URL
+        if manrope_medium_font_path.startswith(("http://", "https://")):
+            font_path = download_and_cache_file(manrope_medium_font_path)
+        else:
+            font_path = manrope_medium_font_path
+
+        # Parse the data
+        names, medians, errors, ci_low, ci_high = parse_elo_data(input_file)
+
+        # Create and save the plot
+        create_boxplot(names, medians, errors, ci_low, ci_high, output_file, font_path)
+        click.echo(f"Plot successfully saved to {output_file}")
+
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        raise click.Abort()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/elo/ratings.csv
+++ b/scripts/elo/ratings.csv
+MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
+marker,mineru,53,26,67.1,32.9
+mineru,pdelf,22,55,28.6,71.4
+gotocr_format,marker,26,45,36.6,63.4
+marker,pdelf,31,49,38.8,61.3
+gotocr_format,pdelf,29,41,41.4,58.6
+gotocr_format,mineru,38,37,50.7,49.3
\ No newline at end of file
--- a/scripts/elo/results.txt
+++ b/scripts/elo/results.txt
+Bootstrapped Elo Ratings (95% CI):
+--------------------------------------------------
+pdelf        1813.0 ± 84.9 [1605.9, 1930.0]
+mineru       1545.2 ± 99.7 [1336.7, 1714.1]
+marker       1429.1 ± 100.7 [1267.6, 1645.5]
+gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]
+
+Pairwise Significance Tests:
+--------------------------------------------------
+gotocr_format vs marker       Δ = -216.3 [-470.8,  135.0] p = 0.218
+gotocr_format vs mineru       Δ = -332.5 [-567.5,   19.3] p = 0.051
+gotocr_format vs pdelf        Δ = -600.3 [-826.1, -344.3] p = 0.000*
+marker       vs mineru       Δ = -116.1 [-365.4,  246.5] p = 0.430
+marker       vs pdelf        Δ = -383.9 [-610.6,  -10.9] p = 0.044*
+mineru       vs pdelf        Δ = -267.8 [-517.3,  104.0] p = 0.135
+
--- a/scripts/infinigram_count.py
+++ b/scripts/infinigram_count.py
+#!/usr/bin/env python3
+import argparse
+import json
+import random
+import re
+import time
+
+import boto3
+import requests
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+# Allowed characters: alphanumeric, space, and basic punctuation ".,!?()"
+ALLOWED_RE = re.compile(r"^[A-Za-z0-9\.,!?() ]+$")
+
+
+def get_random_line_from_s3(bucket, key):
+    """
+    Reads an S3 object line-by-line and returns a random line using reservoir sampling.
+    """
+    s3 = boto3.client("s3")
+    response = s3.get_object(Bucket=bucket, Key=key)
+    random_line = None
+    count = 0
+    for line in response["Body"].iter_lines():
+        if not line:
+            continue
+        line_str = line.decode("utf-8")
+        count += 1
+        if random.randint(1, count) == 1:
+            random_line = line_str
+    return random_line
+
+
+def query_infinigram(ngram, index="v4_rpj_llama_s4", retries=3):
+    """
+    Sends a count query to the infini-gram API for the given n-gram.
+    Retries a few times in case of network issues.
+    """
+    url = "https://api.infini-gram.io/"
+    payload = {
+        "index": index,
+        "query_type": "count",
+        "query": ngram,
+    }
+    for i in range(retries):
+        try:
+            response = requests.post(url, json=payload, timeout=10)
+            if response.status_code == 200:
+                result = response.json()
+                if "count" in result:
+                    return result["count"]
+        except Exception:  # type: ignore
+            time.sleep(1)
+    return 0
+
+
+def process_document(doc, tokenizer, ngram_size, num_samples, index="v4_rpj_llama_s4"):
+    """
+    Tokenizes the document using the Llama2 tokenizer and samples random n-grams.
+    Each n-gram is chosen such that:
+      1. It starts on a word-split boundary (using the offset mapping and a check on the preceding character).
+      2. Its decoded string contains only alphanumeric characters, spaces, and the punctuation marks ".,!?()".
+
+    Each valid n-gram is then queried using the infini-gram API.
+    The function returns the document id, the number of matching n-grams (i.e. API count > 0),
+    the total number of valid n-grams sampled, and a list of tuples (flag, ngram_string).
+    """
+    text = doc.get("text", "")
+    doc_id = doc.get("id", "Unknown")
+    # Get tokenized representation with offset mapping to determine word boundaries.
+    tokenized = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
+    token_ids = tokenized["input_ids"]
+    # offsets = tokenized["offset_mapping"]
+
+    if len(token_ids) < ngram_size:
+        return doc_id, 0, 0, []
+
+    # Determine valid starting indices based on word-split boundaries.
+    valid_positions = []
+    # for i in range(len(token_ids) - ngram_size + 1):
+    #     start_offset = offsets[i][0]
+    #     if start_offset == 0 or (start_offset > 0 and text[start_offset - 1] == " "):
+    #         valid_positions.append(i)
+
+    if not valid_positions:
+        # Fallback: if no valid positions are found, use all possible positions.
+        valid_positions = list(range(len(token_ids) - ngram_size + 1))
+
+    valid_ngram_details = []
+    attempts = 0
+    max_attempts = num_samples * 10  # Limit to prevent infinite loops.
+    while len(valid_ngram_details) < num_samples and attempts < max_attempts:
+        idx = random.choice(valid_positions)
+        ngram_token_ids = token_ids[idx : idx + ngram_size]
+        ngram_str = tokenizer.decode(ngram_token_ids, clean_up_tokenization_spaces=True)
+        # Only accept n-grams that contain only allowed characters.
+        if ALLOWED_RE.fullmatch(ngram_str) and len(ngram_str.strip()) > ngram_size * 3:
+            count = query_infinigram(ngram_str, index=index)
+            flag = "YES" if count > 0 else "NO"
+            valid_ngram_details.append((flag, ngram_str))
+        attempts += 1
+
+    match_count = sum(1 for flag, _ in valid_ngram_details if flag == "YES")
+    sample_count = len(valid_ngram_details)
+    return doc_id, match_count, sample_count, valid_ngram_details
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Infini-gram n-gram matching script with Llama2 tokenization.")
+    parser.add_argument("N", type=int, help="Number of random .jsonl files to process")
+    parser.add_argument("s3_path", type=str, help="S3 path to a prefix containing .jsonl files (e.g., s3://my-bucket/my-prefix/)")
+    parser.add_argument("--index", type=str, default="v4_dolma-v1_7_llama", help="Infini-gram index to use (default: v4_rpj_llama_s4)")
+    parser.add_argument("--ngram_size", type=int, default=10, help="Size of the n-gram to sample (default: 10)")
+    parser.add_argument("--num_ngrams", type=int, default=100, help="Number of random n-grams to sample from each document (default: 100)")
+    args = parser.parse_args()
+
+    if not args.s3_path.startswith("s3://"):
+        print("Error: s3_path must start with 's3://'")
+        return
+    path_without_scheme = args.s3_path[5:]
+    parts = path_without_scheme.split("/", 1)
+    bucket = parts[0]
+    prefix = parts[1] if len(parts) > 1 else ""
+
+    print("Listing .jsonl files from S3...")
+    s3 = boto3.client("s3")
+    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
+    files = [obj["Key"] for obj in response.get("Contents", []) if obj["Key"].endswith(".jsonl")]
+    if not files:
+        print("No .jsonl files found in the given prefix.")
+        return
+
+    if args.N > len(files):
+        print(f"Requested {args.N} files, but only found {len(files)}. Processing all available files.")
+        args.N = len(files)
+    random_files = random.sample(files, args.N)
+
+    print("Loading Llama2 tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+
+    total_matches = 0
+    total_ngrams_sampled = 0
+
+    for key in tqdm(random_files, desc="Processing files"):
+        line = get_random_line_from_s3(bucket, key)
+        if not line:
+            print(f"Skipping {key}: No valid lines found.")
+            continue
+        try:
+            doc = json.loads(line)
+        except Exception as e:
+            print(f"Error parsing JSON in {key}: {e}")
+            continue
+        doc_id, match_count, sample_count, details = process_document(doc, tokenizer, args.ngram_size, args.num_ngrams, index=args.index)
+
+        # Print per-document n-gram summary
+        print(f"\nDocument ID: {doc_id}")
+        for flag, ngram in details:
+            # Print the flag in a fixed-width field (4 characters) followed by the n-gram representation.
+            print(f"{flag:4} {repr(ngram)}")
+        percentage = (match_count / sample_count * 100) if sample_count else 0
+        print(f"Matched n-grams: {match_count}/{sample_count} ({percentage:.2f}%)")
+
+        total_matches += match_count
+        total_ngrams_sampled += sample_count
+
+    overall_percentage = (total_matches / total_ngrams_sampled * 100) if total_ngrams_sampled else 0
+    print(f"\nTotal matched n-grams: {total_matches}/{total_ngrams_sampled} ({overall_percentage:.2f}%)")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/molmo-7b-lora-gantry.sh
+++ b/scripts/molmo-7b-lora-gantry.sh
+#!/usr/bin/env bash
+
+set -ex
+
+# check if jq is installed
+if ! command -v jq &> /dev/null
+then
+    echo "jq could not be found. Please install it."
+    exit
+fi
+
+
+EXTRA_ARGS="-c olmocr/train/config/molmo-o-lora-8192.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/molmo-pdf/v1/models/\${BEAKER_USER_ID}\""
+
+run_name=$(basename "$0" .sh)
+
+# --cluster 'ai2/jupiter*' \
+# --cluster 'ai2/pluto*' \
+# --cluster 'ai2/allennlp-cirrascale' \
+# --priority high \
+
+CLUSTER='jupiter'
+
+gantry run \
+    --description "${run_name}-8192"\
+    --task-name "${run_name}-8192"\
+    --allow-dirty \
+    --host-networking \
+    --workspace ai2/oe-data-model-based-cleanup \
+    --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
+    --venv 'base' \
+    --pip gantry-requirements.txt \
+    --priority high \
+    --gpus 8 \
+    --cluster "ai2/${CLUSTER}*" \
+    --budget ai2/oe-data \
+    --weka "oe-data-default:/data" \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
+    --env-secret DS_AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
+    --env-secret DS_AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --yes \
+    -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"
\ No newline at end of file
--- a/scripts/movedolmadocs_to_md.py
+++ b/scripts/movedolmadocs_to_md.py
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+from urllib.parse import urlparse
+
+import boto3
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Read JSONL files from an S3 prefix, extract text, and write to local .md files.")
+    parser.add_argument(
+        "--s3-prefix",
+        default="s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/",
+        help="S3 prefix containing the JSONL files (default: s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/)",
+    )
+    parser.add_argument("--output-dir", default="output_md", help="Local directory to store output .md files (default: output_md)")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # Parse the s3-prefix into bucket and prefix
+    parsed_s3 = urlparse(args.s3_prefix)
+    # e.g. netloc = 'ai2-oe-data', path = '/jakep/pdfworkspaces/pdelfin_testset/results/'
+    bucket_name = parsed_s3.netloc
+    # Remove leading '/' from parsed_s3.path
+    prefix = parsed_s3.path.lstrip("/")
+
+    # Ensure local output directory exists
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Initialize S3 client
+    s3 = boto3.client("s3")
+
+    # List all objects under the prefix
+    paginator = s3.get_paginator("list_objects_v2")
+    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
+
+    for page in pages:
+        if "Contents" not in page:
+            continue
+
+        for obj in page["Contents"]:
+            key = obj["Key"]
+            # Skip non-jsonl files
+            if not key.endswith(".jsonl"):
+                continue
+
+            print(f"Processing S3 object: s3://{bucket_name}/{key}")
+
+            # Read the S3 object
+            s3_object = s3.get_object(Bucket=bucket_name, Key=key)
+            # s3_object['Body'] is a StreamingBody, so we can read it line-by-line
+            body_stream = s3_object["Body"].iter_lines()
+
+            for line in body_stream:
+                if not line.strip():
+                    continue
+
+                try:
+                    record = json.loads(line)
+                except json.JSONDecodeError:
+                    print("Warning: Failed to decode JSON line.")
+                    continue
+
+                # Extract text
+                text_content = record.get("text", "")
+                if not text_content.strip():
+                    # If there's no text, skip
+                    continue
+
+                # Derive the output filename based on the "Source-File" metadata
+                metadata = record.get("metadata", {})
+                source_file = metadata.get("Source-File", "")
+
+                # Example: source_file = 's3://ai2-oe-data/jakep/pdfdata/pdelfin_testset/fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
+                # We want to end up with: 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8_pdelf.md'
+
+                # 1) Extract just the filename from the path
+                # 2) Remove '.pdf'
+                # 3) Append '_pdelf.md'
+                source_filename = os.path.basename(source_file)  # e.g. 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
+                if source_filename.lower().endswith(".pdf"):
+                    source_filename = source_filename[:-4]  # remove .pdf
+
+                output_filename = f"{source_filename}_pdelf.md"
+                output_path = os.path.join(args.output_dir, output_filename)
+
+                # Append the text to the corresponding file
+                # If you want to overwrite instead, change mode to 'w'
+                with open(output_path, "a", encoding="utf-8") as f:
+                    f.write(text_content + "\n")
+
+                # Optional: Print or log what you've written
+                # print(f"Appended text to {output_path}")
+
+    print("Done processing all JSONL files.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/prepare_changelog.py
+++ b/scripts/prepare_changelog.py
+from datetime import datetime
+from pathlib import Path
+
+from olmocr.version import VERSION
+
+
+def main():
+    changelog = Path("CHANGELOG.md")
+
+    with changelog.open() as f:
+        lines = f.readlines()
+
+    insert_index: int = -1
+    for i in range(len(lines)):
+        line = lines[i]
+        if line.startswith("## Unreleased"):
+            insert_index = i + 1
+        elif line.startswith(f"## [v{VERSION}]"):
+            print("CHANGELOG already up-to-date")
+            return
+        elif line.startswith("## [v"):
+            break
+
+    if insert_index < 0:
+        raise RuntimeError("Couldn't find 'Unreleased' section")
+
+    lines.insert(insert_index, "\n")
+    lines.insert(
+        insert_index + 1,
+        f"## [v{VERSION}](https://github.com/allenai/olmocr/releases/tag/v{VERSION}) - " f"{datetime.now().strftime('%Y-%m-%d')}\n",
+    )
+
+    with changelog.open("w") as f:
+        f.writelines(lines)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/qwen2vl-2b-gantry.sh
+++ b/scripts/qwen2vl-2b-gantry.sh
+#!/usr/bin/env bash
+
+set -ex
+
+# check if jq is installed
+if ! command -v jq &> /dev/null
+then
+    echo "jq could not be found. Please install it."
+    exit
+fi
+
+
+EXTRA_ARGS="-c olmocr/train/config/qwen2vl-2b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
+
+run_name=$(basename "$0" .sh)
+
+# --cluster 'ai2/jupiter*' \
+# --cluster 'ai2/pluto*' \
+# --cluster 'ai2/allennlp-cirrascale' \
+# --priority high \
+
+CLUSTER='jupiter'
+
+gantry run \
+    --description "${run_name}"\
+    --task-name "${run_name}"\
+    --allow-dirty \
+    --host-networking \
+    --workspace ai2/oe-data-pdf \
+    --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
+    --venv 'base' \
+    --pip gantry-requirements.txt \
+    --priority normal \
+    --gpus 8 \
+    --preemptible \
+    --cluster "ai2/${CLUSTER}*" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --yes \
+    -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"
\ No newline at end of file
--- a/scripts/qwen2vl-7b-gantry.sh
+++ b/scripts/qwen2vl-7b-gantry.sh
+#!/usr/bin/env bash
+
+set -ex
+
+# check if jq is installed
+if ! command -v jq &> /dev/null
+then
+    echo "jq could not be found. Please install it."
+    exit
+fi
+
+
+EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
+
+run_name=$(basename "$0" .sh)
+
+# --cluster 'ai2/jupiter*' \
+# --cluster 'ai2/pluto*' \
+# --cluster 'ai2/allennlp-cirrascale' \
+# --priority high \
+
+CLUSTER='jupiter'
+
+gantry run \
+    --description "${run_name}"\
+    --task-name "${run_name}"\
+    --allow-dirty \
+    --host-networking \
+    --workspace ai2/oe-data-model-based-cleanup \
+    --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
+    --venv 'base' \
+    --pip gantry-requirements.txt \
+    --priority high \
+    --gpus 8 \
+    --preemptible \
+    --cluster "ai2/${CLUSTER}*" \
+    --budget ai2/oe-data \
+    --weka "oe-data-default:/data" \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --yes \
+    -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"
\ No newline at end of file
--- a/scripts/qwen2vl-7b-lora-gantry.sh
+++ b/scripts/qwen2vl-7b-lora-gantry.sh
+#!/usr/bin/env bash
+
+set -ex
+
+# check if jq is installed
+if ! command -v jq &> /dev/null
+then
+    echo "jq could not be found. Please install it."
+    exit
+fi
+
+
+EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b-lora.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
+
+run_name=$(basename "$0" .sh)
+
+# --cluster 'ai2/jupiter*' \
+# --cluster 'ai2/pluto*' \
+# --cluster 'ai2/allennlp-cirrascale' \
+# --priority high \
+
+CLUSTER='jupiter'
+
+gantry run \
+    --description "${run_name}"\
+    --task-name "${run_name}"\
+    --allow-dirty \
+    --host-networking \
+    --workspace ai2/oe-data-model-based-cleanup \
+    --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
+    --venv 'base' \
+    --pip gantry-requirements.txt \
+    --priority high \
+    --gpus 8 \
+    --preemptible \
+    --cluster "ai2/${CLUSTER}*" \
+    --budget ai2/oe-data \
+    --weka "oe-data-default:/data" \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
+    --env-secret DS_AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
+    --env-secret DS_AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --yes \
+    -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"
\ No newline at end of file
--- a/scripts/release.sh
+++ b/scripts/release.sh
+#!/bin/bash
+
+set -e
+
+# Function to extract version components from version.py using regex
+get_version_from_file() {
+    VERSION_FILE="olmocr/version.py"
+
+    if [[ ! -f "$VERSION_FILE" ]]; then
+        echo "Error: $VERSION_FILE does not exist."
+        exit 1
+    fi
+
+    # Extract _MAJOR
+    _MAJOR=$(grep -E '^_MAJOR\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_MAJOR\s*=\s*"([^"]+)"/\1/')
+    if [[ -z "$_MAJOR" ]]; then
+        echo "Error: Could not extract _MAJOR from $VERSION_FILE."
+        exit 1
+    fi
+
+    # Extract _MINOR
+    _MINOR=$(grep -E '^_MINOR\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_MINOR\s*=\s*"([^"]+)"/\1/')
+    if [[ -z "$_MINOR" ]]; then
+        echo "Error: Could not extract _MINOR from $VERSION_FILE."
+        exit 1
+    fi
+
+    # Extract _PATCH
+    _PATCH=$(grep -E '^_PATCH\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_PATCH\s*=\s*"([^"]+)"/\1/')
+    if [[ -z "$_PATCH" ]]; then
+        echo "Error: Could not extract _PATCH from $VERSION_FILE."
+        exit 1
+    fi
+
+    # Extract _SUFFIX (optional)
+    _SUFFIX=$(grep -E '^_SUFFIX\s*=\s*"([^"]*)"' "$VERSION_FILE" | sed -E 's/_SUFFIX\s*=\s*"([^"]*)"/\1/')
+    if [[ -z "$_SUFFIX" ]]; then
+        _SUFFIX=""
+    fi
+
+    # Construct VERSION
+    VERSION_PY="${_MAJOR}.${_MINOR}.${_PATCH}${_SUFFIX}"
+    echo "$VERSION_PY"
+}
+
+TAG=$(python -c 'from olmocr.version import VERSION; print("v" + VERSION)')
+
+# Get the VERSION from version.py
+VERSION_PY=$(get_version_from_file)
+
+# Compare the two versions
+if [[ "v$VERSION_PY" != "$TAG" ]]; then
+    echo "Version mismatch detected:"
+    echo "  Python reported version: $TAG"
+    echo "  version.py contains: v$VERSION_PY"
+    echo
+    read -p "The versions do not match. Please run 'pip install -e .' to synchronize versions. Do you want to continue? [Y/n] " prompt
+
+    if [[ ! "$prompt" =~ ^([yY][eE][sS]|[yY])$ ]]; then
+        echo "Release process aborted due to version mismatch."
+        exit 1
+    else
+        echo "Proceeding with the release despite the version mismatch."
+    fi
+fi
+
+read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
+
+if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
+    python scripts/prepare_changelog.py
+    git add -A
+    git commit -m "Bump version to $TAG for release" || true && git push
+    echo "Creating new git tag $TAG"
+    git tag "$TAG" -m "$TAG"
+    git push --tags
+else
+    echo "Cancelled"
+    exit 1
+fi
\ No newline at end of file
--- a/scripts/release_notes.py
+++ b/scripts/release_notes.py
+# encoding: utf-8
+
+"""
+Prepares markdown release notes for GitHub releases.
+"""
+
+import os
+from typing import List, Optional
+
+import packaging.version
+
+TAG = os.environ["TAG"]
+
+ADDED_HEADER = "### Added 🎉"
+CHANGED_HEADER = "### Changed ⚠️"
+FIXED_HEADER = "### Fixed ✅"
+REMOVED_HEADER = "### Removed 👋"
+
+
+def get_change_log_notes() -> str:
+    in_current_section = False
+    current_section_notes: List[str] = []
+    with open("CHANGELOG.md") as changelog:
+        for line in changelog:
+            if line.startswith("## "):
+                if line.startswith("## Unreleased"):
+                    continue
+                if line.startswith(f"## [{TAG}]"):
+                    in_current_section = True
+                    continue
+                break
+            if in_current_section:
+                if line.startswith("### Added"):
+                    line = ADDED_HEADER + "\n"
+                elif line.startswith("### Changed"):
+                    line = CHANGED_HEADER + "\n"
+                elif line.startswith("### Fixed"):
+                    line = FIXED_HEADER + "\n"
+                elif line.startswith("### Removed"):
+                    line = REMOVED_HEADER + "\n"
+                current_section_notes.append(line)
+    assert current_section_notes
+    return "## What's new\n\n" + "".join(current_section_notes).strip() + "\n"
+
+
+def get_commit_history() -> str:
+    new_version = packaging.version.parse(TAG)
+
+    # Pull all tags.
+    os.popen("git fetch --tags")
+
+    # Get all tags sorted by version, latest first.
+    all_tags = os.popen("git tag -l --sort=-version:refname 'v*'").read().split("\n")
+
+    # Out of `all_tags`, find the latest previous version so that we can collect all
+    # commits between that version and the new version we're about to publish.
+    # Note that we ignore pre-releases unless the new version is also a pre-release.
+    last_tag: Optional[str] = None
+    for tag in all_tags:
+        if not tag.strip():  # could be blank line
+            continue
+        version = packaging.version.parse(tag)
+        if new_version.pre is None and version.pre is not None:
+            continue
+        if version < new_version:
+            last_tag = tag
+            break
+    if last_tag is not None:
+        commits = os.popen(f"git log {last_tag}..{TAG} --oneline --first-parent").read()
+    else:
+        commits = os.popen("git log --oneline --first-parent").read()
+    return "## Commits\n\n" + commits
+
+
+def main():
+    print(get_change_log_notes())
+    print(get_commit_history())
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/s2orc_extractor.sh
+++ b/scripts/s2orc_extractor.sh
+#!/bin/bash
+
+# Define the output file for the metadata.sha1 fields
+OUTPUT_FILE="s2orc_pdfs_v2.txt"
+
+# Clear the output file if it already exists
+> "$OUTPUT_FILE"
+
+# Create a temporary directory for partial outputs
+temp_output_dir=$(mktemp -d)
+
+# Ensure the temporary directory is cleaned up on exit or error
+trap 'rm -rf "$temp_output_dir"' EXIT
+
+# Export the temporary output directory variable for use in xargs
+export temp_output_dir
+
+echo "temp dir $temp_output_dir"
+
+# Find all .gz files recursively from the current directory
+find 'split=train' -type f -name "*.gz" | \
+    xargs -P 30 -I{} bash -c '
+        gz_file="$1"
+        partial_output="$temp_output_dir/$(basename "$gz_file").txt"
+
+        # Stream uncompressed data directly into jq and format the output
+        gunzip -c "$gz_file" | jq -r '"'"'
+            select(.metadata.sha1 != null) |
+            "s3://ai2-s2-pdfs/" + (.metadata.sha1[:4]) + "/" + (.metadata.sha1[4:]) + ".pdf"
+        '"'"' >> "$partial_output"
+    ' _ {}
+
+# Concatenate all partial outputs into the final output file
+cat "$temp_output_dir"/*.txt >> "$OUTPUT_FILE"
+
+echo "All metadata.sha1 fields have been extracted to $OUTPUT_FILE."