Commit 89e60e48 authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #2484 canceled with stages
set -ex
export NCCL_DEBUG=INFO NCCL_SOCKET_IFNAME=ib NCCL_IB_HCA="^=mlx5_1,mlx5_2"
\ No newline at end of file
"""Benchmark offline inference throughput."""
import argparse
import base64
import json
import random
import time
from io import BytesIO
from typing import List, Optional, Tuple
import torch
import uvloop
from PIL import Image
from tqdm import tqdm
from transformers import (
AutoModelForCausalLM,
AutoProcessor,
AutoTokenizer,
PreTrainedTokenizerBase,
)
from vllm import TokensPrompt
from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
dataset = [(data["conversations"][0]["value"], data["conversations"][1]["value"]) for data in dataset]
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len))
return filtered_dataset
def sample_mm_requests_qwen2vl(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
):
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
with open(dataset_path, "r") as f:
json_data = [json.loads(line) for line in f.readlines() if len(line.strip()) > 0]
result = []
for data in tqdm(json_data):
text = processor.apply_chat_template(data["chat_messages"], tokenize=False, add_generation_prompt=True)
raw_b64 = data["chat_messages"][0]["content"][1]["image_url"]["url"]
_main_image = Image.open(BytesIO(base64.b64decode(raw_b64[raw_b64.find(",") + 1 :])))
# Process inputs using processor
inputs = processor(
text=[text],
# images=[_main_image], # Don't pad out the image tokens yet, since that happens later inside of birr
padding=True,
return_tensors="np",
)
# print(inputs)
tokens = inputs["input_ids"][0]
prompt_len = len(tokens)
result.append(
(
TokensPrompt(
dict(
prompt_token_ids=tokens,
multi_modal_data=dict(image=dict(image_embeds=torch.randn(1036, 3584), image_grid_thw=torch.tensor([[1, 74, 56]]))),
# multi_modal_data=dict(image=main_image)
)
),
prompt_len,
fixed_output_len,
)
)
if len(result) >= num_requests:
break
return result
def sample_mm_requests_phi3(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
):
processor = AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
with open(dataset_path, "r") as f:
json_data = [json.loads(line) for line in f.readlines() if len(line.strip()) > 0]
result = []
for data in tqdm(json_data):
inputs = processor.tokenizer.apply_chat_template(
[{"role": "user", "content": "<|image_1|>\n" + data["chat_messages"][0]["content"][0]["text"]}], tokenize=True, add_generation_prompt=True
)
raw_b64 = data["chat_messages"][0]["content"][1]["image_url"]["url"]
main_image = Image.open(BytesIO(base64.b64decode(raw_b64[raw_b64.find(",") + 1 :])))
# tokens = inputs["input_ids"][0]
tokens = inputs
prompt_len = len(tokens)
result.append(
(
TokensPrompt(
dict(
prompt_token_ids=tokens,
multi_modal_data=dict(image=main_image),
)
),
prompt_len,
fixed_output_len,
)
)
if len(result) >= num_requests:
break
return result
def sample_mm_requests_molmo(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
):
processor = AutoProcessor.from_pretrained("allenai/Molmo-7B-D-0924", trust_remote_code=True, torch_dtype="auto", device_map="auto")
with open(dataset_path, "r") as f:
json_data = [json.loads(line) for line in f.readlines() if len(line.strip()) > 0]
result = []
for data in tqdm(json_data):
raw_b64 = data["chat_messages"][0]["content"][1]["image_url"]["url"]
main_image = Image.open(BytesIO(base64.b64decode(raw_b64[raw_b64.find(",") + 1 :])))
inputs = inputs = processor.process(images=[main_image], text=data["chat_messages"][0]["content"][0]["text"])
# print(inputs)
# Molmo has max size of 4096 which is lower than our dataset was generated for
tokens = inputs["input_ids"][:2000]
# tokens = inputs
prompt_len = len(tokens)
result.append(
(
TokensPrompt(
dict(
prompt_token_ids=tokens,
multi_modal_data=dict(image=main_image),
)
),
prompt_len,
fixed_output_len,
)
)
if len(result) >= num_requests:
break
return result
def run_vllm(
requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
quantization: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
trust_remote_code: bool,
dtype: str,
max_model_len: Optional[int],
enforce_eager: bool,
kv_cache_dtype: str,
quantization_param_path: Optional[str],
device: str,
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
num_scheduler_steps: int = 1,
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(
model=model,
tokenizer=tokenizer,
quantization=quantization,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
dtype=dtype,
# speculative_model="[ngram]",
# num_speculative_tokens=1,
# ngram_prompt_lookup_max=5,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
quantization_param_path=quantization_param_path,
device=device,
enable_prefix_caching=enable_prefix_caching,
download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
num_scheduler_steps=num_scheduler_steps,
disable_async_output_proc=disable_async_output_proc,
)
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
for prompt, _, output_len in requests:
prompts.append(prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=output_len,
)
)
use_beam_search = False
if not use_beam_search:
start = time.perf_counter()
llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
else:
prompts = [prompt for prompt, _, _ in requests]
# output_len should be the same for all requests.
output_len = requests[0][2]
for prompt, input_len, _output_len in requests:
assert _output_len == output_len
start = time.perf_counter()
llm.beam_search(
prompts,
BeamSearchParams(
beam_width=n,
max_tokens=output_len,
ignore_eos=True,
),
)
end = time.perf_counter()
return end - start
async def run_vllm_async(
requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
quantization: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
trust_remote_code: bool,
dtype: str,
max_model_len: Optional[int],
enforce_eager: bool,
kv_cache_dtype: str,
quantization_param_path: Optional[str],
device: str,
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
num_scheduler_steps: int = 1,
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
disable_frontend_multiprocessing: bool = False,
) -> float:
from vllm import SamplingParams
engine_args = AsyncEngineArgs(
model=model,
tokenizer=tokenizer,
quantization=quantization,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
dtype=dtype,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
quantization_param_path=quantization_param_path,
device=device,
enable_prefix_caching=enable_prefix_caching,
download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
num_scheduler_steps=num_scheduler_steps,
disable_async_output_proc=disable_async_output_proc,
worker_use_ray=False,
disable_log_requests=True,
)
async with build_async_engine_client_from_engine_args(engine_args, disable_frontend_multiprocessing) as llm:
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
for prompt, _, output_len in requests:
prompts.append(prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=output_len,
)
)
generators = []
start = time.perf_counter()
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
generator = llm.generate(prompt, sp, request_id=f"test{i}")
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
pass
end = time.perf_counter()
return end - start
def run_hf(
requests: List[Tuple[str, int, int]],
model: str,
tokenizer: PreTrainedTokenizerBase,
n: int,
max_batch_size: int,
trust_remote_code: bool,
) -> float:
llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
if llm.config.model_type == "llama":
# To enable padding in the HF backend.
tokenizer.pad_token = tokenizer.eos_token
llm = llm.cuda()
pbar = tqdm(total=len(requests))
start = time.perf_counter()
batch: List[str] = []
max_prompt_len = 0
max_output_len = 0
for i in range(len(requests)):
prompt, prompt_len, output_len = requests[i]
# Add the prompt to the batch.
batch.append(prompt)
max_prompt_len = max(max_prompt_len, prompt_len)
max_output_len = max(max_output_len, output_len)
if len(batch) < max_batch_size and i != len(requests) - 1:
# Check if we can add more requests to the batch.
_, next_prompt_len, next_output_len = requests[i + 1]
if (max(max_prompt_len, next_prompt_len) + max(max_output_len, next_output_len)) <= 2048:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
llm_outputs = llm.generate(
input_ids=input_ids.cuda(),
do_sample=True,
num_return_sequences=n,
temperature=1.0,
top_p=1.0,
use_cache=True,
max_new_tokens=max_output_len,
)
# Include the decoding time.
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
pbar.update(len(batch))
# Clear the batch.
batch = []
max_prompt_len = 0
max_output_len = 0
end = time.perf_counter()
return end - start
def run_mii(
requests: List[Tuple[str, int, int]],
model: str,
tensor_parallel_size: int,
output_len: int,
) -> float:
from mii import client, serve
llm = serve(model, tensor_parallel=tensor_parallel_size)
prompts = [prompt for prompt, _, _ in requests]
start = time.perf_counter()
llm.generate(prompts, max_new_tokens=output_len)
end = time.perf_counter()
client = client(model)
client.terminate_server()
return end - start
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
# Sample the requests.
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
# Synthesize a prompt with the given input length.
prompt = "hi" * (args.input_len - 1)
requests = [(prompt, args.input_len, args.output_len) for _ in range(args.num_prompts)]
else:
# requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
# args.output_len)
requests = sample_mm_requests_qwen2vl(args.dataset, args.num_prompts, tokenizer, args.output_len)
if args.backend == "vllm":
run_args = [
requests,
args.model,
args.tokenizer,
args.quantization,
args.tensor_parallel_size,
args.seed,
args.n,
args.trust_remote_code,
args.dtype,
args.max_model_len,
args.enforce_eager,
args.kv_cache_dtype,
args.quantization_param_path,
args.device,
args.enable_prefix_caching,
args.enable_chunked_prefill,
args.max_num_batched_tokens,
args.distributed_executor_backend,
args.gpu_memory_utilization,
args.num_scheduler_steps,
args.download_dir,
args.load_format,
args.disable_async_output_proc,
]
if args.async_engine:
run_args.append(args.disable_frontend_multiprocessing)
elapsed_time = uvloop.run(run_vllm_async(*run_args))
else:
elapsed_time = run_vllm(*run_args)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n, args.hf_max_batch_size, args.trust_remote_code)
elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, args.output_len)
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len for _, prompt_len, output_len in requests)
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} tokens/s")
# Output JSON results if specified
if args.output_json:
results = {
"elapsed_time": elapsed_time,
"num_requests": len(requests),
"total_num_tokens": total_num_tokens,
"requests_per_second": len(requests) / elapsed_time,
"tokens_per_second": total_num_tokens / elapsed_time,
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm")
parser.add_argument("--dataset", type=str, default=None, help="Path to the dataset.")
parser.add_argument("--input-len", type=int, default=None, help="Input prompt length for each request")
parser.add_argument("--output-len", type=int, default=None, help="Output length for each request. Overrides the " "output length from the dataset.")
parser.add_argument("--model", type=str, default="facebook/opt-125m")
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument("--quantization", "-q", choices=[*QUANTIZATION_METHODS, None], default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n", type=int, default=1, help="Number of generated sequences per prompt.")
parser.add_argument("--num-prompts", type=int, default=1000, help="Number of prompts to process.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.")
parser.add_argument("--trust-remote-code", action="store_true", help="trust remote code from huggingface")
parser.add_argument(
"--max-model-len",
type=int,
default=None,
help="Maximum length of a sequence (including prompt and output). " "If None, will be derived from the model.",
)
parser.add_argument(
"--dtype",
type=str,
default="auto",
choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
help="data type for model weights and activations. "
'The "auto" option will use FP16 precision '
"for FP32 and FP16 models, and BF16 precision "
"for BF16 models.",
)
parser.add_argument(
"--gpu-memory-utilization",
type=float,
default=0.9,
help="the fraction of GPU memory to be used for "
"the model executor, which can range from 0 to 1."
"If unspecified, will use the default value of 0.9.",
)
parser.add_argument("--enforce-eager", action="store_true", help="enforce eager execution")
parser.add_argument(
"--kv-cache-dtype",
type=str,
choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
default="auto",
help='Data type for kv cache storage. If "auto", will use model '
"data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
"ROCm (AMD GPU) supports fp8 (=fp8_e4m3)",
)
parser.add_argument(
"--quantization-param-path",
type=str,
default=None,
help="Path to the JSON file containing the KV cache scaling factors. "
"This should generally be supplied, when KV cache dtype is FP8. "
"Otherwise, KV cache scaling factors default to 1.0, which may cause "
"accuracy issues. FP8_E5M2 (without scaling) is only supported on "
"cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is "
"instead supported for common inference criteria.",
)
parser.add_argument("--device", type=str, default="auto", choices=DEVICE_OPTIONS, help="device type for vLLM execution")
parser.add_argument("--num-scheduler-steps", type=int, default=1, help="Maximum number of forward steps per scheduler call.")
parser.add_argument("--enable-prefix-caching", action="store_true", help="Enable automatic prefix caching for vLLM backend.")
parser.add_argument("--enable-chunked-prefill", action="store_true", help="enable chunked prefill for vLLM backend.")
parser.add_argument("--max-num-batched-tokens", type=int, default=None, help="maximum number of batched tokens per " "iteration")
parser.add_argument(
"--download-dir", type=str, default=None, help="directory to download and load the weights, " "default to the default cache dir of huggingface"
)
parser.add_argument("--output-json", type=str, default=None, help="Path to save the throughput results in JSON format.")
parser.add_argument(
"--distributed-executor-backend",
choices=["ray", "mp"],
default=None,
help="Backend to use for distributed serving. When more than 1 GPU "
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.',
)
parser.add_argument(
"--load-format",
type=str,
default=EngineArgs.load_format,
choices=["auto", "pt", "safetensors", "npcache", "dummy", "tensorizer", "bitsandbytes"],
help="The format of the model weights to load.\n\n"
'* "auto" will try to load the weights in the safetensors format '
"and fall back to the pytorch bin format if safetensors format "
"is not available.\n"
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
"a numpy cache to speed up the loading.\n"
'* "dummy" will initialize the weights with random values, '
"which is mainly for profiling.\n"
'* "tensorizer" will load the weights using tensorizer from '
"CoreWeave. See the Tensorize vLLM Model script in the Examples"
"section for more information.\n"
'* "bitsandbytes" will load the weights using bitsandbytes '
"quantization.\n",
)
parser.add_argument("--disable-async-output-proc", action="store_true", default=False, help="Disable async output processor for vLLM backend.")
parser.add_argument("--async-engine", action="store_true", default=False, help="Use vLLM async engine rather than LLM class.")
parser.add_argument("--disable-frontend-multiprocessing", action="store_true", default=False, help="Disable decoupled async engine frontend.")
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
if args.dataset is None:
assert args.input_len is not None
assert args.output_len is not None
else:
assert args.input_len is None
if args.backend == "vllm":
if args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
elif args.backend == "hf":
if args.hf_max_batch_size is None:
raise ValueError("HF max batch size is required for HF backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
elif args.backend == "mii":
if args.dtype != "auto":
raise ValueError("dtype must be auto for MII backend.")
if args.n != 1:
raise ValueError("n must be 1 for MII backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
if args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII " "backend.")
main(args)
model:
# full fine tune
name_or_path: weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/
#name_or_path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/
vlm: true
# necessary to prevent random crashes, until vllm fixes some bugs
num_scheduler_steps: 1
format:
add_generation_prompt: true
generate:
# The model's max context length is 8096, but around 1500 tokens are reserved for the image itself
max_context_length: 6500
temperature: 0.8
top_p: 1.0
drop_long_outputs: false
pipeline:
sqs_queue_name: jake-pdf
num_workers: 3
generation_batch_size: 256
tokenization_batch_size: 64
output_serializer: default
target_bucket: ai2-oe-data
target_object_prefix: [your username]/pdfworkspaces/s2orc_3200k_v2/inference_outputs
allowed_restarts_per_predictor: 10
task:
budget: ai2/oe-data
workspace: ai2/oe-data-model-based-cleanup
name: qwen2vl-schedsteps-bg
replicas: 128
priority: LOW
gpu_count: 1
cluster:
- ai2/jupiter-cirrascale-2
- ai2/saturn-cirrascale
#!/bin/bash
set -e
VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)')
echo "$VERSION"
docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference -t olmocr-inference-$VERSION .
beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION olmocr-inference-$VERSION
\ No newline at end of file
# elo rating
Calculates elo rating of olmOCR vs other tools.
## Data
The pairwise judgment data is stored in `ratings.csv` as win/loss counts:
```
MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
marker,mineru,53,26,67.1,32.9
mineru,pdelf,22,55,28.6,71.4
gotocr_format,marker,26,45,36.6,63.4
marker,pdelf,31,49,38.8,61.3
gotocr_format,pdelf,29,41,41.4,58.6
gotocr_format,mineru,38,37,50.7,49.3
```
*Note* `pdfelf` is olmOCR.
## Usage
To calculate elo ratings, run the following command:
```bash
python calculate_elo_ratings.py ratings.csv --num-bootstrap 5000 --num-elo-sims 100 --confidence-level 95 --seed 123
```
It should print something like:
```
Bootstrapped Elo Ratings (95% CI):
--------------------------------------------------
pdelf 1813.0 ± 84.9 [1605.9, 1930.0]
mineru 1545.2 ± 99.7 [1336.7, 1714.1]
marker 1429.1 ± 100.7 [1267.6, 1645.5]
gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]
Pairwise Significance Tests:
--------------------------------------------------
gotocr_format vs marker Δ = -216.3 [-470.8, 135.0] p = 0.218
gotocr_format vs mineru Δ = -332.5 [-567.5, 19.3] p = 0.051
gotocr_format vs pdelf Δ = -600.3 [-826.1, -344.3] p = 0.000*
marker vs mineru Δ = -116.1 [-365.4, 246.5] p = 0.430
marker vs pdelf Δ = -383.9 [-610.6, -10.9] p = 0.044*
mineru vs pdelf Δ = -267.8 [-517.3, 104.0] p = 0.135
```
which is also already saved in `results.txt`.
To generate boxplots of elo ratings, run the following command:
```bash
python draw_boxplots.py results.txt boxplots.png
```
which should save boxplots as `boxplots.png`.
\ No newline at end of file
"""
Elo ratings for olmOCR vs baselines.
See data at scripts/elo/ratings.csv
MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
marker,mineru,53,26,67.1,32.9
mineru,pdelf,22,55,28.6,71.4
gotocr_format,marker,26,45,36.6,63.4
marker,pdelf,31,49,38.8,61.3
gotocr_format,pdelf,29,41,41.4,58.6
gotocr_format,mineru,38,37,50.7,49.3
Invoke via
python calculate_elo_ratings.py ratings.csv --num-bootstrap 5000 --num-elo-sims 100 --confidence-level 95 --seed 123
Output:
Bootstrapped Elo Ratings (95% CI):
--------------------------------------------------
pdelf 1813.0 ± 84.9 [1605.9, 1930.0]
mineru 1545.2 ± 99.7 [1336.7, 1714.1]
marker 1429.1 ± 100.7 [1267.6, 1645.5]
gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]
Pairwise Significance Tests:
--------------------------------------------------
gotocr_format vs marker Δ = -216.3 [-470.8, 135.0] p = 0.218
gotocr_format vs mineru Δ = -332.5 [-567.5, 19.3] p = 0.051
gotocr_format vs pdelf Δ = -600.3 [-826.1, -344.3] p = 0.000*
marker vs mineru Δ = -116.1 [-365.4, 246.5] p = 0.430
marker vs pdelf Δ = -383.9 [-610.6, -10.9] p = 0.044*
mineru vs pdelf Δ = -267.8 [-517.3, 104.0] p = 0.135
@kylel
"""
import random
from itertools import combinations
import click
import numpy as np
import pandas as pd
from tqdm import tqdm
def calculate_elo(matches_data, all_methods, k_factor=32, initial_rating=1500, n_replications=10, random_state=None):
"""Calculate Elo ratings with multiple replications per dataset"""
all_ratings = {method: [] for method in all_methods}
for _ in range(n_replications):
matches = matches_data.sample(frac=1, replace=False, random_state=random_state).reset_index(drop=True)
ratings = {method: initial_rating for method in all_methods}
for _, row in matches.iterrows():
method_a, method_b = row["MethodA"], row["MethodB"]
a_wins, b_wins = row["A_wins"], row["B_wins"]
for _ in range(int(a_wins)):
ra, rb = update_single_match(ratings[method_a], ratings[method_b], 1, k_factor)
ratings[method_a], ratings[method_b] = ra, rb
for _ in range(int(b_wins)):
ra, rb = update_single_match(ratings[method_a], ratings[method_b], 0, k_factor)
ratings[method_a], ratings[method_b] = ra, rb
for method in all_methods:
all_ratings[method].append(ratings[method])
return {method: np.mean(ratings) for method, ratings in all_ratings.items()}
def update_single_match(rating_a, rating_b, actual_score, k_factor):
"""Update ratings for a single match"""
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
new_rating_a = rating_a + k_factor * (actual_score - expected_a)
new_rating_b = rating_b + k_factor * ((1 - actual_score) - (1 - expected_a))
return new_rating_a, new_rating_b
def bootstrap_elo_and_tests(df, num_bootstrap=1000, num_elo_sims=10, confidence_level=95, k_factor=32, initial_rating=1500, random_state=None):
"""Calculate bootstrapped Elo ratings with confidence intervals and perform pairwise significance tests"""
ci_lower = (100 - confidence_level) / 2
ci_upper = 100 - ci_lower
all_methods = set(df["MethodA"].unique()) | set(df["MethodB"].unique())
bootstrap_ratings = {method: [] for method in all_methods}
for _ in tqdm(range(num_bootstrap)):
bootstrap_sample = df.sample(n=len(df), replace=True, random_state=random_state)
ratings = calculate_elo(bootstrap_sample, all_methods, k_factor, initial_rating, num_elo_sims)
for method in all_methods:
bootstrap_ratings[method].append(ratings[method])
# Calculate statistics and perform significance tests
results = {}
# Basic statistics
for method in all_methods:
ratings_array = np.array(bootstrap_ratings[method])
results[method] = {
"mean": np.mean(ratings_array),
"std": np.std(ratings_array),
"ci_lower": np.percentile(ratings_array, ci_lower),
"ci_upper": np.percentile(ratings_array, ci_upper),
"bootstrap_samples": ratings_array, # Store for significance testing
}
# Pairwise significance tests
significance_tests = {}
for method1, method2 in combinations(all_methods, 2):
# Calculate difference distribution
diff_distribution = results[method1]["bootstrap_samples"] - results[method2]["bootstrap_samples"]
# Calculate p-value (two-tailed test)
p_value = 2 * min(np.mean(diff_distribution >= 0), np.mean(diff_distribution <= 0))
# Store results
significance_tests[(method1, method2)] = {
"diff_mean": np.mean(diff_distribution),
"diff_ci_lower": np.percentile(diff_distribution, ci_lower),
"diff_ci_upper": np.percentile(diff_distribution, ci_upper),
"p_value": p_value,
}
return results, significance_tests
@click.command()
@click.argument("ratings_file", type=click.Path(exists=True))
@click.option("--num-bootstrap", default=1000, help="Number of bootstrap iterations")
@click.option("--num-elo-sims", default=10, help="Number of ELO simulations per bootstrap")
@click.option("--confidence-level", default=95, help="Confidence level for intervals (in percent)")
@click.option("--seed", default=42, help="Random seed for reproducibility")
def main(ratings_file, num_bootstrap, num_elo_sims, confidence_level, seed):
# Set random seed
random.seed(seed)
np.random.seed(seed)
# Load data
df = pd.read_csv(ratings_file)
# Calculate bootstrapped Elo ratings
results, significance_tests = bootstrap_elo_and_tests(df, num_bootstrap=num_bootstrap, num_elo_sims=num_elo_sims)
# Sort and display results
print(f"\nBootstrapped Elo Ratings ({confidence_level}% CI):")
print("-" * 50)
sorted_results = dict(sorted(results.items(), key=lambda x: x[1]["mean"], reverse=True))
for method, stats in sorted_results.items():
print(f"{method:12} {stats['mean']:6.1f} ± {stats['std']:4.1f} [{stats['ci_lower']:6.1f}, {stats['ci_upper']:6.1f}]")
# Display pairwise significance tests
print("\nPairwise Significance Tests:")
print("-" * 50)
for (method1, method2), stats in significance_tests.items():
sig_marker = "*" if stats["p_value"] < (1 - confidence_level / 100) else " "
print(
f"{method1:12} vs {method2:12} Δ = {stats['diff_mean']:6.1f} "
+ f"[{stats['diff_ci_lower']:6.1f}, {stats['diff_ci_upper']:6.1f}] "
+ f"p = {stats['p_value']:.3f}{sig_marker}"
)
if __name__ == "__main__":
main()
"""
Boxplots of Elo ratings with 95% confidence intervals for each method.
Invocation:
python draw_boxplots.py results.txt boxplots.png
@kylel
"""
import hashlib
import re
from pathlib import Path
import click
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
import numpy as np
import requests
# AI2 Colors
AI2_PINK = "#f0529c"
AI2_DARK_TEAL = "#0a3235"
AI2_TEAL = "#105257"
# Name mappings
NAME_DISPLAY_MAP = {"pdelf": "olmOCR", "mineru": "MinerU", "marker": "Marker", "gotocr_format": "GOTOCR"}
def download_and_cache_file(url, cache_dir=None):
"""Download a file and cache it locally."""
if cache_dir is None:
cache_dir = Path.home() / ".cache" / "elo_plot"
cache_dir = Path(cache_dir)
cache_dir.mkdir(parents=True, exist_ok=True)
# Create filename from URL hash
url_hash = hashlib.sha256(url.encode()).hexdigest()[:12]
file_name = url.split("/")[-1]
cached_path = cache_dir / f"{url_hash}_{file_name}"
if not cached_path.exists():
response = requests.get(url, stream=True)
response.raise_for_status()
with open(cached_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return str(cached_path)
def parse_elo_data(file_path):
"""Parse Elo ratings data from a text file."""
with open(file_path, "r") as f:
content = f.read()
# Regular expression to match the data lines
pattern = r"(\w+)\s+(\d+\.\d+)\s*±\s*(\d+\.\d+)\s*\[(\d+\.\d+),\s*(\d+\.\d+)\]"
matches = re.finditer(pattern, content)
# Initialize lists to store data
names = []
medians = []
errors = []
ci_low = []
ci_high = []
for match in matches:
names.append(match.group(1))
medians.append(float(match.group(2)))
errors.append(float(match.group(3)))
ci_low.append(float(match.group(4)))
ci_high.append(float(match.group(5)))
return names, medians, errors, ci_low, ci_high
def create_boxplot(names, medians, errors, ci_low, ci_high, output_path, font_path):
"""Create and save a boxplot of Elo ratings."""
# Set up Manrope font
font_manager.fontManager.addfont(font_path)
plt.rcParams["font.family"] = "Manrope"
plt.rcParams["font.weight"] = "medium"
# Define colors - pdelf in pink, others in shades of teal/grey based on performance
max_median = max(medians)
colors = []
for i, median in enumerate(medians):
if names[i] == "pdelf":
colors.append(AI2_PINK)
else:
# Calculate a shade between dark teal and grey based on performance
performance_ratio = (median - min(medians)) / (max_median - min(medians))
base_color = np.array(tuple(int(AI2_DARK_TEAL[i : i + 2], 16) for i in (1, 3, 5))) / 255.0
grey = np.array([0.7, 0.7, 0.7]) # Light grey
color = tuple(np.clip(base_color * performance_ratio + grey * (1 - performance_ratio), 0, 1))
colors.append(color)
# Create box plot data
box_data = []
for i in range(len(names)):
q1 = medians[i] - errors[i]
q3 = medians[i] + errors[i]
box_data.append([ci_low[i], q1, medians[i], q3, ci_high[i]])
# Create box plot with smaller width and spacing
plt.figure(figsize=(4, 4))
bp = plt.boxplot(
box_data,
labels=[NAME_DISPLAY_MAP[name] for name in names],
whis=1.5,
patch_artist=True,
widths=0.15, # Make boxes much narrower
medianprops=dict(color="black"), # Make median line black
positions=np.arange(len(names)) * 0.25,
) # Reduce spacing between boxes significantly
# Color each box
for patch, color in zip(bp["boxes"], colors):
patch.set_facecolor(color)
patch.set_alpha(0.8)
# Style the plot
# plt.ylabel("Elo Rating", fontsize=12, color=AI2_DARK_TEAL)
plt.xticks(
np.arange(len(names)) * 0.25, # Match positions from boxplot
[NAME_DISPLAY_MAP[name] for name in names],
rotation=45,
ha="right",
color=AI2_DARK_TEAL,
)
plt.yticks(color=AI2_DARK_TEAL)
# Set x-axis limits to maintain proper spacing
plt.xlim(-0.1, (len(names) - 1) * 0.25 + 0.1)
# Remove the title and adjust the layout
plt.tight_layout()
# Remove spines
for spine in plt.gca().spines.values():
spine.set_visible(False)
# Add left spine only
plt.gca().spines["left"].set_visible(True)
plt.gca().spines["left"].set_color(AI2_DARK_TEAL)
plt.gca().spines["left"].set_linewidth(0.5)
# Add bottom spine only
plt.gca().spines["bottom"].set_visible(True)
plt.gca().spines["bottom"].set_color(AI2_DARK_TEAL)
plt.gca().spines["bottom"].set_linewidth(0.5)
plt.savefig(output_path, dpi=300, bbox_inches="tight", transparent=True)
plt.close()
@click.command()
@click.argument("input_file", type=click.Path(exists=True))
@click.argument("output_file", type=click.Path())
@click.option(
"--manrope-medium-font-path",
type=str,
default="https://dolma-artifacts.org/Manrope-Medium.ttf",
help="Path to the Manrope Medium font file (local path or URL)",
)
def main(input_file, output_file, manrope_medium_font_path):
"""Generate a boxplot from Elo ratings data.
INPUT_FILE: Path to the text file containing Elo ratings data
OUTPUT_FILE: Path where the plot should be saved
"""
try:
# Handle font path - download and cache if it's a URL
if manrope_medium_font_path.startswith(("http://", "https://")):
font_path = download_and_cache_file(manrope_medium_font_path)
else:
font_path = manrope_medium_font_path
# Parse the data
names, medians, errors, ci_low, ci_high = parse_elo_data(input_file)
# Create and save the plot
create_boxplot(names, medians, errors, ci_low, ci_high, output_file, font_path)
click.echo(f"Plot successfully saved to {output_file}")
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
raise click.Abort()
if __name__ == "__main__":
main()
MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
marker,mineru,53,26,67.1,32.9
mineru,pdelf,22,55,28.6,71.4
gotocr_format,marker,26,45,36.6,63.4
marker,pdelf,31,49,38.8,61.3
gotocr_format,pdelf,29,41,41.4,58.6
gotocr_format,mineru,38,37,50.7,49.3
\ No newline at end of file
Bootstrapped Elo Ratings (95% CI):
--------------------------------------------------
pdelf 1813.0 ± 84.9 [1605.9, 1930.0]
mineru 1545.2 ± 99.7 [1336.7, 1714.1]
marker 1429.1 ± 100.7 [1267.6, 1645.5]
gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]
Pairwise Significance Tests:
--------------------------------------------------
gotocr_format vs marker Δ = -216.3 [-470.8, 135.0] p = 0.218
gotocr_format vs mineru Δ = -332.5 [-567.5, 19.3] p = 0.051
gotocr_format vs pdelf Δ = -600.3 [-826.1, -344.3] p = 0.000*
marker vs mineru Δ = -116.1 [-365.4, 246.5] p = 0.430
marker vs pdelf Δ = -383.9 [-610.6, -10.9] p = 0.044*
mineru vs pdelf Δ = -267.8 [-517.3, 104.0] p = 0.135
#!/usr/bin/env python3
import argparse
import json
import random
import re
import time
import boto3
import requests
from tqdm import tqdm
from transformers import AutoTokenizer
# Allowed characters: alphanumeric, space, and basic punctuation ".,!?()"
ALLOWED_RE = re.compile(r"^[A-Za-z0-9\.,!?() ]+$")
def get_random_line_from_s3(bucket, key):
"""
Reads an S3 object line-by-line and returns a random line using reservoir sampling.
"""
s3 = boto3.client("s3")
response = s3.get_object(Bucket=bucket, Key=key)
random_line = None
count = 0
for line in response["Body"].iter_lines():
if not line:
continue
line_str = line.decode("utf-8")
count += 1
if random.randint(1, count) == 1:
random_line = line_str
return random_line
def query_infinigram(ngram, index="v4_rpj_llama_s4", retries=3):
"""
Sends a count query to the infini-gram API for the given n-gram.
Retries a few times in case of network issues.
"""
url = "https://api.infini-gram.io/"
payload = {
"index": index,
"query_type": "count",
"query": ngram,
}
for i in range(retries):
try:
response = requests.post(url, json=payload, timeout=10)
if response.status_code == 200:
result = response.json()
if "count" in result:
return result["count"]
except Exception: # type: ignore
time.sleep(1)
return 0
def process_document(doc, tokenizer, ngram_size, num_samples, index="v4_rpj_llama_s4"):
"""
Tokenizes the document using the Llama2 tokenizer and samples random n-grams.
Each n-gram is chosen such that:
1. It starts on a word-split boundary (using the offset mapping and a check on the preceding character).
2. Its decoded string contains only alphanumeric characters, spaces, and the punctuation marks ".,!?()".
Each valid n-gram is then queried using the infini-gram API.
The function returns the document id, the number of matching n-grams (i.e. API count > 0),
the total number of valid n-grams sampled, and a list of tuples (flag, ngram_string).
"""
text = doc.get("text", "")
doc_id = doc.get("id", "Unknown")
# Get tokenized representation with offset mapping to determine word boundaries.
tokenized = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
token_ids = tokenized["input_ids"]
# offsets = tokenized["offset_mapping"]
if len(token_ids) < ngram_size:
return doc_id, 0, 0, []
# Determine valid starting indices based on word-split boundaries.
valid_positions = []
# for i in range(len(token_ids) - ngram_size + 1):
# start_offset = offsets[i][0]
# if start_offset == 0 or (start_offset > 0 and text[start_offset - 1] == " "):
# valid_positions.append(i)
if not valid_positions:
# Fallback: if no valid positions are found, use all possible positions.
valid_positions = list(range(len(token_ids) - ngram_size + 1))
valid_ngram_details = []
attempts = 0
max_attempts = num_samples * 10 # Limit to prevent infinite loops.
while len(valid_ngram_details) < num_samples and attempts < max_attempts:
idx = random.choice(valid_positions)
ngram_token_ids = token_ids[idx : idx + ngram_size]
ngram_str = tokenizer.decode(ngram_token_ids, clean_up_tokenization_spaces=True)
# Only accept n-grams that contain only allowed characters.
if ALLOWED_RE.fullmatch(ngram_str) and len(ngram_str.strip()) > ngram_size * 3:
count = query_infinigram(ngram_str, index=index)
flag = "YES" if count > 0 else "NO"
valid_ngram_details.append((flag, ngram_str))
attempts += 1
match_count = sum(1 for flag, _ in valid_ngram_details if flag == "YES")
sample_count = len(valid_ngram_details)
return doc_id, match_count, sample_count, valid_ngram_details
def main():
parser = argparse.ArgumentParser(description="Infini-gram n-gram matching script with Llama2 tokenization.")
parser.add_argument("N", type=int, help="Number of random .jsonl files to process")
parser.add_argument("s3_path", type=str, help="S3 path to a prefix containing .jsonl files (e.g., s3://my-bucket/my-prefix/)")
parser.add_argument("--index", type=str, default="v4_dolma-v1_7_llama", help="Infini-gram index to use (default: v4_rpj_llama_s4)")
parser.add_argument("--ngram_size", type=int, default=10, help="Size of the n-gram to sample (default: 10)")
parser.add_argument("--num_ngrams", type=int, default=100, help="Number of random n-grams to sample from each document (default: 100)")
args = parser.parse_args()
if not args.s3_path.startswith("s3://"):
print("Error: s3_path must start with 's3://'")
return
path_without_scheme = args.s3_path[5:]
parts = path_without_scheme.split("/", 1)
bucket = parts[0]
prefix = parts[1] if len(parts) > 1 else ""
print("Listing .jsonl files from S3...")
s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
files = [obj["Key"] for obj in response.get("Contents", []) if obj["Key"].endswith(".jsonl")]
if not files:
print("No .jsonl files found in the given prefix.")
return
if args.N > len(files):
print(f"Requested {args.N} files, but only found {len(files)}. Processing all available files.")
args.N = len(files)
random_files = random.sample(files, args.N)
print("Loading Llama2 tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
total_matches = 0
total_ngrams_sampled = 0
for key in tqdm(random_files, desc="Processing files"):
line = get_random_line_from_s3(bucket, key)
if not line:
print(f"Skipping {key}: No valid lines found.")
continue
try:
doc = json.loads(line)
except Exception as e:
print(f"Error parsing JSON in {key}: {e}")
continue
doc_id, match_count, sample_count, details = process_document(doc, tokenizer, args.ngram_size, args.num_ngrams, index=args.index)
# Print per-document n-gram summary
print(f"\nDocument ID: {doc_id}")
for flag, ngram in details:
# Print the flag in a fixed-width field (4 characters) followed by the n-gram representation.
print(f"{flag:4} {repr(ngram)}")
percentage = (match_count / sample_count * 100) if sample_count else 0
print(f"Matched n-grams: {match_count}/{sample_count} ({percentage:.2f}%)")
total_matches += match_count
total_ngrams_sampled += sample_count
overall_percentage = (total_matches / total_ngrams_sampled * 100) if total_ngrams_sampled else 0
print(f"\nTotal matched n-grams: {total_matches}/{total_ngrams_sampled} ({overall_percentage:.2f}%)")
if __name__ == "__main__":
main()
#!/usr/bin/env bash
set -ex
# check if jq is installed
if ! command -v jq &> /dev/null
then
echo "jq could not be found. Please install it."
exit
fi
EXTRA_ARGS="-c olmocr/train/config/molmo-o-lora-8192.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/molmo-pdf/v1/models/\${BEAKER_USER_ID}\""
run_name=$(basename "$0" .sh)
# --cluster 'ai2/jupiter*' \
# --cluster 'ai2/pluto*' \
# --cluster 'ai2/allennlp-cirrascale' \
# --priority high \
CLUSTER='jupiter'
gantry run \
--description "${run_name}-8192"\
--task-name "${run_name}-8192"\
--allow-dirty \
--host-networking \
--workspace ai2/oe-data-model-based-cleanup \
--beaker-image 'jakep/jakep-pdf-finetunev1.2' \
--venv 'base' \
--pip gantry-requirements.txt \
--priority high \
--gpus 8 \
--cluster "ai2/${CLUSTER}*" \
--budget ai2/oe-data \
--weka "oe-data-default:/data" \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
--env-secret DS_AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
--env-secret DS_AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"
\ No newline at end of file
#!/usr/bin/env python3
import argparse
import json
import os
from urllib.parse import urlparse
import boto3
def parse_args():
parser = argparse.ArgumentParser(description="Read JSONL files from an S3 prefix, extract text, and write to local .md files.")
parser.add_argument(
"--s3-prefix",
default="s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/",
help="S3 prefix containing the JSONL files (default: s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/)",
)
parser.add_argument("--output-dir", default="output_md", help="Local directory to store output .md files (default: output_md)")
return parser.parse_args()
def main():
args = parse_args()
# Parse the s3-prefix into bucket and prefix
parsed_s3 = urlparse(args.s3_prefix)
# e.g. netloc = 'ai2-oe-data', path = '/jakep/pdfworkspaces/pdelfin_testset/results/'
bucket_name = parsed_s3.netloc
# Remove leading '/' from parsed_s3.path
prefix = parsed_s3.path.lstrip("/")
# Ensure local output directory exists
os.makedirs(args.output_dir, exist_ok=True)
# Initialize S3 client
s3 = boto3.client("s3")
# List all objects under the prefix
paginator = s3.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
for page in pages:
if "Contents" not in page:
continue
for obj in page["Contents"]:
key = obj["Key"]
# Skip non-jsonl files
if not key.endswith(".jsonl"):
continue
print(f"Processing S3 object: s3://{bucket_name}/{key}")
# Read the S3 object
s3_object = s3.get_object(Bucket=bucket_name, Key=key)
# s3_object['Body'] is a StreamingBody, so we can read it line-by-line
body_stream = s3_object["Body"].iter_lines()
for line in body_stream:
if not line.strip():
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
print("Warning: Failed to decode JSON line.")
continue
# Extract text
text_content = record.get("text", "")
if not text_content.strip():
# If there's no text, skip
continue
# Derive the output filename based on the "Source-File" metadata
metadata = record.get("metadata", {})
source_file = metadata.get("Source-File", "")
# Example: source_file = 's3://ai2-oe-data/jakep/pdfdata/pdelfin_testset/fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
# We want to end up with: 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8_pdelf.md'
# 1) Extract just the filename from the path
# 2) Remove '.pdf'
# 3) Append '_pdelf.md'
source_filename = os.path.basename(source_file) # e.g. 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
if source_filename.lower().endswith(".pdf"):
source_filename = source_filename[:-4] # remove .pdf
output_filename = f"{source_filename}_pdelf.md"
output_path = os.path.join(args.output_dir, output_filename)
# Append the text to the corresponding file
# If you want to overwrite instead, change mode to 'w'
with open(output_path, "a", encoding="utf-8") as f:
f.write(text_content + "\n")
# Optional: Print or log what you've written
# print(f"Appended text to {output_path}")
print("Done processing all JSONL files.")
if __name__ == "__main__":
main()
from datetime import datetime
from pathlib import Path
from olmocr.version import VERSION
def main():
changelog = Path("CHANGELOG.md")
with changelog.open() as f:
lines = f.readlines()
insert_index: int = -1
for i in range(len(lines)):
line = lines[i]
if line.startswith("## Unreleased"):
insert_index = i + 1
elif line.startswith(f"## [v{VERSION}]"):
print("CHANGELOG already up-to-date")
return
elif line.startswith("## [v"):
break
if insert_index < 0:
raise RuntimeError("Couldn't find 'Unreleased' section")
lines.insert(insert_index, "\n")
lines.insert(
insert_index + 1,
f"## [v{VERSION}](https://github.com/allenai/olmocr/releases/tag/v{VERSION}) - " f"{datetime.now().strftime('%Y-%m-%d')}\n",
)
with changelog.open("w") as f:
f.writelines(lines)
if __name__ == "__main__":
main()
#!/usr/bin/env bash
set -ex
# check if jq is installed
if ! command -v jq &> /dev/null
then
echo "jq could not be found. Please install it."
exit
fi
EXTRA_ARGS="-c olmocr/train/config/qwen2vl-2b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
run_name=$(basename "$0" .sh)
# --cluster 'ai2/jupiter*' \
# --cluster 'ai2/pluto*' \
# --cluster 'ai2/allennlp-cirrascale' \
# --priority high \
CLUSTER='jupiter'
gantry run \
--description "${run_name}"\
--task-name "${run_name}"\
--allow-dirty \
--host-networking \
--workspace ai2/oe-data-pdf \
--beaker-image 'jakep/jakep-pdf-finetunev1.2' \
--venv 'base' \
--pip gantry-requirements.txt \
--priority normal \
--gpus 8 \
--preemptible \
--cluster "ai2/${CLUSTER}*" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"
\ No newline at end of file
#!/usr/bin/env bash
set -ex
# check if jq is installed
if ! command -v jq &> /dev/null
then
echo "jq could not be found. Please install it."
exit
fi
EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
run_name=$(basename "$0" .sh)
# --cluster 'ai2/jupiter*' \
# --cluster 'ai2/pluto*' \
# --cluster 'ai2/allennlp-cirrascale' \
# --priority high \
CLUSTER='jupiter'
gantry run \
--description "${run_name}"\
--task-name "${run_name}"\
--allow-dirty \
--host-networking \
--workspace ai2/oe-data-model-based-cleanup \
--beaker-image 'jakep/jakep-pdf-finetunev1.2' \
--venv 'base' \
--pip gantry-requirements.txt \
--priority high \
--gpus 8 \
--preemptible \
--cluster "ai2/${CLUSTER}*" \
--budget ai2/oe-data \
--weka "oe-data-default:/data" \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"
\ No newline at end of file
#!/usr/bin/env bash
set -ex
# check if jq is installed
if ! command -v jq &> /dev/null
then
echo "jq could not be found. Please install it."
exit
fi
EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b-lora.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
run_name=$(basename "$0" .sh)
# --cluster 'ai2/jupiter*' \
# --cluster 'ai2/pluto*' \
# --cluster 'ai2/allennlp-cirrascale' \
# --priority high \
CLUSTER='jupiter'
gantry run \
--description "${run_name}"\
--task-name "${run_name}"\
--allow-dirty \
--host-networking \
--workspace ai2/oe-data-model-based-cleanup \
--beaker-image 'jakep/jakep-pdf-finetunev1.2' \
--venv 'base' \
--pip gantry-requirements.txt \
--priority high \
--gpus 8 \
--preemptible \
--cluster "ai2/${CLUSTER}*" \
--budget ai2/oe-data \
--weka "oe-data-default:/data" \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
--env-secret DS_AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
--env-secret DS_AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"
\ No newline at end of file
#!/bin/bash
set -e
# Function to extract version components from version.py using regex
get_version_from_file() {
VERSION_FILE="olmocr/version.py"
if [[ ! -f "$VERSION_FILE" ]]; then
echo "Error: $VERSION_FILE does not exist."
exit 1
fi
# Extract _MAJOR
_MAJOR=$(grep -E '^_MAJOR\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_MAJOR\s*=\s*"([^"]+)"/\1/')
if [[ -z "$_MAJOR" ]]; then
echo "Error: Could not extract _MAJOR from $VERSION_FILE."
exit 1
fi
# Extract _MINOR
_MINOR=$(grep -E '^_MINOR\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_MINOR\s*=\s*"([^"]+)"/\1/')
if [[ -z "$_MINOR" ]]; then
echo "Error: Could not extract _MINOR from $VERSION_FILE."
exit 1
fi
# Extract _PATCH
_PATCH=$(grep -E '^_PATCH\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_PATCH\s*=\s*"([^"]+)"/\1/')
if [[ -z "$_PATCH" ]]; then
echo "Error: Could not extract _PATCH from $VERSION_FILE."
exit 1
fi
# Extract _SUFFIX (optional)
_SUFFIX=$(grep -E '^_SUFFIX\s*=\s*"([^"]*)"' "$VERSION_FILE" | sed -E 's/_SUFFIX\s*=\s*"([^"]*)"/\1/')
if [[ -z "$_SUFFIX" ]]; then
_SUFFIX=""
fi
# Construct VERSION
VERSION_PY="${_MAJOR}.${_MINOR}.${_PATCH}${_SUFFIX}"
echo "$VERSION_PY"
}
TAG=$(python -c 'from olmocr.version import VERSION; print("v" + VERSION)')
# Get the VERSION from version.py
VERSION_PY=$(get_version_from_file)
# Compare the two versions
if [[ "v$VERSION_PY" != "$TAG" ]]; then
echo "Version mismatch detected:"
echo " Python reported version: $TAG"
echo " version.py contains: v$VERSION_PY"
echo
read -p "The versions do not match. Please run 'pip install -e .' to synchronize versions. Do you want to continue? [Y/n] " prompt
if [[ ! "$prompt" =~ ^([yY][eE][sS]|[yY])$ ]]; then
echo "Release process aborted due to version mismatch."
exit 1
else
echo "Proceeding with the release despite the version mismatch."
fi
fi
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
python scripts/prepare_changelog.py
git add -A
git commit -m "Bump version to $TAG for release" || true && git push
echo "Creating new git tag $TAG"
git tag "$TAG" -m "$TAG"
git push --tags
else
echo "Cancelled"
exit 1
fi
\ No newline at end of file
# encoding: utf-8
"""
Prepares markdown release notes for GitHub releases.
"""
import os
from typing import List, Optional
import packaging.version
TAG = os.environ["TAG"]
ADDED_HEADER = "### Added 🎉"
CHANGED_HEADER = "### Changed ⚠️"
FIXED_HEADER = "### Fixed ✅"
REMOVED_HEADER = "### Removed 👋"
def get_change_log_notes() -> str:
in_current_section = False
current_section_notes: List[str] = []
with open("CHANGELOG.md") as changelog:
for line in changelog:
if line.startswith("## "):
if line.startswith("## Unreleased"):
continue
if line.startswith(f"## [{TAG}]"):
in_current_section = True
continue
break
if in_current_section:
if line.startswith("### Added"):
line = ADDED_HEADER + "\n"
elif line.startswith("### Changed"):
line = CHANGED_HEADER + "\n"
elif line.startswith("### Fixed"):
line = FIXED_HEADER + "\n"
elif line.startswith("### Removed"):
line = REMOVED_HEADER + "\n"
current_section_notes.append(line)
assert current_section_notes
return "## What's new\n\n" + "".join(current_section_notes).strip() + "\n"
def get_commit_history() -> str:
new_version = packaging.version.parse(TAG)
# Pull all tags.
os.popen("git fetch --tags")
# Get all tags sorted by version, latest first.
all_tags = os.popen("git tag -l --sort=-version:refname 'v*'").read().split("\n")
# Out of `all_tags`, find the latest previous version so that we can collect all
# commits between that version and the new version we're about to publish.
# Note that we ignore pre-releases unless the new version is also a pre-release.
last_tag: Optional[str] = None
for tag in all_tags:
if not tag.strip(): # could be blank line
continue
version = packaging.version.parse(tag)
if new_version.pre is None and version.pre is not None:
continue
if version < new_version:
last_tag = tag
break
if last_tag is not None:
commits = os.popen(f"git log {last_tag}..{TAG} --oneline --first-parent").read()
else:
commits = os.popen("git log --oneline --first-parent").read()
return "## Commits\n\n" + commits
def main():
print(get_change_log_notes())
print(get_commit_history())
if __name__ == "__main__":
main()
#!/bin/bash
# Define the output file for the metadata.sha1 fields
OUTPUT_FILE="s2orc_pdfs_v2.txt"
# Clear the output file if it already exists
> "$OUTPUT_FILE"
# Create a temporary directory for partial outputs
temp_output_dir=$(mktemp -d)
# Ensure the temporary directory is cleaned up on exit or error
trap 'rm -rf "$temp_output_dir"' EXIT
# Export the temporary output directory variable for use in xargs
export temp_output_dir
echo "temp dir $temp_output_dir"
# Find all .gz files recursively from the current directory
find 'split=train' -type f -name "*.gz" | \
xargs -P 30 -I{} bash -c '
gz_file="$1"
partial_output="$temp_output_dir/$(basename "$gz_file").txt"
# Stream uncompressed data directly into jq and format the output
gunzip -c "$gz_file" | jq -r '"'"'
select(.metadata.sha1 != null) |
"s3://ai2-s2-pdfs/" + (.metadata.sha1[:4]) + "/" + (.metadata.sha1[4:]) + ".pdf"
'"'"' >> "$partial_output"
' _ {}
# Concatenate all partial outputs into the final output file
cat "$temp_output_dir"/*.txt >> "$OUTPUT_FILE"
echo "All metadata.sha1 fields have been extracted to $OUTPUT_FILE."
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment