Unverified Commit 32ec044c authored by Qi Wang's avatar Qi Wang Committed by GitHub
Browse files

feat(benchmarks): add sliding-window JSONL generation strategy (#8201)


Signed-off-by: default avatarfurionw <qiwa@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent f8208f8d
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""CLI argument parsing for request generation scripts."""
"""CLI argument parsing for JSONL benchmark generation."""
import argparse
import sys
from pathlib import Path
DEFAULT_IMAGES_PER_REQUEST = 3
......@@ -11,77 +12,140 @@ USER_TEXT_TOKENS = 300
COCO_ANNOTATIONS = Path(__file__).parent / "annotations" / "image_info_test2017.json"
def parse_args(description: str = "") -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=description,
formatter_class=argparse.RawDescriptionHelpFormatter,
def _positive_int(value: str) -> int:
iv = int(value)
if iv <= 0:
raise argparse.ArgumentTypeError(f"must be a positive integer, got {iv}")
return iv
def _common_parser() -> argparse.ArgumentParser:
"""Args shared across all strategies."""
p = argparse.ArgumentParser(add_help=False)
p.add_argument(
"-o",
"--output",
type=Path,
default=None,
help="Output .jsonl path (default: auto-generated from parameters)",
)
parser.add_argument(
"-n",
"--num-requests",
p.add_argument(
"--user-text-tokens",
type=int,
default=500,
help="Number of requests to generate (default: 500)",
default=USER_TEXT_TOKENS,
help=f"Target user text tokens per request (default: {USER_TEXT_TOKENS})",
)
parser.add_argument(
"--images-pool",
p.add_argument(
"--seed",
type=int,
default=None,
help="Number of unique images in the pool. Each request samples from this pool, "
"so a smaller pool means more cross-request reuse. "
"Default: num_requests * images_per_request (all unique, no reuse).",
help="Random seed for reproducible generation (default: time-based)",
)
parser.add_argument(
"--images-per-request",
return p
def _image_parser() -> argparse.ArgumentParser:
"""Args for image generation (reusable for future video/audio parsers)."""
p = argparse.ArgumentParser(add_help=False)
p.add_argument(
"--image-size",
type=int,
default=DEFAULT_IMAGES_PER_REQUEST,
help=f"Number of images per request (default: {DEFAULT_IMAGES_PER_REQUEST})",
)
parser.add_argument(
"-o",
"--output",
type=Path,
default=None,
help="Output .jsonl path (default: {n}req_{img}img_{pool}pool_{word}word_{mode}.jsonl, e.g. 100req_20img_1000pool_4000word_base64.jsonl)",
nargs=2,
default=[512, 512],
metavar=("WIDTH", "HEIGHT"),
help="Size of generated PNG images in pixels (default: 512 512)",
)
parser.add_argument(
p.add_argument(
"--image-dir",
type=Path,
default=Path("/tmp/bench_images"),
help="Directory to save generated PNG images (default: /tmp/bench_images)",
)
parser.add_argument(
"--user-text-tokens",
type=int,
default=USER_TEXT_TOKENS,
help=f"Target user text tokens per request (default: {USER_TEXT_TOKENS}). --isl is an alias.",
)
parser.add_argument(
p.add_argument(
"--image-mode",
choices=["base64", "http"],
default="base64",
help="Image loading mode: 'base64' generates local PNGs and puts file paths in "
"the JSONL so aiperf reads and base64-encodes them before sending (default); "
"'http' puts COCO HTTP URLs in the JSONL so the LLM server downloads images itself",
help="'base64' generates local PNGs (default); 'http' uses COCO URLs",
)
parser.add_argument(
p.add_argument(
"--coco-annotations",
type=Path,
default=COCO_ANNOTATIONS,
help=f"Path to COCO image_info JSON for --image-mode http (default: {COCO_ANNOTATIONS})",
)
parser.add_argument(
"--image-size",
return p
def parse_args(description: str = "") -> argparse.Namespace:
common = _common_parser()
image = _image_parser()
parser = argparse.ArgumentParser(
description=description,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
sub = parser.add_subparsers(dest="strategy")
# --- single-turn (default) ---
st = sub.add_parser(
"single-turn",
parents=[common, image],
help="Independent requests with random image sampling (default)",
)
st.add_argument(
"-n",
"--num-requests",
type=int,
nargs=2,
default=[512, 512],
metavar=("WIDTH", "HEIGHT"),
help="Size of generated PNG images in pixels (default: 512 512)",
default=500,
help="Number of requests to generate (default: 500)",
)
parser.add_argument(
"--seed",
st.add_argument(
"--images-per-request",
type=int,
default=DEFAULT_IMAGES_PER_REQUEST,
help=f"Number of images per request (default: {DEFAULT_IMAGES_PER_REQUEST})",
)
st.add_argument(
"--images-pool",
type=int,
default=None,
help="Random seed for reproducible dataset generation (default: time-based)",
help="Unique images in pool. Smaller pool = more cross-request reuse. "
"Default: num_requests * images_per_request (all unique).",
)
# --- sliding-window ---
sw = sub.add_parser(
"sliding-window",
parents=[common, image],
help="Causal sessions with sliding-window image overlap",
)
sw.add_argument(
"--num-users",
type=_positive_int,
default=10,
help="Number of concurrent user sessions (default: 10)",
)
return parser.parse_args()
sw.add_argument(
"--turns-per-user",
type=_positive_int,
default=20,
help="Number of requests per user (default: 20)",
)
sw.add_argument(
"--window-size",
type=_positive_int,
default=5,
help="Sliding window width — each turn sees this many images, "
"with window_size-1 overlap between consecutive turns (default: 5)",
)
# Default to single-turn when no subcommand given, but let top-level
# `-h`/`--help` flow through the main parser so users see both
# subcommands and the module description.
known_strategies = {"single-turn", "sliding-window"}
argv = sys.argv[1:]
help_requested = bool(argv) and argv[0] in {"-h", "--help"}
if not help_requested and (not argv or argv[0] not in known_strategies):
argv = ["single-turn", *argv]
return parser.parse_args(argv)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Generate a .jsonl benchmark file for aiperf (single-turn, text + images).
"""Generate .jsonl benchmark files for aiperf.
Images are drawn from a fixed pool; a smaller pool produces more cross-request
reuse. Supports base64 (local PNGs) and http (COCO URLs) image modes.
Strategies:
single-turn Independent requests with random image sampling (default)
sliding-window Causal sessions with sliding-window image overlap
Usage:
python main.py
python main.py --image-mode http
python main.py -n 200 --images-pool 100
python main.py single-turn --image-mode http
python main.py sliding-window --num-users 10 --turns-per-user 20 --window-size 5
"""
import argparse
import json
import random
import time
......@@ -27,31 +29,32 @@ from generate_images import (
from generate_input_text import generate_filler
def main() -> None:
args = parse_args(__doc__)
def _make_pool(
args: argparse.Namespace,
pool_size: int,
np_rng: np.random.Generator,
py_rng: random.Random,
) -> list[str]:
if args.image_mode == "http":
return generate_image_pool_http(py_rng, pool_size, args.coco_annotations)
return generate_image_pool_base64(
np_rng, pool_size, args.image_dir, tuple(args.image_size)
)
def run_single_turn(
args: argparse.Namespace,
np_rng: np.random.Generator,
py_rng: random.Random,
) -> None:
num_requests: int = args.num_requests
images_per_request: int = args.images_per_request
image_pool: int = args.images_pool or (num_requests * images_per_request)
seed: int = (
args.seed if args.seed is not None else int(time.time() * 1000) % (2**32)
)
print(f"Using seed: {seed}")
np_rng = np.random.default_rng(seed)
py_rng = random.Random(seed)
if args.image_mode == "http":
pool = generate_image_pool_http(py_rng, image_pool, args.coco_annotations)
else:
pool = generate_image_pool_base64(
np_rng, image_pool, args.image_dir, tuple(args.image_size)
)
pool = _make_pool(args, image_pool, np_rng, py_rng)
slot_refs = sample_slots(py_rng, pool, num_requests, images_per_request)
output_path = args.output
if output_path is None:
output_path = (
output_path = args.output or (
Path(__file__).parent
/ f"{num_requests}req_{images_per_request}img_{image_pool}pool_{args.user_text_tokens}word_{args.image_mode}.jsonl"
)
......@@ -69,5 +72,66 @@ def main() -> None:
print(f"Wrote {num_requests} requests to {output_path}")
def run_sliding_window(
args: argparse.Namespace,
np_rng: np.random.Generator,
py_rng: random.Random,
) -> None:
num_users: int = args.num_users
turns_per_user: int = args.turns_per_user
window_size: int = args.window_size
images_per_user = window_size + turns_per_user - 1
total_images = num_users * images_per_user
total_requests = num_users * turns_per_user
print(
f"Sliding window: {num_users} users × {turns_per_user} turns, "
f"window={window_size}, {images_per_user} images/user, "
f"{total_images} total images"
)
pool = _make_pool(args, total_images, np_rng, py_rng)
output_path = args.output or (
Path(__file__).parent
/ f"{num_users}u_{turns_per_user}t_{window_size}w_{args.user_text_tokens}word_{args.image_mode}.jsonl"
)
with open(output_path, "w") as f:
for turn_idx in range(turns_per_user):
for user_idx in range(num_users):
offset = user_idx * images_per_user + turn_idx
window = pool[offset : offset + window_size]
entry = {
"session_id": f"user_{user_idx}",
"text": generate_filler(py_rng, args.user_text_tokens),
"images": window,
}
f.write(json.dumps(entry, separators=(",", ":")) + "\n")
print(f"Wrote {total_requests} requests ({num_users} sessions) to {output_path}")
STRATEGIES = {
"single-turn": run_single_turn,
"sliding-window": run_sliding_window,
}
def main() -> None:
args = parse_args(__doc__)
seed: int = (
args.seed if args.seed is not None else int(time.time() * 1000) % (2**32)
)
print(f"Using seed: {seed}")
np_rng = np.random.default_rng(seed)
py_rng = random.Random(seed)
STRATEGIES[args.strategy](args, np_rng, py_rng)
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import json
import sys
from pathlib import Path
from unittest.mock import patch
import pytest
pytest.importorskip("PIL", reason="Pillow required for image generation benchmarks")
# The generators use relative imports, so add the source dir to sys.path
JSONL_DIR = Path(__file__).resolve().parents[4] / "benchmarks" / "multimodal" / "jsonl"
sys.path.insert(0, str(JSONL_DIR))
from main import main # noqa: E402
pytestmark = [pytest.mark.unit, pytest.mark.pre_merge, pytest.mark.gpu_0]
def _run_main(tmp_path: Path, argv: list[str]) -> list[dict]:
"""Run main() with given argv, return parsed JSONL lines."""
with patch("sys.argv", ["main.py"] + argv):
main()
jsonl_files = list(tmp_path.glob("*.jsonl"))
assert len(jsonl_files) == 1, f"Expected 1 JSONL file, got {jsonl_files}"
with open(jsonl_files[0]) as f:
return [json.loads(line) for line in f if line.strip()]
class TestSingleTurnDefault:
"""single-turn is the default when no subcommand is given."""
def test_default_produces_independent_requests(self, tmp_path: Path) -> None:
lines = _run_main(
tmp_path,
[
"-n",
"4",
"--images-per-request",
"2",
"--image-size",
"32",
"32",
"--image-dir",
str(tmp_path / "imgs"),
"--seed",
"1",
"-o",
str(tmp_path / "out.jsonl"),
],
)
assert len(lines) == 4
for line in lines:
assert "text" in line
assert len(line["images"]) == 2
assert "session_id" not in line
class TestSlidingWindow:
"""sliding-window produces causal sessions with image overlap."""
def test_output_structure(self, tmp_path: Path) -> None:
lines = _run_main(
tmp_path,
[
"sliding-window",
"--num-users",
"2",
"--turns-per-user",
"3",
"--window-size",
"5",
"--image-size",
"32",
"32",
"--image-dir",
str(tmp_path / "imgs"),
"--seed",
"42",
"-o",
str(tmp_path / "sw.jsonl"),
],
)
assert len(lines) == 6 # 2 users x 3 turns
# Round-robin interleaving: user_0, user_1, user_0, user_1, ...
session_ids = [row["session_id"] for row in lines]
assert session_ids == [
"user_0",
"user_1",
"user_0",
"user_1",
"user_0",
"user_1",
]
for line in lines:
assert len(line["images"]) == 5
def test_image_overlap(self, tmp_path: Path) -> None:
lines = _run_main(
tmp_path,
[
"sliding-window",
"--num-users",
"1",
"--turns-per-user",
"3",
"--window-size",
"4",
"--image-size",
"32",
"32",
"--image-dir",
str(tmp_path / "imgs"),
"--seed",
"7",
"-o",
str(tmp_path / "overlap.jsonl"),
],
)
assert len(lines) == 3
# Consecutive turns should share window_size-1 images
for i in range(len(lines) - 1):
prev = lines[i]["images"]
curr = lines[i + 1]["images"]
# Sliding by 1: prev[1:] == curr[:-1]
assert (
prev[1:] == curr[:-1]
), f"Turn {i} and {i + 1} should share 3/4 images"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment