Unverified Commit 5a4c96db authored by Qi Wang's avatar Qi Wang Committed by GitHub
Browse files

test: introduce multimodal benchmark toolkit (#6330)

parent fcdf6610
# Generate aiperf Source Images
aiperf's built-in image generator ships with very few source images. When
benchmarking with `--image-mode base64`, aiperf picks from its
`assets/source_images/` directory — a small set means every request sends
nearly identical images, which doesn't stress the multimodal pipeline
realistically.
This script populates that directory with 200 random-noise PNGs so aiperf
has a larger pool to sample from.
## Usage
```bash
python main.py
```
Images are written directly into aiperf's installed `source_images/` directory.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""CLI argument parsing for aiperf image generation."""
import argparse
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Generate random-noise PNGs into aiperf's source_images directory.",
)
parser.add_argument(
"--images-pool",
type=int,
default=200,
help="Number of unique images to generate (default: 200)",
)
parser.add_argument(
"--image-size",
type=int,
nargs=2,
default=[512, 512],
metavar=("WIDTH", "HEIGHT"),
help="Size of generated PNG images in pixels (default: 512 512)",
)
return parser.parse_args()
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
import aiperf.dataset.generator.image as _img_mod
import numpy as np
from args import parse_args
from PIL import Image
TARGET_DIR = Path(_img_mod.__file__).parent / "assets" / "source_images"
def main() -> None:
args = parse_args()
num_images: int = args.images_pool
width, height = args.image_size
TARGET_DIR.mkdir(parents=True, exist_ok=True)
rng = np.random.default_rng(42)
for i in range(num_images):
pixels = rng.integers(0, 256, (height, width, 3), dtype=np.uint8)
Image.fromarray(pixels).save(TARGET_DIR / f"noise_{i:04d}.png")
if (i + 1) % 100 == 0:
print(f"{i + 1}/{num_images}")
print(f"\n{num_images} unique {width}x{height} images saved to {TARGET_DIR}")
if __name__ == "__main__":
main()
# Multimodal JSONL Request Generator
Generates `.jsonl` benchmark files for [aiperf](https://github.com/NVIDIA/aiperf) with single-turn multimodal requests (text + images).
## Key concept: image pool reuse
Each request samples images from a fixed pool. A smaller pool relative to total
image slots produces more cross-request image reuse — useful for benchmarking
embedding cache hit rates.
For example, 500 requests x 3 images each = 1500 image slots. With
`--images-pool 200`, many requests will share the same images.
## Image modes
| Mode | `--image-mode` | What goes in the JSONL | Who fetches the image |
|------|---------------|------------------------|----------------------|
| base64 (default) | `base64` | Absolute file paths to local PNGs | aiperf reads and base64-encodes before sending |
| HTTP | `http` | COCO test2017 URLs | The LLM server downloads images itself |
For `http` mode, download COCO annotations first:
```bash
mkdir -p annotations && cd annotations
wget http://images.cocodataset.org/annotations/image_info_test2017.zip
unzip image_info_test2017.zip
```
## Usage
```bash
# Defaults: 500 requests, 3 images each, all unique, base64 mode
python main.py
# HTTP mode with COCO URLs
python main.py --image-mode http
# Control reuse: 200 requests, pool of 100 unique images
python main.py -n 200 --images-pool 100
# More images per request
python main.py -n 100 --images-per-request 20 --images-pool 500
```
Output filename encodes the parameters, e.g. `500req_3img_200pool_300word_http.jsonl`.
## Running with aiperf
```bash
aiperf profile \
--model Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 \
--input-file 500req_3img_200pool_300word_http.jsonl \
--custom-dataset-type single_turn \
--shared-system-prompt-length 1000 \
--extra-inputs "max_tokens:500" \
--extra-inputs "min_tokens:500" \
--extra-inputs "ignore_eos:true"
```
Note: the JSONL contains actual content (text + image references), not token
counts. Do not pass `--isl` — it only applies to synthetic data generation.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""CLI argument parsing for request generation scripts."""
import argparse
from pathlib import Path
DEFAULT_IMAGES_PER_REQUEST = 3
USER_TEXT_TOKENS = 300
COCO_ANNOTATIONS = Path(__file__).parent / "annotations" / "image_info_test2017.json"
def parse_args(description: str = "") -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=description,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-n",
"--num-requests",
type=int,
default=500,
help="Number of requests to generate (default: 500)",
)
parser.add_argument(
"--images-pool",
type=int,
default=None,
help="Number of unique images in the pool. Each request samples from this pool, "
"so a smaller pool means more cross-request reuse. "
"Default: num_requests * images_per_request (all unique, no reuse).",
)
parser.add_argument(
"--images-per-request",
type=int,
default=DEFAULT_IMAGES_PER_REQUEST,
help=f"Number of images per request (default: {DEFAULT_IMAGES_PER_REQUEST})",
)
parser.add_argument(
"-o",
"--output",
type=Path,
default=None,
help="Output .jsonl path (default: {n}req_{img}img_{pool}pool_{word}word_{mode}.jsonl, e.g. 100req_20img_1000pool_4000word_base64.jsonl)",
)
parser.add_argument(
"--image-dir",
type=Path,
default=Path("/tmp/bench_images"),
help="Directory to save generated PNG images (default: /tmp/bench_images)",
)
parser.add_argument(
"--user-text-tokens",
type=int,
default=USER_TEXT_TOKENS,
help=f"Target user text tokens per request (default: {USER_TEXT_TOKENS}). --isl is an alias.",
)
parser.add_argument(
"--image-mode",
choices=["base64", "http"],
default="base64",
help="Image loading mode: 'base64' generates local PNGs and puts file paths in "
"the JSONL so aiperf reads and base64-encodes them before sending (default); "
"'http' puts COCO HTTP URLs in the JSONL so the LLM server downloads images itself",
)
parser.add_argument(
"--coco-annotations",
type=Path,
default=COCO_ANNOTATIONS,
help=f"Path to COCO image_info JSON for --image-mode http (default: {COCO_ANNOTATIONS})",
)
parser.add_argument(
"--image-size",
type=int,
nargs=2,
default=[512, 512],
metavar=("WIDTH", "HEIGHT"),
help="Size of generated PNG images in pixels (default: 512 512)",
)
return parser.parse_args()
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Utilities for generating and sampling image pools."""
import json
import random
from pathlib import Path
import numpy as np
from PIL import Image
def generate_image_pool_base64(
np_rng: np.random.Generator,
pool_size: int,
image_dir: Path,
image_size: tuple[int, int] = (512, 512),
) -> list[str]:
"""Generate pool_size random PNG files and return their paths."""
image_dir.mkdir(parents=True, exist_ok=True)
pool: list[str] = []
for idx in range(pool_size):
path = image_dir / f"img_{idx:04d}.png"
pixels = np_rng.integers(0, 256, (*image_size, 3), dtype=np.uint8)
Image.fromarray(pixels).save(path)
pool.append(str(path.resolve()))
print(
f" {pool_size} unique {image_size[0]}x{image_size[1]} images saved to {image_dir}"
)
return pool
def generate_image_pool_http(
py_rng: random.Random,
pool_size: int,
coco_annotations: Path,
) -> list[str]:
"""Pick pool_size unique COCO test2017 URLs."""
with open(coco_annotations) as f:
data = json.load(f)
all_urls = [img["coco_url"] for img in data["images"]]
if pool_size > len(all_urls):
raise RuntimeError(
f"--images-pool ({pool_size}) exceeds available COCO images ({len(all_urls)}). "
f"Reduce --images-pool."
)
py_rng.shuffle(all_urls)
pool = all_urls[:pool_size]
print(
f" {pool_size} URLs sampled from {coco_annotations.name} ({len(all_urls)} available)"
)
return pool
def sample_slots(
py_rng: random.Random,
pool: list[str],
num_requests: int,
images_per_request: int,
) -> list[str]:
"""Sample image slots from a fixed pool, no duplicates within each request."""
assert (
len(pool) >= images_per_request
), f"images-pool ({len(pool)}) must be >= images-per-request ({images_per_request})"
total_slots = num_requests * images_per_request
slot_refs: list[str] = []
for _ in range(num_requests):
slot_refs.extend(py_rng.sample(pool, images_per_request))
num_unique = len(set(slot_refs))
print(
f"Generated {total_slots} image slots from pool of {len(pool)}: "
f"{num_unique} unique in use, "
f"{total_slots - num_unique} duplicate references "
f"({(total_slots - num_unique) / total_slots:.1%} reuse)"
)
return slot_refs
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Utilities for generating filler text that tokenizes predictably."""
import random
# Common English words that each tokenize to a single BPE token on most LLMs.
ENGLISH_VOCAB = [
"the",
"be",
"to",
"of",
"and",
"a",
"in",
"that",
"have",
"I",
"it",
"for",
"not",
"on",
"with",
"he",
"as",
"you",
"do",
"at",
"this",
"but",
"his",
"by",
"from",
"they",
"we",
"say",
"her",
"she",
"or",
"an",
"will",
"my",
"one",
"all",
"would",
"there",
"their",
"what",
"so",
"up",
"out",
"if",
"about",
"who",
"get",
"which",
"go",
"me",
"when",
"make",
"can",
"like",
"time",
"no",
"just",
"him",
"know",
"take",
"people",
"into",
"year",
"your",
"good",
"some",
"could",
"them",
"see",
"other",
"than",
"then",
"now",
"look",
"only",
"come",
"its",
"over",
"think",
"also",
"back",
"after",
"use",
"two",
"how",
"our",
"work",
"first",
"well",
"way",
"even",
"new",
"want",
"because",
"any",
"these",
"give",
"day",
"most",
"us",
"great",
"world",
"still",
"own",
"find",
"here",
"thing",
"many",
"long",
"hand",
"high",
"keep",
"place",
"start",
"might",
"old",
"home",
"big",
"end",
"while",
"last",
"turn",
"ask",
"need",
"too",
"feel",
"seem",
"call",
"head",
"put",
"lot",
"run",
"every",
"play",
"small",
"set",
"live",
"try",
"tell",
"few",
"part",
"change",
"help",
"show",
"house",
"both",
"side",
"point",
"such",
"name",
"each",
"right",
"move",
"must",
"real",
"left",
"same",
"much",
"open",
"near",
"line",
"build",
"power",
"water",
"city",
"tree",
"earth",
"plan",
"food",
"dark",
"cold",
"sure",
"car",
"face",
"nice",
"state",
"fact",
"night",
"hard",
"read",
"idea",
"stand",
"class",
"body",
"book",
"word",
"best",
"done",
"case",
"four",
"fire",
"front",
"rest",
"game",
"war",
"air",
"eye",
"true",
"top",
"area",
"boy",
"girl",
"color",
"oil",
"song",
"note",
"low",
"bed",
]
def generate_filler(rng: random.Random, num_tokens: int) -> str:
"""Return ~num_tokens worth of space-separated common English words."""
return " ".join(rng.choice(ENGLISH_VOCAB) for _ in range(num_tokens))
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Generate a .jsonl benchmark file for aiperf (single-turn, text + images).
Images are drawn from a fixed pool; a smaller pool produces more cross-request
reuse. Supports base64 (local PNGs) and http (COCO URLs) image modes.
Usage:
python main.py
python main.py --image-mode http
python main.py -n 200 --images-pool 100
"""
import json
import random
import time
from pathlib import Path
import numpy as np
from args import parse_args
from generate_images import (
generate_image_pool_base64,
generate_image_pool_http,
sample_slots,
)
from generate_input_text import generate_filler
SEED = int(time.time() * 1000) % (2**32)
def main() -> None:
args = parse_args(__doc__)
num_requests: int = args.num_requests
images_per_request: int = args.images_per_request
image_pool: int = args.images_pool or (num_requests * images_per_request)
np_rng = np.random.default_rng(SEED)
py_rng = random.Random(SEED)
if args.image_mode == "http":
pool = generate_image_pool_http(py_rng, image_pool, args.coco_annotations)
else:
pool = generate_image_pool_base64(
np_rng, image_pool, args.image_dir, tuple(args.image_size)
)
slot_refs = sample_slots(py_rng, pool, num_requests, images_per_request)
unique_images = len(set(slot_refs))
output_path = args.output
if output_path is None:
output_path = (
Path(__file__).parent
/ f"{num_requests}req_{images_per_request}img_{unique_images}pool_{args.user_text_tokens}word_{args.image_mode}.jsonl"
)
with open(output_path, "w") as f:
for i in range(num_requests):
user_text = generate_filler(py_rng, args.user_text_tokens)
start = i * images_per_request
images = slot_refs[start : start + images_per_request]
line = json.dumps(
{"text": user_text, "images": images}, separators=(",", ":")
)
f.write(line + "\n")
print(f"Wrote {num_requests} requests to {output_path}")
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment