"launch/dynamo-run/vscode:/vscode.git/clone" did not exist on "0c4c4d1de3fc19caf21bf50352e4dd5c0a94139a"
Unverified Commit 5a4c96db authored by Qi Wang's avatar Qi Wang Committed by GitHub
Browse files

test: introduce multimodal benchmark toolkit (#6330)

parent fcdf6610
# Generate aiperf Source Images
aiperf's built-in image generator ships with very few source images. When
benchmarking with `--image-mode base64`, aiperf picks from its
`assets/source_images/` directory — a small set means every request sends
nearly identical images, which doesn't stress the multimodal pipeline
realistically.
This script populates that directory with 200 random-noise PNGs so aiperf
has a larger pool to sample from.
## Usage
```bash
python main.py
```
Images are written directly into aiperf's installed `source_images/` directory.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""CLI argument parsing for aiperf image generation."""
import argparse
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Generate random-noise PNGs into aiperf's source_images directory.",
)
parser.add_argument(
"--images-pool",
type=int,
default=200,
help="Number of unique images to generate (default: 200)",
)
parser.add_argument(
"--image-size",
type=int,
nargs=2,
default=[512, 512],
metavar=("WIDTH", "HEIGHT"),
help="Size of generated PNG images in pixels (default: 512 512)",
)
return parser.parse_args()
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
import aiperf.dataset.generator.image as _img_mod
import numpy as np
from args import parse_args
from PIL import Image
TARGET_DIR = Path(_img_mod.__file__).parent / "assets" / "source_images"
def main() -> None:
args = parse_args()
num_images: int = args.images_pool
width, height = args.image_size
TARGET_DIR.mkdir(parents=True, exist_ok=True)
rng = np.random.default_rng(42)
for i in range(num_images):
pixels = rng.integers(0, 256, (height, width, 3), dtype=np.uint8)
Image.fromarray(pixels).save(TARGET_DIR / f"noise_{i:04d}.png")
if (i + 1) % 100 == 0:
print(f"{i + 1}/{num_images}")
print(f"\n{num_images} unique {width}x{height} images saved to {TARGET_DIR}")
if __name__ == "__main__":
main()
# Multimodal JSONL Request Generator
Generates `.jsonl` benchmark files for [aiperf](https://github.com/NVIDIA/aiperf) with single-turn multimodal requests (text + images).
## Key concept: image pool reuse
Each request samples images from a fixed pool. A smaller pool relative to total
image slots produces more cross-request image reuse — useful for benchmarking
embedding cache hit rates.
For example, 500 requests x 3 images each = 1500 image slots. With
`--images-pool 200`, many requests will share the same images.
## Image modes
| Mode | `--image-mode` | What goes in the JSONL | Who fetches the image |
|------|---------------|------------------------|----------------------|
| base64 (default) | `base64` | Absolute file paths to local PNGs | aiperf reads and base64-encodes before sending |
| HTTP | `http` | COCO test2017 URLs | The LLM server downloads images itself |
For `http` mode, download COCO annotations first:
```bash
mkdir -p annotations && cd annotations
wget http://images.cocodataset.org/annotations/image_info_test2017.zip
unzip image_info_test2017.zip
```
## Usage
```bash
# Defaults: 500 requests, 3 images each, all unique, base64 mode
python main.py
# HTTP mode with COCO URLs
python main.py --image-mode http
# Control reuse: 200 requests, pool of 100 unique images
python main.py -n 200 --images-pool 100
# More images per request
python main.py -n 100 --images-per-request 20 --images-pool 500
```
Output filename encodes the parameters, e.g. `500req_3img_200pool_300word_http.jsonl`.
## Running with aiperf
```bash
aiperf profile \
--model Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 \
--input-file 500req_3img_200pool_300word_http.jsonl \
--custom-dataset-type single_turn \
--shared-system-prompt-length 1000 \
--extra-inputs "max_tokens:500" \
--extra-inputs "min_tokens:500" \
--extra-inputs "ignore_eos:true"
```
Note: the JSONL contains actual content (text + image references), not token
counts. Do not pass `--isl` — it only applies to synthetic data generation.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""CLI argument parsing for request generation scripts."""
import argparse
from pathlib import Path
DEFAULT_IMAGES_PER_REQUEST = 3
USER_TEXT_TOKENS = 300
COCO_ANNOTATIONS = Path(__file__).parent / "annotations" / "image_info_test2017.json"
def parse_args(description: str = "") -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=description,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-n",
"--num-requests",
type=int,
default=500,
help="Number of requests to generate (default: 500)",
)
parser.add_argument(
"--images-pool",
type=int,
default=None,
help="Number of unique images in the pool. Each request samples from this pool, "
"so a smaller pool means more cross-request reuse. "
"Default: num_requests * images_per_request (all unique, no reuse).",
)
parser.add_argument(
"--images-per-request",
type=int,
default=DEFAULT_IMAGES_PER_REQUEST,
help=f"Number of images per request (default: {DEFAULT_IMAGES_PER_REQUEST})",
)
parser.add_argument(
"-o",
"--output",
type=Path,
default=None,
help="Output .jsonl path (default: {n}req_{img}img_{pool}pool_{word}word_{mode}.jsonl, e.g. 100req_20img_1000pool_4000word_base64.jsonl)",
)
parser.add_argument(
"--image-dir",
type=Path,
default=Path("/tmp/bench_images"),
help="Directory to save generated PNG images (default: /tmp/bench_images)",
)
parser.add_argument(
"--user-text-tokens",
type=int,
default=USER_TEXT_TOKENS,
help=f"Target user text tokens per request (default: {USER_TEXT_TOKENS}). --isl is an alias.",
)
parser.add_argument(
"--image-mode",
choices=["base64", "http"],
default="base64",
help="Image loading mode: 'base64' generates local PNGs and puts file paths in "
"the JSONL so aiperf reads and base64-encodes them before sending (default); "
"'http' puts COCO HTTP URLs in the JSONL so the LLM server downloads images itself",
)
parser.add_argument(
"--coco-annotations",
type=Path,
default=COCO_ANNOTATIONS,
help=f"Path to COCO image_info JSON for --image-mode http (default: {COCO_ANNOTATIONS})",
)
parser.add_argument(
"--image-size",
type=int,
nargs=2,
default=[512, 512],
metavar=("WIDTH", "HEIGHT"),
help="Size of generated PNG images in pixels (default: 512 512)",
)
return parser.parse_args()
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Utilities for generating and sampling image pools."""
import json
import random
from pathlib import Path
import numpy as np
from PIL import Image
def generate_image_pool_base64(
np_rng: np.random.Generator,
pool_size: int,
image_dir: Path,
image_size: tuple[int, int] = (512, 512),
) -> list[str]:
"""Generate pool_size random PNG files and return their paths."""
image_dir.mkdir(parents=True, exist_ok=True)
pool: list[str] = []
for idx in range(pool_size):
path = image_dir / f"img_{idx:04d}.png"
pixels = np_rng.integers(0, 256, (*image_size, 3), dtype=np.uint8)
Image.fromarray(pixels).save(path)
pool.append(str(path.resolve()))
print(
f" {pool_size} unique {image_size[0]}x{image_size[1]} images saved to {image_dir}"
)
return pool
def generate_image_pool_http(
py_rng: random.Random,
pool_size: int,
coco_annotations: Path,
) -> list[str]:
"""Pick pool_size unique COCO test2017 URLs."""
with open(coco_annotations) as f:
data = json.load(f)
all_urls = [img["coco_url"] for img in data["images"]]
if pool_size > len(all_urls):
raise RuntimeError(
f"--images-pool ({pool_size}) exceeds available COCO images ({len(all_urls)}). "
f"Reduce --images-pool."
)
py_rng.shuffle(all_urls)
pool = all_urls[:pool_size]
print(
f" {pool_size} URLs sampled from {coco_annotations.name} ({len(all_urls)} available)"
)
return pool
def sample_slots(
py_rng: random.Random,
pool: list[str],
num_requests: int,
images_per_request: int,
) -> list[str]:
"""Sample image slots from a fixed pool, no duplicates within each request."""
assert (
len(pool) >= images_per_request
), f"images-pool ({len(pool)}) must be >= images-per-request ({images_per_request})"
total_slots = num_requests * images_per_request
slot_refs: list[str] = []
for _ in range(num_requests):
slot_refs.extend(py_rng.sample(pool, images_per_request))
num_unique = len(set(slot_refs))
print(
f"Generated {total_slots} image slots from pool of {len(pool)}: "
f"{num_unique} unique in use, "
f"{total_slots - num_unique} duplicate references "
f"({(total_slots - num_unique) / total_slots:.1%} reuse)"
)
return slot_refs
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Utilities for generating filler text that tokenizes predictably."""
import random
# Common English words that each tokenize to a single BPE token on most LLMs.
ENGLISH_VOCAB = [
"the",
"be",
"to",
"of",
"and",
"a",
"in",
"that",
"have",
"I",
"it",
"for",
"not",
"on",
"with",
"he",
"as",
"you",
"do",
"at",
"this",
"but",
"his",
"by",
"from",
"they",
"we",
"say",
"her",
"she",
"or",
"an",
"will",
"my",
"one",
"all",
"would",
"there",
"their",
"what",
"so",
"up",
"out",
"if",
"about",
"who",
"get",
"which",
"go",
"me",
"when",
"make",
"can",
"like",
"time",
"no",
"just",
"him",
"know",
"take",
"people",
"into",
"year",
"your",
"good",
"some",
"could",
"them",
"see",
"other",
"than",
"then",
"now",
"look",
"only",
"come",
"its",
"over",
"think",
"also",
"back",
"after",
"use",
"two",
"how",
"our",
"work",
"first",
"well",
"way",
"even",
"new",
"want",
"because",
"any",
"these",
"give",
"day",
"most",
"us",
"great",
"world",
"still",
"own",
"find",
"here",
"thing",
"many",
"long",
"hand",
"high",
"keep",
"place",
"start",
"might",
"old",
"home",
"big",
"end",
"while",
"last",
"turn",
"ask",
"need",
"too",
"feel",
"seem",
"call",
"head",
"put",
"lot",
"run",
"every",
"play",
"small",
"set",
"live",
"try",
"tell",
"few",
"part",
"change",
"help",
"show",
"house",
"both",
"side",
"point",
"such",
"name",
"each",
"right",
"move",
"must",
"real",
"left",
"same",
"much",
"open",
"near",
"line",
"build",
"power",
"water",
"city",
"tree",
"earth",
"plan",
"food",
"dark",
"cold",
"sure",
"car",
"face",
"nice",
"state",
"fact",
"night",
"hard",
"read",
"idea",
"stand",
"class",
"body",
"book",
"word",
"best",
"done",
"case",
"four",
"fire",
"front",
"rest",
"game",
"war",
"air",
"eye",
"true",
"top",
"area",
"boy",
"girl",
"color",
"oil",
"song",
"note",
"low",
"bed",
]
def generate_filler(rng: random.Random, num_tokens: int) -> str:
"""Return ~num_tokens worth of space-separated common English words."""
return " ".join(rng.choice(ENGLISH_VOCAB) for _ in range(num_tokens))
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Generate a .jsonl benchmark file for aiperf (single-turn, text + images).
Images are drawn from a fixed pool; a smaller pool produces more cross-request
reuse. Supports base64 (local PNGs) and http (COCO URLs) image modes.
Usage:
python main.py
python main.py --image-mode http
python main.py -n 200 --images-pool 100
"""
import json
import random
import time
from pathlib import Path
import numpy as np
from args import parse_args
from generate_images import (
generate_image_pool_base64,
generate_image_pool_http,
sample_slots,
)
from generate_input_text import generate_filler
SEED = int(time.time() * 1000) % (2**32)
def main() -> None:
args = parse_args(__doc__)
num_requests: int = args.num_requests
images_per_request: int = args.images_per_request
image_pool: int = args.images_pool or (num_requests * images_per_request)
np_rng = np.random.default_rng(SEED)
py_rng = random.Random(SEED)
if args.image_mode == "http":
pool = generate_image_pool_http(py_rng, image_pool, args.coco_annotations)
else:
pool = generate_image_pool_base64(
np_rng, image_pool, args.image_dir, tuple(args.image_size)
)
slot_refs = sample_slots(py_rng, pool, num_requests, images_per_request)
unique_images = len(set(slot_refs))
output_path = args.output
if output_path is None:
output_path = (
Path(__file__).parent
/ f"{num_requests}req_{images_per_request}img_{unique_images}pool_{args.user_text_tokens}word_{args.image_mode}.jsonl"
)
with open(output_path, "w") as f:
for i in range(num_requests):
user_text = generate_filler(py_rng, args.user_text_tokens)
start = i * images_per_request
images = slot_refs[start : start + images_per_request]
line = json.dumps(
{"text": user_text, "images": images}, separators=(",", ":")
)
f.write(line + "\n")
print(f"Wrote {num_requests} requests to {output_path}")
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment