test: introduce multimodal benchmark toolkit (#6330)

5a4c96db · Qi Wang · GitHub · fcdf6610 · 5a4c96db · 5a4c96db
Unverified Commit 5a4c96db authored Feb 24, 2026 by Qi Wang Committed by GitHub Feb 24, 2026
11 changed files
--- a/benchmarks/multimodal/generate_aiperf_images/README.md
+++ b/benchmarks/multimodal/generate_aiperf_images/README.md
+# Generate aiperf Source Images
+aiperf's built-in image generator ships with very few source images. When
+benchmarking with `--image-mode base64`, aiperf picks from its
+`assets/source_images/` directory — a small set means every request sends
+nearly identical images, which doesn't stress the multimodal pipeline
+realistically.
+This script populates that directory with 200 random-noise PNGs so aiperf
+has a larger pool to sample from.
+## Usage
+```bash
+python main.py
+```
+Images are written directly into aiperf's installed `source_images/` directory.
--- a/benchmarks/multimodal/generate_aiperf_images/args.py
+++ b/benchmarks/multimodal/generate_aiperf_images/args.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""CLI argument parsing for aiperf image generation."""
+import argparse
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate random-noise PNGs into aiperf's source_images directory.",
+    )
+    parser.add_argument(
+        "--images-pool",
+        type=int,
+        default=200,
+        help="Number of unique images to generate (default: 200)",
+    )
+    parser.add_argument(
+        "--image-size",
+        type=int,
+        nargs=2,
+        default=[512, 512],
+        metavar=("WIDTH", "HEIGHT"),
+        help="Size of generated PNG images in pixels (default: 512 512)",
+    )
+    return parser.parse_args()
--- a/benchmarks/multimodal/generate_aiperf_images/main.py
+++ b/benchmarks/multimodal/generate_aiperf_images/main.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+import aiperf.dataset.generator.image as _img_mod
+import numpy as np
+from args import parse_args
+from PIL import Image
+TARGET_DIR = Path(_img_mod.__file__).parent / "assets" / "source_images"
+def main() -> None:
+    args = parse_args()
+    num_images: int = args.images_pool
+    width, height = args.image_size
+    TARGET_DIR.mkdir(parents=True, exist_ok=True)
+    rng = np.random.default_rng(42)
+    for i in range(num_images):
+        pixels = rng.integers(0, 256, (height, width, 3), dtype=np.uint8)
+        Image.fromarray(pixels).save(TARGET_DIR / f"noise_{i:04d}.png")
+        if (i + 1) % 100 == 0:
+            print(f"{i + 1}/{num_images}")
+    print(f"\n{num_images} unique {width}x{height} images saved to {TARGET_DIR}")
+if __name__ == "__main__":
+    main()
--- a/benchmarks/multimodal/jsonl/.gitignore
+++ b/benchmarks/multimodal/jsonl/.gitignore
+*.jsonl
--- a/benchmarks/multimodal/jsonl/README.md
+++ b/benchmarks/multimodal/jsonl/README.md
+# Multimodal JSONL Request Generator
+Generates `.jsonl` benchmark files for [aiperf](https://github.com/NVIDIA/aiperf) with single-turn multimodal requests (text + images).
+## Key concept: image pool reuse
+Each request samples images from a fixed pool. A smaller pool relative to total
+image slots produces more cross-request image reuse — useful for benchmarking
+embedding cache hit rates.
+For example, 500 requests x 3 images each = 1500 image slots. With
+`--images-pool 200`, many requests will share the same images.
+## Image modes
+| Mode | `--image-mode` | What goes in the JSONL | Who fetches the image |
+|------|---------------|------------------------|----------------------|
+| base64 (default) | `base64` | Absolute file paths to local PNGs | aiperf reads and base64-encodes before sending |
+| HTTP | `http` | COCO test2017 URLs | The LLM server downloads images itself |
+For `http` mode, download COCO annotations first:
+```bash
+mkdir -p annotations && cd annotations
+wget http://images.cocodataset.org/annotations/image_info_test2017.zip
+unzip image_info_test2017.zip
+```
+## Usage
+```bash
+# Defaults: 500 requests, 3 images each, all unique, base64 mode
+python main.py
+# HTTP mode with COCO URLs
+python main.py --image-mode http
+# Control reuse: 200 requests, pool of 100 unique images
+python main.py -n 200 --images-pool 100
+# More images per request
+python main.py -n 100 --images-per-request 20 --images-pool 500
+```
+Output filename encodes the parameters, e.g. `500req_3img_200pool_300word_http.jsonl`.
+## Running with aiperf
+```bash
+aiperf profile \
+  --model Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 \
+  --input-file 500req_3img_200pool_300word_http.jsonl \
+  --custom-dataset-type single_turn \
+  --shared-system-prompt-length 1000 \
+  --extra-inputs "max_tokens:500" \
+  --extra-inputs "min_tokens:500" \
+  --extra-inputs "ignore_eos:true"
+```
+Note: the JSONL contains actual content (text + image references), not token
+counts. Do not pass `--isl` — it only applies to synthetic data generation.
--- a/benchmarks/multimodal/jsonl/annotations/image_info_test-dev2017.json
+++ b/benchmarks/multimodal/jsonl/annotations/image_info_test-dev2017.json
--- a/benchmarks/multimodal/jsonl/annotations/image_info_test2017.json
+++ b/benchmarks/multimodal/jsonl/annotations/image_info_test2017.json
--- a/benchmarks/multimodal/jsonl/args.py
+++ b/benchmarks/multimodal/jsonl/args.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""CLI argument parsing for request generation scripts."""
+import argparse
+from pathlib import Path
+DEFAULT_IMAGES_PER_REQUEST = 3
+USER_TEXT_TOKENS = 300
+COCO_ANNOTATIONS = Path(__file__).parent / "annotations" / "image_info_test2017.json"
+def parse_args(description: str = "") -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=description,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "-n",
+        "--num-requests",
+        type=int,
+        default=500,
+        help="Number of requests to generate (default: 500)",
+    )
+    parser.add_argument(
+        "--images-pool",
+        type=int,
+        default=None,
+        help="Number of unique images in the pool. Each request samples from this pool, "
+        "so a smaller pool means more cross-request reuse. "
+        "Default: num_requests * images_per_request (all unique, no reuse).",
+    )
+    parser.add_argument(
+        "--images-per-request",
+        type=int,
+        default=DEFAULT_IMAGES_PER_REQUEST,
+        help=f"Number of images per request (default: {DEFAULT_IMAGES_PER_REQUEST})",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        default=None,
+        help="Output .jsonl path (default: {n}req_{img}img_{pool}pool_{word}word_{mode}.jsonl, e.g. 100req_20img_1000pool_4000word_base64.jsonl)",
+    )
+    parser.add_argument(
+        "--image-dir",
+        type=Path,
+        default=Path("/tmp/bench_images"),
+        help="Directory to save generated PNG images (default: /tmp/bench_images)",
+    )
+    parser.add_argument(
+        "--user-text-tokens",
+        type=int,
+        default=USER_TEXT_TOKENS,
+        help=f"Target user text tokens per request (default: {USER_TEXT_TOKENS}). --isl is an alias.",
+    )
+    parser.add_argument(
+        "--image-mode",
+        choices=["base64", "http"],
+        default="base64",
+        help="Image loading mode: 'base64' generates local PNGs and puts file paths in "
+        "the JSONL so aiperf reads and base64-encodes them before sending (default); "
+        "'http' puts COCO HTTP URLs in the JSONL so the LLM server downloads images itself",
+    )
+    parser.add_argument(
+        "--coco-annotations",
+        type=Path,
+        default=COCO_ANNOTATIONS,
+        help=f"Path to COCO image_info JSON for --image-mode http (default: {COCO_ANNOTATIONS})",
+    )
+    parser.add_argument(
+        "--image-size",
+        type=int,
+        nargs=2,
+        default=[512, 512],
+        metavar=("WIDTH", "HEIGHT"),
+        help="Size of generated PNG images in pixels (default: 512 512)",
+    )
+    return parser.parse_args()
--- a/benchmarks/multimodal/jsonl/generate_images.py
+++ b/benchmarks/multimodal/jsonl/generate_images.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for generating and sampling image pools."""
+import json
+import random
+from pathlib import Path
+import numpy as np
+from PIL import Image
+def generate_image_pool_base64(
+    np_rng: np.random.Generator,
+    pool_size: int,
+    image_dir: Path,
+    image_size: tuple[int, int] = (512, 512),
+) -> list[str]:
+    """Generate pool_size random PNG files and return their paths."""
+    image_dir.mkdir(parents=True, exist_ok=True)
+    pool: list[str] = []
+    for idx in range(pool_size):
+        path = image_dir / f"img_{idx:04d}.png"
+        pixels = np_rng.integers(0, 256, (*image_size, 3), dtype=np.uint8)
+        Image.fromarray(pixels).save(path)
+        pool.append(str(path.resolve()))
+    print(
+        f"  {pool_size} unique {image_size[0]}x{image_size[1]} images saved to {image_dir}"
+    )
+    return pool
+def generate_image_pool_http(
+    py_rng: random.Random,
+    pool_size: int,
+    coco_annotations: Path,
+) -> list[str]:
+    """Pick pool_size unique COCO test2017 URLs."""
+    with open(coco_annotations) as f:
+        data = json.load(f)
+    all_urls = [img["coco_url"] for img in data["images"]]
+    if pool_size > len(all_urls):
+        raise RuntimeError(
+            f"--images-pool ({pool_size}) exceeds available COCO images ({len(all_urls)}). "
+            f"Reduce --images-pool."
+        )
+    py_rng.shuffle(all_urls)
+    pool = all_urls[:pool_size]
+    print(
+        f"  {pool_size} URLs sampled from {coco_annotations.name} ({len(all_urls)} available)"
+    )
+    return pool
+def sample_slots(
+    py_rng: random.Random,
+    pool: list[str],
+    num_requests: int,
+    images_per_request: int,
+) -> list[str]:
+    """Sample image slots from a fixed pool, no duplicates within each request."""
+    assert (
+        len(pool) >= images_per_request
+    ), f"images-pool ({len(pool)}) must be >= images-per-request ({images_per_request})"
+    total_slots = num_requests * images_per_request
+    slot_refs: list[str] = []
+    for _ in range(num_requests):
+        slot_refs.extend(py_rng.sample(pool, images_per_request))
+    num_unique = len(set(slot_refs))
+    print(
+        f"Generated {total_slots} image slots from pool of {len(pool)}: "
+        f"{num_unique} unique in use, "
+        f"{total_slots - num_unique} duplicate references "
+        f"({(total_slots - num_unique) / total_slots:.1%} reuse)"
+    )
+    return slot_refs
--- a/benchmarks/multimodal/jsonl/generate_input_text.py
+++ b/benchmarks/multimodal/jsonl/generate_input_text.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for generating filler text that tokenizes predictably."""
+import random
+# Common English words that each tokenize to a single BPE token on most LLMs.
+ENGLISH_VOCAB = [
+    "the",
+    "be",
+    "to",
+    "of",
+    "and",
+    "a",
+    "in",
+    "that",
+    "have",
+    "I",
+    "it",
+    "for",
+    "not",
+    "on",
+    "with",
+    "he",
+    "as",
+    "you",
+    "do",
+    "at",
+    "this",
+    "but",
+    "his",
+    "by",
+    "from",
+    "they",
+    "we",
+    "say",
+    "her",
+    "she",
+    "or",
+    "an",
+    "will",
+    "my",
+    "one",
+    "all",
+    "would",
+    "there",
+    "their",
+    "what",
+    "so",
+    "up",
+    "out",
+    "if",
+    "about",
+    "who",
+    "get",
+    "which",
+    "go",
+    "me",
+    "when",
+    "make",
+    "can",
+    "like",
+    "time",
+    "no",
+    "just",
+    "him",
+    "know",
+    "take",
+    "people",
+    "into",
+    "year",
+    "your",
+    "good",
+    "some",
+    "could",
+    "them",
+    "see",
+    "other",
+    "than",
+    "then",
+    "now",
+    "look",
+    "only",
+    "come",
+    "its",
+    "over",
+    "think",
+    "also",
+    "back",
+    "after",
+    "use",
+    "two",
+    "how",
+    "our",
+    "work",
+    "first",
+    "well",
+    "way",
+    "even",
+    "new",
+    "want",
+    "because",
+    "any",
+    "these",
+    "give",
+    "day",
+    "most",
+    "us",
+    "great",
+    "world",
+    "still",
+    "own",
+    "find",
+    "here",
+    "thing",
+    "many",
+    "long",
+    "hand",
+    "high",
+    "keep",
+    "place",
+    "start",
+    "might",
+    "old",
+    "home",
+    "big",
+    "end",
+    "while",
+    "last",
+    "turn",
+    "ask",
+    "need",
+    "too",
+    "feel",
+    "seem",
+    "call",
+    "head",
+    "put",
+    "lot",
+    "run",
+    "every",
+    "play",
+    "small",
+    "set",
+    "live",
+    "try",
+    "tell",
+    "few",
+    "part",
+    "change",
+    "help",
+    "show",
+    "house",
+    "both",
+    "side",
+    "point",
+    "such",
+    "name",
+    "each",
+    "right",
+    "move",
+    "must",
+    "real",
+    "left",
+    "same",
+    "much",
+    "open",
+    "near",
+    "line",
+    "build",
+    "power",
+    "water",
+    "city",
+    "tree",
+    "earth",
+    "plan",
+    "food",
+    "dark",
+    "cold",
+    "sure",
+    "car",
+    "face",
+    "nice",
+    "state",
+    "fact",
+    "night",
+    "hard",
+    "read",
+    "idea",
+    "stand",
+    "class",
+    "body",
+    "book",
+    "word",
+    "best",
+    "done",
+    "case",
+    "four",
+    "fire",
+    "front",
+    "rest",
+    "game",
+    "war",
+    "air",
+    "eye",
+    "true",
+    "top",
+    "area",
+    "boy",
+    "girl",
+    "color",
+    "oil",
+    "song",
+    "note",
+    "low",
+    "bed",
+]
+def generate_filler(rng: random.Random, num_tokens: int) -> str:
+    """Return ~num_tokens worth of space-separated common English words."""
+    return " ".join(rng.choice(ENGLISH_VOCAB) for _ in range(num_tokens))
--- a/benchmarks/multimodal/jsonl/main.py
+++ b/benchmarks/multimodal/jsonl/main.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Generate a .jsonl benchmark file for aiperf (single-turn, text + images).
+Images are drawn from a fixed pool; a smaller pool produces more cross-request
+reuse. Supports base64 (local PNGs) and http (COCO URLs) image modes.
+Usage:
+    python main.py
+    python main.py --image-mode http
+    python main.py -n 200 --images-pool 100
+"""
+import json
+import random
+import time
+from pathlib import Path
+import numpy as np
+from args import parse_args
+from generate_images import (
+    generate_image_pool_base64,
+    generate_image_pool_http,
+    sample_slots,
+)
+from generate_input_text import generate_filler
+SEED = int(time.time() * 1000) % (2**32)
+def main() -> None:
+    args = parse_args(__doc__)
+    num_requests: int = args.num_requests
+    images_per_request: int = args.images_per_request
+    image_pool: int = args.images_pool or (num_requests * images_per_request)
+    np_rng = np.random.default_rng(SEED)
+    py_rng = random.Random(SEED)
+    if args.image_mode == "http":
+        pool = generate_image_pool_http(py_rng, image_pool, args.coco_annotations)
+    else:
+        pool = generate_image_pool_base64(
+            np_rng, image_pool, args.image_dir, tuple(args.image_size)
+        )
+    slot_refs = sample_slots(py_rng, pool, num_requests, images_per_request)
+    unique_images = len(set(slot_refs))
+    output_path = args.output
+    if output_path is None:
+        output_path = (
+            Path(__file__).parent
+            / f"{num_requests}req_{images_per_request}img_{unique_images}pool_{args.user_text_tokens}word_{args.image_mode}.jsonl"
+        )
+    with open(output_path, "w") as f:
+        for i in range(num_requests):
+            user_text = generate_filler(py_rng, args.user_text_tokens)
+            start = i * images_per_request
+            images = slot_refs[start : start + images_per_request]
+            line = json.dumps(
+                {"text": user_text, "images": images}, separators=(",", ":")
+            )
+            f.write(line + "\n")
+    print(f"Wrote {num_requests} requests to {output_path}")
+if __name__ == "__main__":
+    main()