feat: add TxtSlicesDataset to allow sampling slices from txt file for benchmarking (#30156)

Signed-off-by: jdebache <jdebache@nvidia.com>

feat: add TxtSlicesDataset to allow sampling slices from txt file for benchmarking (#30156)
Signed-off-by: jdebache <jdebache@nvidia.com>
893b2aff · Julien Debache · GitHub · 80118853 · 893b2aff · 893b2aff
Unverified Commit 893b2aff authored Apr 14, 2026 by Julien Debache Committed by GitHub Apr 14, 2026
6 changed files
--- a/tests/benchmarks/test_sampling_params.py
+++ b/tests/benchmarks/test_sampling_params.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+import pytest
+from vllm.benchmarks.datasets.utils import get_sampling_params
+from vllm.tokenizers import TokenizerLike
+class _FakeTokenizer(TokenizerLike):
+    """Minimal tokenizer implementing the TokenizerLike protocol
+    for testing get_sampling_params."""
+    def __init__(self, vocab_size: int = 1000, num_special_tokens: int = 0) -> None:
+        self._vocab_size = vocab_size
+        self._num_special_tokens = num_special_tokens
+    # -- Properties required by TokenizerLike --
+    @classmethod
+    def from_pretrained(cls, path_or_repo_id, *a, **kw):  # type: ignore[override]
+        return cls()
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return []
+    @property
+    def all_special_ids(self) -> list[int]:
+        return []
+    @property
+    def bos_token_id(self) -> int:
+        return 0
+    @property
+    def eos_token_id(self) -> int:
+        return 1
+    @property
+    def pad_token_id(self) -> int:
+        return 2
+    @property
+    def is_fast(self) -> bool:
+        return False
+    @property
+    def max_token_id(self) -> int:
+        return self._vocab_size - 1
+    @property
+    def max_chars_per_token(self) -> int:
+        return 4
+    @property
+    def truncation_side(self) -> str:
+        return "right"
+    def num_special_tokens_to_add(self) -> int:
+        return self._num_special_tokens
+    def __call__(self, text, text_pair=None, **kw):  # type: ignore[override]
+        raise NotImplementedError
+    def get_vocab(self) -> dict[str, int]:
+        return {}
+    def get_added_vocab(self) -> dict[str, int]:
+        return {}
+    def encode(self, text, **kw) -> list[int]:  # type: ignore[override]
+        raise NotImplementedError
+    def apply_chat_template(self, messages, **kw):  # type: ignore[override]
+        raise NotImplementedError
+    def convert_tokens_to_ids(self, tokens):  # type: ignore[override]
+        raise NotImplementedError
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        raise NotImplementedError
+    def decode(self, ids, skip_special_tokens: bool = False) -> str:  # type: ignore[override]
+        raise NotImplementedError
+    def convert_ids_to_tokens(  # type: ignore[override]
+        self, ids, skip_special_tokens: bool = False
+    ) -> list[str]:
+        raise NotImplementedError
+class TestGetSamplingParams:
+    """Tests for ``get_sampling_params`` in ``vllm.benchmarks.datasets.shared``."""
+    # -- helpers --
+    @staticmethod
+    def _tok(vocab_size: int = 1000, num_special: int = 0) -> _FakeTokenizer:
+        return _FakeTokenizer(vocab_size=vocab_size, num_special_tokens=num_special)
+    # -- return shape / dtype --
+    def test_returns_three_arrays(self):
+        rng = np.random.default_rng(0)
+        result = get_sampling_params(rng, 5, 0.0, 100, 50, self._tok())
+        assert len(result) == 3
+        for arr in result:
+            assert isinstance(arr, np.ndarray)
+    @pytest.mark.parametrize("n", [1, 10, 100])
+    def test_output_length_matches_num_requests(self, n: int):
+        rng = np.random.default_rng(42)
+        input_lens, output_lens, offsets = get_sampling_params(
+            rng, n, 0.0, 64, 32, self._tok()
+        )
+        assert input_lens.shape == (n,)
+        assert output_lens.shape == (n,)
+        assert offsets.shape == (n,)
+    # -- fixed lengths (range_ratio = 0) --
+    def test_zero_range_ratio_gives_constant_lengths(self):
+        rng = np.random.default_rng(7)
+        input_lens, output_lens, _ = get_sampling_params(
+            rng, 20, 0.0, 128, 64, self._tok()
+        )
+        assert np.all(input_lens == 128)
+        assert np.all(output_lens == 64)
+    def test_special_tokens_subtracted_from_input_only(self):
+        rng = np.random.default_rng(7)
+        input_lens, output_lens, _ = get_sampling_params(
+            rng, 10, 0.0, 100, 50, self._tok(num_special=4)
+        )
+        # real_input_len = 100 - 4 = 96, range_ratio 0 → all 96
+        assert np.all(input_lens == 96)
+        # special tokens are not subtracted from output length
+        assert np.all(output_lens == 50)
+    # -- range ratios --
+    def test_input_range_bounds(self):
+        rng = np.random.default_rng(0)
+        ratio = 0.5
+        base = 200
+        input_lens, _, _ = get_sampling_params(
+            rng, 500, {"input": ratio, "output": 0.0}, base, 50, self._tok()
+        )
+        lo = int(np.floor(base * (1 - ratio)))
+        hi = int(np.ceil(base * (1 + ratio)))
+        assert np.all(input_lens >= lo)
+        assert np.all(input_lens <= hi)
+    def test_output_range_bounds(self):
+        rng = np.random.default_rng(0)
+        ratio = 0.3
+        base = 100
+        _, output_lens, _ = get_sampling_params(
+            rng, 500, {"input": 0.0, "output": ratio}, 50, base, self._tok()
+        )
+        lo = max(1, int(np.floor(base * (1 - ratio))))
+        hi = int(np.ceil(base * (1 + ratio)))
+        assert np.all(output_lens >= lo)
+        assert np.all(output_lens <= hi)
+    def test_output_low_clamped_to_one(self):
+        """Even with a high ratio that would push output_low to 0,
+        the function clamps it to 1."""
+        rng = np.random.default_rng(0)
+        # output_len=1, ratio=0.99 → floor(1*0.01)=0, should clamp to 1
+        _, output_lens, _ = get_sampling_params(
+            rng, 50, {"input": 0.0, "output": 0.99}, 100, 1, self._tok()
+        )
+        assert np.all(output_lens >= 1)
+    # -- offsets bounded by vocab_size --
+    @pytest.mark.parametrize("vocab", [100, 32000, 128256])
+    def test_offsets_within_vocab(self, vocab: int):
+        rng = np.random.default_rng(0)
+        _, _, offsets = get_sampling_params(
+            rng, 200, 0.0, 64, 32, self._tok(vocab_size=vocab)
+        )
+        assert np.all(offsets >= 0)
+        assert np.all(offsets < vocab)
+    # -- reproducibility --
+    def test_same_seed_same_results(self):
+        tok = self._tok()
+        rr = {"input": 0.3, "output": 0.2}
+        a = get_sampling_params(np.random.default_rng(42), 50, rr, 256, 64, tok)
+        b = get_sampling_params(np.random.default_rng(42), 50, rr, 256, 64, tok)
+        for arr_a, arr_b in zip(a, b):
+            np.testing.assert_array_equal(arr_a, arr_b)
+    def test_different_seed_different_results(self):
+        tok = self._tok()
+        rr = {"input": 0.3, "output": 0.2}
+        a = get_sampling_params(np.random.default_rng(0), 50, rr, 256, 64, tok)
+        b = get_sampling_params(np.random.default_rng(1), 50, rr, 256, 64, tok)
+        # Extremely unlikely all three arrays match with different seeds
+        assert not all(np.array_equal(arr_a, arr_b) for arr_a, arr_b in zip(a, b))
+    # -- validation / error paths --
+    @pytest.mark.parametrize("bad_ratio", [-0.1, 1.0, 1.5])
+    def test_invalid_input_range_ratio(self, bad_ratio: float):
+        rng = np.random.default_rng(0)
+        with pytest.raises(ValueError, match="input_range_ratio"):
+            get_sampling_params(
+                rng, 10, {"input": bad_ratio, "output": 0.0}, 100, 50, self._tok()
+            )
+    @pytest.mark.parametrize("bad_ratio", [-0.1, 1.0, 1.5])
+    def test_invalid_output_range_ratio(self, bad_ratio: float):
+        rng = np.random.default_rng(0)
+        with pytest.raises(ValueError, match="output_range_ratio"):
+            get_sampling_params(
+                rng, 10, {"input": 0.0, "output": bad_ratio}, 100, 50, self._tok()
+            )
+    def test_invalid_dict_missing_keys(self):
+        rng = np.random.default_rng(0)
+        with pytest.raises(ValueError, match="input.*output"):
+            get_sampling_params(rng, 10, {"input": 0.1}, 100, 50, self._tok())
+    def test_input_len_zero_with_special_tokens(self):
+        """input_len < num_special_tokens → real_input_len = 0, which is fine
+        (range [0, 0])."""
+        rng = np.random.default_rng(0)
+        input_lens, _, _ = get_sampling_params(
+            rng, 5, 0.0, 5, 50, self._tok(num_special=10)
+        )
+        # real_input_len = max(0, 5 - 10) = 0
+        assert np.all(input_lens == 0)
+    # -- edge cases --
+    def test_single_request(self):
+        rng = np.random.default_rng(0)
+        i, o, off = get_sampling_params(rng, 1, 0.0, 100, 50, self._tok())
+        assert i.shape == (1,)
+        assert o.shape == (1,)
+        assert off.shape == (1,)
+    def test_large_num_requests(self):
+        rng = np.random.default_rng(0)
+        i, o, off = get_sampling_params(rng, 10_000, 0.5, 512, 128, self._tok())
+        assert i.shape == (10_000,)
+        assert o.shape == (10_000,)
+        assert off.shape == (10_000,)
--- a/tests/benchmarks/test_txt_slices_dataset.py
+++ b/tests/benchmarks/test_txt_slices_dataset.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from pathlib import Path
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from vllm.benchmarks.datasets import CustomDataset
+from vllm.benchmarks.datasets.create_txt_slices_dataset import create_txt_slices_jsonl
+@pytest.fixture(scope="session")
+def hf_tokenizer() -> PreTrainedTokenizerBase:
+    # Use a small, commonly available tokenizer
+    return AutoTokenizer.from_pretrained("gpt2")
+text_content = """
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
+incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
+exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat
+nulla pariatur. Excepteur sint occaecat cupidatat non proident,
+sunt in culpa qui officia deserunt mollit anim id est laborum.
+"""
+@pytest.mark.benchmark
+def test_create_txt_slices_jsonl(
+    hf_tokenizer: PreTrainedTokenizerBase, tmp_path: Path
+) -> None:
+    """Test that create_txt_slices_jsonl produces valid JSONL for CustomDataset."""
+    txt_path = tmp_path / "input.txt"
+    jsonl_path = tmp_path / "input.txt.jsonl"
+    txt_path.write_text(text_content)
+    create_txt_slices_jsonl(
+        input_path=str(txt_path),
+        output_path=str(jsonl_path),
+        tokenizer_name="gpt2",
+        num_prompts=10,
+        input_len=10,
+        output_len=10,
+    )
+    # Verify the JSONL file is valid and has the expected structure
+    records = [json.loads(line) for line in jsonl_path.read_text().splitlines()]
+    assert len(records) == 10
+    for record in records:
+        assert "prompt" in record
+        assert "output_tokens" in record
+        assert isinstance(record["prompt"], str)
+        assert record["output_tokens"] == 10
+    # Verify the JSONL file can be loaded by CustomDataset
+    dataset = CustomDataset(dataset_path=str(jsonl_path))
+    samples = dataset.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=10,
+        output_len=10,
+        skip_chat_template=True,
+    )
+    assert len(samples) == 10
+    assert all(sample.expected_output_len == 10 for sample in samples)
--- a/vllm/benchmarks/datasets/__init__.py
+++ b/vllm/benchmarks/datasets/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.benchmarks.datasets.datasets import (
+    DEFAULT_NUM_PROMPTS,
+    AIMODataset,
+    ASRDataset,
+    BenchmarkDataset,
+    BlazeditDataset,
+    BurstGPTDataset,
+    ConversationDataset,
+    CustomDataset,
+    CustomMMDataset,
+    HuggingFaceDataset,
+    InstructCoderDataset,
+    MLPerfDataset,
+    MMStarDataset,
+    MMVUDataset,
+    MTBenchDataset,
+    MultiModalConversationDataset,
+    NextEditPredictionDataset,
+    PrefixRepetitionRandomDataset,
+    RandomDataset,
+    RandomDatasetForReranking,
+    RandomMultiModalDataset,
+    SampleRequest,
+    ShareGPTDataset,
+    SonnetDataset,
+    SpecBench,
+    VisionArenaDataset,
+    add_dataset_parser,
+    add_random_dataset_base_args,
+    add_random_multimodal_dataset_args,
+    gen_prompt_decode_to_target_len,
+    get_samples,
+    is_valid_sequence,
+    lora_path_on_disk,
+    lora_tokenizer_cache,
+    process_image,
+    process_video,
+    zeta_prompt,
+)
+from vllm.benchmarks.datasets.utils import RangeRatio
+__all__ = [
+    "DEFAULT_NUM_PROMPTS",
+    "AIMODataset",
+    "ASRDataset",
+    "BenchmarkDataset",
+    "BlazeditDataset",
+    "BurstGPTDataset",
+    "ConversationDataset",
+    "CustomDataset",
+    "CustomMMDataset",
+    "HuggingFaceDataset",
+    "InstructCoderDataset",
+    "MLPerfDataset",
+    "MMStarDataset",
+    "MMVUDataset",
+    "MTBenchDataset",
+    "MultiModalConversationDataset",
+    "NextEditPredictionDataset",
+    "PrefixRepetitionRandomDataset",
+    "RandomDataset",
+    "RandomDatasetForReranking",
+    "RandomMultiModalDataset",
+    "SampleRequest",
+    "ShareGPTDataset",
+    "SonnetDataset",
+    "SpecBench",
+    "VisionArenaDataset",
+    "add_dataset_parser",
+    "add_random_dataset_base_args",
+    "add_random_multimodal_dataset_args",
+    "gen_prompt_decode_to_target_len",
+    "get_samples",
+    "is_valid_sequence",
+    "lora_path_on_disk",
+    "lora_tokenizer_cache",
+    "process_image",
+    "process_video",
+    "RangeRatio",
+    "zeta_prompt",
+]
--- a/vllm/benchmarks/datasets/create_txt_slices_dataset.py
+++ b/vllm/benchmarks/datasets/create_txt_slices_dataset.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Convert a plain-text file (local path or URL) into a JSONL dataset
+compatible with ``CustomDataset`` (``--dataset-name custom``), by 
+randomly slicing the tokenized text into prompts.
+Each line of the output JSONL contains a ``prompt`` (decoded from a random
+slice of the tokenized source text) and an ``output_tokens`` count.
+Usage
+-----
+::
+    python -m vllm.benchmarks.datasets.create_txt_slices_dataset \\
+        --input  sonnet.txt \\
+        --output sonnet_dataset.jsonl \\
+        --tokenizer gpt2 \\
+        --num-prompts 1000 \\
+        --input-len 1024 \\
+        --output-len 128
+The resulting JSONL file can then be used with the serving benchmark::
+    python -m vllm.benchmarks.serve \\
+        --dataset-name custom \\
+        --dataset-path sonnet_dataset.jsonl \\
+        ...
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import random
+import urllib.request
+import numpy as np
+from transformers import AutoTokenizer
+from vllm.benchmarks.datasets.utils import RangeRatio, get_sampling_params
+logger = logging.getLogger(__name__)
+def load_text(path: str) -> str:
+    """Load text from a local file or URL."""
+    if path.startswith(("http://", "https://")):
+        with urllib.request.urlopen(path) as response:
+            return response.read().decode("utf-8")
+    with open(path, encoding="utf-8") as f:
+        return f.read()
+def create_txt_slices_jsonl(
+    *,
+    input_path: str,
+    output_path: str,
+    tokenizer_name: str,
+    num_prompts: int,
+    input_len: int,
+    output_len: int,
+    range_ratio: RangeRatio = 0.0,
+    seed: int = 0,
+    trust_remote_code: bool = False,
+) -> None:
+    """Read *input_path*, slice it into prompts, and write JSONL to
+    *output_path*."""
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name, trust_remote_code=trust_remote_code
+    )
+    text = load_text(input_path)
+    if not text:
+        raise ValueError("The text file is empty and cannot be sampled from.")
+    token_ids = tokenizer(text, add_special_tokens=False).input_ids
+    if not token_ids:
+        raise ValueError("Tokenizing the text produced zero tokens; cannot sample.")
+    rng_np = np.random.default_rng(seed)
+    rng_py = random.Random(seed)
+    input_lens, output_lens, _ = get_sampling_params(
+        rng_np,
+        num_prompts,
+        range_ratio,
+        input_len,
+        output_len,
+        tokenizer,
+    )
+    num_available_tokens = len(token_ids)
+    records: list[dict[str, object]] = []
+    for i in range(num_prompts):
+        req_input_len = int(input_lens[i])
+        req_output_len = int(output_lens[i])
+        # Randomly select a start position and slice with cycling
+        start_pos = rng_py.randint(0, num_available_tokens - 1)
+        prompt_token_ids = [
+            token_ids[(start_pos + j) % num_available_tokens]
+            for j in range(req_input_len)
+        ]
+        prompt = tokenizer.decode(prompt_token_ids, skip_special_tokens=False)
+        records.append({"prompt": prompt, "output_tokens": req_output_len})
+    with open(output_path, "w", encoding="utf-8") as f:
+        for record in records:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+    logger.info(
+        "Wrote %d prompts to %s",
+        len(records),
+        output_path,
+    )
+def main(argv: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(
+        description="Convert a plain-text file into a JSONL dataset "
+        "for CustomDataset (--dataset-name custom).",
+    )
+    parser.add_argument(
+        "--input",
+        required=True,
+        help="Path or URL to the source text file.",
+    )
+    parser.add_argument(
+        "--output",
+        required=True,
+        help="Path for the output JSONL file.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        required=True,
+        help="HuggingFace tokenizer name or path.",
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompt samples to generate (default: 1000).",
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=1024,
+        help="Target number of input tokens per prompt (default: 1024).",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=128,
+        help="Target number of output tokens per prompt (default: 128).",
+    )
+    parser.add_argument(
+        "--range-ratio",
+        type=str,
+        default="0.0",
+        help="Range ratio for input/output length sampling (default: 0.0). "
+        "A single float applies to both ISL and OSL. "
+        'A JSON dict like \'{"input": 0.3, "output": 0.5}\' sets them '
+        "independently. Values must be in [0, 1).",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Random seed for reproducibility (default: 0).",
+    )
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from HuggingFace.",
+    )
+    args = parser.parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    # Parse --range-ratio: try float first, then JSON dict.
+    range_ratio: RangeRatio
+    try:
+        range_ratio = float(args.range_ratio)
+    except ValueError:
+        import json as _json
+        range_ratio = _json.loads(args.range_ratio)
+    create_txt_slices_jsonl(
+        input_path=args.input,
+        output_path=args.output,
+        tokenizer_name=args.tokenizer,
+        num_prompts=args.num_prompts,
+        input_len=args.input_len,
+        output_len=args.output_len,
+        range_ratio=range_ratio,
+        seed=args.seed,
+        trust_remote_code=args.trust_remote_code,
+    )
+if __name__ == "__main__":
+    main()
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
--- a/vllm/benchmarks/datasets/utils.py
+++ b/vllm/benchmarks/datasets/utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Shared utilities for benchmark dataset sampling.
+"""
+import logging
+import math
+import numpy as np
+from vllm.tokenizers import TokenizerLike
+logger = logging.getLogger(__name__)
+# Type alias: a single float applies to both ISL and OSL; a dict allows
+# specifying them independently via ``{"input": …, "output": …}``.
+RangeRatio = float | dict[str, float]
+def _resolve_range_ratios(
+    range_ratio: RangeRatio,
+) -> tuple[float, float]:
+    """Return ``(input_range_ratio, output_range_ratio)`` from *range_ratio*.
+    *range_ratio* is either a single float (used for both input and output)
+    or a dict with ``"input"`` and ``"output"`` keys.
+    """
+    if isinstance(range_ratio, dict):
+        try:
+            return float(range_ratio["input"]), float(range_ratio["output"])
+        except KeyError as exc:
+            raise ValueError(
+                "When range_ratio is a dict it must contain 'input' and "
+                f"'output' keys, got: {sorted(range_ratio)}"
+            ) from exc
+    ratio = float(range_ratio)
+    return ratio, ratio
+def get_sampling_params(
+    rng: np.random.Generator,
+    num_requests: int,
+    range_ratio: RangeRatio,
+    input_len: int,
+    output_len: int,
+    tokenizer: TokenizerLike,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Sample per-request input/output token lengths and vocab offsets.
+    Lengths are drawn uniformly from integer ranges around the configured
+    means, controlled by *range_ratio*.  It may be a single ``float``
+    (applied to both input and output) or a ``dict`` with ``"input"`` and
+    ``"output"`` keys for independent control.
+    Tokenizer special tokens are subtracted from ``input_len`` before
+    computing the sampling interval.
+    Returns:
+        (input_lens, output_lens, offsets) – three 1-D ``np.ndarray`` of
+        shape ``(num_requests,)``.
+    """
+    input_range_ratio, output_range_ratio = _resolve_range_ratios(range_ratio)
+    if not (0.0 <= input_range_ratio < 1.0):
+        raise ValueError("input_range_ratio must be in [0, 1).")
+    if not (0.0 <= output_range_ratio < 1.0):
+        raise ValueError("output_range_ratio must be in [0, 1).")
+    num_special_tokens = int(tokenizer.num_special_tokens_to_add())
+    real_input_len = max(0, int(input_len) - num_special_tokens)
+    input_low = math.floor(real_input_len * (1 - input_range_ratio))
+    input_high = math.ceil(real_input_len * (1 + input_range_ratio))
+    output_low = math.floor(output_len * (1 - output_range_ratio))
+    output_high = math.ceil(output_len * (1 + output_range_ratio))
+    # Ensure the lower bound for output length is at least 1 to
+    # prevent sampling 0 tokens.
+    output_low = max(output_low, 1)
+    output_high = max(output_high, 1)
+    if input_low > input_high:
+        raise ValueError(
+            f"Invalid input sampling interval: low={input_low} > high={input_high}"
+        )
+    if output_low > output_high:
+        raise ValueError(
+            f"Invalid output sampling interval: low={output_low} > high={output_high}"
+        )
+    logger.info(
+        "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
+        input_low,
+        input_high,
+        output_low,
+        output_high,
+    )
+    input_lens = rng.integers(input_low, input_high + 1, size=num_requests)
+    output_lens = rng.integers(output_low, output_high + 1, size=num_requests)
+    offsets = rng.integers(0, tokenizer.vocab_size, size=num_requests)
+    return input_lens, output_lens, offsets