Unverified Commit 893b2aff authored by Julien Debache's avatar Julien Debache Committed by GitHub
Browse files

feat: add TxtSlicesDataset to allow sampling slices from txt file for benchmarking (#30156)


Signed-off-by: default avatarjdebache <jdebache@nvidia.com>
parent 80118853
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import pytest
from vllm.benchmarks.datasets.utils import get_sampling_params
from vllm.tokenizers import TokenizerLike
class _FakeTokenizer(TokenizerLike):
"""Minimal tokenizer implementing the TokenizerLike protocol
for testing get_sampling_params."""
def __init__(self, vocab_size: int = 1000, num_special_tokens: int = 0) -> None:
self._vocab_size = vocab_size
self._num_special_tokens = num_special_tokens
# -- Properties required by TokenizerLike --
@classmethod
def from_pretrained(cls, path_or_repo_id, *a, **kw): # type: ignore[override]
return cls()
@property
def vocab_size(self) -> int:
return self._vocab_size
@property
def all_special_tokens(self) -> list[str]:
return []
@property
def all_special_ids(self) -> list[int]:
return []
@property
def bos_token_id(self) -> int:
return 0
@property
def eos_token_id(self) -> int:
return 1
@property
def pad_token_id(self) -> int:
return 2
@property
def is_fast(self) -> bool:
return False
@property
def max_token_id(self) -> int:
return self._vocab_size - 1
@property
def max_chars_per_token(self) -> int:
return 4
@property
def truncation_side(self) -> str:
return "right"
def num_special_tokens_to_add(self) -> int:
return self._num_special_tokens
def __call__(self, text, text_pair=None, **kw): # type: ignore[override]
raise NotImplementedError
def get_vocab(self) -> dict[str, int]:
return {}
def get_added_vocab(self) -> dict[str, int]:
return {}
def encode(self, text, **kw) -> list[int]: # type: ignore[override]
raise NotImplementedError
def apply_chat_template(self, messages, **kw): # type: ignore[override]
raise NotImplementedError
def convert_tokens_to_ids(self, tokens): # type: ignore[override]
raise NotImplementedError
def convert_tokens_to_string(self, tokens: list[str]) -> str:
raise NotImplementedError
def decode(self, ids, skip_special_tokens: bool = False) -> str: # type: ignore[override]
raise NotImplementedError
def convert_ids_to_tokens( # type: ignore[override]
self, ids, skip_special_tokens: bool = False
) -> list[str]:
raise NotImplementedError
class TestGetSamplingParams:
"""Tests for ``get_sampling_params`` in ``vllm.benchmarks.datasets.shared``."""
# -- helpers --
@staticmethod
def _tok(vocab_size: int = 1000, num_special: int = 0) -> _FakeTokenizer:
return _FakeTokenizer(vocab_size=vocab_size, num_special_tokens=num_special)
# -- return shape / dtype --
def test_returns_three_arrays(self):
rng = np.random.default_rng(0)
result = get_sampling_params(rng, 5, 0.0, 100, 50, self._tok())
assert len(result) == 3
for arr in result:
assert isinstance(arr, np.ndarray)
@pytest.mark.parametrize("n", [1, 10, 100])
def test_output_length_matches_num_requests(self, n: int):
rng = np.random.default_rng(42)
input_lens, output_lens, offsets = get_sampling_params(
rng, n, 0.0, 64, 32, self._tok()
)
assert input_lens.shape == (n,)
assert output_lens.shape == (n,)
assert offsets.shape == (n,)
# -- fixed lengths (range_ratio = 0) --
def test_zero_range_ratio_gives_constant_lengths(self):
rng = np.random.default_rng(7)
input_lens, output_lens, _ = get_sampling_params(
rng, 20, 0.0, 128, 64, self._tok()
)
assert np.all(input_lens == 128)
assert np.all(output_lens == 64)
def test_special_tokens_subtracted_from_input_only(self):
rng = np.random.default_rng(7)
input_lens, output_lens, _ = get_sampling_params(
rng, 10, 0.0, 100, 50, self._tok(num_special=4)
)
# real_input_len = 100 - 4 = 96, range_ratio 0 → all 96
assert np.all(input_lens == 96)
# special tokens are not subtracted from output length
assert np.all(output_lens == 50)
# -- range ratios --
def test_input_range_bounds(self):
rng = np.random.default_rng(0)
ratio = 0.5
base = 200
input_lens, _, _ = get_sampling_params(
rng, 500, {"input": ratio, "output": 0.0}, base, 50, self._tok()
)
lo = int(np.floor(base * (1 - ratio)))
hi = int(np.ceil(base * (1 + ratio)))
assert np.all(input_lens >= lo)
assert np.all(input_lens <= hi)
def test_output_range_bounds(self):
rng = np.random.default_rng(0)
ratio = 0.3
base = 100
_, output_lens, _ = get_sampling_params(
rng, 500, {"input": 0.0, "output": ratio}, 50, base, self._tok()
)
lo = max(1, int(np.floor(base * (1 - ratio))))
hi = int(np.ceil(base * (1 + ratio)))
assert np.all(output_lens >= lo)
assert np.all(output_lens <= hi)
def test_output_low_clamped_to_one(self):
"""Even with a high ratio that would push output_low to 0,
the function clamps it to 1."""
rng = np.random.default_rng(0)
# output_len=1, ratio=0.99 → floor(1*0.01)=0, should clamp to 1
_, output_lens, _ = get_sampling_params(
rng, 50, {"input": 0.0, "output": 0.99}, 100, 1, self._tok()
)
assert np.all(output_lens >= 1)
# -- offsets bounded by vocab_size --
@pytest.mark.parametrize("vocab", [100, 32000, 128256])
def test_offsets_within_vocab(self, vocab: int):
rng = np.random.default_rng(0)
_, _, offsets = get_sampling_params(
rng, 200, 0.0, 64, 32, self._tok(vocab_size=vocab)
)
assert np.all(offsets >= 0)
assert np.all(offsets < vocab)
# -- reproducibility --
def test_same_seed_same_results(self):
tok = self._tok()
rr = {"input": 0.3, "output": 0.2}
a = get_sampling_params(np.random.default_rng(42), 50, rr, 256, 64, tok)
b = get_sampling_params(np.random.default_rng(42), 50, rr, 256, 64, tok)
for arr_a, arr_b in zip(a, b):
np.testing.assert_array_equal(arr_a, arr_b)
def test_different_seed_different_results(self):
tok = self._tok()
rr = {"input": 0.3, "output": 0.2}
a = get_sampling_params(np.random.default_rng(0), 50, rr, 256, 64, tok)
b = get_sampling_params(np.random.default_rng(1), 50, rr, 256, 64, tok)
# Extremely unlikely all three arrays match with different seeds
assert not all(np.array_equal(arr_a, arr_b) for arr_a, arr_b in zip(a, b))
# -- validation / error paths --
@pytest.mark.parametrize("bad_ratio", [-0.1, 1.0, 1.5])
def test_invalid_input_range_ratio(self, bad_ratio: float):
rng = np.random.default_rng(0)
with pytest.raises(ValueError, match="input_range_ratio"):
get_sampling_params(
rng, 10, {"input": bad_ratio, "output": 0.0}, 100, 50, self._tok()
)
@pytest.mark.parametrize("bad_ratio", [-0.1, 1.0, 1.5])
def test_invalid_output_range_ratio(self, bad_ratio: float):
rng = np.random.default_rng(0)
with pytest.raises(ValueError, match="output_range_ratio"):
get_sampling_params(
rng, 10, {"input": 0.0, "output": bad_ratio}, 100, 50, self._tok()
)
def test_invalid_dict_missing_keys(self):
rng = np.random.default_rng(0)
with pytest.raises(ValueError, match="input.*output"):
get_sampling_params(rng, 10, {"input": 0.1}, 100, 50, self._tok())
def test_input_len_zero_with_special_tokens(self):
"""input_len < num_special_tokens → real_input_len = 0, which is fine
(range [0, 0])."""
rng = np.random.default_rng(0)
input_lens, _, _ = get_sampling_params(
rng, 5, 0.0, 5, 50, self._tok(num_special=10)
)
# real_input_len = max(0, 5 - 10) = 0
assert np.all(input_lens == 0)
# -- edge cases --
def test_single_request(self):
rng = np.random.default_rng(0)
i, o, off = get_sampling_params(rng, 1, 0.0, 100, 50, self._tok())
assert i.shape == (1,)
assert o.shape == (1,)
assert off.shape == (1,)
def test_large_num_requests(self):
rng = np.random.default_rng(0)
i, o, off = get_sampling_params(rng, 10_000, 0.5, 512, 128, self._tok())
assert i.shape == (10_000,)
assert o.shape == (10_000,)
assert off.shape == (10_000,)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from pathlib import Path
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.benchmarks.datasets import CustomDataset
from vllm.benchmarks.datasets.create_txt_slices_dataset import create_txt_slices_jsonl
@pytest.fixture(scope="session")
def hf_tokenizer() -> PreTrainedTokenizerBase:
# Use a small, commonly available tokenizer
return AutoTokenizer.from_pretrained("gpt2")
text_content = """
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat
nulla pariatur. Excepteur sint occaecat cupidatat non proident,
sunt in culpa qui officia deserunt mollit anim id est laborum.
"""
@pytest.mark.benchmark
def test_create_txt_slices_jsonl(
hf_tokenizer: PreTrainedTokenizerBase, tmp_path: Path
) -> None:
"""Test that create_txt_slices_jsonl produces valid JSONL for CustomDataset."""
txt_path = tmp_path / "input.txt"
jsonl_path = tmp_path / "input.txt.jsonl"
txt_path.write_text(text_content)
create_txt_slices_jsonl(
input_path=str(txt_path),
output_path=str(jsonl_path),
tokenizer_name="gpt2",
num_prompts=10,
input_len=10,
output_len=10,
)
# Verify the JSONL file is valid and has the expected structure
records = [json.loads(line) for line in jsonl_path.read_text().splitlines()]
assert len(records) == 10
for record in records:
assert "prompt" in record
assert "output_tokens" in record
assert isinstance(record["prompt"], str)
assert record["output_tokens"] == 10
# Verify the JSONL file can be loaded by CustomDataset
dataset = CustomDataset(dataset_path=str(jsonl_path))
samples = dataset.sample(
tokenizer=hf_tokenizer,
num_requests=10,
output_len=10,
skip_chat_template=True,
)
assert len(samples) == 10
assert all(sample.expected_output_len == 10 for sample in samples)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.benchmarks.datasets.datasets import (
DEFAULT_NUM_PROMPTS,
AIMODataset,
ASRDataset,
BenchmarkDataset,
BlazeditDataset,
BurstGPTDataset,
ConversationDataset,
CustomDataset,
CustomMMDataset,
HuggingFaceDataset,
InstructCoderDataset,
MLPerfDataset,
MMStarDataset,
MMVUDataset,
MTBenchDataset,
MultiModalConversationDataset,
NextEditPredictionDataset,
PrefixRepetitionRandomDataset,
RandomDataset,
RandomDatasetForReranking,
RandomMultiModalDataset,
SampleRequest,
ShareGPTDataset,
SonnetDataset,
SpecBench,
VisionArenaDataset,
add_dataset_parser,
add_random_dataset_base_args,
add_random_multimodal_dataset_args,
gen_prompt_decode_to_target_len,
get_samples,
is_valid_sequence,
lora_path_on_disk,
lora_tokenizer_cache,
process_image,
process_video,
zeta_prompt,
)
from vllm.benchmarks.datasets.utils import RangeRatio
__all__ = [
"DEFAULT_NUM_PROMPTS",
"AIMODataset",
"ASRDataset",
"BenchmarkDataset",
"BlazeditDataset",
"BurstGPTDataset",
"ConversationDataset",
"CustomDataset",
"CustomMMDataset",
"HuggingFaceDataset",
"InstructCoderDataset",
"MLPerfDataset",
"MMStarDataset",
"MMVUDataset",
"MTBenchDataset",
"MultiModalConversationDataset",
"NextEditPredictionDataset",
"PrefixRepetitionRandomDataset",
"RandomDataset",
"RandomDatasetForReranking",
"RandomMultiModalDataset",
"SampleRequest",
"ShareGPTDataset",
"SonnetDataset",
"SpecBench",
"VisionArenaDataset",
"add_dataset_parser",
"add_random_dataset_base_args",
"add_random_multimodal_dataset_args",
"gen_prompt_decode_to_target_len",
"get_samples",
"is_valid_sequence",
"lora_path_on_disk",
"lora_tokenizer_cache",
"process_image",
"process_video",
"RangeRatio",
"zeta_prompt",
]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Convert a plain-text file (local path or URL) into a JSONL dataset
compatible with ``CustomDataset`` (``--dataset-name custom``), by
randomly slicing the tokenized text into prompts.
Each line of the output JSONL contains a ``prompt`` (decoded from a random
slice of the tokenized source text) and an ``output_tokens`` count.
Usage
-----
::
python -m vllm.benchmarks.datasets.create_txt_slices_dataset \\
--input sonnet.txt \\
--output sonnet_dataset.jsonl \\
--tokenizer gpt2 \\
--num-prompts 1000 \\
--input-len 1024 \\
--output-len 128
The resulting JSONL file can then be used with the serving benchmark::
python -m vllm.benchmarks.serve \\
--dataset-name custom \\
--dataset-path sonnet_dataset.jsonl \\
...
"""
from __future__ import annotations
import argparse
import json
import logging
import random
import urllib.request
import numpy as np
from transformers import AutoTokenizer
from vllm.benchmarks.datasets.utils import RangeRatio, get_sampling_params
logger = logging.getLogger(__name__)
def load_text(path: str) -> str:
"""Load text from a local file or URL."""
if path.startswith(("http://", "https://")):
with urllib.request.urlopen(path) as response:
return response.read().decode("utf-8")
with open(path, encoding="utf-8") as f:
return f.read()
def create_txt_slices_jsonl(
*,
input_path: str,
output_path: str,
tokenizer_name: str,
num_prompts: int,
input_len: int,
output_len: int,
range_ratio: RangeRatio = 0.0,
seed: int = 0,
trust_remote_code: bool = False,
) -> None:
"""Read *input_path*, slice it into prompts, and write JSONL to
*output_path*."""
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name, trust_remote_code=trust_remote_code
)
text = load_text(input_path)
if not text:
raise ValueError("The text file is empty and cannot be sampled from.")
token_ids = tokenizer(text, add_special_tokens=False).input_ids
if not token_ids:
raise ValueError("Tokenizing the text produced zero tokens; cannot sample.")
rng_np = np.random.default_rng(seed)
rng_py = random.Random(seed)
input_lens, output_lens, _ = get_sampling_params(
rng_np,
num_prompts,
range_ratio,
input_len,
output_len,
tokenizer,
)
num_available_tokens = len(token_ids)
records: list[dict[str, object]] = []
for i in range(num_prompts):
req_input_len = int(input_lens[i])
req_output_len = int(output_lens[i])
# Randomly select a start position and slice with cycling
start_pos = rng_py.randint(0, num_available_tokens - 1)
prompt_token_ids = [
token_ids[(start_pos + j) % num_available_tokens]
for j in range(req_input_len)
]
prompt = tokenizer.decode(prompt_token_ids, skip_special_tokens=False)
records.append({"prompt": prompt, "output_tokens": req_output_len})
with open(output_path, "w", encoding="utf-8") as f:
for record in records:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
logger.info(
"Wrote %d prompts to %s",
len(records),
output_path,
)
def main(argv: list[str] | None = None) -> None:
parser = argparse.ArgumentParser(
description="Convert a plain-text file into a JSONL dataset "
"for CustomDataset (--dataset-name custom).",
)
parser.add_argument(
"--input",
required=True,
help="Path or URL to the source text file.",
)
parser.add_argument(
"--output",
required=True,
help="Path for the output JSONL file.",
)
parser.add_argument(
"--tokenizer",
required=True,
help="HuggingFace tokenizer name or path.",
)
parser.add_argument(
"--num-prompts",
type=int,
default=1000,
help="Number of prompt samples to generate (default: 1000).",
)
parser.add_argument(
"--input-len",
type=int,
default=1024,
help="Target number of input tokens per prompt (default: 1024).",
)
parser.add_argument(
"--output-len",
type=int,
default=128,
help="Target number of output tokens per prompt (default: 128).",
)
parser.add_argument(
"--range-ratio",
type=str,
default="0.0",
help="Range ratio for input/output length sampling (default: 0.0). "
"A single float applies to both ISL and OSL. "
'A JSON dict like \'{"input": 0.3, "output": 0.5}\' sets them '
"independently. Values must be in [0, 1).",
)
parser.add_argument(
"--seed",
type=int,
default=0,
help="Random seed for reproducibility (default: 0).",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Trust remote code from HuggingFace.",
)
args = parser.parse_args(argv)
logging.basicConfig(level=logging.INFO)
# Parse --range-ratio: try float first, then JSON dict.
range_ratio: RangeRatio
try:
range_ratio = float(args.range_ratio)
except ValueError:
import json as _json
range_ratio = _json.loads(args.range_ratio)
create_txt_slices_jsonl(
input_path=args.input,
output_path=args.output,
tokenizer_name=args.tokenizer,
num_prompts=args.num_prompts,
input_len=args.input_len,
output_len=args.output_len,
range_ratio=range_ratio,
seed=args.seed,
trust_remote_code=args.trust_remote_code,
)
if __name__ == "__main__":
main()
......@@ -22,8 +22,7 @@ import random
from abc import ABC, abstractmethod
from collections.abc import Callable, Iterator, Mapping
from contextlib import suppress
from copy import deepcopy
from dataclasses import dataclass
from dataclasses import dataclass, replace
from functools import cache
from io import BytesIO
from tempfile import NamedTemporaryFile
......@@ -35,6 +34,11 @@ from huggingface_hub import snapshot_download
from PIL import Image
from typing_extensions import deprecated
from vllm.benchmarks.datasets.utils import (
RangeRatio,
_resolve_range_ratios,
get_sampling_params,
)
from vllm.inputs import MultiModalDataDict
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
......@@ -60,10 +64,6 @@ logger = logging.getLogger(__name__)
DEFAULT_NUM_PROMPTS = 1000
# -----------------------------------------------------------------------------
# Data Classes
# -----------------------------------------------------------------------------
@dataclass
class SampleRequest:
......@@ -71,9 +71,9 @@ class SampleRequest:
Represents a single inference request for benchmarking.
"""
prompt: str | list[str]
prompt: str | list[str] | list[dict]
prompt_len: int
expected_output_len: int
expected_output_len: int | None
multi_modal_data: MultiModalDataDict | dict | list[dict] | None = None
lora_request: LoRARequest | None = None
request_id: str | None = None
......@@ -110,7 +110,7 @@ class BenchmarkDataset(ABC):
# default seed.
self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
self.disable_shuffle = disable_shuffle
self.data = None
self.data: Any | None = None
def apply_multimodal_chat_transformation(
self,
......@@ -249,6 +249,7 @@ class BenchmarkDataset(ABC):
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list[SampleRequest]:
"""
Abstract method to generate sample requests from the dataset.
......@@ -296,8 +297,10 @@ class BenchmarkDataset(ABC):
needed = num_requests - len(requests)
additional = []
for i in range(needed):
req = deepcopy(random.choice(requests))
req.request_id = request_id_prefix + str(len(requests) + i)
req = replace(
random.choice(requests),
request_id=request_id_prefix + str(len(requests) + i),
)
additional.append(req)
requests.extend(additional)
logger.info("Oversampled requests to reach %d total samples.", num_requests)
......@@ -533,7 +536,7 @@ class RandomDataset(BenchmarkDataset):
request_id_prefix: str = "",
no_oversample: bool = False,
prefix_len: int = DEFAULT_PREFIX_LEN,
range_ratio: float = DEFAULT_RANGE_RATIO,
range_ratio: RangeRatio = DEFAULT_RANGE_RATIO,
input_len: int = DEFAULT_INPUT_LEN,
output_len: int = DEFAULT_OUTPUT_LEN,
batchsize: int = 1,
......@@ -542,24 +545,33 @@ class RandomDataset(BenchmarkDataset):
lora_assignment: str = "random",
**kwargs,
) -> list[SampleRequest]:
# validate total input tokens (prefix + sampled) is at least 1.
resolved_input_rr, _ = _resolve_range_ratios(range_ratio)
num_special = int(tokenizer.num_special_tokens_to_add())
real_input_len = max(0, int(input_len) - num_special)
min_sampled_input = math.floor(real_input_len * (1.0 - float(range_ratio)))
min_sampled_input = math.floor(
real_input_len * (1.0 - float(resolved_input_rr))
)
min_total_input = int(prefix_len) + min_sampled_input
if min_total_input < 1:
raise ValueError(
"--random-input-len is too small: with tokenizer special "
f"tokens {num_special} and --random-range-ratio {range_ratio}, "
f"tokens {num_special} and "
f"input range ratio {resolved_input_rr}, "
"the minimum possible total input tokens (prefix + sampled) is "
f"{min_total_input}. Increase --random-input-len and/or "
"--random-prefix-len, or decrease --random-range-ratio so that "
"prefix_len + floor(max(0, random_input_len - num_special)) "
"* (1 - range_ratio) >= 1."
)
input_lens, output_lens, offsets = self.get_sampling_params(
num_requests, range_ratio, input_len, output_len, tokenizer
"--random-prefix-len, or decrease the input range ratio "
"so that prefix_len + floor(max(0, random_input_len - "
"num_special)) * (1 - input_range_ratio) >= 1."
)
input_lens, output_lens, offsets = get_sampling_params(
self._rng,
num_requests,
range_ratio,
input_len,
output_len,
tokenizer,
)
vocab_size = tokenizer.vocab_size
......@@ -661,55 +673,6 @@ class RandomDataset(BenchmarkDataset):
)
return adjusted_tokens
def get_sampling_params(
self,
num_requests: int,
range_ratio: float,
input_len: int,
output_len: int,
tokenizer: TokenizerLike,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Get the sampling parameters for the dataset.
"""
# Enforce range_ratio < 1
if not (0.0 <= range_ratio < 1.0):
raise ValueError("range_ratio must be in [0, 1).")
num_special_tokens = int(tokenizer.num_special_tokens_to_add())
real_input_len = max(0, int(input_len) - num_special_tokens)
# Bounds use floor for low and ceil for high
input_low = math.floor(real_input_len * (1 - range_ratio))
input_high = math.ceil(real_input_len * (1 + range_ratio))
output_low = math.floor(output_len * (1 - range_ratio))
output_high = math.ceil(output_len * (1 + range_ratio))
# Ensure the lower bound for output length is at least 1 to
# prevent sampling 0 tokens.
output_low = max(output_low, 1)
output_high = max(output_high, 1)
if input_low > input_high:
raise ValueError(
f"Invalid input sampling interval: low={input_low} > high={input_high}"
)
if output_low > output_high:
raise ValueError(
"Invalid output sampling interval: "
f"low={output_low} > high={output_high}"
)
logger.info(
"Sampling input_len from [%s, %s] and output_len from [%s, %s]",
input_low,
input_high,
output_low,
output_high,
)
input_lens = self._rng.integers(input_low, input_high + 1, size=num_requests)
output_lens = self._rng.integers(output_low, output_high + 1, size=num_requests)
offsets = self._rng.integers(0, tokenizer.vocab_size, size=num_requests)
return input_lens, output_lens, offsets
def generate_token_sequence(
self,
*,
......@@ -776,8 +739,11 @@ class RandomDatasetForReranking(RandomDataset):
tokenizer: TokenizerLike,
num_requests: int,
request_id_prefix: str = "",
range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
no_oversample: bool = False,
prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
range_ratio: RangeRatio = RandomDataset.DEFAULT_RANGE_RATIO,
input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN,
batchsize: int = 1,
is_reranker: bool = True,
**kwargs,
......@@ -786,8 +752,13 @@ class RandomDatasetForReranking(RandomDataset):
query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len
query_lens, _, query_offsets = self.get_sampling_params(
1, range_ratio, query_len_param, 0, tokenizer
query_lens, _, query_offsets = get_sampling_params(
self._rng,
1,
range_ratio,
query_len_param,
0,
tokenizer,
)
query_len = int(query_lens[0])
......@@ -800,8 +771,13 @@ class RandomDatasetForReranking(RandomDataset):
else:
doc_len_param = input_len - query_len - n_sep_tokens
doc_lens, _, doc_offsets = self.get_sampling_params(
num_requests, range_ratio, doc_len_param, 0, tokenizer
doc_lens, _, doc_offsets = get_sampling_params(
self._rng,
num_requests,
range_ratio,
doc_len_param,
0,
tokenizer,
)
vocab_size = tokenizer.vocab_size
......@@ -1175,9 +1151,10 @@ class RandomMultiModalDataset(RandomDataset):
request_id_prefix: str = "",
no_oversample: bool = False,
prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
range_ratio: RangeRatio = RandomDataset.DEFAULT_RANGE_RATIO,
input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN,
batchsize: int = 1,
limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT,
base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST,
num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
......@@ -1187,9 +1164,18 @@ class RandomMultiModalDataset(RandomDataset):
enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT,
**kwargs,
) -> list[SampleRequest]:
# Get the sampling parameters for the dataset
input_lens, output_lens, offsets = self.get_sampling_params(
num_requests, range_ratio, input_len, output_len, tokenizer
if batchsize != 1:
raise NotImplementedError(
"batchsize > 1 is not supported for RandomMultiModalDataset."
)
input_lens, output_lens, offsets = get_sampling_params(
self._rng,
num_requests,
range_ratio,
input_len,
output_len,
tokenizer,
)
(
......@@ -1326,16 +1312,16 @@ class ShareGPTDataset(BenchmarkDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
lora_path: str | None = None,
max_loras: int | None = None,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
lora_assignment: str = "random",
**kwargs,
) -> list:
samples: list = []
) -> list[SampleRequest]:
samples: list[SampleRequest] = []
ind = 0
for entry in self.data:
if len(samples) >= num_requests:
......@@ -1449,8 +1435,8 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
type=str,
default=None,
action=_ValidateDatasetArgs,
help="Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset.",
help="Path to the sharegpt/sonnet dataset or the HF dataset ID if "
"using HF dataset.",
)
parser.add_argument(
"--no-oversample",
......@@ -1648,12 +1634,12 @@ def add_random_dataset_base_args(
)
parser_or_group.add_argument(
"--random-range-ratio",
type=float,
default=0.0,
type=str,
default="0.0",
help="Range ratio for sampling input/output length, "
"used only for random sampling. Must be in the range [0, 1) to define "
"a symmetric sampling range"
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
"used only for random sampling. A single float applies to both "
'ISL and OSL. A JSON dict like \'{"input": 0.3, "output": 0.5}\' '
"sets them independently. Values must be in [0, 1).",
)
parser_or_group.add_argument(
"--random-prefix-len",
......@@ -1786,10 +1772,25 @@ def add_random_multimodal_dataset_args(
)
def _parse_range_ratio(value: str) -> RangeRatio:
"""Parse a ``--random-range-ratio`` CLI string.
Accepts either a plain float (``"0.3"``) or a JSON dict
(``'{"input": 0.3, "output": 0.5}'``).
"""
try:
return float(value)
except ValueError:
return json.loads(value)
def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
if not hasattr(args, "request_id_prefix"):
args.request_id_prefix = ""
if hasattr(args, "random_range_ratio") and isinstance(args.random_range_ratio, str):
args.random_range_ratio = _parse_range_ratio(args.random_range_ratio)
if args.dataset_name == "custom":
dataset = CustomDataset(
dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
......@@ -2120,7 +2121,7 @@ class CustomDataset(BenchmarkDataset):
# This will be the standardized format which load_data()
# has to convert into depending on the filetype of dataset_path.
# sample() will assume this standardized format of self.data
self.data = []
self.data: list[dict] = []
# Load the JSONL file
if self.dataset_path.endswith(".jsonl"):
......@@ -2149,15 +2150,15 @@ class CustomDataset(BenchmarkDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
lora_path: str | None = None,
max_loras: int | None = None,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
) -> list[SampleRequest]:
# load all data if needed
self.num_available_samples = len(self.data)
if num_requests <= 0:
......@@ -2168,7 +2169,7 @@ class CustomDataset(BenchmarkDataset):
num_requests,
)
sampled_requests = []
sampled_requests: list[SampleRequest] = []
for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
......@@ -2252,7 +2253,7 @@ class CustomMMDataset(CustomDataset):
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
) -> list[SampleRequest]:
# load all data if needed
self.num_available_samples = len(self.data)
if num_requests <= 0:
......@@ -2340,9 +2341,13 @@ class SpecBench(CustomDataset):
if not getattr(self, "disable_shuffle", False):
random.shuffle(self.data)
def sample(self, **kwargs) -> list:
def sample(
**kwargs,
) -> list[SampleRequest]:
# leverage CustomDataset sample
return super().sample(**kwargs)
return super().sample(
**kwargs,
)
# -----------------------------------------------------------------------------
......@@ -2381,14 +2386,14 @@ class SonnetDataset(BenchmarkDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
prefix_len: int = DEFAULT_PREFIX_LEN,
input_len: int = DEFAULT_INPUT_LEN,
output_len: int = DEFAULT_OUTPUT_LEN,
return_prompt_formatted: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
) -> list[SampleRequest]:
# Calculate average token length for a poem line.
tokenized_lines = [tokenizer(line).input_ids for line in self.data]
avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
......@@ -2411,7 +2416,7 @@ class SonnetDataset(BenchmarkDataset):
num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
prefix_lines = self.data[:num_prefix_lines]
samples = []
samples: list[SampleRequest] = []
ind = 0
while len(samples) < num_requests:
extra_lines = random.choices(
......@@ -2482,11 +2487,11 @@ class BurstGPTDataset(BenchmarkDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
max_loras: int | None = None,
lora_path: str | None = None,
request_id_prefix: str = "",
no_oversample: bool = False,
lora_assignment: str = "random",
max_loras: int | None = None,
lora_path: str | None = None,
**kwargs,
) -> list[SampleRequest]:
samples = []
......@@ -2574,15 +2579,15 @@ class ConversationDataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
) -> list[SampleRequest]:
# Filter examples with at least 2 conversations
filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
sampled_requests = []
sampled_requests: list[SampleRequest] = []
ind = 0
dynamic_output = output_len is None
......@@ -2634,15 +2639,15 @@ class MultiModalConversationDataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
) -> list[SampleRequest]:
# Filter examples with at least 2 conversations
filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
sampled_requests = []
sampled_requests: list[SampleRequest] = []
ind = 0
dynamic_output = output_len is None
......@@ -2703,12 +2708,12 @@ class VisionArenaDataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
) -> list[SampleRequest]:
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
if parser_fn is None:
raise ValueError(f"Unsupported dataset path: {self.hf_name}")
......@@ -2753,9 +2758,11 @@ class MMVUDataset(HuggingFaceDataset):
DEFAULT_OUTPUT_LEN = 128
SUPPORTED_DATASET_PATHS = {
"yale-nlp/MMVU": lambda x: x["question"]
+ " "
+ (" ".join(f"{k}.{v}" for k, v in x["choices"].items())),
"yale-nlp/MMVU": lambda x: (
x["question"]
+ " "
+ (" ".join(f"{k}.{v}" for k, v in x["choices"].items()))
),
}
def __init__(self, **kwargs) -> None:
......@@ -2770,12 +2777,12 @@ class MMVUDataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
) -> list[SampleRequest]:
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
if parser_fn is None:
raise ValueError(f"Unsupported dataset path: {self.hf_name}")
......@@ -2838,15 +2845,15 @@ class InstructCoderDataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list[SampleRequest]:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
sampled_requests = []
sampled_requests: list[SampleRequest] = []
for i, prompt in enumerate(self.sample_prompts(n=num_requests)):
# apply template
if not skip_chat_template:
......@@ -2903,15 +2910,15 @@ class MTBenchDataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
) -> list[SampleRequest]:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
sampled_requests = []
sampled_requests: list[SampleRequest] = []
for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
......@@ -2976,7 +2983,7 @@ class BlazeditDataset(HuggingFaceDataset):
min_distance: float = 0.0,
max_distance: float = 1.0,
**kwargs,
) -> list:
) -> list[SampleRequest]:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
sampled_requests = []
......@@ -3050,12 +3057,12 @@ class AIMODataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
output_len: int | None = None,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
**kwargs,
) -> list:
sampled_requests = []
) -> list[SampleRequest]:
sampled_requests: list[SampleRequest] = []
ind = 0
dynamic_output = output_len is None
......@@ -3228,18 +3235,18 @@ class ASRDataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
output_len: int | None = None,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
**kwargs,
) -> list:
) -> list[SampleRequest]:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
if "openai" in getattr(tokenizer, "name_or_path", ""):
prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
else:
prompt = ""
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests = []
sampled_requests: list[SampleRequest] = []
ind = 0
skipped = 0
asr_min_audio_len_sec = kwargs.get("asr_min_audio_len_sec")
......@@ -3326,9 +3333,9 @@ class MLPerfDataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
output_len: int | None = None,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
**kwargs,
) -> list[SampleRequest]:
# Force dynamic output length based on reference completion.
......@@ -3405,12 +3412,12 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
prefix_len: int = DEFAULT_PREFIX_LEN,
suffix_len: int = DEFAULT_SUFFIX_LEN,
num_prefixes: int = DEFAULT_NUM_PREFIXES,
output_len: int = DEFAULT_OUTPUT_LEN,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list[SampleRequest]:
vocab_size = tokenizer.vocab_size
......@@ -3421,7 +3428,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
f"to num_prefixes ({num_prefixes})"
)
def _generate_exact_length_tokens(target_length: int) -> list[int]:
def _generate_exact_length_tokens(target_length: int) -> tuple[list[int], int]:
"""Generate tokens that decode and re-encode to exactly
target_length."""
# Generate random tokens
......@@ -3491,10 +3498,10 @@ class MMStarDataset(HuggingFaceDataset):
self,
tokenizer: TokenizerLike,
num_requests: int,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
output_len: int | None = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list[SampleRequest]:
# If --hf-output-len is not set, use the default output length.
......@@ -3516,6 +3523,7 @@ class MMStarDataset(HuggingFaceDataset):
# if enable_multimodal_chat is False).
prompt_len = len(tokenizer(question_text).input_ids)
prompt: str | list[dict]
if enable_multimodal_chat:
# If multimodal content should be embedded in the chat message,
# convert to [{"role":"user","content":[...]}]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Shared utilities for benchmark dataset sampling.
"""
import logging
import math
import numpy as np
from vllm.tokenizers import TokenizerLike
logger = logging.getLogger(__name__)
# Type alias: a single float applies to both ISL and OSL; a dict allows
# specifying them independently via ``{"input": …, "output": …}``.
RangeRatio = float | dict[str, float]
def _resolve_range_ratios(
range_ratio: RangeRatio,
) -> tuple[float, float]:
"""Return ``(input_range_ratio, output_range_ratio)`` from *range_ratio*.
*range_ratio* is either a single float (used for both input and output)
or a dict with ``"input"`` and ``"output"`` keys.
"""
if isinstance(range_ratio, dict):
try:
return float(range_ratio["input"]), float(range_ratio["output"])
except KeyError as exc:
raise ValueError(
"When range_ratio is a dict it must contain 'input' and "
f"'output' keys, got: {sorted(range_ratio)}"
) from exc
ratio = float(range_ratio)
return ratio, ratio
def get_sampling_params(
rng: np.random.Generator,
num_requests: int,
range_ratio: RangeRatio,
input_len: int,
output_len: int,
tokenizer: TokenizerLike,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Sample per-request input/output token lengths and vocab offsets.
Lengths are drawn uniformly from integer ranges around the configured
means, controlled by *range_ratio*. It may be a single ``float``
(applied to both input and output) or a ``dict`` with ``"input"`` and
``"output"`` keys for independent control.
Tokenizer special tokens are subtracted from ``input_len`` before
computing the sampling interval.
Returns:
(input_lens, output_lens, offsets) – three 1-D ``np.ndarray`` of
shape ``(num_requests,)``.
"""
input_range_ratio, output_range_ratio = _resolve_range_ratios(range_ratio)
if not (0.0 <= input_range_ratio < 1.0):
raise ValueError("input_range_ratio must be in [0, 1).")
if not (0.0 <= output_range_ratio < 1.0):
raise ValueError("output_range_ratio must be in [0, 1).")
num_special_tokens = int(tokenizer.num_special_tokens_to_add())
real_input_len = max(0, int(input_len) - num_special_tokens)
input_low = math.floor(real_input_len * (1 - input_range_ratio))
input_high = math.ceil(real_input_len * (1 + input_range_ratio))
output_low = math.floor(output_len * (1 - output_range_ratio))
output_high = math.ceil(output_len * (1 + output_range_ratio))
# Ensure the lower bound for output length is at least 1 to
# prevent sampling 0 tokens.
output_low = max(output_low, 1)
output_high = max(output_high, 1)
if input_low > input_high:
raise ValueError(
f"Invalid input sampling interval: low={input_low} > high={input_high}"
)
if output_low > output_high:
raise ValueError(
f"Invalid output sampling interval: low={output_low} > high={output_high}"
)
logger.info(
"Sampling input_len from [%s, %s] and output_len from [%s, %s]",
input_low,
input_high,
output_low,
output_high,
)
input_lens = rng.integers(input_low, input_high + 1, size=num_requests)
output_lens = rng.integers(output_low, output_high + 1, size=num_requests)
offsets = rng.integers(0, tokenizer.vocab_size, size=num_requests)
return input_lens, output_lens, offsets
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment