Unverified Commit 893b2aff authored by Julien Debache's avatar Julien Debache Committed by GitHub
Browse files

feat: add TxtSlicesDataset to allow sampling slices from txt file for benchmarking (#30156)


Signed-off-by: default avatarjdebache <jdebache@nvidia.com>
parent 80118853
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import pytest
from vllm.benchmarks.datasets.utils import get_sampling_params
from vllm.tokenizers import TokenizerLike
class _FakeTokenizer(TokenizerLike):
"""Minimal tokenizer implementing the TokenizerLike protocol
for testing get_sampling_params."""
def __init__(self, vocab_size: int = 1000, num_special_tokens: int = 0) -> None:
self._vocab_size = vocab_size
self._num_special_tokens = num_special_tokens
# -- Properties required by TokenizerLike --
@classmethod
def from_pretrained(cls, path_or_repo_id, *a, **kw): # type: ignore[override]
return cls()
@property
def vocab_size(self) -> int:
return self._vocab_size
@property
def all_special_tokens(self) -> list[str]:
return []
@property
def all_special_ids(self) -> list[int]:
return []
@property
def bos_token_id(self) -> int:
return 0
@property
def eos_token_id(self) -> int:
return 1
@property
def pad_token_id(self) -> int:
return 2
@property
def is_fast(self) -> bool:
return False
@property
def max_token_id(self) -> int:
return self._vocab_size - 1
@property
def max_chars_per_token(self) -> int:
return 4
@property
def truncation_side(self) -> str:
return "right"
def num_special_tokens_to_add(self) -> int:
return self._num_special_tokens
def __call__(self, text, text_pair=None, **kw): # type: ignore[override]
raise NotImplementedError
def get_vocab(self) -> dict[str, int]:
return {}
def get_added_vocab(self) -> dict[str, int]:
return {}
def encode(self, text, **kw) -> list[int]: # type: ignore[override]
raise NotImplementedError
def apply_chat_template(self, messages, **kw): # type: ignore[override]
raise NotImplementedError
def convert_tokens_to_ids(self, tokens): # type: ignore[override]
raise NotImplementedError
def convert_tokens_to_string(self, tokens: list[str]) -> str:
raise NotImplementedError
def decode(self, ids, skip_special_tokens: bool = False) -> str: # type: ignore[override]
raise NotImplementedError
def convert_ids_to_tokens( # type: ignore[override]
self, ids, skip_special_tokens: bool = False
) -> list[str]:
raise NotImplementedError
class TestGetSamplingParams:
"""Tests for ``get_sampling_params`` in ``vllm.benchmarks.datasets.shared``."""
# -- helpers --
@staticmethod
def _tok(vocab_size: int = 1000, num_special: int = 0) -> _FakeTokenizer:
return _FakeTokenizer(vocab_size=vocab_size, num_special_tokens=num_special)
# -- return shape / dtype --
def test_returns_three_arrays(self):
rng = np.random.default_rng(0)
result = get_sampling_params(rng, 5, 0.0, 100, 50, self._tok())
assert len(result) == 3
for arr in result:
assert isinstance(arr, np.ndarray)
@pytest.mark.parametrize("n", [1, 10, 100])
def test_output_length_matches_num_requests(self, n: int):
rng = np.random.default_rng(42)
input_lens, output_lens, offsets = get_sampling_params(
rng, n, 0.0, 64, 32, self._tok()
)
assert input_lens.shape == (n,)
assert output_lens.shape == (n,)
assert offsets.shape == (n,)
# -- fixed lengths (range_ratio = 0) --
def test_zero_range_ratio_gives_constant_lengths(self):
rng = np.random.default_rng(7)
input_lens, output_lens, _ = get_sampling_params(
rng, 20, 0.0, 128, 64, self._tok()
)
assert np.all(input_lens == 128)
assert np.all(output_lens == 64)
def test_special_tokens_subtracted_from_input_only(self):
rng = np.random.default_rng(7)
input_lens, output_lens, _ = get_sampling_params(
rng, 10, 0.0, 100, 50, self._tok(num_special=4)
)
# real_input_len = 100 - 4 = 96, range_ratio 0 → all 96
assert np.all(input_lens == 96)
# special tokens are not subtracted from output length
assert np.all(output_lens == 50)
# -- range ratios --
def test_input_range_bounds(self):
rng = np.random.default_rng(0)
ratio = 0.5
base = 200
input_lens, _, _ = get_sampling_params(
rng, 500, {"input": ratio, "output": 0.0}, base, 50, self._tok()
)
lo = int(np.floor(base * (1 - ratio)))
hi = int(np.ceil(base * (1 + ratio)))
assert np.all(input_lens >= lo)
assert np.all(input_lens <= hi)
def test_output_range_bounds(self):
rng = np.random.default_rng(0)
ratio = 0.3
base = 100
_, output_lens, _ = get_sampling_params(
rng, 500, {"input": 0.0, "output": ratio}, 50, base, self._tok()
)
lo = max(1, int(np.floor(base * (1 - ratio))))
hi = int(np.ceil(base * (1 + ratio)))
assert np.all(output_lens >= lo)
assert np.all(output_lens <= hi)
def test_output_low_clamped_to_one(self):
"""Even with a high ratio that would push output_low to 0,
the function clamps it to 1."""
rng = np.random.default_rng(0)
# output_len=1, ratio=0.99 → floor(1*0.01)=0, should clamp to 1
_, output_lens, _ = get_sampling_params(
rng, 50, {"input": 0.0, "output": 0.99}, 100, 1, self._tok()
)
assert np.all(output_lens >= 1)
# -- offsets bounded by vocab_size --
@pytest.mark.parametrize("vocab", [100, 32000, 128256])
def test_offsets_within_vocab(self, vocab: int):
rng = np.random.default_rng(0)
_, _, offsets = get_sampling_params(
rng, 200, 0.0, 64, 32, self._tok(vocab_size=vocab)
)
assert np.all(offsets >= 0)
assert np.all(offsets < vocab)
# -- reproducibility --
def test_same_seed_same_results(self):
tok = self._tok()
rr = {"input": 0.3, "output": 0.2}
a = get_sampling_params(np.random.default_rng(42), 50, rr, 256, 64, tok)
b = get_sampling_params(np.random.default_rng(42), 50, rr, 256, 64, tok)
for arr_a, arr_b in zip(a, b):
np.testing.assert_array_equal(arr_a, arr_b)
def test_different_seed_different_results(self):
tok = self._tok()
rr = {"input": 0.3, "output": 0.2}
a = get_sampling_params(np.random.default_rng(0), 50, rr, 256, 64, tok)
b = get_sampling_params(np.random.default_rng(1), 50, rr, 256, 64, tok)
# Extremely unlikely all three arrays match with different seeds
assert not all(np.array_equal(arr_a, arr_b) for arr_a, arr_b in zip(a, b))
# -- validation / error paths --
@pytest.mark.parametrize("bad_ratio", [-0.1, 1.0, 1.5])
def test_invalid_input_range_ratio(self, bad_ratio: float):
rng = np.random.default_rng(0)
with pytest.raises(ValueError, match="input_range_ratio"):
get_sampling_params(
rng, 10, {"input": bad_ratio, "output": 0.0}, 100, 50, self._tok()
)
@pytest.mark.parametrize("bad_ratio", [-0.1, 1.0, 1.5])
def test_invalid_output_range_ratio(self, bad_ratio: float):
rng = np.random.default_rng(0)
with pytest.raises(ValueError, match="output_range_ratio"):
get_sampling_params(
rng, 10, {"input": 0.0, "output": bad_ratio}, 100, 50, self._tok()
)
def test_invalid_dict_missing_keys(self):
rng = np.random.default_rng(0)
with pytest.raises(ValueError, match="input.*output"):
get_sampling_params(rng, 10, {"input": 0.1}, 100, 50, self._tok())
def test_input_len_zero_with_special_tokens(self):
"""input_len < num_special_tokens → real_input_len = 0, which is fine
(range [0, 0])."""
rng = np.random.default_rng(0)
input_lens, _, _ = get_sampling_params(
rng, 5, 0.0, 5, 50, self._tok(num_special=10)
)
# real_input_len = max(0, 5 - 10) = 0
assert np.all(input_lens == 0)
# -- edge cases --
def test_single_request(self):
rng = np.random.default_rng(0)
i, o, off = get_sampling_params(rng, 1, 0.0, 100, 50, self._tok())
assert i.shape == (1,)
assert o.shape == (1,)
assert off.shape == (1,)
def test_large_num_requests(self):
rng = np.random.default_rng(0)
i, o, off = get_sampling_params(rng, 10_000, 0.5, 512, 128, self._tok())
assert i.shape == (10_000,)
assert o.shape == (10_000,)
assert off.shape == (10_000,)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from pathlib import Path
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.benchmarks.datasets import CustomDataset
from vllm.benchmarks.datasets.create_txt_slices_dataset import create_txt_slices_jsonl
@pytest.fixture(scope="session")
def hf_tokenizer() -> PreTrainedTokenizerBase:
# Use a small, commonly available tokenizer
return AutoTokenizer.from_pretrained("gpt2")
text_content = """
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat
nulla pariatur. Excepteur sint occaecat cupidatat non proident,
sunt in culpa qui officia deserunt mollit anim id est laborum.
"""
@pytest.mark.benchmark
def test_create_txt_slices_jsonl(
hf_tokenizer: PreTrainedTokenizerBase, tmp_path: Path
) -> None:
"""Test that create_txt_slices_jsonl produces valid JSONL for CustomDataset."""
txt_path = tmp_path / "input.txt"
jsonl_path = tmp_path / "input.txt.jsonl"
txt_path.write_text(text_content)
create_txt_slices_jsonl(
input_path=str(txt_path),
output_path=str(jsonl_path),
tokenizer_name="gpt2",
num_prompts=10,
input_len=10,
output_len=10,
)
# Verify the JSONL file is valid and has the expected structure
records = [json.loads(line) for line in jsonl_path.read_text().splitlines()]
assert len(records) == 10
for record in records:
assert "prompt" in record
assert "output_tokens" in record
assert isinstance(record["prompt"], str)
assert record["output_tokens"] == 10
# Verify the JSONL file can be loaded by CustomDataset
dataset = CustomDataset(dataset_path=str(jsonl_path))
samples = dataset.sample(
tokenizer=hf_tokenizer,
num_requests=10,
output_len=10,
skip_chat_template=True,
)
assert len(samples) == 10
assert all(sample.expected_output_len == 10 for sample in samples)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.benchmarks.datasets.datasets import (
DEFAULT_NUM_PROMPTS,
AIMODataset,
ASRDataset,
BenchmarkDataset,
BlazeditDataset,
BurstGPTDataset,
ConversationDataset,
CustomDataset,
CustomMMDataset,
HuggingFaceDataset,
InstructCoderDataset,
MLPerfDataset,
MMStarDataset,
MMVUDataset,
MTBenchDataset,
MultiModalConversationDataset,
NextEditPredictionDataset,
PrefixRepetitionRandomDataset,
RandomDataset,
RandomDatasetForReranking,
RandomMultiModalDataset,
SampleRequest,
ShareGPTDataset,
SonnetDataset,
SpecBench,
VisionArenaDataset,
add_dataset_parser,
add_random_dataset_base_args,
add_random_multimodal_dataset_args,
gen_prompt_decode_to_target_len,
get_samples,
is_valid_sequence,
lora_path_on_disk,
lora_tokenizer_cache,
process_image,
process_video,
zeta_prompt,
)
from vllm.benchmarks.datasets.utils import RangeRatio
__all__ = [
"DEFAULT_NUM_PROMPTS",
"AIMODataset",
"ASRDataset",
"BenchmarkDataset",
"BlazeditDataset",
"BurstGPTDataset",
"ConversationDataset",
"CustomDataset",
"CustomMMDataset",
"HuggingFaceDataset",
"InstructCoderDataset",
"MLPerfDataset",
"MMStarDataset",
"MMVUDataset",
"MTBenchDataset",
"MultiModalConversationDataset",
"NextEditPredictionDataset",
"PrefixRepetitionRandomDataset",
"RandomDataset",
"RandomDatasetForReranking",
"RandomMultiModalDataset",
"SampleRequest",
"ShareGPTDataset",
"SonnetDataset",
"SpecBench",
"VisionArenaDataset",
"add_dataset_parser",
"add_random_dataset_base_args",
"add_random_multimodal_dataset_args",
"gen_prompt_decode_to_target_len",
"get_samples",
"is_valid_sequence",
"lora_path_on_disk",
"lora_tokenizer_cache",
"process_image",
"process_video",
"RangeRatio",
"zeta_prompt",
]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Convert a plain-text file (local path or URL) into a JSONL dataset
compatible with ``CustomDataset`` (``--dataset-name custom``), by
randomly slicing the tokenized text into prompts.
Each line of the output JSONL contains a ``prompt`` (decoded from a random
slice of the tokenized source text) and an ``output_tokens`` count.
Usage
-----
::
python -m vllm.benchmarks.datasets.create_txt_slices_dataset \\
--input sonnet.txt \\
--output sonnet_dataset.jsonl \\
--tokenizer gpt2 \\
--num-prompts 1000 \\
--input-len 1024 \\
--output-len 128
The resulting JSONL file can then be used with the serving benchmark::
python -m vllm.benchmarks.serve \\
--dataset-name custom \\
--dataset-path sonnet_dataset.jsonl \\
...
"""
from __future__ import annotations
import argparse
import json
import logging
import random
import urllib.request
import numpy as np
from transformers import AutoTokenizer
from vllm.benchmarks.datasets.utils import RangeRatio, get_sampling_params
logger = logging.getLogger(__name__)
def load_text(path: str) -> str:
"""Load text from a local file or URL."""
if path.startswith(("http://", "https://")):
with urllib.request.urlopen(path) as response:
return response.read().decode("utf-8")
with open(path, encoding="utf-8") as f:
return f.read()
def create_txt_slices_jsonl(
*,
input_path: str,
output_path: str,
tokenizer_name: str,
num_prompts: int,
input_len: int,
output_len: int,
range_ratio: RangeRatio = 0.0,
seed: int = 0,
trust_remote_code: bool = False,
) -> None:
"""Read *input_path*, slice it into prompts, and write JSONL to
*output_path*."""
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name, trust_remote_code=trust_remote_code
)
text = load_text(input_path)
if not text:
raise ValueError("The text file is empty and cannot be sampled from.")
token_ids = tokenizer(text, add_special_tokens=False).input_ids
if not token_ids:
raise ValueError("Tokenizing the text produced zero tokens; cannot sample.")
rng_np = np.random.default_rng(seed)
rng_py = random.Random(seed)
input_lens, output_lens, _ = get_sampling_params(
rng_np,
num_prompts,
range_ratio,
input_len,
output_len,
tokenizer,
)
num_available_tokens = len(token_ids)
records: list[dict[str, object]] = []
for i in range(num_prompts):
req_input_len = int(input_lens[i])
req_output_len = int(output_lens[i])
# Randomly select a start position and slice with cycling
start_pos = rng_py.randint(0, num_available_tokens - 1)
prompt_token_ids = [
token_ids[(start_pos + j) % num_available_tokens]
for j in range(req_input_len)
]
prompt = tokenizer.decode(prompt_token_ids, skip_special_tokens=False)
records.append({"prompt": prompt, "output_tokens": req_output_len})
with open(output_path, "w", encoding="utf-8") as f:
for record in records:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
logger.info(
"Wrote %d prompts to %s",
len(records),
output_path,
)
def main(argv: list[str] | None = None) -> None:
parser = argparse.ArgumentParser(
description="Convert a plain-text file into a JSONL dataset "
"for CustomDataset (--dataset-name custom).",
)
parser.add_argument(
"--input",
required=True,
help="Path or URL to the source text file.",
)
parser.add_argument(
"--output",
required=True,
help="Path for the output JSONL file.",
)
parser.add_argument(
"--tokenizer",
required=True,
help="HuggingFace tokenizer name or path.",
)
parser.add_argument(
"--num-prompts",
type=int,
default=1000,
help="Number of prompt samples to generate (default: 1000).",
)
parser.add_argument(
"--input-len",
type=int,
default=1024,
help="Target number of input tokens per prompt (default: 1024).",
)
parser.add_argument(
"--output-len",
type=int,
default=128,
help="Target number of output tokens per prompt (default: 128).",
)
parser.add_argument(
"--range-ratio",
type=str,
default="0.0",
help="Range ratio for input/output length sampling (default: 0.0). "
"A single float applies to both ISL and OSL. "
'A JSON dict like \'{"input": 0.3, "output": 0.5}\' sets them '
"independently. Values must be in [0, 1).",
)
parser.add_argument(
"--seed",
type=int,
default=0,
help="Random seed for reproducibility (default: 0).",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Trust remote code from HuggingFace.",
)
args = parser.parse_args(argv)
logging.basicConfig(level=logging.INFO)
# Parse --range-ratio: try float first, then JSON dict.
range_ratio: RangeRatio
try:
range_ratio = float(args.range_ratio)
except ValueError:
import json as _json
range_ratio = _json.loads(args.range_ratio)
create_txt_slices_jsonl(
input_path=args.input,
output_path=args.output,
tokenizer_name=args.tokenizer,
num_prompts=args.num_prompts,
input_len=args.input_len,
output_len=args.output_len,
range_ratio=range_ratio,
seed=args.seed,
trust_remote_code=args.trust_remote_code,
)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Shared utilities for benchmark dataset sampling.
"""
import logging
import math
import numpy as np
from vllm.tokenizers import TokenizerLike
logger = logging.getLogger(__name__)
# Type alias: a single float applies to both ISL and OSL; a dict allows
# specifying them independently via ``{"input": …, "output": …}``.
RangeRatio = float | dict[str, float]
def _resolve_range_ratios(
range_ratio: RangeRatio,
) -> tuple[float, float]:
"""Return ``(input_range_ratio, output_range_ratio)`` from *range_ratio*.
*range_ratio* is either a single float (used for both input and output)
or a dict with ``"input"`` and ``"output"`` keys.
"""
if isinstance(range_ratio, dict):
try:
return float(range_ratio["input"]), float(range_ratio["output"])
except KeyError as exc:
raise ValueError(
"When range_ratio is a dict it must contain 'input' and "
f"'output' keys, got: {sorted(range_ratio)}"
) from exc
ratio = float(range_ratio)
return ratio, ratio
def get_sampling_params(
rng: np.random.Generator,
num_requests: int,
range_ratio: RangeRatio,
input_len: int,
output_len: int,
tokenizer: TokenizerLike,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Sample per-request input/output token lengths and vocab offsets.
Lengths are drawn uniformly from integer ranges around the configured
means, controlled by *range_ratio*. It may be a single ``float``
(applied to both input and output) or a ``dict`` with ``"input"`` and
``"output"`` keys for independent control.
Tokenizer special tokens are subtracted from ``input_len`` before
computing the sampling interval.
Returns:
(input_lens, output_lens, offsets) – three 1-D ``np.ndarray`` of
shape ``(num_requests,)``.
"""
input_range_ratio, output_range_ratio = _resolve_range_ratios(range_ratio)
if not (0.0 <= input_range_ratio < 1.0):
raise ValueError("input_range_ratio must be in [0, 1).")
if not (0.0 <= output_range_ratio < 1.0):
raise ValueError("output_range_ratio must be in [0, 1).")
num_special_tokens = int(tokenizer.num_special_tokens_to_add())
real_input_len = max(0, int(input_len) - num_special_tokens)
input_low = math.floor(real_input_len * (1 - input_range_ratio))
input_high = math.ceil(real_input_len * (1 + input_range_ratio))
output_low = math.floor(output_len * (1 - output_range_ratio))
output_high = math.ceil(output_len * (1 + output_range_ratio))
# Ensure the lower bound for output length is at least 1 to
# prevent sampling 0 tokens.
output_low = max(output_low, 1)
output_high = max(output_high, 1)
if input_low > input_high:
raise ValueError(
f"Invalid input sampling interval: low={input_low} > high={input_high}"
)
if output_low > output_high:
raise ValueError(
f"Invalid output sampling interval: low={output_low} > high={output_high}"
)
logger.info(
"Sampling input_len from [%s, %s] and output_len from [%s, %s]",
input_low,
input_high,
output_low,
output_high,
)
input_lens = rng.integers(input_low, input_high + 1, size=num_requests)
output_lens = rng.integers(output_low, output_high + 1, size=num_requests)
offsets = rng.integers(0, tokenizer.vocab_size, size=num_requests)
return input_lens, output_lens, offsets
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment