# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """Utilities for generating filler text that tokenizes predictably.""" import random # Common English words that each tokenize to a single BPE token on most LLMs. ENGLISH_VOCAB = [ "the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us", "great", "world", "still", "own", "find", "here", "thing", "many", "long", "hand", "high", "keep", "place", "start", "might", "old", "home", "big", "end", "while", "last", "turn", "ask", "need", "too", "feel", "seem", "call", "head", "put", "lot", "run", "every", "play", "small", "set", "live", "try", "tell", "few", "part", "change", "help", "show", "house", "both", "side", "point", "such", "name", "each", "right", "move", "must", "real", "left", "same", "much", "open", "near", "line", "build", "power", "water", "city", "tree", "earth", "plan", "food", "dark", "cold", "sure", "car", "face", "nice", "state", "fact", "night", "hard", "read", "idea", "stand", "class", "body", "book", "word", "best", "done", "case", "four", "fire", "front", "rest", "game", "war", "air", "eye", "true", "top", "area", "boy", "girl", "color", "oil", "song", "note", "low", "bed", ] def generate_filler(rng: random.Random, num_tokens: int) -> str: """Return ~num_tokens worth of space-separated common English words.""" return " ".join(rng.choice(ENGLISH_VOCAB) for _ in range(num_tokens))