common.py 1.35 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Shared constants and utilities for trace converters."""

from aiperf.common.tokenizer import Tokenizer
from aiperf.dataset.synthesis.rolling_hasher import RollingHasher

DEFAULT_TOKENIZER = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
DEFAULT_BLOCK_SIZE = 64


def texts_to_hashes_and_lengths(
    tokenizer: Tokenizer,
    texts: list[str],
    block_size: int,
) -> tuple[list[list[int]], list[int]]:
    """
    Convert texts to hash IDs and token lengths.

    Returns:
        Tuple of (hash_ids_list, token_lengths) where:
        - hash_ids_list: List of hash ID sequences, one per input text
        - token_lengths: List of token counts, one per input text
    """
    hasher = RollingHasher(block_size=block_size)
    hash_results: list[list[int]] = []
    length_results: list[int] = []

    for text in texts:
        tokens = tokenizer.encode(text)
        length_results.append(len(tokens))

        blocks: list[list[int]] = [
            tokens[i : i + block_size] for i in range(0, len(tokens), block_size)
        ]
        if blocks:
            hashes = hasher.hash_token_blocks(blocks)
            hash_results.append(hashes)
        else:
            hash_results.append([])

    return hash_results, length_results