test_hasher.py

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
from data_generator.hasher import texts_to_hashes
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
from transformers import PreTrainedTokenizerFast


@pytest.fixture(scope="module")
def dummy_tokenizer():
    vocab = [chr(i) for i in range(ord("a"), ord("z") + 1)]
    vocab.append("[UNK]")
    vocab_dict = {token: idx for idx, token in enumerate(vocab)}

    tokenizer_model = models.WordLevel(vocab=vocab_dict, unk_token="[UNK]")
    tokenizer = Tokenizer(tokenizer_model)
    tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(), normalizers.Lowercase()]
    )
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer.decoder = decoders.WordPiece(prefix="")

    return PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        bos_token="[BOS]",
        eos_token="[EOS]",
    )


def test_texts_to_hashes_blocks(dummy_tokenizer):
    dum1 = "a b c d"
    dum2 = "e f g h"
    dum3 = "i j k l"

    texts = [dum1, dum1 + " " + dum2, dum1 + " " + dum3, dum2 + " " + dum1]
    expected = [[0], [0, 1], [0, 2], [3, 4]]

    result = texts_to_hashes(dummy_tokenizer, texts, block_size=4)
    assert result == expected, f"Expected {expected}, got {result}"