test_deduplication.py 1.67 KB
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromDicts, MinHashDedup


def test_minhash_deduplication() -> None:
    with Pipeline() as pipeline:
        ds_size = 1000
        batch_size = 500
        data = LoadDataFromDicts(
            data=[
                {"text": "This is a test document."},
                {"text": "This document is a test."},
                {"text": "Test document for duplication."},
                {"text": "Document for duplication test."},
                {"text": "This is another unique document."},
            ]
            * (ds_size // 5),
            batch_size=batch_size,
        )
        minhash = MinHashDedup(
            tokenizer="ngrams",
            n=2,
            threshold=0.9,
            storage="disk",
            input_batch_size=batch_size,
        )
        data >> minhash

    distiset = pipeline.run(use_cache=False)
    ds = distiset["default"]["train"]
    ds_dedup = ds.filter(lambda x: x["keep_row_after_minhash_filtering"])
    assert len(ds_dedup) == 4


if __name__ == "__main__":
    test_minhash_deduplication()