create_doc_index.py 1.09 KB
Newer Older
Mostofa Patwary's avatar
Mostofa Patwary committed
1
import os
Neel Kant's avatar
Neel Kant committed
2
import sys
Mostofa Patwary's avatar
Mostofa Patwary committed
3
4
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                             os.path.pardir)))
Neel Kant's avatar
Neel Kant committed
5

6
from megatron import print_rank_0
Neel Kant's avatar
Neel Kant committed
7
8
9
10
11
12
13
14
15
16
17
18
from megatron.indexer import IndexBuilder
from megatron.initialize import initialize_megatron


def main():
    """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
    - Include all args needed for initial model specification

    Other key args:
        --block-data-path: path to write to
        --ict-load or --realm-load: path to checkpoint with which to embed
        --data-path and --titles-data-path: paths for dataset
Neel Kant's avatar
Neel Kant committed
19
20
        --indexer-log-interval: reporting interval
        --indexer-batch-size: size specific for indexer jobs
Neel Kant's avatar
Neel Kant committed
21
22
23
24
25
26
27
28

    Check README.md for example script
    """

    initialize_megatron(extra_args_provider=None,
                        args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
    index_builder = IndexBuilder()
    index_builder.build_and_save_index()
29
    print_rank_0("Build and save indices: done!")
Neel Kant's avatar
Neel Kant committed
30
31
32
33

if __name__ == "__main__":
    main()