create_doc_index.py 1.02 KB
Newer Older
Mostofa Patwary's avatar
Mostofa Patwary committed
1
import os
Neel Kant's avatar
Neel Kant committed
2
import sys
Mostofa Patwary's avatar
Mostofa Patwary committed
3
4
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                             os.path.pardir)))
Neel Kant's avatar
Neel Kant committed
5
6
7
8
9
10
11
12
13
14
15
16
17

from megatron.indexer import IndexBuilder
from megatron.initialize import initialize_megatron


def main():
    """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
    - Include all args needed for initial model specification

    Other key args:
        --block-data-path: path to write to
        --ict-load or --realm-load: path to checkpoint with which to embed
        --data-path and --titles-data-path: paths for dataset
Neel Kant's avatar
Neel Kant committed
18
19
        --indexer-log-interval: reporting interval
        --indexer-batch-size: size specific for indexer jobs
Neel Kant's avatar
Neel Kant committed
20
21
22
23
24
25
26

    Check README.md for example script
    """

    initialize_megatron(extra_args_provider=None,
                        args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
    index_builder = IndexBuilder()
Mostofa Patwary's avatar
Mostofa Patwary committed
27
    sys.exit()
Neel Kant's avatar
Neel Kant committed
28
29
30
31
32
33
    index_builder.build_and_save_index()


if __name__ == "__main__":
    main()