create_doc_index.py 907 Bytes
Newer Older
Neel Kant's avatar
Neel Kant committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import sys
sys.path.append('../')

from megatron.indexer import IndexBuilder
from megatron.initialize import initialize_megatron


def main():
    """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
    - Include all args needed for initial model specification

    Other key args:
        --block-data-path: path to write to
        --ict-load or --realm-load: path to checkpoint with which to embed
        --data-path and --titles-data-path: paths for dataset
Neel Kant's avatar
Neel Kant committed
16
17
        --indexer-log-interval: reporting interval
        --indexer-batch-size: size specific for indexer jobs
Neel Kant's avatar
Neel Kant committed
18
19
20
21
22
23
24
25
26
27
28
29
30

    Check README.md for example script
    """

    initialize_megatron(extra_args_provider=None,
                        args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
    index_builder = IndexBuilder()
    index_builder.build_and_save_index()


if __name__ == "__main__":
    main()