sample_idxs_to_text.py

"""
A script which prints the training data according to the given sample index.

Note, that it's crucial that exactly the same corresponding arguments are passed as in the training script. Including the seed. Only then the random sequence from the generated by megatron .npy files will match.

Here is how to decipher the index file name:

meg-gpt2_oscar-combined_text_document_train_indexmap_100ns_1024sl_42s_sample_idx.npy

100ns = --train-samples 100
1024s = --seq-length 1024
42s   = --seed 42

So these 3 have to match the training to get the correct output from this script.

If you're working on the same machine that already has the indices generated during the training, you can also do a sanity check that it doesn't generate new .npy files for the 3 train .npy files (but it will still do it for 3 valid and 3 test .npy files since we feed it a hardcoded setup of size 0 for both valid and test datasets.)

`--sample-id-range` is for consumed samples, so if the gap of interest is between these 2 iterations:

 iteration     3848/  159576 | consumed samples:        75888 | elapsed time per iteration (ms): 14308.9 | learning rate: 2.102E-05 | global batch size:    32 | lm loss: 6.452862E+00 | loss scale: 32768.0 | grad norm: 262044.694 | num zeros: 0.0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 iteration     3792/  159576 | consumed samples:        74096 | elapsed time per iteration (ms): 16474.9 | learning rate: 2.052E-05 | global batch size:    32 | lm loss: 6.404737E+00 | loss scale: 32768.0 | grad norm: 214321.235 | num zeros: 0.0 | number of skipped iterations:   0 | number of nan iterations:   0 |

You'd then use:

`--sample-id-range 75888 74096`

the larger the batch size, the larger the number of samples will be.

Below is an example bash script to print the data in sample index range 5-15:

```
source $six_ALL_CCFRWORK/code/tr1-13B/bigscience/train/tr1-13B-base/start-tr1-13B
MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr1-13B/Megatron-DeepSpeed-tr1-13B/

cd $MEGATRON_DEEPSPEED_REPO

VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document

SEQ_LEN=2048

python tools/sample_idxs_to_text.py \
    --print-text \
    --sample-id-range 5 15 \
    --seed 42 \
    --train-samples 100 \
    --seq-length $SEQ_LEN \
    --data-path $DATA_PATH \
    --data-impl mmap \
    --tokenizer-type GPT2BPETokenizer \
    --vocab-file $VOCAB_FILE \
    --merge-file $MERGE_FILE
````

If you want tokens instead of text, remove `--print-text` and add `--print-tokens` (but you can have both). If you want full token dumps add `--all-tokens`

If you want the data written to a file add:

    --output-file output.txt

This script can be extended to support valid and tests datasets as well, but currently ignores those.

Again, the key 3 args to get right are:

    --seed 42 \
    --train-samples 100 \
    --seq-length $SEQ_LEN \

"""

import sys
import torch

from megatron import get_args
from megatron import get_tokenizer
from megatron import initialize_megatron
from megatron.data.data_samplers import build_pretraining_data_loader
from megatron.data.gpt_dataset import build_train_valid_test_datasets
from megatron.training import update_train_iters


def _add_network_size_args(parser):
    group = parser.add_argument_group(title='Get text from sample idxs.')
    group.add_argument('--sample-id-range', type=int, nargs='+', required=True,
                        help='The number of samples consumed. ex) --sample-id-range 1024 2048')
    group.add_argument('--all-tokens', action='store_true', help='Whether to dump all tokens per record')
    group.add_argument('--print-tokens', action='store_true', help='Whether to print tokens')
    group.add_argument('--print-text', action='store_true', help='Whether to print text')
    group.add_argument('--output-file', help='path to file if the dump should be saved into a file')

    return parser


if __name__ == "__main__":

    # megatron requires args, which are irrelevant to a task at hand, but w/o which it won't start.
    # There prefill those and not require the user to enter them.
    required_irrelevant_args = """
    --num-layers 1
    --hidden-size 1
    --num-attention-heads 1
    --max-position-embeddings 1000000
    --eval-interval 1
    --eval-iters 1
    --micro-batch-size 1
    --global-batch-size 1
    """.split()
    sys.argv.extend(required_irrelevant_args)

    initialize_megatron(extra_args_provider=_add_network_size_args)

    args = get_args()
    tokenizer = get_tokenizer()
    update_train_iters(args)

    if not (args.print_tokens or args.print_text):
        raise ValueError("Need to specify either --print_tokens or --print_text or both")

    if args.all_tokens and not args.print_tokens:
        raise ValueError("--all_tokens requires --print_tokens")

    train_ds, _, _ = build_train_valid_test_datasets(
        data_prefix=args.data_path,
        data_impl=args.data_impl,
        splits_string=args.split,
        train_valid_test_num_samples=[args.train_samples, 0, 0],
        seq_length=args.seq_length,
        seed=args.seed,
        skip_warmup=(not args.mmap_warmup)
    )

    # fast forward to where we want to start sampling
    train_dataloader = build_pretraining_data_loader(train_ds, args.sample_id_range[0])
    data_iterator = iter(train_dataloader)

    if args.all_tokens:
        torch.set_printoptions(threshold=2**20)

    if args.output_file is not None:
        print(f"*** Saving to {args.output_file}")
        fh = open(args.output_file, "w")
    else:
        print(f"*** Dumping to stdout")

    def write(msg):
        if args.output_file:
            fh.write(msg+"\n")
        else:
            print(msg)

    for i in range(args.sample_id_range[0], args.sample_id_range[1]):
        tokens = next(data_iterator)["text"][0]

        if args.print_tokens:
            write(f"{i} {tokens}")

        if args.print_text:
            trim_decode_tokens = tokenizer.detokenize(tokens.tolist())
            write(f"{i} {trim_decode_tokens}")

    if args.output_file is not None:
        print(f"*** Output saved in {args.output_file}")
        fh.close()

    print(f"*** {args.sample_id_range[1]-args.sample_id_range[0]} records dumped")