Add test_retriever.sh

1eccfc94 · Neel Kant · d7022c72 · 1eccfc94 · 1eccfc94 · 1eccfc94
Commit 1eccfc94 authored Apr 24, 2020 by Neel Kant
5 changed files
--- a/hashed_index.py
+++ b/hashed_index.py
@@ -118,10 +118,11 @@ class HashedIndex(object):
 def test_retriever():
    initialize_megatron(extra_args_provider=None,
                        args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
+    args = get_args()
    model = load_ict_checkpoint()
    model.eval()
    dataset = get_ict_dataset()
-    hashed_index = HashedIndex.load_from_file('block_hash_data.pkl')
+    hashed_index = HashedIndex.load_from_file(args.hash_data_path)
    retriever = REALMRetriever(model, dataset, hashed_index)

    strs = [

--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -298,6 +298,8 @@ def _add_data_args(parser):
                       help='Path to combined dataset to split.')
    group.add_argument('--titles-data-path', type=str, default=None,
                       help='Path to titles dataset used for ICT')
+    group.add_argument('--hash-data-path', type=str, default=None,
+                       help='Path to pickled HashedIndex data structure')
    group.add_argument('--split', type=str, default='969, 30, 1',
                       help='Comma-separated list of proportions for training,'
                       ' validation, and test split. For example the split '

--- a/megatron/data/ict_dataset.py
+++ b/megatron/data/ict_dataset.py
@@ -80,7 +80,7 @@ class InverseClozeDataset(Dataset):

    def decode_tokens(self, token_ids):
        tokens = self.tokenizer.tokenizer.convert_ids_to_tokens(token_ids)
-        return ' '.join(tokens)
+        return ' '.join(token for token in tokens if token != '[PAD]')

    def get_block(self, start_idx, end_idx, doc_idx):
        """Get the IDs for an evidence block plus the title of the corresponding document"""

--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -295,9 +295,9 @@ class REALMRetriever(MegatronModule):
        query_pad_mask = torch.cuda.LongTensor(np.array(query_pad_mask).reshape(1, -1))

        top5_block_tokens, _ = self.retrieve_evidence_blocks(query_tokens, query_pad_mask)
-        for i, block in enumerate(top5_block_tokens):
+        for i, block in enumerate(top5_block_tokens[0]):
            block_text = self.ict_dataset.decode_tokens(block)
-            print('    > Block {}: {}'.format(i, block_text))
+            print('\n    > Block {}: {}'.format(i, block_text))

    def retrieve_evidence_blocks(self, query_tokens, query_pad_mask):
        query_embeds = self.ict_model.module.module.embed_query(query_tokens, query_pad_mask)
@@ -321,6 +321,7 @@ class REALMRetriever(MegatronModule):
            all_top5_tokens.append(np.array(top5_tokens))
            all_top5_pad_masks.append(np.array(top5_pad_masks))

+        # [batch_size x 5 x seq_length]
        return np.array(all_top5_tokens), np.array(all_top5_pad_masks)



--- a/test_retriever.sh
+++ b/test_retriever.sh
+COMMAND="/home/scratch.gcf/adlr-utils/release/cluster-interface/latest/mp_launch python hashed_index.py \
+    --num-layers 12 \
+    --hidden-size 768 \
+    --num-attention-heads 12 \
+    --batch-size 8 \
+    --checkpoint-activations \
+    --seq-length 288 \
+    --max-position-embeddings 288 \
+    --train-iters 100000 \
+    --load /home/dcg-adlr-nkant-output.cosmos1203/chkpts/realm_debug \
+    --ict-load /home/dcg-adlr-nkant-output.cosmos1203/chkpts/ict_best \
+    --save /home/dcg-adlr-nkant-output.cosmos1203/chkpts/realm_debug \
+    --data-path /home/universal-lm-data.cosmos549/datasets/wiki-indexed/wikipedia_lines \
+    --titles-data-path /home/universal-lm-data.cosmos549/datasets/wiki-indexed/wikipedia_lines-titles \
+    --hash-data-path /home/dcg-adlr-nkant-data.cosmos1202/hash_data/ict_best.pkl \
+    --vocab-file /home/universal-lm-data.cosmos549/scratch/mshoeybi/data/albert/vocab.txt \
+    --split 58,1,1 \
+    --distributed-backend nccl \
+    --lr 0.0001 \
+    --num-workers 2 \
+    --lr-decay-style linear \
+    --warmup .01 \
+    --save-interval 3000 \
+    --fp16 \
+    --adlr-autoresume \
+    --adlr-autoresume-interval 100"
+
+submit_job --image 'http://gitlab-master.nvidia.com/adlr/megatron-lm/megatron:20.03' --mounts /home/universal-lm-data.cosmos549,/home/dcg-adlr-nkant-source.cosmos1204,/home/dcg-adlr-nkant-data.cosmos1202,/home/dcg-adlr-nkant-output.cosmos1203,/home/nkant --name test_retriever --partition interactive --gpu 1 --nodes 1 --autoresume_timer 300 -c "${COMMAND}"