Commit 1eccfc94 authored by Neel Kant's avatar Neel Kant
Browse files

Add test_retriever.sh

parent d7022c72
......@@ -118,10 +118,11 @@ class HashedIndex(object):
def test_retriever():
initialize_megatron(extra_args_provider=None,
args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
args = get_args()
model = load_ict_checkpoint()
model.eval()
dataset = get_ict_dataset()
hashed_index = HashedIndex.load_from_file('block_hash_data.pkl')
hashed_index = HashedIndex.load_from_file(args.hash_data_path)
retriever = REALMRetriever(model, dataset, hashed_index)
strs = [
......
......@@ -298,6 +298,8 @@ def _add_data_args(parser):
help='Path to combined dataset to split.')
group.add_argument('--titles-data-path', type=str, default=None,
help='Path to titles dataset used for ICT')
group.add_argument('--hash-data-path', type=str, default=None,
help='Path to pickled HashedIndex data structure')
group.add_argument('--split', type=str, default='969, 30, 1',
help='Comma-separated list of proportions for training,'
' validation, and test split. For example the split '
......
......@@ -80,7 +80,7 @@ class InverseClozeDataset(Dataset):
def decode_tokens(self, token_ids):
tokens = self.tokenizer.tokenizer.convert_ids_to_tokens(token_ids)
return ' '.join(tokens)
return ' '.join(token for token in tokens if token != '[PAD]')
def get_block(self, start_idx, end_idx, doc_idx):
"""Get the IDs for an evidence block plus the title of the corresponding document"""
......
......@@ -295,9 +295,9 @@ class REALMRetriever(MegatronModule):
query_pad_mask = torch.cuda.LongTensor(np.array(query_pad_mask).reshape(1, -1))
top5_block_tokens, _ = self.retrieve_evidence_blocks(query_tokens, query_pad_mask)
for i, block in enumerate(top5_block_tokens):
for i, block in enumerate(top5_block_tokens[0]):
block_text = self.ict_dataset.decode_tokens(block)
print(' > Block {}: {}'.format(i, block_text))
print('\n > Block {}: {}'.format(i, block_text))
def retrieve_evidence_blocks(self, query_tokens, query_pad_mask):
query_embeds = self.ict_model.module.module.embed_query(query_tokens, query_pad_mask)
......@@ -321,6 +321,7 @@ class REALMRetriever(MegatronModule):
all_top5_tokens.append(np.array(top5_tokens))
all_top5_pad_masks.append(np.array(top5_pad_masks))
# [batch_size x 5 x seq_length]
return np.array(all_top5_tokens), np.array(all_top5_pad_masks)
......
COMMAND="/home/scratch.gcf/adlr-utils/release/cluster-interface/latest/mp_launch python hashed_index.py \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--batch-size 8 \
--checkpoint-activations \
--seq-length 288 \
--max-position-embeddings 288 \
--train-iters 100000 \
--load /home/dcg-adlr-nkant-output.cosmos1203/chkpts/realm_debug \
--ict-load /home/dcg-adlr-nkant-output.cosmos1203/chkpts/ict_best \
--save /home/dcg-adlr-nkant-output.cosmos1203/chkpts/realm_debug \
--data-path /home/universal-lm-data.cosmos549/datasets/wiki-indexed/wikipedia_lines \
--titles-data-path /home/universal-lm-data.cosmos549/datasets/wiki-indexed/wikipedia_lines-titles \
--hash-data-path /home/dcg-adlr-nkant-data.cosmos1202/hash_data/ict_best.pkl \
--vocab-file /home/universal-lm-data.cosmos549/scratch/mshoeybi/data/albert/vocab.txt \
--split 58,1,1 \
--distributed-backend nccl \
--lr 0.0001 \
--num-workers 2 \
--lr-decay-style linear \
--warmup .01 \
--save-interval 3000 \
--fp16 \
--adlr-autoresume \
--adlr-autoresume-interval 100"
submit_job --image 'http://gitlab-master.nvidia.com/adlr/megatron-lm/megatron:20.03' --mounts /home/universal-lm-data.cosmos549,/home/dcg-adlr-nkant-source.cosmos1204,/home/dcg-adlr-nkant-data.cosmos1202,/home/dcg-adlr-nkant-output.cosmos1203,/home/nkant --name test_retriever --partition interactive --gpu 1 --nodes 1 --autoresume_timer 300 -c "${COMMAND}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment