Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
1eccfc94
Commit
1eccfc94
authored
Apr 24, 2020
by
Neel Kant
Browse files
Add test_retriever.sh
parent
d7022c72
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
36 additions
and
4 deletions
+36
-4
hashed_index.py
hashed_index.py
+2
-1
megatron/arguments.py
megatron/arguments.py
+2
-0
megatron/data/ict_dataset.py
megatron/data/ict_dataset.py
+1
-1
megatron/model/bert_model.py
megatron/model/bert_model.py
+3
-2
test_retriever.sh
test_retriever.sh
+28
-0
No files found.
hashed_index.py
View file @
1eccfc94
...
...
@@ -118,10 +118,11 @@ class HashedIndex(object):
def
test_retriever
():
initialize_megatron
(
extra_args_provider
=
None
,
args_defaults
=
{
'tokenizer_type'
:
'BertWordPieceLowerCase'
})
args
=
get_args
()
model
=
load_ict_checkpoint
()
model
.
eval
()
dataset
=
get_ict_dataset
()
hashed_index
=
HashedIndex
.
load_from_file
(
'block_
hash_data
.pkl'
)
hashed_index
=
HashedIndex
.
load_from_file
(
args
.
hash_data
_path
)
retriever
=
REALMRetriever
(
model
,
dataset
,
hashed_index
)
strs
=
[
...
...
megatron/arguments.py
View file @
1eccfc94
...
...
@@ -298,6 +298,8 @@ def _add_data_args(parser):
help
=
'Path to combined dataset to split.'
)
group
.
add_argument
(
'--titles-data-path'
,
type
=
str
,
default
=
None
,
help
=
'Path to titles dataset used for ICT'
)
group
.
add_argument
(
'--hash-data-path'
,
type
=
str
,
default
=
None
,
help
=
'Path to pickled HashedIndex data structure'
)
group
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'969, 30, 1'
,
help
=
'Comma-separated list of proportions for training,'
' validation, and test split. For example the split '
...
...
megatron/data/ict_dataset.py
View file @
1eccfc94
...
...
@@ -80,7 +80,7 @@ class InverseClozeDataset(Dataset):
def
decode_tokens
(
self
,
token_ids
):
tokens
=
self
.
tokenizer
.
tokenizer
.
convert_ids_to_tokens
(
token_ids
)
return
' '
.
join
(
token
s
)
return
' '
.
join
(
token
for
token
in
tokens
if
token
!=
'[PAD]'
)
def
get_block
(
self
,
start_idx
,
end_idx
,
doc_idx
):
"""Get the IDs for an evidence block plus the title of the corresponding document"""
...
...
megatron/model/bert_model.py
View file @
1eccfc94
...
...
@@ -295,9 +295,9 @@ class REALMRetriever(MegatronModule):
query_pad_mask
=
torch
.
cuda
.
LongTensor
(
np
.
array
(
query_pad_mask
).
reshape
(
1
,
-
1
))
top5_block_tokens
,
_
=
self
.
retrieve_evidence_blocks
(
query_tokens
,
query_pad_mask
)
for
i
,
block
in
enumerate
(
top5_block_tokens
):
for
i
,
block
in
enumerate
(
top5_block_tokens
[
0
]
):
block_text
=
self
.
ict_dataset
.
decode_tokens
(
block
)
print
(
' > Block {}: {}'
.
format
(
i
,
block_text
))
print
(
'
\n
> Block {}: {}'
.
format
(
i
,
block_text
))
def
retrieve_evidence_blocks
(
self
,
query_tokens
,
query_pad_mask
):
query_embeds
=
self
.
ict_model
.
module
.
module
.
embed_query
(
query_tokens
,
query_pad_mask
)
...
...
@@ -321,6 +321,7 @@ class REALMRetriever(MegatronModule):
all_top5_tokens
.
append
(
np
.
array
(
top5_tokens
))
all_top5_pad_masks
.
append
(
np
.
array
(
top5_pad_masks
))
# [batch_size x 5 x seq_length]
return
np
.
array
(
all_top5_tokens
),
np
.
array
(
all_top5_pad_masks
)
...
...
test_retriever.sh
0 → 100755
View file @
1eccfc94
COMMAND
=
"/home/scratch.gcf/adlr-utils/release/cluster-interface/latest/mp_launch python hashed_index.py
\
--num-layers 12
\
--hidden-size 768
\
--num-attention-heads 12
\
--batch-size 8
\
--checkpoint-activations
\
--seq-length 288
\
--max-position-embeddings 288
\
--train-iters 100000
\
--load /home/dcg-adlr-nkant-output.cosmos1203/chkpts/realm_debug
\
--ict-load /home/dcg-adlr-nkant-output.cosmos1203/chkpts/ict_best
\
--save /home/dcg-adlr-nkant-output.cosmos1203/chkpts/realm_debug
\
--data-path /home/universal-lm-data.cosmos549/datasets/wiki-indexed/wikipedia_lines
\
--titles-data-path /home/universal-lm-data.cosmos549/datasets/wiki-indexed/wikipedia_lines-titles
\
--hash-data-path /home/dcg-adlr-nkant-data.cosmos1202/hash_data/ict_best.pkl
\
--vocab-file /home/universal-lm-data.cosmos549/scratch/mshoeybi/data/albert/vocab.txt
\
--split 58,1,1
\
--distributed-backend nccl
\
--lr 0.0001
\
--num-workers 2
\
--lr-decay-style linear
\
--warmup .01
\
--save-interval 3000
\
--fp16
\
--adlr-autoresume
\
--adlr-autoresume-interval 100"
submit_job
--image
'http://gitlab-master.nvidia.com/adlr/megatron-lm/megatron:20.03'
--mounts
/home/universal-lm-data.cosmos549,/home/dcg-adlr-nkant-source.cosmos1204,/home/dcg-adlr-nkant-data.cosmos1202,/home/dcg-adlr-nkant-output.cosmos1203,/home/nkant
--name
test_retriever
--partition
interactive
--gpu
1
--nodes
1
--autoresume_timer
300
-c
"
${
COMMAND
}
"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment