Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
1eccfc94
Commit
1eccfc94
authored
Apr 24, 2020
by
Neel Kant
Browse files
Add test_retriever.sh
parent
d7022c72
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
36 additions
and
4 deletions
+36
-4
hashed_index.py
hashed_index.py
+2
-1
megatron/arguments.py
megatron/arguments.py
+2
-0
megatron/data/ict_dataset.py
megatron/data/ict_dataset.py
+1
-1
megatron/model/bert_model.py
megatron/model/bert_model.py
+3
-2
test_retriever.sh
test_retriever.sh
+28
-0
No files found.
hashed_index.py
View file @
1eccfc94
...
@@ -118,10 +118,11 @@ class HashedIndex(object):
...
@@ -118,10 +118,11 @@ class HashedIndex(object):
def
test_retriever
():
def
test_retriever
():
initialize_megatron
(
extra_args_provider
=
None
,
initialize_megatron
(
extra_args_provider
=
None
,
args_defaults
=
{
'tokenizer_type'
:
'BertWordPieceLowerCase'
})
args_defaults
=
{
'tokenizer_type'
:
'BertWordPieceLowerCase'
})
args
=
get_args
()
model
=
load_ict_checkpoint
()
model
=
load_ict_checkpoint
()
model
.
eval
()
model
.
eval
()
dataset
=
get_ict_dataset
()
dataset
=
get_ict_dataset
()
hashed_index
=
HashedIndex
.
load_from_file
(
'block_
hash_data
.pkl'
)
hashed_index
=
HashedIndex
.
load_from_file
(
args
.
hash_data
_path
)
retriever
=
REALMRetriever
(
model
,
dataset
,
hashed_index
)
retriever
=
REALMRetriever
(
model
,
dataset
,
hashed_index
)
strs
=
[
strs
=
[
...
...
megatron/arguments.py
View file @
1eccfc94
...
@@ -298,6 +298,8 @@ def _add_data_args(parser):
...
@@ -298,6 +298,8 @@ def _add_data_args(parser):
help
=
'Path to combined dataset to split.'
)
help
=
'Path to combined dataset to split.'
)
group
.
add_argument
(
'--titles-data-path'
,
type
=
str
,
default
=
None
,
group
.
add_argument
(
'--titles-data-path'
,
type
=
str
,
default
=
None
,
help
=
'Path to titles dataset used for ICT'
)
help
=
'Path to titles dataset used for ICT'
)
group
.
add_argument
(
'--hash-data-path'
,
type
=
str
,
default
=
None
,
help
=
'Path to pickled HashedIndex data structure'
)
group
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'969, 30, 1'
,
group
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'969, 30, 1'
,
help
=
'Comma-separated list of proportions for training,'
help
=
'Comma-separated list of proportions for training,'
' validation, and test split. For example the split '
' validation, and test split. For example the split '
...
...
megatron/data/ict_dataset.py
View file @
1eccfc94
...
@@ -80,7 +80,7 @@ class InverseClozeDataset(Dataset):
...
@@ -80,7 +80,7 @@ class InverseClozeDataset(Dataset):
def
decode_tokens
(
self
,
token_ids
):
def
decode_tokens
(
self
,
token_ids
):
tokens
=
self
.
tokenizer
.
tokenizer
.
convert_ids_to_tokens
(
token_ids
)
tokens
=
self
.
tokenizer
.
tokenizer
.
convert_ids_to_tokens
(
token_ids
)
return
' '
.
join
(
token
s
)
return
' '
.
join
(
token
for
token
in
tokens
if
token
!=
'[PAD]'
)
def
get_block
(
self
,
start_idx
,
end_idx
,
doc_idx
):
def
get_block
(
self
,
start_idx
,
end_idx
,
doc_idx
):
"""Get the IDs for an evidence block plus the title of the corresponding document"""
"""Get the IDs for an evidence block plus the title of the corresponding document"""
...
...
megatron/model/bert_model.py
View file @
1eccfc94
...
@@ -295,9 +295,9 @@ class REALMRetriever(MegatronModule):
...
@@ -295,9 +295,9 @@ class REALMRetriever(MegatronModule):
query_pad_mask
=
torch
.
cuda
.
LongTensor
(
np
.
array
(
query_pad_mask
).
reshape
(
1
,
-
1
))
query_pad_mask
=
torch
.
cuda
.
LongTensor
(
np
.
array
(
query_pad_mask
).
reshape
(
1
,
-
1
))
top5_block_tokens
,
_
=
self
.
retrieve_evidence_blocks
(
query_tokens
,
query_pad_mask
)
top5_block_tokens
,
_
=
self
.
retrieve_evidence_blocks
(
query_tokens
,
query_pad_mask
)
for
i
,
block
in
enumerate
(
top5_block_tokens
):
for
i
,
block
in
enumerate
(
top5_block_tokens
[
0
]
):
block_text
=
self
.
ict_dataset
.
decode_tokens
(
block
)
block_text
=
self
.
ict_dataset
.
decode_tokens
(
block
)
print
(
' > Block {}: {}'
.
format
(
i
,
block_text
))
print
(
'
\n
> Block {}: {}'
.
format
(
i
,
block_text
))
def
retrieve_evidence_blocks
(
self
,
query_tokens
,
query_pad_mask
):
def
retrieve_evidence_blocks
(
self
,
query_tokens
,
query_pad_mask
):
query_embeds
=
self
.
ict_model
.
module
.
module
.
embed_query
(
query_tokens
,
query_pad_mask
)
query_embeds
=
self
.
ict_model
.
module
.
module
.
embed_query
(
query_tokens
,
query_pad_mask
)
...
@@ -321,6 +321,7 @@ class REALMRetriever(MegatronModule):
...
@@ -321,6 +321,7 @@ class REALMRetriever(MegatronModule):
all_top5_tokens
.
append
(
np
.
array
(
top5_tokens
))
all_top5_tokens
.
append
(
np
.
array
(
top5_tokens
))
all_top5_pad_masks
.
append
(
np
.
array
(
top5_pad_masks
))
all_top5_pad_masks
.
append
(
np
.
array
(
top5_pad_masks
))
# [batch_size x 5 x seq_length]
return
np
.
array
(
all_top5_tokens
),
np
.
array
(
all_top5_pad_masks
)
return
np
.
array
(
all_top5_tokens
),
np
.
array
(
all_top5_pad_masks
)
...
...
test_retriever.sh
0 → 100755
View file @
1eccfc94
COMMAND
=
"/home/scratch.gcf/adlr-utils/release/cluster-interface/latest/mp_launch python hashed_index.py
\
--num-layers 12
\
--hidden-size 768
\
--num-attention-heads 12
\
--batch-size 8
\
--checkpoint-activations
\
--seq-length 288
\
--max-position-embeddings 288
\
--train-iters 100000
\
--load /home/dcg-adlr-nkant-output.cosmos1203/chkpts/realm_debug
\
--ict-load /home/dcg-adlr-nkant-output.cosmos1203/chkpts/ict_best
\
--save /home/dcg-adlr-nkant-output.cosmos1203/chkpts/realm_debug
\
--data-path /home/universal-lm-data.cosmos549/datasets/wiki-indexed/wikipedia_lines
\
--titles-data-path /home/universal-lm-data.cosmos549/datasets/wiki-indexed/wikipedia_lines-titles
\
--hash-data-path /home/dcg-adlr-nkant-data.cosmos1202/hash_data/ict_best.pkl
\
--vocab-file /home/universal-lm-data.cosmos549/scratch/mshoeybi/data/albert/vocab.txt
\
--split 58,1,1
\
--distributed-backend nccl
\
--lr 0.0001
\
--num-workers 2
\
--lr-decay-style linear
\
--warmup .01
\
--save-interval 3000
\
--fp16
\
--adlr-autoresume
\
--adlr-autoresume-interval 100"
submit_job
--image
'http://gitlab-master.nvidia.com/adlr/megatron-lm/megatron:20.03'
--mounts
/home/universal-lm-data.cosmos549,/home/dcg-adlr-nkant-source.cosmos1204,/home/dcg-adlr-nkant-data.cosmos1202,/home/dcg-adlr-nkant-output.cosmos1203,/home/nkant
--name
test_retriever
--partition
interactive
--gpu
1
--nodes
1
--autoresume_timer
300
-c
"
${
COMMAND
}
"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment