Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
f332d7e1
"...git@developer.sourcefind.cn:OpenDAS/llama-factory.git" did not exist on "c8d12c061ba3861f8a5a73e78763a0de361f556c"
Commit
f332d7e1
authored
Apr 23, 2020
by
Neel Kant
Browse files
Rename fns to be more precise
parent
ac79d374
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
6 deletions
+9
-6
hashed_index.py
hashed_index.py
+9
-6
No files found.
hashed_index.py
View file @
f332d7e1
...
@@ -118,9 +118,9 @@ class HashedIndex(object):
...
@@ -118,9 +118,9 @@ class HashedIndex(object):
def
test_retriever
():
def
test_retriever
():
initialize_megatron
(
extra_args_provider
=
None
,
initialize_megatron
(
extra_args_provider
=
None
,
args_defaults
=
{
'tokenizer_type'
:
'BertWordPieceLowerCase'
})
args_defaults
=
{
'tokenizer_type'
:
'BertWordPieceLowerCase'
})
model
=
load_checkpoint
()
model
=
load_
ict_
checkpoint
()
model
.
eval
()
model
.
eval
()
dataset
=
get_dataset
()
dataset
=
get_
ict_
dataset
()
hashed_index
=
HashedIndex
.
load_from_file
(
'block_hash_data.pkl'
)
hashed_index
=
HashedIndex
.
load_from_file
(
'block_hash_data.pkl'
)
retriever
=
REALMRetriever
(
model
,
dataset
,
hashed_index
)
retriever
=
REALMRetriever
(
model
,
dataset
,
hashed_index
)
...
@@ -151,12 +151,15 @@ def main():
...
@@ -151,12 +151,15 @@ def main():
# allocate the resources well. Have to subsequently assign the correct gpus to the indexing job
# allocate the resources well. Have to subsequently assign the correct gpus to the indexing job
# consider initializing everything in a single group and break off processes based on the ranks
# consider initializing everything in a single group and break off processes based on the ranks
# for debugging purposes, make it so that the training process group checks every some number of intervals
# and if it isn't ready, then wait so that it's consistent. Start with using the filesystem
initialize_megatron
(
extra_args_provider
=
None
,
initialize_megatron
(
extra_args_provider
=
None
,
args_defaults
=
{
'tokenizer_type'
:
'BertWordPieceLowerCase'
})
args_defaults
=
{
'tokenizer_type'
:
'BertWordPieceLowerCase'
})
args
=
get_args
()
args
=
get_args
()
model
=
load_checkpoint
()
model
=
load_
ict_
checkpoint
()
model
.
eval
()
model
.
eval
()
dataset
=
get_dataset
()
dataset
=
get_
ict_
dataset
()
data_iter
=
iter
(
get_dataloader
(
dataset
))
data_iter
=
iter
(
get_dataloader
(
dataset
))
hashed_index
=
HashedIndex
(
embed_size
=
128
,
num_buckets
=
2048
)
hashed_index
=
HashedIndex
(
embed_size
=
128
,
num_buckets
=
2048
)
...
@@ -189,7 +192,7 @@ def main():
...
@@ -189,7 +192,7 @@ def main():
hashed_index
.
clear
()
hashed_index
.
clear
()
def
load_checkpoint
():
def
load_
ict_
checkpoint
():
args
=
get_args
()
args
=
get_args
()
model
=
get_model
(
model_provider
)
model
=
get_model
(
model_provider
)
...
@@ -215,7 +218,7 @@ def load_checkpoint():
...
@@ -215,7 +218,7 @@ def load_checkpoint():
return
model
return
model
def
get_dataset
():
def
get_
ict_
dataset
():
args
=
get_args
()
args
=
get_args
()
block_dataset
=
get_indexed_dataset_
(
args
.
data_path
,
'mmap'
,
True
)
block_dataset
=
get_indexed_dataset_
(
args
.
data_path
,
'mmap'
,
True
)
titles_dataset
=
get_indexed_dataset_
(
args
.
data_path
+
'-titles'
,
'mmap'
,
True
)
titles_dataset
=
get_indexed_dataset_
(
args
.
data_path
+
'-titles'
,
'mmap'
,
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment