Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
8eff2a99
Commit
8eff2a99
authored
Feb 03, 2021
by
Mostofa Patwary
Browse files
remove the function get_one_epoch_dataloader and also added assert
parent
38898931
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
27 deletions
+3
-27
megatron/data/biencoder_dataset_utils.py
megatron/data/biencoder_dataset_utils.py
+0
-27
pretrain_ict.py
pretrain_ict.py
+3
-0
No files found.
megatron/data/biencoder_dataset_utils.py
View file @
8eff2a99
...
@@ -9,33 +9,6 @@ from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_co
...
@@ -9,33 +9,6 @@ from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_co
from
megatron
import
get_args
,
get_tokenizer
,
print_rank_0
,
mpu
from
megatron
import
get_args
,
get_tokenizer
,
print_rank_0
,
mpu
def
get_one_epoch_dataloader
(
dataset
,
micro_batch_size
=
None
):
"""Specifically one epoch to be used in an indexing job."""
args
=
get_args
()
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
if
micro_batch_size
is
None
:
micro_batch_size
=
args
.
micro_batch_size
global_batch_size
=
micro_batch_size
*
world_size
num_workers
=
args
.
num_workers
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
# importantly, drop_last must be False to get all the data.
assert
False
,
'DistributedBatchSampler deprecated, change the implementation'
from
megatron.data.samplers
import
DistributedBatchSampler
batch_sampler
=
DistributedBatchSampler
(
sampler
,
batch_size
=
global_batch_size
,
drop_last
=
False
,
rank
=
rank
,
world_size
=
world_size
)
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
def
get_ict_batch
(
data_iterator
):
def
get_ict_batch
(
data_iterator
):
# Items and their type.
# Items and their type.
keys
=
[
'query_tokens'
,
'query_mask'
,
keys
=
[
'query_tokens'
,
'query_mask'
,
...
...
pretrain_ict.py
View file @
8eff2a99
...
@@ -99,6 +99,9 @@ def forward_step(data_iterator, model, input_tensor):
...
@@ -99,6 +99,9 @@ def forward_step(data_iterator, model, input_tensor):
micro_batch_size
=
query_logits
.
shape
[
0
]
micro_batch_size
=
query_logits
.
shape
[
0
]
# recall we assert that tensor_model_parallel_size == 1
# recall we assert that tensor_model_parallel_size == 1
assert
mpu
.
get_tensor_model_parallel_world_size
()
==
1
,
\
"Model parallel size > 1 not supported for ICT"
global_batch_size
=
dist
.
get_world_size
()
*
micro_batch_size
global_batch_size
=
dist
.
get_world_size
()
*
micro_batch_size
all_query_logits
=
AllgatherFromDataParallelRegion
.
apply
(
query_logits
)
all_query_logits
=
AllgatherFromDataParallelRegion
.
apply
(
query_logits
)
all_context_logits
=
AllgatherFromDataParallelRegion
.
apply
(
context_logits
)
all_context_logits
=
AllgatherFromDataParallelRegion
.
apply
(
context_logits
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment