Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
242770dd
Commit
242770dd
authored
Dec 29, 2020
by
mshoeybi
Browse files
moved compile helper to initialize
parent
a495871b
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
10 additions
and
15 deletions
+10
-15
megatron/data/bert_dataset.py
megatron/data/bert_dataset.py
+0
-2
megatron/data/blendable_dataset.py
megatron/data/blendable_dataset.py
+0
-7
megatron/data/gpt2_dataset.py
megatron/data/gpt2_dataset.py
+0
-2
megatron/data/realm_dataset_utils.py
megatron/data/realm_dataset_utils.py
+0
-4
megatron/initialize.py
megatron/initialize.py
+10
-0
No files found.
megatron/data/bert_dataset.py
View file @
242770dd
...
...
@@ -128,8 +128,6 @@ def get_samples_mapping_(indexed_dataset,
print_rank_0
(
' > building sapmles index mapping for {} ...'
.
format
(
name
))
# First compile and then import.
from
megatron.data.dataset_utils
import
compile_helper
compile_helper
()
from
megatron.data
import
helpers
samples_mapping
=
helpers
.
build_mapping
(
indexed_dataset
.
doc_idx
,
...
...
megatron/data/blendable_dataset.py
View file @
242770dd
...
...
@@ -49,13 +49,6 @@ class BlendableDataset(torch.utils.data.Dataset):
self
.
dataset_index
=
np
.
zeros
(
self
.
size
,
dtype
=
np
.
uint8
)
self
.
dataset_sample_index
=
np
.
zeros
(
self
.
size
,
dtype
=
np
.
int64
)
if
torch
.
distributed
.
get_rank
()
==
0
:
from
megatron.data.dataset_utils
import
compile_helper
compile_helper
()
# Simple barrier
tmp
=
torch
.
cuda
.
LongTensor
([
1
])
torch
.
distributed
.
all_reduce
(
tmp
,
group
=
mpu
.
get_data_parallel_group
())
from
megatron.data
import
helpers
helpers
.
build_blending_indices
(
self
.
dataset_index
,
self
.
dataset_sample_index
,
...
...
megatron/data/gpt2_dataset.py
View file @
242770dd
...
...
@@ -269,8 +269,6 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
start_time
=
time
.
time
()
# Use C++ implementation for speed.
# First compile and then import.
from
megatron.data.dataset_utils
import
compile_helper
compile_helper
()
from
megatron.data
import
helpers
assert
doc_idx
.
dtype
==
np
.
int32
assert
sizes
.
dtype
==
np
.
int32
...
...
megatron/data/realm_dataset_utils.py
View file @
242770dd
...
...
@@ -152,10 +152,6 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
print_rank_0
(
' > building samples index mapping for {} ...'
.
format
(
name
))
# compile/bind the C++ helper code
from
megatron.data.dataset_utils
import
compile_helper
compile_helper
()
from
megatron.data
import
helpers
mapping_array
=
helpers
.
build_blocks_mapping
(
block_dataset
.
doc_idx
,
...
...
megatron/initialize.py
View file @
242770dd
...
...
@@ -78,6 +78,16 @@ def initialize_megatron(extra_args_provider=None, args_defaults={},
# Autoresume.
_init_autoresume
()
# Compile dataset C++ code.
try
:
from
megatron.data
import
helpers
except
:
if
torch
.
distributed
.
get_rank
()
==
0
:
from
megatron.data.dataset_utils
import
compile_helper
compile_helper
()
# Simple barrier
torch
.
distributed
.
barrier
()
# No continuation function
return
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment