Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
690291dd
Commit
690291dd
authored
Nov 20, 2019
by
Mohammad Shoeybi
Browse files
fixed couple of bugs in helpers
parent
1a1cfcff
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
194 additions
and
116 deletions
+194
-116
megatron/data/albert_dataset.py
megatron/data/albert_dataset.py
+27
-10
megatron/data/helpers.cpp
megatron/data/helpers.cpp
+157
-97
megatron/data/indexed_dataset.py
megatron/data/indexed_dataset.py
+10
-9
No files found.
megatron/data/albert_dataset.py
View file @
690291dd
...
...
@@ -12,6 +12,7 @@ from .dataset_utils import build_training_sample
from
.
import
helpers
from
megatron.data
import
FullBertTokenizer
,
indexed_dataset
from
megatron.utils
import
print_rank_0
class
AlbertDataset
(
Dataset
):
...
...
@@ -31,11 +32,19 @@ class AlbertDataset(Dataset):
# Build the samples mapping.
if
not
num_epochs
:
if
not
max_num_samples
:
raise
ValueError
(
"Need to specify either max_num_samples or num_epochs"
)
num_epochs
=
int
(
max_num_samples
/
len
(
indexed_dataset
))
+
1
raise
ValueError
(
"Need to specify either max_num_samples "
"or num_epochs"
)
num_epochs
=
np
.
iinfo
(
np
.
int32
).
max
-
1
if
not
max_num_samples
:
max_num_samples
=
len
(
indexed_dataset
)
*
num_epochs
print
(
f
"Building the sample map for
{
num_epochs
}
epochs or
{
max_num_samples
}
samples."
)
max_num_samples
=
np
.
iinfo
(
np
.
int64
).
max
-
1
# Make sure the types match the helpers input types.
assert
indexed_dataset
.
doc_idx
.
dtype
==
np
.
int64
assert
indexed_dataset
.
sizes
.
dtype
==
np
.
int32
# Build samples mapping
verbose
=
torch
.
distributed
.
get_rank
()
==
0
start_time
=
time
.
time
()
self
.
samples_mapping
=
helpers
.
build_mapping
(
indexed_dataset
.
doc_idx
,
indexed_dataset
.
sizes
,
...
...
@@ -43,7 +52,14 @@ class AlbertDataset(Dataset):
max_num_samples
,
self
.
max_seq_length
-
3
,
# account for added tokens
short_seq_prob
,
self
.
seed
)
self
.
seed
,
verbose
)
# Make sure all the ranks have built the mapping
torch
.
distributed
.
barrier
()
print_rank_0
(
'> elasped time to build samples mapping (seconds): '
'{:2f}'
.
format
(
time
.
time
()
-
start_time
))
exit
()
# Vocab stuff.
self
.
vocab_id_list
=
list
(
tokenizer
.
inv_vocab
.
keys
())
...
...
@@ -59,11 +75,12 @@ class AlbertDataset(Dataset):
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
,
skip_warmup
=
False
):
tokenizer
=
FullBertTokenizer
(
vocab
,
do_lower_case
=
True
)
print
(
"> Reading dataset index"
)
idx_ds
=
indexed_dataset
.
make_dataset
(
data_prefix
,
data_impl
,
skip_warmup
)
print
(
"> Finished creating indexed dataset"
)
return
cls
(
idx_ds
,
tokenizer
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
)
print_rank_0
(
"> Reading dataset index ..."
)
idx_ds
=
indexed_dataset
.
make_dataset
(
data_prefix
,
data_impl
,
skip_warmup
)
print_rank_0
(
"> Finished creating indexed dataset"
)
return
cls
(
idx_ds
,
tokenizer
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
)
def
num_tokens
(
self
):
return
self
.
tokenizer
.
vocab_size
()
...
...
megatron/data/helpers.cpp
View file @
690291dd
/* Helper methods for fast index mapping builds */
#include <algorithm>
#include <iostream>
...
...
@@ -6,46 +7,61 @@
#include <stdexcept>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <random>
namespace
py
=
pybind11
;
using
namespace
std
;
inline
uint32_t
get_sample_len
(
const
int
short_seq_ratio
,
const
uint32_t
max_length
)
{
inline
int32_t
get_target_sample_len
(
const
int32_t
short_seq_ratio
,
const
int32_t
max_length
,
std
::
mt19937
&
rand32_gen
)
{
/* Training sample length. */
const
auto
random_number
=
rand
();
const
auto
random_number
=
rand
32_gen
();
if
((
random_number
%
short_seq_ratio
)
==
0
)
{
return
2
+
random_number
%
(
max_length
-
1
);
return
2
+
random_number
%
(
max_length
-
1
);
}
return
max_length
;
}
template
<
typename
DocIdx
>
py
::
array
build_mapping_impl
(
const
py
::
array_t
<
u
int
32
_t
>&
docs_
,
const
py
::
array_t
<
u
int
16
_t
>&
sizes_
,
const
int
num_epochs
,
py
::
array
build_mapping_impl
(
const
py
::
array_t
<
int
64
_t
>&
docs_
,
const
py
::
array_t
<
int
32
_t
>&
sizes_
,
const
int
32_t
num_epochs
,
const
uint64_t
max_num_samples
,
const
int
max_seq_length
,
const
int
32_t
max_seq_length
,
const
double
short_seq_prob
,
const
int
seed
)
{
const
int32_t
seed
,
const
bool
verbose
)
{
/* Build a mapping of (start-index, end-index, sequence-length) where
start and end index are the indices of the sentences in the sample
and sequence-length is the target sequence length.
*/
if
(
verbose
)
{
cout
<<
" > using "
<<
docs_
.
shape
(
0
)
-
1
<<
" documents with "
<<
sizes_
.
shape
(
0
)
<<
" sentences ..."
<<
endl
<<
std
::
flush
;
}
cout
<<
"> building dataset mapping for "
<<
docs_
.
shape
(
0
)
-
1
\
<<
" documents with "
<<
sizes_
.
shape
(
0
)
<<
" sentences ..."
<<
std
::
flush
<<
endl
;
// Consistency checks.
assert
(
num_epochs
>
0
);
assert
(
max_seq_length
>
1
);
assert
(
short_seq_prob
>
0.0
);
assert
(
short_seq_prob
<=
1.0
);
assert
(
seed
>
0
);
// For efficiency, convert probability to ratio.
const
auto
short_seq_ratio
=
static_cast
<
int
>
(
round
(
1.0
/
short_seq_prob
));
// For efficiency, convert probability to ratio.
Note: rand() generates int.
const
auto
short_seq_ratio
=
static_cast
<
int
32_t
>
(
round
(
1.0
/
short_seq_prob
));
// Remove bound checks.
auto
docs
=
docs_
.
unchecked
<
1
>
();
auto
sizes
=
sizes_
.
unchecked
<
1
>
();
// Check for consistency.
if
(
docs
[
docs
.
shape
(
0
)
-
1
]
!=
sizes
.
shape
(
0
))
{
cout
<<
"document values is not consistent with length of sizes: "
<<
docs
[
docs
.
shape
(
0
)
-
1
]
<<
" != "
<<
sizes
.
shape
(
0
)
<<
endl
;
throw
(
-
1
);
throw
std
::
length_error
(
"docs and sizes"
);
}
// Mapping and it's length (1D).
...
...
@@ -55,36 +71,39 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
// Perform two iterations, in the first iteration get the size
// and allocate memory and in the second iteration populate the map.
bool
second
=
false
;
for
(
int
iteration
=
0
;
iteration
<
2
;
++
iteration
)
{
for
(
int
32_t
iteration
=
0
;
iteration
<
2
;
++
iteration
)
{
// Set the seed so both iterations produce the same results.
s
rand
(
seed
);
s
td
::
mt19937
rand32_gen
(
seed
);
// Set the flag on second iteration.
second
=
iteration
==
1
;
second
=
(
iteration
==
1
)
;
// Counters:
uint
32
_t
empty_docs
=
0
;
uint
32
_t
one_sent_docs
=
0
;
uint
64
_t
empty_docs
=
0
;
uint
64
_t
one_sent_docs
=
0
;
// Current map index.
uint64_t
map_index
=
0
;
// For each epoch:
for
(
int
epoch
=
0
;
epoch
<
num_epochs
;
++
epoch
)
{
if
(
map_index
>=
max_num_samples
&&
!
second
)
{
cout
<<
" > reached "
<<
max_num_samples
<<
" samples after "
<<
epoch
<<
" epochs ..."
<<
std
::
flush
<<
endl
;
for
(
int32_t
epoch
=
0
;
epoch
<
num_epochs
;
++
epoch
)
{
if
(
map_index
>=
max_num_samples
)
{
if
(
verbose
&&
(
!
second
))
{
cout
<<
" > reached "
<<
max_num_samples
<<
" samples after "
<<
epoch
<<
" epochs ..."
<<
endl
<<
std
::
flush
;
}
break
;
}
// For each document:
for
(
int
doc
=
0
;
doc
<
(
docs
.
shape
(
0
)
-
1
);
++
doc
)
{
for
(
int
32_t
doc
=
0
;
doc
<
(
docs
.
shape
(
0
)
-
1
);
++
doc
)
{
// Document sentences are in [sent_index_first, sent_index_last)
.
// Document sentences are in [sent_index_first, sent_index_last)
const
auto
sent_index_first
=
docs
[
doc
];
const
auto
sent_index_last
=
docs
[
doc
+
1
];
// At the begining of the document previous index is the start index.
// At the begining of the document previous index is the
// start index.
auto
prev_start_index
=
sent_index_first
;
// Remaining documents.
...
...
@@ -93,13 +112,10 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
// Some bookkeeping
if
((
epoch
==
0
)
&&
(
!
second
))
{
if
(
num_remain_sent
==
0
)
{
cout
<<
"***WARNING*** document "
<<
doc
<<
" is empty"
<<
endl
;
empty_docs
+=
1
;
++
empty_docs
;
}
if
(
num_remain_sent
==
1
)
{
// cout << "***WARNING*** document " << doc <<
// " has one sentence" << endl;
one_sent_docs
+=
1
;
++
one_sent_docs
;
}
}
...
...
@@ -107,110 +123,154 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
if
(
num_remain_sent
>
1
)
{
// Set values.
auto
size
=
uint32_t
{
0
};
auto
num_sent
=
uint32_t
{
0
};
auto
seq_len
=
get_sample_len
(
short_seq_ratio
,
max_seq_length
);
auto
seq_len
=
int32_t
{
0
};
auto
num_sent
=
int32_t
{
0
};
auto
target_seq_len
=
get_target_sample_len
(
short_seq_ratio
,
max_seq_length
,
rand32_gen
);
// Loop through sentences.
for
(
auto
sent_index
=
sent_index_first
;
sent_index
<
sent_index_last
;
++
sent_index
)
{
// Add the size and number of sentences.
size
+=
sizes
[
sent_index
];
num_sent
+=
1
;
num_remain_sent
-=
1
;
// If we have reached the target length.
// and if not only one sentence is left in the document.
// and if we have at least two sentneces.
// and if we have reached end of the document.
if
(((
size
>=
seq_len
)
&&
(
num_remain_sent
>
1
)
&&
(
num_sent
>
1
)
)
||
(
num_remain_sent
==
0
))
{
// Populate the map.
if
(
second
)
{
const
auto
map_index_0
=
3
*
map_index
;
maps
[
map_index_0
]
=
prev_start_index
;
maps
[
map_index_0
+
1
]
=
sent_index
+
1
;
maps
[
map_index_0
+
2
]
=
seq_len
;
}
// Update indices / counters.
// check for overflow
if
(
map_index
==
std
::
numeric_limits
<
DocIdx
>::
max
())
{
cout
<<
"number of samples exceeded maximum allowed by type: "
<<
std
::
numeric_limits
<
DocIdx
>::
max
()
<<
endl
;
throw
std
::
overflow_error
(
"Number of samples"
);
}
map_index
+=
1
;
prev_start_index
=
sent_index
+
1
;
seq_len
=
get_sample_len
(
short_seq_ratio
,
max_seq_length
);
size
=
0
;
num_sent
=
0
;
}
}
// Add the size and number of sentences.
seq_len
+=
sizes
[
sent_index
];
++
num_sent
;
--
num_remain_sent
;
// If we have reached the target length.
// and if not only one sentence is left in the document.
// and if we have at least two sentneces.
// and if we have reached end of the document.
if
(((
seq_len
>=
target_seq_len
)
&&
(
num_remain_sent
>
1
)
&&
(
num_sent
>
1
)
)
||
(
num_remain_sent
==
0
))
{
// Check for overflow.
if
((
3
*
map_index
+
2
)
>
std
::
numeric_limits
<
int64_t
>::
max
())
{
cout
<<
"number of samples exceeded maximum "
<<
"allowed by type int64: "
<<
std
::
numeric_limits
<
int64_t
>::
max
()
<<
endl
;
throw
std
::
overflow_error
(
"Number of samples"
);
}
// Populate the map.
if
(
second
)
{
const
auto
map_index_0
=
3
*
map_index
;
maps
[
map_index_0
]
=
static_cast
<
DocIdx
>
(
prev_start_index
);
maps
[
map_index_0
+
1
]
=
static_cast
<
DocIdx
>
(
sent_index
+
1
);
maps
[
map_index_0
+
2
]
=
static_cast
<
DocIdx
>
(
target_seq_len
);
}
// Update indices / counters.
++
map_index
;
prev_start_index
=
sent_index
+
1
;
target_seq_len
=
get_target_sample_len
(
short_seq_ratio
,
max_seq_length
,
rand32_gen
);
seq_len
=
0
;
num_sent
=
0
;
}
}
// for (auto sent_index=sent_index_first; ...
}
// if (num_remain_sent > 1) {
}
// for (int doc=0; doc < num_docs; ++doc) {
}
// for (int epoch=0; epoch < num_epochs; ++epoch) {
if
(
!
second
)
{
cout
<<
" number of samples: "
<<
map_index
<<
endl
;
cout
<<
" number of empty documents: "
<<
empty_docs
<<
endl
;
cout
<<
" number of documents with one sentence: "
<<
one_sent_docs
<<
endl
;
if
(
verbose
)
{
cout
<<
" > number of empty documents: "
<<
empty_docs
<<
endl
<<
std
::
flush
;
cout
<<
" > number of documents with one sentence: "
<<
one_sent_docs
<<
endl
<<
std
::
flush
;
cout
<<
" > will create mapping for "
<<
map_index
<<
" samples"
<<
endl
<<
std
::
flush
;
}
assert
(
maps
==
NULL
);
assert
(
num_samples
<
0
);
maps
=
new
DocIdx
[
3
*
map_index
];
num_samples
=
map_index
;
num_samples
=
static_cast
<
int64_t
>
(
map_index
)
;
}
}
// for (int iteration=0; iteration < 2; ++iteration) {
// Shuffle.
// We need a 64 bit random number generator as we might have more
// than 2 billion samples.
std
::
mt19937_64
rand64_gen
(
seed
+
1
);
for
(
auto
i
=
(
num_samples
-
1
);
i
>
0
;
--
i
)
{
const
auto
j
=
rand
()
%
(
i
+
1
);
const
auto
i0
=
3
*
i
;
const
auto
j0
=
3
*
j
;
// Swap values.
swap
(
maps
[
i0
],
maps
[
j0
]);
swap
(
maps
[
i0
+
1
],
maps
[
j0
+
1
]);
swap
(
maps
[
i0
+
2
],
maps
[
j0
+
2
]);
const
auto
j
=
static_cast
<
int64_t
>
(
rand64_gen
()
%
(
i
+
1
)
)
;
const
auto
i0
=
3
*
i
;
const
auto
j0
=
3
*
j
;
// Swap values.
swap
(
maps
[
i0
],
maps
[
j0
]);
swap
(
maps
[
i0
+
1
],
maps
[
j0
+
1
]);
swap
(
maps
[
i0
+
2
],
maps
[
j0
+
2
]);
}
cout
<<
" > done building the mapping."
<<
endl
;
if
(
verbose
)
{
cout
<<
"> done building the mapping."
<<
endl
;
}
// Method to deallocate memory.
py
::
capsule
free_when_done
(
maps
,
[](
void
*
mem_
)
{
DocIdx
*
mem
=
reinterpret_cast
<
DocIdx
*>
(
mem_
);
cout
<<
"freeing memory for the dataset mapping"
<<
endl
;
delete
[]
mem
;
delete
[]
mem
;
});
// Return the numpy array.
const
auto
byte_size
=
sizeof
(
DocIdx
);
return
py
::
array
(
std
::
vector
<
int64_t
>
{
num_samples
,
3
},
// shape
{
3
*
4
,
4
},
// C-style contiguous strides
{
3
*
byte_size
,
byte_size
},
// C-style contiguous strides
maps
,
// the data pointer
free_when_done
);
// numpy array references
}
py
::
array
build_mapping
(
const
py
::
array
&
docs_
,
const
py
::
array
&
sizes_
,
py
::
array
build_mapping
(
const
py
::
array_t
<
int64_t
>&
docs_
,
const
py
::
array_t
<
int
>&
sizes_
,
const
int
num_epochs
,
const
uint64_t
max_num_samples
,
const
int
max_seq_length
,
const
double
short_seq_prob
,
const
int
seed
)
{
const
int
seed
,
const
bool
verbose
)
{
if
(
verbose
)
{
cout
<<
"> building sample map using: "
<<
endl
<<
std
::
flush
;
cout
<<
" number of epochs: "
<<
num_epochs
<<
endl
<<
std
::
flush
;
cout
<<
" maximum number of samples: "
<<
max_num_samples
<<
endl
<<
std
::
flush
;
cout
<<
" maximum sequence length: "
<<
max_seq_length
<<
endl
<<
std
::
flush
;
cout
<<
" short sequence probability: "
<<
short_seq_prob
<<
endl
<<
std
::
flush
;
cout
<<
" seed: "
<<
seed
<<
endl
<<
std
::
flush
;
}
if
(
sizes_
.
size
()
>
std
::
numeric_limits
<
uint32_t
>::
max
())
{
return
build_mapping_impl
<
uint64_t
>
(
docs_
,
sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
);
if
(
verbose
)
{
cout
<<
" > using uint64 for data mapping..."
<<
endl
<<
std
::
flush
;
}
return
build_mapping_impl
<
uint64_t
>
(
docs_
,
sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
,
verbose
);
}
else
{
return
build_mapping_impl
<
uint32_t
>
(
docs_
,
sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
);
if
(
verbose
)
{
cout
<<
" > using uint32 for data mapping..."
<<
endl
<<
std
::
flush
;
}
return
build_mapping_impl
<
uint32_t
>
(
docs_
,
sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
,
verbose
);
}
}
PYBIND11_MODULE
(
helpers
,
m
)
{
m
.
def
(
"build_mapping"
,
&
build_mapping
);
}
megatron/data/indexed_dataset.py
View file @
690291dd
...
...
@@ -18,6 +18,7 @@ from itertools import accumulate
import
numpy
as
np
import
torch
from
megatron.utils
import
print_rank_0
def
__best_fitting_dtype
(
vocab_size
=
None
):
if
vocab_size
is
not
None
and
vocab_size
<
65500
:
...
...
@@ -317,7 +318,7 @@ class IndexedDatasetBuilder(object):
def
_warmup_mmap_file
(
path
):
with
open
(
path
,
'rb'
)
as
stream
:
while
stream
.
read
(
1
*
1024
*
1024
):
while
stream
.
read
(
1
00
*
1024
*
1024
):
pass
...
...
@@ -390,17 +391,17 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
offset
=
stream
.
tell
()
if
not
skip_warmup
:
print
(
"> Warming up index mmap file..."
)
print
_rank_0
(
"> Warming up index mmap file..."
)
_warmup_mmap_file
(
path
)
self
.
_bin_buffer_mmap
=
np
.
memmap
(
path
,
mode
=
'r'
,
order
=
'C'
)
self
.
_bin_buffer
=
memoryview
(
self
.
_bin_buffer_mmap
)
print
(
"> Reading sizes..."
)
print
_rank_0
(
"> Reading sizes..."
)
self
.
_sizes
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
np
.
int32
,
count
=
self
.
_len
,
offset
=
offset
)
print
(
"> Reading pointers..."
)
print
_rank_0
(
"> Reading pointers..."
)
self
.
_pointers
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
np
.
int64
,
count
=
self
.
_len
,
offset
=
offset
+
self
.
_sizes
.
nbytes
)
print
(
"> Reading document index..."
)
print
_rank_0
(
"> Reading document index..."
)
self
.
_doc_idx
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
np
.
int64
,
count
=
self
.
_doc_count
,
offset
=
offset
+
self
.
_sizes
.
nbytes
+
self
.
_pointers
.
nbytes
)
def
__del__
(
self
):
...
...
@@ -446,13 +447,13 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
self
.
_index
=
self
.
Index
(
index_file_path
(
self
.
_path
),
skip_warmup
)
if
not
skip_warmup
:
print
(
"> Warming up data mmap file..."
)
print
_rank_0
(
"> Warming up data mmap file..."
)
_warmup_mmap_file
(
data_file_path
(
self
.
_path
))
print
(
"> Creating numpy buffer of mmap..."
)
print
_rank_0
(
"> Creating numpy buffer of mmap..."
)
self
.
_bin_buffer_mmap
=
np
.
memmap
(
data_file_path
(
self
.
_path
),
mode
=
'r'
,
order
=
'C'
)
print
(
"> Creating memory view of numpy buffer..."
)
print
_rank_0
(
"> Creating memory view of numpy buffer..."
)
self
.
_bin_buffer
=
memoryview
(
self
.
_bin_buffer_mmap
)
print
(
"> Done"
)
print
_rank_0
(
"> Done"
)
def
__del__
(
self
):
self
.
_bin_buffer_mmap
.
_mmap
.
close
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment