Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
7bd2a3c5
Commit
7bd2a3c5
authored
Apr 13, 2020
by
Neel Kant
Browse files
Add build_blocks_mapping code to helpers.cpp
parent
064bdc46
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
209 additions
and
0 deletions
+209
-0
megatron/data/helpers.cpp
megatron/data/helpers.cpp
+209
-0
No files found.
megatron/data/helpers.cpp
View file @
7bd2a3c5
...
@@ -304,7 +304,216 @@ py::array build_mapping(const py::array_t<int64_t>& docs_,
...
@@ -304,7 +304,216 @@ py::array build_mapping(const py::array_t<int64_t>& docs_,
}
}
}
}
template
<
typename
DocIdx
>
py
::
array
build_blocks_mapping_impl
(
const
py
::
array_t
<
int64_t
>&
docs_
,
const
py
::
array_t
<
int32_t
>&
sizes_
,
const
py
::
array_t
<
int32_t
>&
titles_sizes_
,
const
int32_t
num_epochs
,
const
uint64_t
max_num_samples
,
const
int32_t
max_seq_length
,
const
int32_t
seed
,
const
bool
verbose
)
{
/* Build a mapping of (start-index, end-index, sequence-length) where
start and end index are the indices of the sentences in the sample
and sequence-length is the target sequence length.
*/
// Consistency checks.
assert
(
num_epochs
>
0
);
assert
(
max_seq_length
>
1
);
assert
(
seed
>
0
);
// Remove bound checks.
auto
docs
=
docs_
.
unchecked
<
1
>
();
auto
sizes
=
sizes_
.
unchecked
<
1
>
();
auto
titles_sizes
=
titles_sizes_
.
unchecked
<
1
>
();
if
(
verbose
)
{
const
auto
sent_start_index
=
docs
[
0
];
const
auto
sent_end_index
=
docs
[
docs_
.
shape
(
0
)
-
1
];
const
auto
num_sentences
=
sent_end_index
-
sent_start_index
;
cout
<<
" using:"
<<
endl
<<
std
::
flush
;
cout
<<
" number of documents: "
<<
docs_
.
shape
(
0
)
-
1
<<
endl
<<
std
::
flush
;
cout
<<
" sentences range: ["
<<
sent_start_index
<<
", "
<<
sent_end_index
<<
")"
<<
endl
<<
std
::
flush
;
cout
<<
" total number of sentences: "
<<
num_sentences
<<
endl
<<
std
::
flush
;
cout
<<
" number of epochs: "
<<
num_epochs
<<
endl
<<
std
::
flush
;
cout
<<
" maximum number of samples: "
<<
max_num_samples
<<
endl
<<
std
::
flush
;
cout
<<
" maximum sequence length: "
<<
max_seq_length
<<
endl
<<
std
::
flush
;
cout
<<
" seed: "
<<
seed
<<
endl
<<
std
::
flush
;
}
// Mapping and its length (1D).
int64_t
num_samples
=
-
1
;
DocIdx
*
maps
=
NULL
;
// Perform two iterations, in the first iteration get the size
// and allocate memory and in the second iteration populate the map.
bool
second
=
false
;
for
(
int32_t
iteration
=
0
;
iteration
<
2
;
++
iteration
)
{
// Set the flag on second iteration.
second
=
(
iteration
==
1
);
// Counters:
uint64_t
empty_docs
=
0
;
uint64_t
one_sent_docs
=
0
;
uint64_t
long_sent_docs
=
0
;
// Current map index.
uint64_t
map_index
=
0
;
// For each epoch:
for
(
int32_t
epoch
=
0
;
epoch
<
num_epochs
;
++
epoch
)
{
if
(
map_index
>=
max_num_samples
)
{
if
(
verbose
&&
(
!
second
))
{
cout
<<
" reached "
<<
max_num_samples
<<
" samples after "
<<
epoch
<<
" epochs ..."
<<
endl
<<
std
::
flush
;
}
break
;
}
// For each document:
for
(
int32_t
doc
=
0
;
doc
<
(
docs
.
shape
(
0
)
-
1
);
++
doc
)
{
// Document sentences are in [sent_index_first, sent_index_last)
const
auto
sent_index_first
=
docs
[
doc
];
const
auto
sent_index_last
=
docs
[
doc
+
1
];
const
auto
target_seq_len
=
max_seq_length
-
titles_sizes
[
doc
]
// At the begining of the document previous index is the
// start index.
auto
prev_start_index
=
sent_index_first
;
// Remaining documents.
auto
num_remain_sent
=
sent_index_last
-
sent_index_first
;
// Detect documents with long sentences.
bool
contains_long_sentence
=
false
;
if
(
num_remain_sent
>
1
)
{
for
(
auto
sent_index
=
sent_index_first
;
sent_index
<
sent_index_last
;
++
sent_index
)
{
if
(
sizes
[
sent_index
]
>
LONG_SENTENCE_LEN
){
contains_long_sentence
=
true
;
break
;
}
}
}
// If we have more than two sentences.
if
((
num_remain_sent
>
1
)
&&
(
!
contains_long_sentence
))
{
// Set values.
auto
seq_len
=
int32_t
{
0
};
auto
num_sent
=
int32_t
{
0
};
// Loop through sentences.
for
(
auto
sent_index
=
sent_index_first
;
sent_index
<
sent_index_last
;
++
sent_index
)
{
// Add the size and number of sentences.
seq_len
+=
sizes
[
sent_index
];
++
num_sent
;
--
num_remain_sent
;
// If we have reached the target length.
// and if not only one sentence is left in the document.
// and if we have at least two sentneces.
// or if we have reached end of the document.
if
(((
seq_len
>=
target_seq_len
)
&&
(
num_remain_sent
>
1
)
&&
(
num_sent
>
1
)
)
||
(
num_remain_sent
==
0
))
{
// Populate the map.
if
(
second
)
{
const
auto
map_index_0
=
3
*
map_index
;
maps
[
map_index_0
]
=
static_cast
<
DocIdx
>
(
prev_start_index
);
maps
[
map_index_0
+
1
]
=
static_cast
<
DocIdx
>
(
sent_index
+
1
);
maps
[
map_index_0
+
2
]
=
static_cast
<
DocIdx
>
(
target_seq_len
);
}
// Update indices / counters.
++
map_index
;
prev_start_index
=
sent_index
+
1
;
seq_len
=
0
;
num_sent
=
0
;
}
}
// for (auto sent_index=sent_index_first; ...
}
// if (num_remain_sent > 1) {
}
// for (int doc=0; doc < num_docs; ++doc) {
}
// for (int epoch=0; epoch < num_epochs; ++epoch) {
if
(
!
second
)
{
if
(
verbose
)
{
cout
<<
" will create mapping for "
<<
map_index
<<
" samples"
<<
endl
<<
std
::
flush
;
}
assert
(
maps
==
NULL
);
assert
(
num_samples
<
0
);
maps
=
new
DocIdx
[
3
*
map_index
];
num_samples
=
static_cast
<
int64_t
>
(
map_index
);
}
}
// for (int iteration=0; iteration < 2; ++iteration) {
// Shuffle.
// We need a 64 bit random number generator as we might have more
// than 2 billion samples.
std
::
mt19937_64
rand64_gen
(
seed
+
1
);
for
(
auto
i
=
(
num_samples
-
1
);
i
>
0
;
--
i
)
{
const
auto
j
=
static_cast
<
int64_t
>
(
rand64_gen
()
%
(
i
+
1
));
const
auto
i0
=
3
*
i
;
const
auto
j0
=
3
*
j
;
// Swap values.
swap
(
maps
[
i0
],
maps
[
j0
]);
swap
(
maps
[
i0
+
1
],
maps
[
j0
+
1
]);
swap
(
maps
[
i0
+
2
],
maps
[
j0
+
2
]);
}
// Method to deallocate memory.
py
::
capsule
free_when_done
(
maps
,
[](
void
*
mem_
)
{
DocIdx
*
mem
=
reinterpret_cast
<
DocIdx
*>
(
mem_
);
delete
[]
mem
;
});
// Return the numpy array.
const
auto
byte_size
=
sizeof
(
DocIdx
);
return
py
::
array
(
std
::
vector
<
int64_t
>
{
num_samples
,
3
},
// shape
{
3
*
byte_size
,
byte_size
},
// C-style contiguous strides
maps
,
// the data pointer
free_when_done
);
// numpy array references
}
py
::
array
build_blocks_mapping
(
const
py
::
array_t
<
int64_t
>&
docs_
,
const
py
::
array_t
<
int
>&
sizes_
,
const
py
::
array_t
<
int
>&
titles_sizes_
,
const
int
num_epochs
,
const
uint64_t
max_num_samples
,
const
int
max_seq_length
,
const
int
seed
,
const
bool
verbose
)
{
if
(
sizes_
.
size
()
>
std
::
numeric_limits
<
uint32_t
>::
max
())
{
if
(
verbose
)
{
cout
<<
" using uint64 for data mapping..."
<<
endl
<<
std
::
flush
;
}
return
build_blocks_mapping_impl
<
uint64_t
>
(
docs_
,
sizes_
,
titles_sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
seed
,
verbose
);
}
else
{
if
(
verbose
)
{
cout
<<
" using uint32 for data mapping..."
<<
endl
<<
std
::
flush
;
}
return
build_blocks_mapping_impl
<
uint32_t
>
(
docs_
,
sizes_
,
titles_sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
seed
,
verbose
);
}
}
PYBIND11_MODULE
(
helpers
,
m
)
{
PYBIND11_MODULE
(
helpers
,
m
)
{
m
.
def
(
"build_mapping"
,
&
build_mapping
);
m
.
def
(
"build_mapping"
,
&
build_mapping
);
m
.
def
(
"build_blocks_mapping"
,
&
build_blocks_mapping
)
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment