Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
2c58c9b0
Commit
2c58c9b0
authored
Dec 24, 2019
by
Mohammad Shoeybi
Browse files
added filtering based on sentence length
parent
75a86a1d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
21 additions
and
1 deletion
+21
-1
megatron/data/helpers.cpp
megatron/data/helpers.cpp
+21
-1
No files found.
megatron/data/helpers.cpp
View file @
2c58c9b0
...
@@ -30,6 +30,8 @@
...
@@ -30,6 +30,8 @@
namespace
py
=
pybind11
;
namespace
py
=
pybind11
;
using
namespace
std
;
using
namespace
std
;
const
int32_t
LONG_SENTENCE_LEN
=
256
;
inline
int32_t
get_target_sample_len
(
const
int32_t
short_seq_ratio
,
inline
int32_t
get_target_sample_len
(
const
int32_t
short_seq_ratio
,
const
int32_t
max_length
,
const
int32_t
max_length
,
...
@@ -114,6 +116,7 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
...
@@ -114,6 +116,7 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
// Counters:
// Counters:
uint64_t
empty_docs
=
0
;
uint64_t
empty_docs
=
0
;
uint64_t
one_sent_docs
=
0
;
uint64_t
one_sent_docs
=
0
;
uint64_t
long_sent_docs
=
0
;
// Current map index.
// Current map index.
uint64_t
map_index
=
0
;
uint64_t
map_index
=
0
;
...
@@ -151,8 +154,23 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
...
@@ -151,8 +154,23 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
}
}
}
}
// Detect documents with long sentences
bool
contains_long_sentence
=
false
;
if
(
num_remain_sent
>
1
)
{
for
(
auto
sent_index
=
sent_index_first
;
sent_index
<
sent_index_last
;
++
sent_index
)
{
if
(
sizes
[
sent_index
]
>
LONG_SENTENCE_LEN
){
if
((
epoch
==
0
)
&&
(
!
second
))
{
++
long_sent_docs
;
}
contains_long_sentence
=
true
;
break
;
}
}
}
// If we have more than two sentences.
// If we have more than two sentences.
if
(
num_remain_sent
>
1
)
{
if
(
(
num_remain_sent
>
1
)
&&
(
!
contains_long_sentence
))
{
// Set values.
// Set values.
auto
seq_len
=
int32_t
{
0
};
auto
seq_len
=
int32_t
{
0
};
...
@@ -217,6 +235,8 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
...
@@ -217,6 +235,8 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
endl
<<
std
::
flush
;
endl
<<
std
::
flush
;
cout
<<
" number of documents with one sentence: "
<<
cout
<<
" number of documents with one sentence: "
<<
one_sent_docs
<<
endl
<<
std
::
flush
;
one_sent_docs
<<
endl
<<
std
::
flush
;
cout
<<
" number of documents with long sentences: "
<<
long_sent_docs
<<
endl
<<
std
::
flush
;
cout
<<
" will create mapping for "
<<
map_index
<<
cout
<<
" will create mapping for "
<<
map_index
<<
" samples"
<<
endl
<<
std
::
flush
;
" samples"
<<
endl
<<
std
::
flush
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment