Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
594202a9
Commit
594202a9
authored
Oct 02, 2019
by
VictorSanh
Committed by
Victor SANH
Oct 03, 2019
Browse files
lm_seqs_dataset
parent
38084507
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
87 deletions
+37
-87
examples/distillation/lm_seqs_dataset.py
examples/distillation/lm_seqs_dataset.py
+37
-87
No files found.
examples/distillation/dataset.py
→
examples/distillation/
lm_seqs_
dataset.py
View file @
594202a9
...
@@ -12,30 +12,33 @@
...
@@ -12,30 +12,33 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
""" Data
loaders to train DistilBERT
""" Data
set to distilled models
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
"""
"""
from
typing
import
List
import
math
from
itertools
import
chain
from
collections
import
Counter
import
numpy
as
np
import
torch
import
torch
from
torch.utils.data
import
Dataset
import
numpy
as
np
from
utils
import
logger
from
utils
import
logger
class
Dataset
:
class
LmSeqsDataset
(
Dataset
):
"""Custom Dataset wrapping language modeling sequences.
Each sample will be retrieved by indexing the list of token_ids and their corresponding lengths.
Input:
------
params: `NameSpace` parameters
data: `List[np.array[int]]
"""
def
__init__
(
self
,
def
__init__
(
self
,
params
,
params
,
data
):
data
):
self
.
params
=
params
self
.
params
=
params
self
.
tokens_per_batch
=
params
.
tokens_per_batch
self
.
batch_size
=
params
.
batch_size
self
.
shuffle
=
params
.
shuffle
self
.
group_by_size
=
params
.
group_by_size
self
.
token_ids
=
np
.
array
(
data
)
self
.
token_ids
=
np
.
array
(
data
)
self
.
lengths
=
np
.
uint16
([
len
(
t
)
for
t
in
data
])
self
.
lengths
=
np
.
array
([
len
(
t
)
for
t
in
data
])
self
.
check
()
self
.
check
()
self
.
remove_long_sequences
()
self
.
remove_long_sequences
()
...
@@ -43,6 +46,9 @@ class Dataset:
...
@@ -43,6 +46,9 @@ class Dataset:
self
.
check
()
self
.
check
()
self
.
print_statistics
()
self
.
print_statistics
()
def
__getitem__
(
self
,
index
):
return
(
self
.
token_ids
[
index
],
self
.
lengths
[
index
])
def
__len__
(
self
):
def
__len__
(
self
):
return
len
(
self
.
lengths
)
return
len
(
self
.
lengths
)
...
@@ -51,12 +57,14 @@ class Dataset:
...
@@ -51,12 +57,14 @@ class Dataset:
Some sanity checks
Some sanity checks
"""
"""
assert
len
(
self
.
token_ids
)
==
len
(
self
.
lengths
)
assert
len
(
self
.
token_ids
)
==
len
(
self
.
lengths
)
assert
all
(
self
.
lengths
[
i
]
==
len
(
self
.
token_ids
[
i
])
for
i
in
range
(
len
(
self
.
lengths
)))
def
remove_long_sequences
(
self
):
def
remove_long_sequences
(
self
):
"""
"""
Sequences that are too long are splitted by chunk of max_
position_embeddings
.
Sequences that are too long are splitted by chunk of max_
model_input_size
.
"""
"""
indices
=
self
.
lengths
>=
self
.
params
.
max_position_embeddings
max_len
=
self
.
params
.
max_model_input_size
indices
=
self
.
lengths
>
max_len
logger
.
info
(
f
'Splitting
{
sum
(
indices
)
}
too long sequences.'
)
logger
.
info
(
f
'Splitting
{
sum
(
indices
)
}
too long sequences.'
)
def
divide_chunks
(
l
,
n
):
def
divide_chunks
(
l
,
n
):
...
@@ -64,10 +72,13 @@ class Dataset:
...
@@ -64,10 +72,13 @@ class Dataset:
new_tok_ids
=
[]
new_tok_ids
=
[]
new_lengths
=
[]
new_lengths
=
[]
if
self
.
params
.
mlm
:
cls_id
,
sep_id
=
self
.
params
.
special_tok_ids
[
'cls_token'
],
self
.
params
.
special_tok_ids
[
'sep_token'
]
cls_id
,
sep_id
=
self
.
params
.
special_tok_ids
[
'cls_token'
],
self
.
params
.
special_tok_ids
[
'sep_token'
]
max_len
=
self
.
params
.
max_position_embeddings
else
:
cls_id
,
sep_id
=
self
.
params
.
special_tok_ids
[
'bos_token'
],
self
.
params
.
special_tok_ids
[
'eos_token'
]
for
seq_
,
len_
in
zip
(
self
.
token_ids
,
self
.
lengths
):
for
seq_
,
len_
in
zip
(
self
.
token_ids
,
self
.
lengths
):
assert
(
seq_
[
0
]
==
cls_id
)
and
(
seq_
[
-
1
]
==
sep_id
),
seq_
if
len_
<=
max_len
:
if
len_
<=
max_len
:
new_tok_ids
.
append
(
seq_
)
new_tok_ids
.
append
(
seq_
)
new_lengths
.
append
(
len_
)
new_lengths
.
append
(
len_
)
...
@@ -79,6 +90,7 @@ class Dataset:
...
@@ -79,6 +90,7 @@ class Dataset:
if
sub_s
[
-
1
]
!=
sep_id
:
if
sub_s
[
-
1
]
!=
sep_id
:
sub_s
=
np
.
insert
(
sub_s
,
len
(
sub_s
),
sep_id
)
sub_s
=
np
.
insert
(
sub_s
,
len
(
sub_s
),
sep_id
)
assert
len
(
sub_s
)
<=
max_len
assert
len
(
sub_s
)
<=
max_len
assert
(
sub_s
[
0
]
==
cls_id
)
and
(
sub_s
[
-
1
]
==
sep_id
),
sub_s
sub_seqs
.
append
(
sub_s
)
sub_seqs
.
append
(
sub_s
)
new_tok_ids
.
extend
(
sub_seqs
)
new_tok_ids
.
extend
(
sub_seqs
)
...
@@ -113,89 +125,27 @@ class Dataset:
...
@@ -113,89 +125,27 @@ class Dataset:
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
def
select_data
(
self
,
a
:
int
,
b
:
int
):
"""
Select a subportion of the data.
"""
n_sequences
=
len
(
self
)
assert
0
<=
a
<
b
<=
n_sequences
,
ValueError
(
f
'`0 <= a < b <= n_sequences` is not met with a=
{
a
}
and b=
{
b
}
'
)
logger
.
info
(
f
'Selecting sequences from
{
a
}
to
{
b
}
(excluded).'
)
self
.
token_ids
=
self
.
token_ids
[
a
:
b
]
self
.
lengths
=
self
.
lengths
[
a
:
b
]
self
.
check
()
def
split
(
self
):
"""
Distributed training: split the data accross the processes.
"""
assert
self
.
params
.
n_gpu
>
1
logger
.
info
(
'Splitting the data accross the processuses.'
)
n_seq
=
len
(
self
)
n_seq_per_procesus
=
n_seq
//
self
.
params
.
world_size
a
=
n_seq_per_procesus
*
self
.
params
.
global_rank
b
=
a
+
n_seq_per_procesus
self
.
select_data
(
a
=
a
,
b
=
b
)
def
batch_sequences
(
self
,
def
batch_sequences
(
self
,
token_ids
:
List
[
List
[
int
]],
batch
):
lengths
:
List
[
int
]):
"""
"""
Do the padding and transform into torch.tensor.
Do the padding and transform into torch.tensor.
"""
"""
token_ids
=
[
t
[
0
]
for
t
in
batch
]
lengths
=
[
t
[
1
]
for
t
in
batch
]
assert
len
(
token_ids
)
==
len
(
lengths
)
assert
len
(
token_ids
)
==
len
(
lengths
)
# Max for paddings
# Max for paddings
max_seq_len_
=
max
(
lengths
)
max_seq_len_
=
max
(
lengths
)
# Pad token ids
# Pad token ids
if
self
.
params
.
mlm
:
pad_idx
=
self
.
params
.
special_tok_ids
[
'pad_token'
]
pad_idx
=
self
.
params
.
special_tok_ids
[
'pad_token'
]
else
:
pad_idx
=
self
.
params
.
special_tok_ids
[
'unk_token'
]
tk_
=
[
list
(
t
.
astype
(
int
))
+
[
pad_idx
]
*
(
max_seq_len_
-
len
(
t
))
for
t
in
token_ids
]
tk_
=
[
list
(
t
.
astype
(
int
))
+
[
pad_idx
]
*
(
max_seq_len_
-
len
(
t
))
for
t
in
token_ids
]
assert
len
(
tk_
)
==
len
(
token_ids
)
assert
len
(
tk_
)
==
len
(
token_ids
)
assert
all
(
len
(
t
)
==
max_seq_len_
for
t
in
tk_
)
assert
all
(
len
(
t
)
==
max_seq_len_
for
t
in
tk_
)
tk_t
=
torch
.
tensor
(
tk_
)
# (bs, max_seq_len_)
tk_t
=
torch
.
tensor
(
tk_
)
# (bs, max_seq_len_)
lg_t
=
torch
.
tensor
(
lengths
.
astype
(
int
)
)
# (bs)
lg_t
=
torch
.
tensor
(
lengths
)
# (bs)
return
tk_t
,
lg_t
return
tk_t
,
lg_t
def
get_batches_iterator
(
self
,
batches
):
"""
Return an iterator over batches.
"""
for
sequences_ids
in
batches
:
token_ids
,
lengths
=
self
.
batch_sequences
(
self
.
token_ids
[
sequences_ids
],
self
.
lengths
[
sequences_ids
])
yield
(
token_ids
,
lengths
)
def
get_iterator
(
self
,
seed
:
int
=
None
):
"""
Return a data iterator.
"""
rng
=
np
.
random
.
RandomState
(
seed
)
n_sequences
=
len
(
self
)
indices
=
np
.
arange
(
n_sequences
)
if
self
.
group_by_size
:
indices
=
indices
[
np
.
argsort
(
self
.
lengths
[
indices
],
kind
=
'mergesort'
)]
if
self
.
tokens_per_batch
==
-
1
:
batches
=
np
.
array_split
(
indices
,
math
.
ceil
(
len
(
indices
)
*
1.
/
self
.
batch_size
))
else
:
assert
self
.
tokens_per_batch
>
0
batch_ids
=
np
.
cumsum
(
self
.
lengths
[
indices
])
//
self
.
tokens_per_batch
_
,
bounds
=
np
.
unique
(
batch_ids
,
return_index
=
True
)
batches
=
[
indices
[
bounds
[
i
]:
bounds
[
i
+
1
]]
for
i
in
range
(
len
(
bounds
)
-
1
)]
if
bounds
[
-
1
]
<
len
(
indices
):
batches
.
append
(
indices
[
bounds
[
-
1
]:])
if
self
.
shuffle
:
rng
.
shuffle
(
batches
)
assert
n_sequences
==
sum
([
len
(
x
)
for
x
in
batches
])
assert
self
.
lengths
[
indices
].
sum
()
==
sum
([
self
.
lengths
[
x
].
sum
()
for
x
in
batches
])
return
self
.
get_batches_iterator
(
batches
=
batches
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment