Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
c0f05c10
Commit
c0f05c10
authored
Nov 29, 2022
by
hepj
Browse files
更新transformer代码
parent
c056df78
Changes
321
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1712 additions
and
0 deletions
+1712
-0
PyTorch/NLP/new-Transformer/fairseq/data/noising.py
PyTorch/NLP/new-Transformer/fairseq/data/noising.py
+334
-0
PyTorch/NLP/new-Transformer/fairseq/data/num_samples_dataset.py
...h/NLP/new-Transformer/fairseq/data/num_samples_dataset.py
+17
-0
PyTorch/NLP/new-Transformer/fairseq/data/numel_dataset.py
PyTorch/NLP/new-Transformer/fairseq/data/numel_dataset.py
+31
-0
PyTorch/NLP/new-Transformer/fairseq/data/offset_tokens_dataset.py
...NLP/new-Transformer/fairseq/data/offset_tokens_dataset.py
+15
-0
PyTorch/NLP/new-Transformer/fairseq/data/pad_dataset.py
PyTorch/NLP/new-Transformer/fairseq/data/pad_dataset.py
+31
-0
PyTorch/NLP/new-Transformer/fairseq/data/plasma_utils.py
PyTorch/NLP/new-Transformer/fairseq/data/plasma_utils.py
+197
-0
PyTorch/NLP/new-Transformer/fairseq/data/prepend_dataset.py
PyTorch/NLP/new-Transformer/fairseq/data/prepend_dataset.py
+28
-0
PyTorch/NLP/new-Transformer/fairseq/data/prepend_token_dataset.py
...NLP/new-Transformer/fairseq/data/prepend_token_dataset.py
+41
-0
PyTorch/NLP/new-Transformer/fairseq/data/raw_label_dataset.py
...rch/NLP/new-Transformer/fairseq/data/raw_label_dataset.py
+23
-0
PyTorch/NLP/new-Transformer/fairseq/data/replace_dataset.py
PyTorch/NLP/new-Transformer/fairseq/data/replace_dataset.py
+36
-0
PyTorch/NLP/new-Transformer/fairseq/data/resampling_dataset.py
...ch/NLP/new-Transformer/fairseq/data/resampling_dataset.py
+139
-0
PyTorch/NLP/new-Transformer/fairseq/data/roll_dataset.py
PyTorch/NLP/new-Transformer/fairseq/data/roll_dataset.py
+18
-0
PyTorch/NLP/new-Transformer/fairseq/data/round_robin_zip_datasets.py
.../new-Transformer/fairseq/data/round_robin_zip_datasets.py
+160
-0
PyTorch/NLP/new-Transformer/fairseq/data/shorten_dataset.py
PyTorch/NLP/new-Transformer/fairseq/data/shorten_dataset.py
+78
-0
PyTorch/NLP/new-Transformer/fairseq/data/sort_dataset.py
PyTorch/NLP/new-Transformer/fairseq/data/sort_dataset.py
+21
-0
PyTorch/NLP/new-Transformer/fairseq/data/strip_token_dataset.py
...h/NLP/new-Transformer/fairseq/data/strip_token_dataset.py
+20
-0
PyTorch/NLP/new-Transformer/fairseq/data/subsample_dataset.py
...rch/NLP/new-Transformer/fairseq/data/subsample_dataset.py
+72
-0
PyTorch/NLP/new-Transformer/fairseq/data/text_compressor.py
PyTorch/NLP/new-Transformer/fairseq/data/text_compressor.py
+58
-0
PyTorch/NLP/new-Transformer/fairseq/data/token_block_dataset.py
...h/NLP/new-Transformer/fairseq/data/token_block_dataset.py
+206
-0
PyTorch/NLP/new-Transformer/fairseq/data/token_block_utils_fast.pyx
...P/new-Transformer/fairseq/data/token_block_utils_fast.pyx
+187
-0
No files found.
Too many changes to show.
To preserve performance only
321 of 321+
files are displayed.
Plain diff
Email patch
PyTorch/NLP/new-Transformer/fairseq/data/noising.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
import
torch
from
fairseq.data
import
data_utils
class
WordNoising
(
object
):
"""Generate a noisy version of a sentence, without changing words themselves."""
def
__init__
(
self
,
dictionary
,
bpe_cont_marker
=
"@@"
,
bpe_end_marker
=
None
):
self
.
dictionary
=
dictionary
self
.
bpe_end
=
None
if
bpe_cont_marker
:
self
.
bpe_end
=
np
.
array
(
[
not
self
.
dictionary
[
i
].
endswith
(
bpe_cont_marker
)
for
i
in
range
(
len
(
self
.
dictionary
))
]
)
elif
bpe_end_marker
:
self
.
bpe_end
=
np
.
array
(
[
self
.
dictionary
[
i
].
endswith
(
bpe_end_marker
)
for
i
in
range
(
len
(
self
.
dictionary
))
]
)
self
.
get_word_idx
=
(
self
.
_get_bpe_word_idx
if
self
.
bpe_end
is
not
None
else
self
.
_get_token_idx
)
def
noising
(
self
,
x
,
lengths
,
noising_prob
=
0.0
):
raise
NotImplementedError
()
def
_get_bpe_word_idx
(
self
,
x
):
"""
Given a list of BPE tokens, for every index in the tokens list,
return the index of the word grouping that it belongs to.
For example, for input x corresponding to ["how", "are", "y@@", "ou"],
return [[0], [1], [2], [2]].
"""
# x: (T x B)
bpe_end
=
self
.
bpe_end
[
x
]
if
x
.
size
(
0
)
==
1
and
x
.
size
(
1
)
==
1
:
# Special case when we only have one word in x. If x = [[N]],
# bpe_end is a scalar (bool) instead of a 2-dim array of bools,
# which makes the sum operation below fail.
return
np
.
array
([[
0
]])
# do a reduce front sum to generate word ids
word_idx
=
bpe_end
[::
-
1
].
cumsum
(
0
)[::
-
1
]
word_idx
=
word_idx
.
max
(
0
)[
None
,
:]
-
word_idx
return
word_idx
def
_get_token_idx
(
self
,
x
):
"""
This is to extend noising functions to be able to apply to non-bpe
tokens, e.g. word or characters.
"""
x
=
torch
.
t
(
x
)
word_idx
=
np
.
array
([
range
(
len
(
x_i
))
for
x_i
in
x
])
return
np
.
transpose
(
word_idx
)
class
WordDropout
(
WordNoising
):
"""Randomly drop input words. If not passing blank_idx (default is None),
then dropped words will be removed. Otherwise, it will be replaced by the
blank_idx."""
def
__init__
(
self
,
dictionary
,
default_dropout_prob
=
0.1
,
bpe_cont_marker
=
"@@"
,
bpe_end_marker
=
None
,
):
super
().
__init__
(
dictionary
,
bpe_cont_marker
,
bpe_end_marker
)
self
.
default_dropout_prob
=
default_dropout_prob
def
noising
(
self
,
x
,
lengths
,
dropout_prob
=
None
,
blank_idx
=
None
):
if
dropout_prob
is
None
:
dropout_prob
=
self
.
default_dropout_prob
# x: (T x B), lengths: B
if
dropout_prob
==
0
:
return
x
,
lengths
assert
0
<
dropout_prob
<
1
# be sure to drop entire words
word_idx
=
self
.
get_word_idx
(
x
)
sentences
=
[]
modified_lengths
=
[]
for
i
in
range
(
lengths
.
size
(
0
)):
# Since dropout probabilities need to apply over non-pad tokens,
# it is not trivial to generate the keep mask without consider
# input lengths; otherwise, this could be done outside the loop
# We want to drop whole words based on word_idx grouping
num_words
=
max
(
word_idx
[:,
i
])
+
1
# ith example: [x0, x1, ..., eos, pad, ..., pad]
# We should only generate keep probs for non-EOS tokens. Thus if the
# input sentence ends in EOS, the last word idx is not included in
# the dropout mask generation and we append True to always keep EOS.
# Otherwise, just generate the dropout mask for all word idx
# positions.
has_eos
=
x
[
lengths
[
i
]
-
1
,
i
]
==
self
.
dictionary
.
eos
()
if
has_eos
:
# has eos?
keep
=
np
.
random
.
rand
(
num_words
-
1
)
>=
dropout_prob
keep
=
np
.
append
(
keep
,
[
True
])
# keep EOS symbol
else
:
keep
=
np
.
random
.
rand
(
num_words
)
>=
dropout_prob
words
=
x
[:
lengths
[
i
],
i
].
tolist
()
# TODO: speed up the following loop
# drop words from the input according to keep
new_s
=
[
w
if
keep
[
word_idx
[
j
,
i
]]
else
blank_idx
for
j
,
w
in
enumerate
(
words
)
]
new_s
=
[
w
for
w
in
new_s
if
w
is
not
None
]
# we need to have at least one word in the sentence (more than the
# start / end sentence symbols)
if
len
(
new_s
)
<=
1
:
# insert at beginning in case the only token left is EOS
# EOS should be at end of list.
new_s
.
insert
(
0
,
words
[
np
.
random
.
randint
(
0
,
len
(
words
))])
assert
len
(
new_s
)
>=
1
and
(
not
has_eos
# Either don't have EOS at end or last token is EOS
or
(
len
(
new_s
)
>=
2
and
new_s
[
-
1
]
==
self
.
dictionary
.
eos
())
),
"New sentence is invalid."
sentences
.
append
(
new_s
)
modified_lengths
.
append
(
len
(
new_s
))
# re-construct input
modified_lengths
=
torch
.
LongTensor
(
modified_lengths
)
modified_x
=
torch
.
LongTensor
(
modified_lengths
.
max
(),
modified_lengths
.
size
(
0
)
).
fill_
(
self
.
dictionary
.
pad
())
for
i
in
range
(
modified_lengths
.
size
(
0
)):
modified_x
[:
modified_lengths
[
i
],
i
].
copy_
(
torch
.
LongTensor
(
sentences
[
i
]))
return
modified_x
,
modified_lengths
class
WordShuffle
(
WordNoising
):
"""Shuffle words by no more than k positions."""
def
__init__
(
self
,
dictionary
,
default_max_shuffle_distance
=
3
,
bpe_cont_marker
=
"@@"
,
bpe_end_marker
=
None
,
):
super
().
__init__
(
dictionary
,
bpe_cont_marker
,
bpe_end_marker
)
self
.
default_max_shuffle_distance
=
3
def
noising
(
self
,
x
,
lengths
,
max_shuffle_distance
=
None
):
if
max_shuffle_distance
is
None
:
max_shuffle_distance
=
self
.
default_max_shuffle_distance
# x: (T x B), lengths: B
if
max_shuffle_distance
==
0
:
return
x
,
lengths
# max_shuffle_distance < 1 will return the same sequence
assert
max_shuffle_distance
>
1
# define noise word scores
noise
=
np
.
random
.
uniform
(
0
,
max_shuffle_distance
,
size
=
(
x
.
size
(
0
),
x
.
size
(
1
)),
)
noise
[
0
]
=
-
1
# do not move start sentence symbol
# be sure to shuffle entire words
word_idx
=
self
.
get_word_idx
(
x
)
x2
=
x
.
clone
()
for
i
in
range
(
lengths
.
size
(
0
)):
length_no_eos
=
lengths
[
i
]
if
x
[
lengths
[
i
]
-
1
,
i
]
==
self
.
dictionary
.
eos
():
length_no_eos
=
lengths
[
i
]
-
1
# generate a random permutation
scores
=
word_idx
[:
length_no_eos
,
i
]
+
noise
[
word_idx
[:
length_no_eos
,
i
],
i
]
# ensure no reordering inside a word
scores
+=
1e-6
*
np
.
arange
(
length_no_eos
.
item
())
permutation
=
scores
.
argsort
()
# shuffle words
x2
[:
length_no_eos
,
i
].
copy_
(
x2
[:
length_no_eos
,
i
][
torch
.
from_numpy
(
permutation
)]
)
return
x2
,
lengths
class
UnsupervisedMTNoising
(
WordNoising
):
"""
Implements the default configuration for noising in UnsupervisedMT
(github.com/facebookresearch/UnsupervisedMT)
"""
def
__init__
(
self
,
dictionary
,
max_word_shuffle_distance
,
word_dropout_prob
,
word_blanking_prob
,
bpe_cont_marker
=
"@@"
,
bpe_end_marker
=
None
,
):
super
().
__init__
(
dictionary
)
self
.
max_word_shuffle_distance
=
max_word_shuffle_distance
self
.
word_dropout_prob
=
word_dropout_prob
self
.
word_blanking_prob
=
word_blanking_prob
self
.
word_dropout
=
WordDropout
(
dictionary
=
dictionary
,
bpe_cont_marker
=
bpe_cont_marker
,
bpe_end_marker
=
bpe_end_marker
,
)
self
.
word_shuffle
=
WordShuffle
(
dictionary
=
dictionary
,
bpe_cont_marker
=
bpe_cont_marker
,
bpe_end_marker
=
bpe_end_marker
,
)
def
noising
(
self
,
x
,
lengths
):
# 1. Word Shuffle
noisy_src_tokens
,
noisy_src_lengths
=
self
.
word_shuffle
.
noising
(
x
=
x
,
lengths
=
lengths
,
max_shuffle_distance
=
self
.
max_word_shuffle_distance
,
)
# 2. Word Dropout
noisy_src_tokens
,
noisy_src_lengths
=
self
.
word_dropout
.
noising
(
x
=
noisy_src_tokens
,
lengths
=
noisy_src_lengths
,
dropout_prob
=
self
.
word_dropout_prob
,
)
# 3. Word Blanking
noisy_src_tokens
,
noisy_src_lengths
=
self
.
word_dropout
.
noising
(
x
=
noisy_src_tokens
,
lengths
=
noisy_src_lengths
,
dropout_prob
=
self
.
word_blanking_prob
,
blank_idx
=
self
.
dictionary
.
unk
(),
)
return
noisy_src_tokens
class
NoisingDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
src_dataset
,
src_dict
,
seed
,
noiser
=
None
,
noising_class
=
UnsupervisedMTNoising
,
**
kwargs
):
"""
Wrap a :class:`~torch.utils.data.Dataset` and apply noise to the
samples based on the supplied noising configuration.
Args:
src_dataset (~torch.utils.data.Dataset): dataset to wrap.
to build self.src_dataset --
a LanguagePairDataset with src dataset as the source dataset and
None as the target dataset. Should NOT have padding so that
src_lengths are accurately calculated by language_pair_dataset
collate function.
We use language_pair_dataset here to encapsulate the tgt_dataset
so we can re-use the LanguagePairDataset collater to format the
batches in the structure that SequenceGenerator expects.
src_dict (~fairseq.data.Dictionary): source dictionary
seed (int): seed to use when generating random noise
noiser (WordNoising): a pre-initialized :class:`WordNoising`
instance. If this is None, a new instance will be created using
*noising_class* and *kwargs*.
noising_class (class, optional): class to use to initialize a
default :class:`WordNoising` instance.
kwargs (dict, optional): arguments to initialize the default
:class:`WordNoising` instance given by *noiser*.
"""
self
.
src_dataset
=
src_dataset
self
.
src_dict
=
src_dict
self
.
seed
=
seed
self
.
noiser
=
(
noiser
if
noiser
is
not
None
else
noising_class
(
dictionary
=
src_dict
,
**
kwargs
,
)
)
self
.
sizes
=
src_dataset
.
sizes
def
__getitem__
(
self
,
index
):
"""
Returns a single noisy sample. Multiple samples are fed to the collater
create a noising dataset batch.
"""
src_tokens
=
self
.
src_dataset
[
index
]
src_lengths
=
torch
.
LongTensor
([
len
(
src_tokens
)])
src_tokens
=
src_tokens
.
unsqueeze
(
0
)
# Transpose src tokens to fit expected shape of x in noising function
# (batch size, sequence length) -> (sequence length, batch size)
src_tokens_t
=
torch
.
t
(
src_tokens
)
with
data_utils
.
numpy_seed
(
self
.
seed
+
index
):
noisy_src_tokens
=
self
.
noiser
.
noising
(
src_tokens_t
,
src_lengths
)
# Transpose back to expected src_tokens format
# (sequence length, 1) -> (1, sequence length)
noisy_src_tokens
=
torch
.
t
(
noisy_src_tokens
)
return
noisy_src_tokens
[
0
]
def
__len__
(
self
):
"""
The length of the noising dataset is the length of src.
"""
return
len
(
self
.
src_dataset
)
@
property
def
supports_prefetch
(
self
):
return
self
.
src_dataset
.
supports_prefetch
def
prefetch
(
self
,
indices
):
if
self
.
src_dataset
.
supports_prefetch
:
self
.
src_dataset
.
prefetch
(
indices
)
PyTorch/NLP/new-Transformer/fairseq/data/num_samples_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
.
import
FairseqDataset
class
NumSamplesDataset
(
FairseqDataset
):
def
__getitem__
(
self
,
index
):
return
1
def
__len__
(
self
):
return
0
def
collater
(
self
,
samples
):
return
sum
(
samples
)
PyTorch/NLP/new-Transformer/fairseq/data/numel_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
import
torch
from
.
import
BaseWrapperDataset
class
NumelDataset
(
BaseWrapperDataset
):
def
__init__
(
self
,
dataset
,
reduce
=
False
):
super
().
__init__
(
dataset
)
self
.
reduce
=
reduce
def
__getitem__
(
self
,
index
):
item
=
self
.
dataset
[
index
]
if
torch
.
is_tensor
(
item
):
return
torch
.
numel
(
item
)
else
:
return
np
.
size
(
item
)
def
__len__
(
self
):
return
len
(
self
.
dataset
)
def
collater
(
self
,
samples
):
if
self
.
reduce
:
return
sum
(
samples
)
else
:
return
torch
.
tensor
(
samples
)
PyTorch/NLP/new-Transformer/fairseq/data/offset_tokens_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
.
import
BaseWrapperDataset
class
OffsetTokensDataset
(
BaseWrapperDataset
):
def
__init__
(
self
,
dataset
,
offset
):
super
().
__init__
(
dataset
)
self
.
offset
=
offset
def
__getitem__
(
self
,
idx
):
return
self
.
dataset
[
idx
]
+
self
.
offset
PyTorch/NLP/new-Transformer/fairseq/data/pad_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
fairseq.data
import
data_utils
from
.
import
BaseWrapperDataset
class
PadDataset
(
BaseWrapperDataset
):
def
__init__
(
self
,
dataset
,
pad_idx
,
left_pad
,
pad_length
=
None
):
super
().
__init__
(
dataset
)
self
.
pad_idx
=
pad_idx
self
.
left_pad
=
left_pad
self
.
pad_length
=
pad_length
def
collater
(
self
,
samples
):
return
data_utils
.
collate_tokens
(
samples
,
self
.
pad_idx
,
left_pad
=
self
.
left_pad
,
pad_to_length
=
self
.
pad_length
)
class
LeftPadDataset
(
PadDataset
):
def
__init__
(
self
,
dataset
,
pad_idx
):
super
().
__init__
(
dataset
,
pad_idx
,
left_pad
=
True
)
class
RightPadDataset
(
PadDataset
):
def
__init__
(
self
,
dataset
,
pad_idx
):
super
().
__init__
(
dataset
,
pad_idx
,
left_pad
=
False
)
PyTorch/NLP/new-Transformer/fairseq/data/plasma_utils.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
hashlib
import
json
import
subprocess
import
tempfile
from
typing
import
Hashable
try
:
import
pyarrow.plasma
as
plasma
PYARROW_AVAILABLE
=
True
except
ImportError
:
plasma
=
None
PYARROW_AVAILABLE
=
False
class
PlasmaArray
:
"""
Wrapper around numpy arrays that automatically moves the data to shared
memory upon serialization. This is particularly helpful when passing numpy
arrays through multiprocessing, so that data is not unnecessarily
duplicated or pickled.
"""
def
__init__
(
self
,
array
):
super
().
__init__
()
self
.
array
=
array
self
.
disable
=
array
.
nbytes
<
134217728
# disable for arrays <128MB
self
.
object_id
=
None
self
.
path
=
None
# variables with underscores shouldn't be pickled
self
.
_client
=
None
self
.
_server
=
None
self
.
_server_tmp
=
None
self
.
_plasma
=
None
@
property
def
plasma
(
self
):
if
self
.
_plasma
is
None
and
not
self
.
disable
:
self
.
_plasma
=
plasma
return
self
.
_plasma
def
start_server
(
self
):
if
self
.
plasma
is
None
or
self
.
_server
is
not
None
:
return
assert
self
.
object_id
is
None
assert
self
.
path
is
None
self
.
_server_tmp
=
tempfile
.
NamedTemporaryFile
()
self
.
path
=
self
.
_server_tmp
.
name
self
.
_server
=
subprocess
.
Popen
(
[
"plasma_store"
,
"-m"
,
str
(
int
(
1.05
*
self
.
array
.
nbytes
)),
"-s"
,
self
.
path
]
)
@
property
def
client
(
self
):
if
self
.
_client
is
None
:
assert
self
.
path
is
not
None
self
.
_client
=
self
.
plasma
.
connect
(
self
.
path
,
num_retries
=
200
)
return
self
.
_client
def
__getstate__
(
self
):
"""Called on pickle load"""
if
self
.
plasma
is
None
:
return
self
.
__dict__
if
self
.
object_id
is
None
:
self
.
start_server
()
self
.
object_id
=
self
.
client
.
put
(
self
.
array
)
state
=
self
.
__dict__
.
copy
()
del
state
[
"array"
]
state
[
"_client"
]
=
None
state
[
"_server"
]
=
None
state
[
"_server_tmp"
]
=
None
state
[
"_plasma"
]
=
None
return
state
def
__setstate__
(
self
,
state
):
"""Called on pickle save"""
self
.
__dict__
.
update
(
state
)
if
self
.
plasma
is
None
:
return
self
.
array
=
self
.
client
.
get
(
self
.
object_id
)
def
__del__
(
self
):
if
self
.
_server
is
not
None
:
self
.
_server
.
kill
()
self
.
_server
=
None
self
.
_server_tmp
.
close
()
self
.
_server_tmp
=
None
DEFAULT_PLASMA_PATH
=
"/tmp/plasma"
class
PlasmaView
:
"""Interface to write and read from shared memory. Whereas PlasmaArray writes to plasma on serialization,
PlasmaView writes to shared memory on instantiation."""
def
__init__
(
self
,
array
,
split_path
:
str
,
hash_data
:
Hashable
,
plasma_path
=
None
):
"""
Args:
array: numpy array to store. This can be read with ``PlasmaView().array``
split_path: the path whence the data was read, used for hashing
hash_data: other metadata about the array that can be used to create a unique key.
as of writing, the 3 callers in ``TokenBlockDataset`` use::
hash_data = ((block_size, document_sep_len, str(break_mode), len(dataset)), 0|1|2)
"""
assert
PYARROW_AVAILABLE
assert
split_path
is
not
None
if
plasma_path
is
None
:
plasma_path
=
DEFAULT_PLASMA_PATH
self
.
path
=
plasma_path
self
.
split_path
=
split_path
self
.
_client
=
None
# Initialize lazily for pickle. plasma clients should not be deep copied or serialized.
self
.
_n
=
None
self
.
object_id
=
self
.
get_object_id
(
self
.
split_path
,
hash_data
)
try
:
self
.
client
.
put
(
array
,
object_id
=
self
.
object_id
)
except
plasma
.
PlasmaObjectExists
:
pass
@
property
def
client
(
self
):
if
self
.
_client
is
None
:
self
.
_client
=
plasma
.
connect
(
self
.
path
,
num_retries
=
200
)
return
self
.
_client
@
property
def
array
(
self
):
"""Fetch a read only view of an np.array, stored in plasma."""
ret
=
self
.
client
.
get
(
self
.
object_id
)
return
ret
@
staticmethod
def
get_object_id
(
split_path
:
str
,
hash_data
:
Hashable
):
"""Returns plasma.ObjectID from hashing split_path and object_num."""
hash
=
hashlib
.
blake2b
(
bytes
(
split_path
,
"utf-8"
),
digest_size
=
20
)
harg
=
json
.
dumps
(
hash_data
).
encode
(
"utf-8"
)
hash
.
update
(
harg
)
return
plasma
.
ObjectID
(
hash
.
digest
())
def
__getstate__
(
self
):
"""Called on pickle save"""
self
.
disconnect
()
state
=
self
.
__dict__
.
copy
()
assert
state
[
"_client"
]
is
None
assert
"object_id"
in
state
return
state
def
__setstate__
(
self
,
state
):
"""Called on pickle load"""
self
.
__dict__
.
update
(
state
)
def
__del__
(
self
):
self
.
disconnect
()
def
disconnect
(
self
):
if
self
.
_client
is
not
None
:
self
.
_client
.
disconnect
()
self
.
_client
=
None
def
__len__
(
self
):
"""Save reads by caching len"""
if
self
.
_n
is
None
:
self
.
_n
=
len
(
self
.
array
)
return
self
.
_n
GB100
=
(
1024
**
3
)
*
100
class
PlasmaStore
:
def
__init__
(
self
,
path
=
DEFAULT_PLASMA_PATH
,
nbytes
:
int
=
GB100
):
self
.
server
=
self
.
start
(
path
,
nbytes
)
def
__del__
(
self
):
self
.
server
.
kill
()
@
staticmethod
def
start
(
path
=
DEFAULT_PLASMA_PATH
,
nbytes
:
int
=
GB100
)
->
subprocess
.
Popen
:
if
not
PYARROW_AVAILABLE
:
raise
ImportError
(
"please run pip install pyarrow to use --use_plasma_view"
)
# best practice is to allocate more space than we need. The limitation seems to be the size of /dev/shm
_server
=
subprocess
.
Popen
([
"plasma_store"
,
"-m"
,
str
(
nbytes
),
"-s"
,
path
])
plasma
.
connect
(
path
,
num_retries
=
200
)
# If we can't connect we fail immediately
return
_server
PyTorch/NLP/new-Transformer/fairseq/data/prepend_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
import
torch
from
.
import
BaseWrapperDataset
class
PrependDataset
(
BaseWrapperDataset
):
def
__init__
(
self
,
dataset
,
prepend_getter
,
ensure_first_token_is
=
None
):
super
().
__init__
(
dataset
)
self
.
prepend_getter
=
prepend_getter
self
.
ensure_first_token
=
ensure_first_token_is
def
__getitem__
(
self
,
idx
):
item
=
self
.
dataset
[
idx
]
is_tuple
=
isinstance
(
item
,
tuple
)
src
=
item
[
0
]
if
is_tuple
else
item
assert
self
.
ensure_first_token
is
None
or
src
[
0
]
==
self
.
ensure_first_token
prepend_idx
=
self
.
prepend_getter
(
self
.
dataset
,
idx
)
assert
isinstance
(
prepend_idx
,
int
)
src
[
0
]
=
prepend_idx
item
=
tuple
((
src
,)
+
item
[
1
:])
if
is_tuple
else
src
return
item
PyTorch/NLP/new-Transformer/fairseq/data/prepend_token_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
import
torch
from
.
import
BaseWrapperDataset
class
PrependTokenDataset
(
BaseWrapperDataset
):
def
__init__
(
self
,
dataset
,
token
=
None
):
super
().
__init__
(
dataset
)
self
.
token
=
token
if
token
is
not
None
:
self
.
_sizes
=
np
.
array
(
dataset
.
sizes
)
+
1
else
:
self
.
_sizes
=
dataset
.
sizes
def
__getitem__
(
self
,
idx
):
item
=
self
.
dataset
[
idx
]
if
self
.
token
is
not
None
:
item
=
torch
.
cat
([
item
.
new
([
self
.
token
]),
item
])
return
item
@
property
def
sizes
(
self
):
return
self
.
_sizes
def
num_tokens
(
self
,
index
):
n
=
self
.
dataset
.
num_tokens
(
index
)
if
self
.
token
is
not
None
:
n
+=
1
return
n
def
size
(
self
,
index
):
n
=
self
.
dataset
.
size
(
index
)
if
self
.
token
is
not
None
:
n
+=
1
return
n
PyTorch/NLP/new-Transformer/fairseq/data/raw_label_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
torch
from
.
import
FairseqDataset
class
RawLabelDataset
(
FairseqDataset
):
def
__init__
(
self
,
labels
):
super
().
__init__
()
self
.
labels
=
labels
def
__getitem__
(
self
,
index
):
return
self
.
labels
[
index
]
def
__len__
(
self
):
return
len
(
self
.
labels
)
def
collater
(
self
,
samples
):
return
torch
.
tensor
(
samples
)
PyTorch/NLP/new-Transformer/fairseq/data/replace_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
.
import
BaseWrapperDataset
class
ReplaceDataset
(
BaseWrapperDataset
):
"""Replaces tokens found in the dataset by a specified replacement token
Args:
dataset (~torch.utils.data.Dataset): dataset to replace tokens in
replace_map(Dictionary[int,int]): map of token to replace -> replacement token
offsets (List[int]): do not replace tokens before (from left if pos, right if neg) this offset. should be
as many as the number of objects returned by the underlying dataset __getitem__ method.
"""
def
__init__
(
self
,
dataset
,
replace_map
,
offsets
):
super
().
__init__
(
dataset
)
assert
len
(
replace_map
)
>
0
self
.
replace_map
=
replace_map
self
.
offsets
=
offsets
def
__getitem__
(
self
,
index
):
item
=
self
.
dataset
[
index
]
is_tuple
=
isinstance
(
item
,
tuple
)
srcs
=
item
if
is_tuple
else
[
item
]
for
offset
,
src
in
zip
(
self
.
offsets
,
srcs
):
for
k
,
v
in
self
.
replace_map
.
items
():
src_off
=
src
[
offset
:]
if
offset
>=
0
else
src
[:
offset
]
src_off
.
masked_fill_
(
src_off
==
k
,
v
)
item
=
srcs
if
is_tuple
else
srcs
[
0
]
return
item
PyTorch/NLP/new-Transformer/fairseq/data/resampling_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
logging
import
numpy
as
np
from
fairseq.data
import
BaseWrapperDataset
,
plasma_utils
logger
=
logging
.
getLogger
(
__name__
)
class
ResamplingDataset
(
BaseWrapperDataset
):
"""Randomly samples from a given dataset at each epoch.
Sampling is done with or without replacement, depending on the "replace"
parameter.
Optionally, the epoch size can be rescaled. This is potentially desirable
to increase per-epoch coverage of the base dataset (since sampling with
replacement means that many items in the dataset will be left out). In the
case of sampling without replacement, size_ratio should be strictly less
than 1.
Args:
dataset (~torch.utils.data.Dataset): dataset on which to sample.
weights (List[float]): list of probability weights
(default: None, which corresponds to uniform sampling).
replace (bool): sampling mode; True for "with replacement", or False
for "without replacement" (default: True)
size_ratio (float): the ratio to subsample to; must be positive
(default: 1.0).
batch_by_size (bool): whether or not to batch by sequence length
(default: True).
seed (int): RNG seed to use (default: 0).
epoch (int): starting epoch number (default: 1).
"""
def
__init__
(
self
,
dataset
,
weights
=
None
,
replace
=
True
,
size_ratio
=
1.0
,
batch_by_size
=
True
,
seed
=
0
,
epoch
=
1
,
):
super
().
__init__
(
dataset
)
if
weights
is
None
:
self
.
weights
=
None
else
:
assert
len
(
weights
)
==
len
(
dataset
)
weights_arr
=
np
.
array
(
weights
,
dtype
=
np
.
float64
)
weights_arr
/=
weights_arr
.
sum
()
self
.
weights
=
plasma_utils
.
PlasmaArray
(
weights_arr
)
self
.
replace
=
replace
assert
size_ratio
>
0.0
if
not
self
.
replace
:
assert
size_ratio
<
1.0
self
.
size_ratio
=
float
(
size_ratio
)
self
.
actual_size
=
np
.
ceil
(
len
(
dataset
)
*
self
.
size_ratio
).
astype
(
int
)
self
.
batch_by_size
=
batch_by_size
self
.
seed
=
seed
self
.
_cur_epoch
=
None
self
.
_cur_indices
=
None
self
.
set_epoch
(
epoch
)
def
__getitem__
(
self
,
index
):
return
self
.
dataset
[
self
.
_cur_indices
.
array
[
index
]]
def
__len__
(
self
):
return
self
.
actual_size
@
property
def
sizes
(
self
):
if
isinstance
(
self
.
dataset
.
sizes
,
list
):
return
[
s
[
self
.
_cur_indices
.
array
]
for
s
in
self
.
dataset
.
sizes
]
return
self
.
dataset
.
sizes
[
self
.
_cur_indices
.
array
]
def
num_tokens
(
self
,
index
):
return
self
.
dataset
.
num_tokens
(
self
.
_cur_indices
.
array
[
index
])
def
size
(
self
,
index
):
return
self
.
dataset
.
size
(
self
.
_cur_indices
.
array
[
index
])
def
ordered_indices
(
self
):
if
self
.
batch_by_size
:
order
=
[
np
.
arange
(
len
(
self
)),
self
.
sizes
,
]
# No need to handle `self.shuffle == True`
return
np
.
lexsort
(
order
)
else
:
return
np
.
arange
(
len
(
self
))
def
prefetch
(
self
,
indices
):
self
.
dataset
.
prefetch
(
self
.
_cur_indices
.
array
[
indices
])
@
property
def
can_reuse_epoch_itr_across_epochs
(
self
):
return
False
def
set_epoch
(
self
,
epoch
):
logger
.
debug
(
"ResamplingDataset.set_epoch: {}"
.
format
(
epoch
))
super
().
set_epoch
(
epoch
)
if
epoch
==
self
.
_cur_epoch
:
return
self
.
_cur_epoch
=
epoch
# Generate a weighted sample of indices as a function of the
# random seed and the current epoch.
rng
=
np
.
random
.
RandomState
(
[
42
,
# magic number
self
.
seed
%
(
2
**
32
),
# global seed
self
.
_cur_epoch
,
# epoch index
]
)
self
.
_cur_indices
=
plasma_utils
.
PlasmaArray
(
rng
.
choice
(
len
(
self
.
dataset
),
self
.
actual_size
,
replace
=
self
.
replace
,
p
=
(
None
if
self
.
weights
is
None
else
self
.
weights
.
array
),
)
)
PyTorch/NLP/new-Transformer/fairseq/data/roll_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
torch
from
.
import
BaseWrapperDataset
class
RollDataset
(
BaseWrapperDataset
):
def
__init__
(
self
,
dataset
,
shifts
):
super
().
__init__
(
dataset
)
self
.
shifts
=
shifts
def
__getitem__
(
self
,
index
):
item
=
self
.
dataset
[
index
]
return
torch
.
roll
(
item
,
self
.
shifts
)
PyTorch/NLP/new-Transformer/fairseq/data/round_robin_zip_datasets.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
logging
from
collections
import
OrderedDict
from
typing
import
Dict
,
Sequence
import
numpy
as
np
from
.
import
FairseqDataset
,
LanguagePairDataset
logger
=
logging
.
getLogger
(
__name__
)
class
RoundRobinZipDatasets
(
FairseqDataset
):
"""Zip multiple :class:`~fairseq.data.FairseqDataset` instances together.
Shorter datasets are repeated in a round-robin fashion to match the length
of the longest one.
Args:
datasets (Dict[~fairseq.data.FairseqDataset]): a dictionary of
:class:`~fairseq.data.FairseqDataset` instances.
eval_key (str, optional): a key used at evaluation time that causes
this instance to pass-through batches from *datasets[eval_key]*.
"""
def
__init__
(
self
,
datasets
,
eval_key
=
None
):
super
().
__init__
()
if
isinstance
(
datasets
,
dict
):
datasets
=
OrderedDict
(
datasets
)
assert
isinstance
(
datasets
,
OrderedDict
)
assert
datasets
,
"Can't make a RoundRobinZipDatasets out of nothing"
for
dataset
in
datasets
.
values
():
assert
isinstance
(
dataset
,
FairseqDataset
)
self
.
datasets
=
datasets
self
.
eval_key
=
eval_key
self
.
longest_dataset_key
=
max
(
datasets
,
key
=
lambda
k
:
len
(
datasets
[
k
]))
self
.
longest_dataset
=
datasets
[
self
.
longest_dataset_key
]
self
.
_ordered_indices
:
Dict
[
str
,
Sequence
[
int
]]
=
None
def
_map_index
(
self
,
key
,
index
):
assert
(
self
.
_ordered_indices
is
not
None
),
"Must call RoundRobinZipDatasets.ordered_indices() first"
o
=
self
.
_ordered_indices
[
key
]
return
o
[
index
%
len
(
o
)]
def
__getitem__
(
self
,
index
):
if
self
.
eval_key
is
None
:
return
OrderedDict
(
[
(
key
,
dataset
[
self
.
_map_index
(
key
,
index
)])
for
key
,
dataset
in
self
.
datasets
.
items
()
]
)
else
:
# at evaluation time it's useful to pass-through batches from a single key
return
self
.
datasets
[
self
.
eval_key
][
self
.
_map_index
(
self
.
eval_key
,
index
)]
def
__len__
(
self
):
if
self
.
_ordered_indices
is
not
None
:
return
len
(
self
.
_ordered_indices
[
self
.
longest_dataset_key
])
return
len
(
self
.
longest_dataset
)
def
collater
(
self
,
samples
):
"""Merge a list of samples to form a mini-batch."""
if
len
(
samples
)
==
0
:
return
None
if
self
.
eval_key
is
None
:
return
OrderedDict
(
[
(
key
,
dataset
.
collater
([
sample
[
key
]
for
sample
in
samples
]))
for
key
,
dataset
in
self
.
datasets
.
items
()
]
)
else
:
# at evaluation time it's useful to pass-through batches from a single key
return
self
.
datasets
[
self
.
eval_key
].
collater
(
samples
)
def
num_tokens
(
self
,
index
):
"""Return an example's length (number of tokens), used for batching."""
# TODO make it configurable whether to use max() or sum() here
return
max
(
dataset
.
num_tokens
(
self
.
_map_index
(
key
,
index
))
for
key
,
dataset
in
self
.
datasets
.
items
()
)
def
size
(
self
,
index
):
"""Return an example's size as a float or tuple. This value is used when
filtering a dataset with ``--max-positions``."""
return
{
key
:
dataset
.
size
(
self
.
_map_index
(
key
,
index
))
for
key
,
dataset
in
self
.
datasets
.
items
()
}
def
ordered_indices
(
self
):
"""Ordered indices for batching."""
if
self
.
_ordered_indices
is
None
:
# Call the underlying dataset's ordered_indices() here, so that we
# get the same random ordering as we would have from using the
# underlying sub-datasets directly.
self
.
_ordered_indices
=
OrderedDict
(
[
(
key
,
dataset
.
ordered_indices
())
for
key
,
dataset
in
self
.
datasets
.
items
()
]
)
return
np
.
arange
(
len
(
self
))
def
filter_indices_by_size
(
self
,
indices
,
max_positions
=
None
):
"""
Filter each sub-dataset independently, then update the round robin to work
on the filtered sub-datasets.
"""
def
_deep_until_language_pair
(
dataset
):
if
isinstance
(
dataset
,
LanguagePairDataset
):
return
dataset
if
hasattr
(
dataset
,
"tgt_dataset"
):
return
_deep_until_language_pair
(
dataset
.
tgt_dataset
)
if
hasattr
(
dataset
,
"dataset"
):
return
_deep_until_language_pair
(
dataset
.
dataset
)
raise
Exception
(
f
"Don't know how to unwrap this dataset:
{
dataset
}
"
)
if
not
isinstance
(
max_positions
,
dict
):
max_positions
=
{
k
:
max_positions
for
k
in
self
.
datasets
.
keys
()}
ignored_some
=
False
for
key
,
dataset
in
self
.
datasets
.
items
():
dataset
=
_deep_until_language_pair
(
dataset
)
self
.
_ordered_indices
[
key
],
ignored
=
dataset
.
filter_indices_by_size
(
self
.
_ordered_indices
[
key
],
max_positions
[
key
]
)
if
len
(
ignored
)
>
0
:
ignored_some
=
True
logger
.
warning
(
f
"
{
len
(
ignored
)
}
samples from
{
key
}
have invalid sizes and will be skipped, "
f
"max_positions=
{
max_positions
[
key
]
}
, first few sample ids=
{
ignored
[:
10
]
}
"
)
# Since we are modifying in place the _ordered_indices,
# it's not possible anymore to return valid ignored indices.
# Hopefully the extra debug information print above should be enough to debug.
# Ideally we would receive ignore_invalid_inputs so that we could have
# a proper error message.
return
(
np
.
arange
(
len
(
self
)),
[
0
]
if
ignored_some
else
[])
@
property
def
supports_prefetch
(
self
):
return
all
(
getattr
(
dataset
,
"supports_prefetch"
,
False
)
for
dataset
in
self
.
datasets
.
values
()
)
def
prefetch
(
self
,
indices
):
for
key
,
dataset
in
self
.
datasets
.
items
():
dataset
.
prefetch
([
self
.
_map_index
(
key
,
index
)
for
index
in
indices
])
PyTorch/NLP/new-Transformer/fairseq/data/shorten_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
from
fairseq.data
import
data_utils
from
.
import
BaseWrapperDataset
class
TruncateDataset
(
BaseWrapperDataset
):
"""Truncate a sequence by returning the first truncation_length tokens"""
def
__init__
(
self
,
dataset
,
truncation_length
):
super
().
__init__
(
dataset
)
assert
truncation_length
is
not
None
self
.
truncation_length
=
truncation_length
self
.
dataset
=
dataset
def
__getitem__
(
self
,
index
):
item
=
self
.
dataset
[
index
]
item_len
=
item
.
size
(
0
)
if
item_len
>
self
.
truncation_length
:
item
=
item
[:
self
.
truncation_length
]
return
item
@
property
def
sizes
(
self
):
return
np
.
minimum
(
self
.
dataset
.
sizes
,
self
.
truncation_length
)
def
__len__
(
self
):
return
len
(
self
.
dataset
)
class
RandomCropDataset
(
TruncateDataset
):
"""Truncate a sequence by returning a random crop of truncation_length tokens"""
def
__init__
(
self
,
dataset
,
truncation_length
,
seed
=
1
):
super
().
__init__
(
dataset
,
truncation_length
)
self
.
seed
=
seed
self
.
epoch
=
0
@
property
def
can_reuse_epoch_itr_across_epochs
(
self
):
return
True
# only the crop changes, not item sizes
def
set_epoch
(
self
,
epoch
,
**
unused
):
super
().
set_epoch
(
epoch
)
self
.
epoch
=
epoch
def
__getitem__
(
self
,
index
):
with
data_utils
.
numpy_seed
(
self
.
seed
,
self
.
epoch
,
index
):
item
=
self
.
dataset
[
index
]
item_len
=
item
.
size
(
0
)
excess
=
item_len
-
self
.
truncation_length
if
excess
>
0
:
start_idx
=
np
.
random
.
randint
(
0
,
excess
)
item
=
item
[
start_idx
:
start_idx
+
self
.
truncation_length
]
return
item
def
maybe_shorten_dataset
(
dataset
,
split
,
shorten_data_split_list
,
shorten_method
,
tokens_per_sample
,
seed
,
):
truncate_split
=
(
split
in
shorten_data_split_list
.
split
(
","
)
or
len
(
shorten_data_split_list
)
==
0
)
if
shorten_method
==
"truncate"
and
truncate_split
:
dataset
=
TruncateDataset
(
dataset
,
tokens_per_sample
)
elif
shorten_method
==
"random_crop"
and
truncate_split
:
dataset
=
RandomCropDataset
(
dataset
,
tokens_per_sample
,
seed
)
return
dataset
PyTorch/NLP/new-Transformer/fairseq/data/sort_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
from
.
import
BaseWrapperDataset
class
SortDataset
(
BaseWrapperDataset
):
def
__init__
(
self
,
dataset
,
sort_order
):
super
().
__init__
(
dataset
)
if
not
isinstance
(
sort_order
,
(
list
,
tuple
)):
sort_order
=
[
sort_order
]
self
.
sort_order
=
sort_order
assert
all
(
len
(
so
)
==
len
(
dataset
)
for
so
in
sort_order
)
def
ordered_indices
(
self
):
return
np
.
lexsort
(
self
.
sort_order
)
PyTorch/NLP/new-Transformer/fairseq/data/strip_token_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
.
import
BaseWrapperDataset
class
StripTokenDataset
(
BaseWrapperDataset
):
def
__init__
(
self
,
dataset
,
id_to_strip
):
super
().
__init__
(
dataset
)
self
.
id_to_strip
=
id_to_strip
def
__getitem__
(
self
,
index
):
item
=
self
.
dataset
[
index
]
while
len
(
item
)
>
0
and
item
[
-
1
]
==
self
.
id_to_strip
:
item
=
item
[:
-
1
]
while
len
(
item
)
>
0
and
item
[
0
]
==
self
.
id_to_strip
:
item
=
item
[
1
:]
return
item
PyTorch/NLP/new-Transformer/fairseq/data/subsample_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
logging
import
numpy
as
np
from
.
import
BaseWrapperDataset
logger
=
logging
.
getLogger
(
__name__
)
class
SubsampleDataset
(
BaseWrapperDataset
):
"""Subsamples a given dataset by a specified ratio. Subsampling is done on the number of examples
Args:
dataset (~torch.utils.data.Dataset): dataset to subsample
size_ratio(float): the ratio to subsample to. must be between 0 and 1 (exclusive)
"""
def
__init__
(
self
,
dataset
,
size_ratio
,
shuffle
=
False
):
super
().
__init__
(
dataset
)
assert
size_ratio
<
1
self
.
actual_size
=
np
.
ceil
(
len
(
dataset
)
*
size_ratio
).
astype
(
int
)
self
.
indices
=
np
.
random
.
choice
(
list
(
range
(
len
(
self
.
dataset
))),
self
.
actual_size
,
replace
=
False
)
self
.
shuffle
=
shuffle
logger
.
info
(
"subsampled dataset from {} to {} (ratio={})"
.
format
(
len
(
self
.
dataset
),
self
.
actual_size
,
size_ratio
)
)
def
__getitem__
(
self
,
index
):
return
self
.
dataset
[
self
.
indices
[
index
]]
def
__len__
(
self
):
return
self
.
actual_size
def
collater
(
self
,
samples
):
return
self
.
dataset
.
collater
(
samples
)
@
property
def
sizes
(
self
):
return
self
.
dataset
.
sizes
[
self
.
indices
]
@
property
def
name
(
self
):
return
self
.
dataset
.
name
def
num_tokens
(
self
,
index
):
return
self
.
dataset
.
num_tokens
(
self
.
indices
[
index
])
def
size
(
self
,
index
):
return
self
.
dataset
.
size
(
self
.
indices
[
index
])
def
ordered_indices
(
self
):
"""Return an ordered list of indices. Batches will be constructed based
on this order."""
if
self
.
shuffle
:
order
=
[
np
.
random
.
permutation
(
len
(
self
))]
else
:
order
=
[
np
.
arange
(
len
(
self
))]
order
.
append
(
self
.
sizes
)
return
np
.
lexsort
(
order
)
def
prefetch
(
self
,
indices
):
self
.
dataset
.
prefetch
(
self
.
indices
[
indices
])
PyTorch/NLP/new-Transformer/fairseq/data/text_compressor.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
enum
import
Enum
class
TextCompressionLevel
(
Enum
):
none
=
0
low
=
1
high
=
2
class
TextCompressor
(
object
):
def
__init__
(
self
,
level
:
TextCompressionLevel
,
max_input_byte_length
:
int
=
2
**
16
):
self
.
level
=
level
self
.
max_input_length
=
max_input_byte_length
def
compress
(
self
,
text
:
str
)
->
bytes
:
if
self
.
level
==
TextCompressionLevel
.
low
:
import
zlib
# zlib: built-in, fast
return
zlib
.
compress
(
text
.
encode
(),
level
=
0
)
elif
self
.
level
==
TextCompressionLevel
.
high
:
try
:
import
unishox2
# unishox2: optimized for short text but slower
except
ImportError
:
raise
ImportError
(
"Please install unishox2 for the text compression feature: "
"pip install unishox2-py3"
)
assert
len
(
text
.
encode
())
<=
self
.
max_input_length
return
unishox2
.
compress
(
text
)[
0
]
else
:
return
text
.
encode
()
def
decompress
(
self
,
compressed
:
bytes
)
->
str
:
if
self
.
level
==
TextCompressionLevel
.
low
:
import
zlib
return
zlib
.
decompress
(
compressed
).
decode
()
elif
self
.
level
==
TextCompressionLevel
.
high
:
try
:
import
unishox2
except
ImportError
:
raise
ImportError
(
"Please install unishox2 for the text compression feature: "
"pip install unishox2-py3"
)
return
unishox2
.
decompress
(
compressed
,
self
.
max_input_length
)
else
:
return
compressed
.
decode
()
PyTorch/NLP/new-Transformer/fairseq/data/token_block_dataset.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
import
torch
from
fairseq.data
import
FairseqDataset
,
plasma_utils
from
fairseq.data.indexed_dataset
import
best_fitting_int_dtype
from
typing
import
Tuple
class
TokenBlockDataset
(
FairseqDataset
):
"""Break a Dataset of tokens into blocks.
Args:
dataset (~torch.utils.data.Dataset): dataset to break into blocks
sizes (List[int]): sentence lengths (required for 'complete' and 'eos')
block_size (int): maximum block size (ignored in 'eos' break mode)
break_mode (str, optional): Mode used for breaking tokens. Values can
be one of:
- 'none': break tokens into equally sized blocks (up to block_size)
- 'complete': break tokens into blocks (up to block_size) such that
blocks contains complete sentences, although block_size may be
exceeded if some sentences exceed block_size
- 'complete_doc': similar to 'complete' mode, but do not
cross document boundaries
- 'eos': each block contains one sentence (block_size is ignored)
include_targets (bool, optional): return next tokens as targets
(default: False).
document_sep_len (int, optional): document separator size (required for
'complete_doc' break mode). Typically 1 if the sentences have eos
and 0 otherwise.
"""
def
__init__
(
self
,
dataset
,
sizes
,
block_size
,
pad
,
eos
,
break_mode
=
None
,
include_targets
=
False
,
document_sep_len
=
1
,
use_plasma_view
=
False
,
split_path
=
None
,
plasma_path
=
None
,
):
super
().
__init__
()
self
.
dataset
=
dataset
self
.
pad
=
pad
self
.
eos
=
eos
self
.
include_targets
=
include_targets
assert
len
(
dataset
)
>
0
assert
len
(
dataset
)
==
len
(
sizes
)
_sizes
,
block_to_dataset_index
,
slice_indices
=
self
.
_build_slice_indices
(
sizes
,
break_mode
,
document_sep_len
,
block_size
)
if
use_plasma_view
:
plasma_id
=
(
block_size
,
document_sep_len
,
str
(
break_mode
),
len
(
dataset
))
self
.
_slice_indices
=
plasma_utils
.
PlasmaView
(
slice_indices
,
split_path
,
(
plasma_id
,
0
),
plasma_path
=
plasma_path
)
self
.
_sizes
=
plasma_utils
.
PlasmaView
(
_sizes
,
split_path
,
(
plasma_id
,
1
),
plasma_path
=
plasma_path
)
self
.
_block_to_dataset_index
=
plasma_utils
.
PlasmaView
(
block_to_dataset_index
,
split_path
,
(
plasma_id
,
2
),
plasma_path
=
plasma_path
,
)
else
:
self
.
_slice_indices
=
plasma_utils
.
PlasmaArray
(
slice_indices
)
self
.
_sizes
=
plasma_utils
.
PlasmaArray
(
_sizes
)
self
.
_block_to_dataset_index
=
plasma_utils
.
PlasmaArray
(
block_to_dataset_index
)
@
staticmethod
def
_build_slice_indices
(
sizes
,
break_mode
,
document_sep_len
,
block_size
)
->
Tuple
[
np
.
ndarray
]:
"""Use token_block_utils_fast to build arrays for indexing into self.dataset"""
try
:
from
fairseq.data.token_block_utils_fast
import
(
_get_slice_indices_fast
,
_get_block_to_dataset_index_fast
,
)
except
ImportError
:
raise
ImportError
(
"Please build Cython components with: `pip install --editable .` "
"or `python setup.py build_ext --inplace`"
)
if
isinstance
(
sizes
,
list
):
sizes
=
np
.
array
(
sizes
,
dtype
=
np
.
int64
)
else
:
if
torch
.
is_tensor
(
sizes
):
sizes
=
sizes
.
numpy
()
sizes
=
sizes
.
astype
(
np
.
int64
)
break_mode
=
break_mode
if
break_mode
is
not
None
else
"none"
# For "eos" break-mode, block_size is not required parameters.
if
break_mode
==
"eos"
and
block_size
is
None
:
block_size
=
0
slice_indices
=
_get_slice_indices_fast
(
sizes
,
str
(
break_mode
),
block_size
,
document_sep_len
)
_sizes
=
slice_indices
[:,
1
]
-
slice_indices
[:,
0
]
# build index mapping block indices to the underlying dataset indices
if
break_mode
==
"eos"
:
# much faster version for eos break mode
block_to_dataset_index
=
np
.
stack
(
[
np
.
arange
(
len
(
sizes
)),
# starting index in dataset
np
.
zeros
(
len
(
sizes
),
dtype
=
np
.
compat
.
long
),
# starting offset within starting index
np
.
arange
(
len
(
sizes
)),
# ending index in dataset
],
1
,
)
else
:
block_to_dataset_index
=
_get_block_to_dataset_index_fast
(
sizes
,
slice_indices
,
)
size_dtype
=
np
.
uint16
if
block_size
<
65535
else
np
.
uint32
num_tokens
=
slice_indices
[
-
1
].
max
()
slice_indices_dtype
=
best_fitting_int_dtype
(
num_tokens
)
slice_indices
=
slice_indices
.
astype
(
slice_indices_dtype
)
_sizes
=
_sizes
.
astype
(
size_dtype
)
block_to_dataset_index
=
block_to_dataset_index
.
astype
(
slice_indices_dtype
)
return
_sizes
,
block_to_dataset_index
,
slice_indices
@
property
def
slice_indices
(
self
):
return
self
.
_slice_indices
.
array
@
property
def
sizes
(
self
):
return
self
.
_sizes
.
array
@
property
def
block_to_dataset_index
(
self
):
return
self
.
_block_to_dataset_index
.
array
def
attr
(
self
,
attr
:
str
,
index
:
int
):
start_ds_idx
,
_
,
_
=
self
.
block_to_dataset_index
[
index
]
return
self
.
dataset
.
attr
(
attr
,
start_ds_idx
)
def
__getitem__
(
self
,
index
):
start_ds_idx
,
start_offset
,
end_ds_idx
=
self
.
block_to_dataset_index
[
index
]
buffer
=
torch
.
cat
(
[
self
.
dataset
[
idx
]
for
idx
in
range
(
start_ds_idx
,
end_ds_idx
+
1
)]
)
slice_s
,
slice_e
=
self
.
slice_indices
[
index
]
length
=
slice_e
-
slice_s
s
,
e
=
start_offset
,
start_offset
+
length
item
=
buffer
[
s
:
e
]
if
self
.
include_targets
:
# *target* is the original sentence (=item)
# *source* is shifted right by 1 (maybe left-padded with eos)
# *past_target* is shifted right by 2 (left-padded as needed)
if
s
==
0
:
source
=
torch
.
cat
([
item
.
new
([
self
.
eos
]),
buffer
[
0
:
e
-
1
]])
past_target
=
torch
.
cat
(
[
item
.
new
([
self
.
pad
,
self
.
eos
]),
buffer
[
0
:
e
-
2
]]
)
else
:
source
=
buffer
[
s
-
1
:
e
-
1
]
if
s
==
1
:
past_target
=
torch
.
cat
([
item
.
new
([
self
.
eos
]),
buffer
[
0
:
e
-
2
]])
else
:
past_target
=
buffer
[
s
-
2
:
e
-
2
]
return
source
,
item
,
past_target
return
item
def
__len__
(
self
):
return
len
(
self
.
slice_indices
)
@
property
def
supports_prefetch
(
self
):
return
getattr
(
self
.
dataset
,
"supports_prefetch"
,
False
)
def
prefetch
(
self
,
indices
):
self
.
dataset
.
prefetch
(
{
ds_idx
for
index
in
indices
for
start_ds_idx
,
_
,
end_ds_idx
in
[
self
.
block_to_dataset_index
[
index
]]
for
ds_idx
in
range
(
start_ds_idx
,
end_ds_idx
+
1
)
}
)
PyTorch/NLP/new-Transformer/fairseq/data/token_block_utils_fast.pyx
0 → 100644
View file @
c0f05c10
# cython: language_level=3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
numpy
as
np
import
torch
from
itertools
import
chain
from
libc.math
cimport
ceil
cimport
cython
cimport
numpy
as
np
from
libc.stdint
cimport
int32_t
,
int64_t
DTYPE
=
np
.
int64
ctypedef
int64_t
DTYPE_t
@
cython
.
boundscheck
(
False
)
@
cython
.
wraparound
(
False
)
@
cython
.
nonecheck
(
False
)
cdef
np
.
ndarray
[
DTYPE_t
,
ndim
=
2
]
_get_slice_indices_none_mode
(
np
.
ndarray
[
DTYPE_t
,
ndim
=
1
]
sizes
,
int
block_size
):
cdef
DTYPE_t
total_size
=
sizes
.
sum
()
cdef
DTYPE_t
length
=
<
DTYPE_t
>
ceil
(
total_size
/
<
double
>
block_size
)
cdef
np
.
ndarray
[
DTYPE_t
,
ndim
=
2
]
slice_indices
=
np
.
zeros
([
length
,
2
],
dtype
=
DTYPE
)
cdef
DTYPE_t
[:,
:]
slice_indices_view
=
slice_indices
cdef
DTYPE_t
i
cdef
DTYPE_t
start
cdef
DTYPE_t
end
for
i
in
range
(
length
):
start
=
i
*
block_size
end
=
min
(
start
+
block_size
,
total_size
)
slice_indices_view
[
i
][
0
]
=
start
slice_indices_view
[
i
][
1
]
=
end
return
slice_indices
cdef
np
.
ndarray
[
DTYPE_t
,
ndim
=
2
]
_fast_convert_to_np_array
(
list
list_of_list
):
"""
Faster function to convert DTYPE_t list of list.
Only fast when there are huge number of rows and low number of columns.
"""
cdef
np
.
ndarray
[
DTYPE_t
,
ndim
=
1
]
flat
=
np
.
fromiter
(
chain
.
from_iterable
(
list_of_list
),
DTYPE
,
-
1
)
return
flat
.
reshape
((
len
(
list_of_list
),
-
1
))
@
cython
.
boundscheck
(
False
)
@
cython
.
wraparound
(
False
)
@
cython
.
nonecheck
(
False
)
cpdef
np
.
ndarray
[
DTYPE_t
,
ndim
=
2
]
_get_slice_indices_fast
(
np
.
ndarray
[
DTYPE_t
,
ndim
=
1
]
sizes
,
str
break_mode
,
int
block_size
,
int
document_sep_len
):
cdef
DTYPE_t
tok_idx
=
0
cdef
DTYPE_t
sz_idx
=
0
cdef
DTYPE_t
curr_size
=
0
cdef
DTYPE_t
i
=
0
cdef
DTYPE_t
length
cdef
DTYPE_t
total_size
cdef
DTYPE_t
[:]
sizes_view
=
sizes
cdef
np
.
ndarray
[
DTYPE_t
,
ndim
=
2
]
slice_indices
cdef
list
slice_indices_list
=
[]
if
break_mode
is
None
or
break_mode
==
'none'
:
slice_indices
=
_get_slice_indices_none_mode
(
sizes
,
block_size
)
elif
break_mode
==
'complete'
:
while
sz_idx
<
len
(
sizes_view
):
if
curr_size
+
sizes_view
[
sz_idx
]
<=
block_size
or
curr_size
==
0
:
curr_size
+=
sizes_view
[
sz_idx
]
sz_idx
+=
1
else
:
slice_indices_list
.
append
((
tok_idx
,
tok_idx
+
curr_size
))
tok_idx
+=
curr_size
curr_size
=
0
if
curr_size
>
0
:
slice_indices_list
.
append
((
tok_idx
,
tok_idx
+
curr_size
))
slice_indices
=
_fast_convert_to_np_array
(
slice_indices_list
)
elif
break_mode
==
'complete_doc'
:
while
sz_idx
<
len
(
sizes_view
):
if
(
(
curr_size
+
sizes_view
[
sz_idx
]
<=
block_size
or
curr_size
==
0
)
# an empty sentence indicates end-of-document:
and
sizes_view
[
sz_idx
]
!=
document_sep_len
):
curr_size
+=
sizes_view
[
sz_idx
]
sz_idx
+=
1
else
:
# Only keep non-empty documents.
if
curr_size
>
1
:
slice_indices_list
.
append
((
tok_idx
,
tok_idx
+
curr_size
))
tok_idx
+=
curr_size
curr_size
=
0
if
sizes_view
[
sz_idx
]
==
document_sep_len
:
tok_idx
+=
sizes_view
[
sz_idx
]
sz_idx
+=
1
if
curr_size
>
1
:
slice_indices_list
.
append
((
tok_idx
,
tok_idx
+
curr_size
))
slice_indices
=
_fast_convert_to_np_array
(
slice_indices_list
)
elif
break_mode
==
'eos'
:
slice_indices
=
np
.
zeros
((
len
(
sizes
),
2
),
dtype
=
DTYPE
)
cumsum
=
sizes
.
cumsum
(
axis
=
0
)
slice_indices
[
1
:,
0
]
=
cumsum
[:
cumsum
.
shape
[
0
]
-
1
]
slice_indices
[:,
1
]
=
cumsum
else
:
raise
ValueError
(
'Invalid break_mode: '
+
break_mode
)
return
slice_indices
@
cython
.
boundscheck
(
False
)
@
cython
.
wraparound
(
False
)
@
cython
.
nonecheck
(
False
)
cpdef
np
.
ndarray
[
DTYPE_t
,
ndim
=
2
]
_get_block_to_dataset_index_fast
(
np
.
ndarray
[
DTYPE_t
,
ndim
=
1
]
sizes
,
np
.
ndarray
[
DTYPE_t
,
ndim
=
2
]
slice_indices
):
cdef
DTYPE_t
start_ds_idx
cdef
DTYPE_t
start_offset
cdef
DTYPE_t
end_ds_idx
cdef
DTYPE_t
i
cdef
DTYPE_t
s
cdef
DTYPE_t
e
cdef
DatasetSearcher
ds
=
DatasetSearcher
(
sizes
)
cdef
np
.
ndarray
[
DTYPE_t
,
ndim
=
2
]
block_to_dataset_index
=
np
.
zeros
([
len
(
slice_indices
),
3
],
dtype
=
DTYPE
)
cdef
DTYPE_t
[:,
:]
block_to_dataset_index_view
=
block_to_dataset_index
cdef
DTYPE_t
[:,
:]
slice_indices_view
=
slice_indices
cdef
Py_ssize_t
x_max
=
slice_indices
.
shape
[
0
]
for
i
in
range
(
x_max
):
s
=
slice_indices_view
[
i
][
0
]
e
=
slice_indices_view
[
i
][
1
]
ds
.
seek
(
s
)
start_ds_idx
=
ds
.
current_index
start_offset
=
ds
.
current_offset
if
e
<=
s
:
end_ds_idx
=
start_ds_idx
else
:
ds
.
seek
(
e
-
1
)
end_ds_idx
=
ds
.
current_index
block_to_dataset_index_view
[
i
][
0
]
=
start_ds_idx
# starting index in dataset
block_to_dataset_index_view
[
i
][
1
]
=
start_offset
# starting offset within starting index
block_to_dataset_index_view
[
i
][
2
]
=
end_ds_idx
# ending index in dataset
return
block_to_dataset_index
cdef
class
DatasetSearcher
(
object
):
"""Helper for mapping "flat" indices to indices and offsets in an
underlying dataset."""
cdef
DTYPE_t
current_i
cdef
DTYPE_t
current_offset
cdef
DTYPE_t
current_index
cdef
DTYPE_t
[:]
sizes
def
__init__
(
self
,
DTYPE_t
[:]
sizes
):
self
.
sizes
=
sizes
self
.
reset
()
cdef
reset
(
self
):
self
.
current_offset
=
0
# offset within current index in underlying dataset
self
.
current_i
=
0
# "flat" index
self
.
current_index
=
0
# index in underlying dataset
@
cython
.
boundscheck
(
False
)
@
cython
.
wraparound
(
False
)
@
cython
.
nonecheck
(
False
)
cdef
int
step
(
self
,
DTYPE_t
i
):
cdef
DTYPE_t
to_consume
cdef
DTYPE_t
remaining
if
i
<
self
.
current_i
:
self
.
reset
()
if
i
>
self
.
current_i
:
to_consume
=
i
-
self
.
current_i
remaining
=
self
.
sizes
[
self
.
current_index
]
-
self
.
current_offset
if
remaining
>
to_consume
:
self
.
current_offset
+=
to_consume
self
.
current_i
+=
to_consume
else
:
assert
remaining
>=
0
self
.
current_i
+=
remaining
self
.
current_index
+=
1
self
.
current_offset
=
0
return
1
return
0
@
cython
.
boundscheck
(
False
)
@
cython
.
wraparound
(
False
)
@
cython
.
nonecheck
(
False
)
cdef
seek
(
self
,
DTYPE_t
i
):
cdef
int
not_done
=
1
while
not_done
==
1
:
not_done
=
self
.
step
(
i
)
assert
self
.
current_i
==
i
Prev
1
…
9
10
11
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment