Commit 7143f128 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'hepj-test' into 'main'

更新transformer代码

See merge request dcutoolkit/deeplearing/dlexamples_new!47
parents a30b77fe c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import torch
from fairseq.data import data_utils
class WordNoising(object):
"""Generate a noisy version of a sentence, without changing words themselves."""
def __init__(self, dictionary, bpe_cont_marker="@@", bpe_end_marker=None):
self.dictionary = dictionary
self.bpe_end = None
if bpe_cont_marker:
self.bpe_end = np.array(
[
not self.dictionary[i].endswith(bpe_cont_marker)
for i in range(len(self.dictionary))
]
)
elif bpe_end_marker:
self.bpe_end = np.array(
[
self.dictionary[i].endswith(bpe_end_marker)
for i in range(len(self.dictionary))
]
)
self.get_word_idx = (
self._get_bpe_word_idx if self.bpe_end is not None else self._get_token_idx
)
def noising(self, x, lengths, noising_prob=0.0):
raise NotImplementedError()
def _get_bpe_word_idx(self, x):
"""
Given a list of BPE tokens, for every index in the tokens list,
return the index of the word grouping that it belongs to.
For example, for input x corresponding to ["how", "are", "y@@", "ou"],
return [[0], [1], [2], [2]].
"""
# x: (T x B)
bpe_end = self.bpe_end[x]
if x.size(0) == 1 and x.size(1) == 1:
# Special case when we only have one word in x. If x = [[N]],
# bpe_end is a scalar (bool) instead of a 2-dim array of bools,
# which makes the sum operation below fail.
return np.array([[0]])
# do a reduce front sum to generate word ids
word_idx = bpe_end[::-1].cumsum(0)[::-1]
word_idx = word_idx.max(0)[None, :] - word_idx
return word_idx
def _get_token_idx(self, x):
"""
This is to extend noising functions to be able to apply to non-bpe
tokens, e.g. word or characters.
"""
x = torch.t(x)
word_idx = np.array([range(len(x_i)) for x_i in x])
return np.transpose(word_idx)
class WordDropout(WordNoising):
"""Randomly drop input words. If not passing blank_idx (default is None),
then dropped words will be removed. Otherwise, it will be replaced by the
blank_idx."""
def __init__(
self,
dictionary,
default_dropout_prob=0.1,
bpe_cont_marker="@@",
bpe_end_marker=None,
):
super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
self.default_dropout_prob = default_dropout_prob
def noising(self, x, lengths, dropout_prob=None, blank_idx=None):
if dropout_prob is None:
dropout_prob = self.default_dropout_prob
# x: (T x B), lengths: B
if dropout_prob == 0:
return x, lengths
assert 0 < dropout_prob < 1
# be sure to drop entire words
word_idx = self.get_word_idx(x)
sentences = []
modified_lengths = []
for i in range(lengths.size(0)):
# Since dropout probabilities need to apply over non-pad tokens,
# it is not trivial to generate the keep mask without consider
# input lengths; otherwise, this could be done outside the loop
# We want to drop whole words based on word_idx grouping
num_words = max(word_idx[:, i]) + 1
# ith example: [x0, x1, ..., eos, pad, ..., pad]
# We should only generate keep probs for non-EOS tokens. Thus if the
# input sentence ends in EOS, the last word idx is not included in
# the dropout mask generation and we append True to always keep EOS.
# Otherwise, just generate the dropout mask for all word idx
# positions.
has_eos = x[lengths[i] - 1, i] == self.dictionary.eos()
if has_eos: # has eos?
keep = np.random.rand(num_words - 1) >= dropout_prob
keep = np.append(keep, [True]) # keep EOS symbol
else:
keep = np.random.rand(num_words) >= dropout_prob
words = x[: lengths[i], i].tolist()
# TODO: speed up the following loop
# drop words from the input according to keep
new_s = [
w if keep[word_idx[j, i]] else blank_idx for j, w in enumerate(words)
]
new_s = [w for w in new_s if w is not None]
# we need to have at least one word in the sentence (more than the
# start / end sentence symbols)
if len(new_s) <= 1:
# insert at beginning in case the only token left is EOS
# EOS should be at end of list.
new_s.insert(0, words[np.random.randint(0, len(words))])
assert len(new_s) >= 1 and (
not has_eos # Either don't have EOS at end or last token is EOS
or (len(new_s) >= 2 and new_s[-1] == self.dictionary.eos())
), "New sentence is invalid."
sentences.append(new_s)
modified_lengths.append(len(new_s))
# re-construct input
modified_lengths = torch.LongTensor(modified_lengths)
modified_x = torch.LongTensor(
modified_lengths.max(), modified_lengths.size(0)
).fill_(self.dictionary.pad())
for i in range(modified_lengths.size(0)):
modified_x[: modified_lengths[i], i].copy_(torch.LongTensor(sentences[i]))
return modified_x, modified_lengths
class WordShuffle(WordNoising):
"""Shuffle words by no more than k positions."""
def __init__(
self,
dictionary,
default_max_shuffle_distance=3,
bpe_cont_marker="@@",
bpe_end_marker=None,
):
super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
self.default_max_shuffle_distance = 3
def noising(self, x, lengths, max_shuffle_distance=None):
if max_shuffle_distance is None:
max_shuffle_distance = self.default_max_shuffle_distance
# x: (T x B), lengths: B
if max_shuffle_distance == 0:
return x, lengths
# max_shuffle_distance < 1 will return the same sequence
assert max_shuffle_distance > 1
# define noise word scores
noise = np.random.uniform(
0,
max_shuffle_distance,
size=(x.size(0), x.size(1)),
)
noise[0] = -1 # do not move start sentence symbol
# be sure to shuffle entire words
word_idx = self.get_word_idx(x)
x2 = x.clone()
for i in range(lengths.size(0)):
length_no_eos = lengths[i]
if x[lengths[i] - 1, i] == self.dictionary.eos():
length_no_eos = lengths[i] - 1
# generate a random permutation
scores = word_idx[:length_no_eos, i] + noise[word_idx[:length_no_eos, i], i]
# ensure no reordering inside a word
scores += 1e-6 * np.arange(length_no_eos.item())
permutation = scores.argsort()
# shuffle words
x2[:length_no_eos, i].copy_(
x2[:length_no_eos, i][torch.from_numpy(permutation)]
)
return x2, lengths
class UnsupervisedMTNoising(WordNoising):
"""
Implements the default configuration for noising in UnsupervisedMT
(github.com/facebookresearch/UnsupervisedMT)
"""
def __init__(
self,
dictionary,
max_word_shuffle_distance,
word_dropout_prob,
word_blanking_prob,
bpe_cont_marker="@@",
bpe_end_marker=None,
):
super().__init__(dictionary)
self.max_word_shuffle_distance = max_word_shuffle_distance
self.word_dropout_prob = word_dropout_prob
self.word_blanking_prob = word_blanking_prob
self.word_dropout = WordDropout(
dictionary=dictionary,
bpe_cont_marker=bpe_cont_marker,
bpe_end_marker=bpe_end_marker,
)
self.word_shuffle = WordShuffle(
dictionary=dictionary,
bpe_cont_marker=bpe_cont_marker,
bpe_end_marker=bpe_end_marker,
)
def noising(self, x, lengths):
# 1. Word Shuffle
noisy_src_tokens, noisy_src_lengths = self.word_shuffle.noising(
x=x,
lengths=lengths,
max_shuffle_distance=self.max_word_shuffle_distance,
)
# 2. Word Dropout
noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising(
x=noisy_src_tokens,
lengths=noisy_src_lengths,
dropout_prob=self.word_dropout_prob,
)
# 3. Word Blanking
noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising(
x=noisy_src_tokens,
lengths=noisy_src_lengths,
dropout_prob=self.word_blanking_prob,
blank_idx=self.dictionary.unk(),
)
return noisy_src_tokens
class NoisingDataset(torch.utils.data.Dataset):
def __init__(
self,
src_dataset,
src_dict,
seed,
noiser=None,
noising_class=UnsupervisedMTNoising,
**kwargs
):
"""
Wrap a :class:`~torch.utils.data.Dataset` and apply noise to the
samples based on the supplied noising configuration.
Args:
src_dataset (~torch.utils.data.Dataset): dataset to wrap.
to build self.src_dataset --
a LanguagePairDataset with src dataset as the source dataset and
None as the target dataset. Should NOT have padding so that
src_lengths are accurately calculated by language_pair_dataset
collate function.
We use language_pair_dataset here to encapsulate the tgt_dataset
so we can re-use the LanguagePairDataset collater to format the
batches in the structure that SequenceGenerator expects.
src_dict (~fairseq.data.Dictionary): source dictionary
seed (int): seed to use when generating random noise
noiser (WordNoising): a pre-initialized :class:`WordNoising`
instance. If this is None, a new instance will be created using
*noising_class* and *kwargs*.
noising_class (class, optional): class to use to initialize a
default :class:`WordNoising` instance.
kwargs (dict, optional): arguments to initialize the default
:class:`WordNoising` instance given by *noiser*.
"""
self.src_dataset = src_dataset
self.src_dict = src_dict
self.seed = seed
self.noiser = (
noiser
if noiser is not None
else noising_class(
dictionary=src_dict,
**kwargs,
)
)
self.sizes = src_dataset.sizes
def __getitem__(self, index):
"""
Returns a single noisy sample. Multiple samples are fed to the collater
create a noising dataset batch.
"""
src_tokens = self.src_dataset[index]
src_lengths = torch.LongTensor([len(src_tokens)])
src_tokens = src_tokens.unsqueeze(0)
# Transpose src tokens to fit expected shape of x in noising function
# (batch size, sequence length) -> (sequence length, batch size)
src_tokens_t = torch.t(src_tokens)
with data_utils.numpy_seed(self.seed + index):
noisy_src_tokens = self.noiser.noising(src_tokens_t, src_lengths)
# Transpose back to expected src_tokens format
# (sequence length, 1) -> (1, sequence length)
noisy_src_tokens = torch.t(noisy_src_tokens)
return noisy_src_tokens[0]
def __len__(self):
"""
The length of the noising dataset is the length of src.
"""
return len(self.src_dataset)
@property
def supports_prefetch(self):
return self.src_dataset.supports_prefetch
def prefetch(self, indices):
if self.src_dataset.supports_prefetch:
self.src_dataset.prefetch(indices)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import FairseqDataset
class NumSamplesDataset(FairseqDataset):
def __getitem__(self, index):
return 1
def __len__(self):
return 0
def collater(self, samples):
return sum(samples)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import torch
from . import BaseWrapperDataset
class NumelDataset(BaseWrapperDataset):
def __init__(self, dataset, reduce=False):
super().__init__(dataset)
self.reduce = reduce
def __getitem__(self, index):
item = self.dataset[index]
if torch.is_tensor(item):
return torch.numel(item)
else:
return np.size(item)
def __len__(self):
return len(self.dataset)
def collater(self, samples):
if self.reduce:
return sum(samples)
else:
return torch.tensor(samples)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import BaseWrapperDataset
class OffsetTokensDataset(BaseWrapperDataset):
def __init__(self, dataset, offset):
super().__init__(dataset)
self.offset = offset
def __getitem__(self, idx):
return self.dataset[idx] + self.offset
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from fairseq.data import data_utils
from . import BaseWrapperDataset
class PadDataset(BaseWrapperDataset):
def __init__(self, dataset, pad_idx, left_pad, pad_length=None):
super().__init__(dataset)
self.pad_idx = pad_idx
self.left_pad = left_pad
self.pad_length = pad_length
def collater(self, samples):
return data_utils.collate_tokens(
samples, self.pad_idx, left_pad=self.left_pad, pad_to_length=self.pad_length
)
class LeftPadDataset(PadDataset):
def __init__(self, dataset, pad_idx):
super().__init__(dataset, pad_idx, left_pad=True)
class RightPadDataset(PadDataset):
def __init__(self, dataset, pad_idx):
super().__init__(dataset, pad_idx, left_pad=False)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import hashlib
import json
import subprocess
import tempfile
from typing import Hashable
try:
import pyarrow.plasma as plasma
PYARROW_AVAILABLE = True
except ImportError:
plasma = None
PYARROW_AVAILABLE = False
class PlasmaArray:
"""
Wrapper around numpy arrays that automatically moves the data to shared
memory upon serialization. This is particularly helpful when passing numpy
arrays through multiprocessing, so that data is not unnecessarily
duplicated or pickled.
"""
def __init__(self, array):
super().__init__()
self.array = array
self.disable = array.nbytes < 134217728 # disable for arrays <128MB
self.object_id = None
self.path = None
# variables with underscores shouldn't be pickled
self._client = None
self._server = None
self._server_tmp = None
self._plasma = None
@property
def plasma(self):
if self._plasma is None and not self.disable:
self._plasma = plasma
return self._plasma
def start_server(self):
if self.plasma is None or self._server is not None:
return
assert self.object_id is None
assert self.path is None
self._server_tmp = tempfile.NamedTemporaryFile()
self.path = self._server_tmp.name
self._server = subprocess.Popen(
["plasma_store", "-m", str(int(1.05 * self.array.nbytes)), "-s", self.path]
)
@property
def client(self):
if self._client is None:
assert self.path is not None
self._client = self.plasma.connect(self.path, num_retries=200)
return self._client
def __getstate__(self):
"""Called on pickle load"""
if self.plasma is None:
return self.__dict__
if self.object_id is None:
self.start_server()
self.object_id = self.client.put(self.array)
state = self.__dict__.copy()
del state["array"]
state["_client"] = None
state["_server"] = None
state["_server_tmp"] = None
state["_plasma"] = None
return state
def __setstate__(self, state):
"""Called on pickle save"""
self.__dict__.update(state)
if self.plasma is None:
return
self.array = self.client.get(self.object_id)
def __del__(self):
if self._server is not None:
self._server.kill()
self._server = None
self._server_tmp.close()
self._server_tmp = None
DEFAULT_PLASMA_PATH = "/tmp/plasma"
class PlasmaView:
"""Interface to write and read from shared memory. Whereas PlasmaArray writes to plasma on serialization,
PlasmaView writes to shared memory on instantiation."""
def __init__(self, array, split_path: str, hash_data: Hashable, plasma_path=None):
"""
Args:
array: numpy array to store. This can be read with ``PlasmaView().array``
split_path: the path whence the data was read, used for hashing
hash_data: other metadata about the array that can be used to create a unique key.
as of writing, the 3 callers in ``TokenBlockDataset`` use::
hash_data = ((block_size, document_sep_len, str(break_mode), len(dataset)), 0|1|2)
"""
assert PYARROW_AVAILABLE
assert split_path is not None
if plasma_path is None:
plasma_path = DEFAULT_PLASMA_PATH
self.path = plasma_path
self.split_path = split_path
self._client = None # Initialize lazily for pickle. plasma clients should not be deep copied or serialized.
self._n = None
self.object_id = self.get_object_id(self.split_path, hash_data)
try:
self.client.put(array, object_id=self.object_id)
except plasma.PlasmaObjectExists:
pass
@property
def client(self):
if self._client is None:
self._client = plasma.connect(self.path, num_retries=200)
return self._client
@property
def array(self):
"""Fetch a read only view of an np.array, stored in plasma."""
ret = self.client.get(self.object_id)
return ret
@staticmethod
def get_object_id(split_path: str, hash_data: Hashable):
"""Returns plasma.ObjectID from hashing split_path and object_num."""
hash = hashlib.blake2b(bytes(split_path, "utf-8"), digest_size=20)
harg = json.dumps(hash_data).encode("utf-8")
hash.update(harg)
return plasma.ObjectID(hash.digest())
def __getstate__(self):
"""Called on pickle save"""
self.disconnect()
state = self.__dict__.copy()
assert state["_client"] is None
assert "object_id" in state
return state
def __setstate__(self, state):
"""Called on pickle load"""
self.__dict__.update(state)
def __del__(self):
self.disconnect()
def disconnect(self):
if self._client is not None:
self._client.disconnect()
self._client = None
def __len__(self):
"""Save reads by caching len"""
if self._n is None:
self._n = len(self.array)
return self._n
GB100 = (1024**3) * 100
class PlasmaStore:
def __init__(self, path=DEFAULT_PLASMA_PATH, nbytes: int = GB100):
self.server = self.start(path, nbytes)
def __del__(self):
self.server.kill()
@staticmethod
def start(path=DEFAULT_PLASMA_PATH, nbytes: int = GB100) -> subprocess.Popen:
if not PYARROW_AVAILABLE:
raise ImportError("please run pip install pyarrow to use --use_plasma_view")
# best practice is to allocate more space than we need. The limitation seems to be the size of /dev/shm
_server = subprocess.Popen(["plasma_store", "-m", str(nbytes), "-s", path])
plasma.connect(path, num_retries=200) # If we can't connect we fail immediately
return _server
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import torch
from . import BaseWrapperDataset
class PrependDataset(BaseWrapperDataset):
def __init__(self, dataset, prepend_getter, ensure_first_token_is=None):
super().__init__(dataset)
self.prepend_getter = prepend_getter
self.ensure_first_token = ensure_first_token_is
def __getitem__(self, idx):
item = self.dataset[idx]
is_tuple = isinstance(item, tuple)
src = item[0] if is_tuple else item
assert self.ensure_first_token is None or src[0] == self.ensure_first_token
prepend_idx = self.prepend_getter(self.dataset, idx)
assert isinstance(prepend_idx, int)
src[0] = prepend_idx
item = tuple((src,) + item[1:]) if is_tuple else src
return item
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import torch
from . import BaseWrapperDataset
class PrependTokenDataset(BaseWrapperDataset):
def __init__(self, dataset, token=None):
super().__init__(dataset)
self.token = token
if token is not None:
self._sizes = np.array(dataset.sizes) + 1
else:
self._sizes = dataset.sizes
def __getitem__(self, idx):
item = self.dataset[idx]
if self.token is not None:
item = torch.cat([item.new([self.token]), item])
return item
@property
def sizes(self):
return self._sizes
def num_tokens(self, index):
n = self.dataset.num_tokens(index)
if self.token is not None:
n += 1
return n
def size(self, index):
n = self.dataset.size(index)
if self.token is not None:
n += 1
return n
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
from . import FairseqDataset
class RawLabelDataset(FairseqDataset):
def __init__(self, labels):
super().__init__()
self.labels = labels
def __getitem__(self, index):
return self.labels[index]
def __len__(self):
return len(self.labels)
def collater(self, samples):
return torch.tensor(samples)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import BaseWrapperDataset
class ReplaceDataset(BaseWrapperDataset):
"""Replaces tokens found in the dataset by a specified replacement token
Args:
dataset (~torch.utils.data.Dataset): dataset to replace tokens in
replace_map(Dictionary[int,int]): map of token to replace -> replacement token
offsets (List[int]): do not replace tokens before (from left if pos, right if neg) this offset. should be
as many as the number of objects returned by the underlying dataset __getitem__ method.
"""
def __init__(self, dataset, replace_map, offsets):
super().__init__(dataset)
assert len(replace_map) > 0
self.replace_map = replace_map
self.offsets = offsets
def __getitem__(self, index):
item = self.dataset[index]
is_tuple = isinstance(item, tuple)
srcs = item if is_tuple else [item]
for offset, src in zip(self.offsets, srcs):
for k, v in self.replace_map.items():
src_off = src[offset:] if offset >= 0 else src[:offset]
src_off.masked_fill_(src_off == k, v)
item = srcs if is_tuple else srcs[0]
return item
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import numpy as np
from fairseq.data import BaseWrapperDataset, plasma_utils
logger = logging.getLogger(__name__)
class ResamplingDataset(BaseWrapperDataset):
"""Randomly samples from a given dataset at each epoch.
Sampling is done with or without replacement, depending on the "replace"
parameter.
Optionally, the epoch size can be rescaled. This is potentially desirable
to increase per-epoch coverage of the base dataset (since sampling with
replacement means that many items in the dataset will be left out). In the
case of sampling without replacement, size_ratio should be strictly less
than 1.
Args:
dataset (~torch.utils.data.Dataset): dataset on which to sample.
weights (List[float]): list of probability weights
(default: None, which corresponds to uniform sampling).
replace (bool): sampling mode; True for "with replacement", or False
for "without replacement" (default: True)
size_ratio (float): the ratio to subsample to; must be positive
(default: 1.0).
batch_by_size (bool): whether or not to batch by sequence length
(default: True).
seed (int): RNG seed to use (default: 0).
epoch (int): starting epoch number (default: 1).
"""
def __init__(
self,
dataset,
weights=None,
replace=True,
size_ratio=1.0,
batch_by_size=True,
seed=0,
epoch=1,
):
super().__init__(dataset)
if weights is None:
self.weights = None
else:
assert len(weights) == len(dataset)
weights_arr = np.array(weights, dtype=np.float64)
weights_arr /= weights_arr.sum()
self.weights = plasma_utils.PlasmaArray(weights_arr)
self.replace = replace
assert size_ratio > 0.0
if not self.replace:
assert size_ratio < 1.0
self.size_ratio = float(size_ratio)
self.actual_size = np.ceil(len(dataset) * self.size_ratio).astype(int)
self.batch_by_size = batch_by_size
self.seed = seed
self._cur_epoch = None
self._cur_indices = None
self.set_epoch(epoch)
def __getitem__(self, index):
return self.dataset[self._cur_indices.array[index]]
def __len__(self):
return self.actual_size
@property
def sizes(self):
if isinstance(self.dataset.sizes, list):
return [s[self._cur_indices.array] for s in self.dataset.sizes]
return self.dataset.sizes[self._cur_indices.array]
def num_tokens(self, index):
return self.dataset.num_tokens(self._cur_indices.array[index])
def size(self, index):
return self.dataset.size(self._cur_indices.array[index])
def ordered_indices(self):
if self.batch_by_size:
order = [
np.arange(len(self)),
self.sizes,
] # No need to handle `self.shuffle == True`
return np.lexsort(order)
else:
return np.arange(len(self))
def prefetch(self, indices):
self.dataset.prefetch(self._cur_indices.array[indices])
@property
def can_reuse_epoch_itr_across_epochs(self):
return False
def set_epoch(self, epoch):
logger.debug("ResamplingDataset.set_epoch: {}".format(epoch))
super().set_epoch(epoch)
if epoch == self._cur_epoch:
return
self._cur_epoch = epoch
# Generate a weighted sample of indices as a function of the
# random seed and the current epoch.
rng = np.random.RandomState(
[
42, # magic number
self.seed % (2**32), # global seed
self._cur_epoch, # epoch index
]
)
self._cur_indices = plasma_utils.PlasmaArray(
rng.choice(
len(self.dataset),
self.actual_size,
replace=self.replace,
p=(None if self.weights is None else self.weights.array),
)
)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
from . import BaseWrapperDataset
class RollDataset(BaseWrapperDataset):
def __init__(self, dataset, shifts):
super().__init__(dataset)
self.shifts = shifts
def __getitem__(self, index):
item = self.dataset[index]
return torch.roll(item, self.shifts)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
from collections import OrderedDict
from typing import Dict, Sequence
import numpy as np
from . import FairseqDataset, LanguagePairDataset
logger = logging.getLogger(__name__)
class RoundRobinZipDatasets(FairseqDataset):
"""Zip multiple :class:`~fairseq.data.FairseqDataset` instances together.
Shorter datasets are repeated in a round-robin fashion to match the length
of the longest one.
Args:
datasets (Dict[~fairseq.data.FairseqDataset]): a dictionary of
:class:`~fairseq.data.FairseqDataset` instances.
eval_key (str, optional): a key used at evaluation time that causes
this instance to pass-through batches from *datasets[eval_key]*.
"""
def __init__(self, datasets, eval_key=None):
super().__init__()
if isinstance(datasets, dict):
datasets = OrderedDict(datasets)
assert isinstance(datasets, OrderedDict)
assert datasets, "Can't make a RoundRobinZipDatasets out of nothing"
for dataset in datasets.values():
assert isinstance(dataset, FairseqDataset)
self.datasets = datasets
self.eval_key = eval_key
self.longest_dataset_key = max(datasets, key=lambda k: len(datasets[k]))
self.longest_dataset = datasets[self.longest_dataset_key]
self._ordered_indices: Dict[str, Sequence[int]] = None
def _map_index(self, key, index):
assert (
self._ordered_indices is not None
), "Must call RoundRobinZipDatasets.ordered_indices() first"
o = self._ordered_indices[key]
return o[index % len(o)]
def __getitem__(self, index):
if self.eval_key is None:
return OrderedDict(
[
(key, dataset[self._map_index(key, index)])
for key, dataset in self.datasets.items()
]
)
else:
# at evaluation time it's useful to pass-through batches from a single key
return self.datasets[self.eval_key][self._map_index(self.eval_key, index)]
def __len__(self):
if self._ordered_indices is not None:
return len(self._ordered_indices[self.longest_dataset_key])
return len(self.longest_dataset)
def collater(self, samples):
"""Merge a list of samples to form a mini-batch."""
if len(samples) == 0:
return None
if self.eval_key is None:
return OrderedDict(
[
(key, dataset.collater([sample[key] for sample in samples]))
for key, dataset in self.datasets.items()
]
)
else:
# at evaluation time it's useful to pass-through batches from a single key
return self.datasets[self.eval_key].collater(samples)
def num_tokens(self, index):
"""Return an example's length (number of tokens), used for batching."""
# TODO make it configurable whether to use max() or sum() here
return max(
dataset.num_tokens(self._map_index(key, index))
for key, dataset in self.datasets.items()
)
def size(self, index):
"""Return an example's size as a float or tuple. This value is used when
filtering a dataset with ``--max-positions``."""
return {
key: dataset.size(self._map_index(key, index))
for key, dataset in self.datasets.items()
}
def ordered_indices(self):
"""Ordered indices for batching."""
if self._ordered_indices is None:
# Call the underlying dataset's ordered_indices() here, so that we
# get the same random ordering as we would have from using the
# underlying sub-datasets directly.
self._ordered_indices = OrderedDict(
[
(key, dataset.ordered_indices())
for key, dataset in self.datasets.items()
]
)
return np.arange(len(self))
def filter_indices_by_size(self, indices, max_positions=None):
"""
Filter each sub-dataset independently, then update the round robin to work
on the filtered sub-datasets.
"""
def _deep_until_language_pair(dataset):
if isinstance(dataset, LanguagePairDataset):
return dataset
if hasattr(dataset, "tgt_dataset"):
return _deep_until_language_pair(dataset.tgt_dataset)
if hasattr(dataset, "dataset"):
return _deep_until_language_pair(dataset.dataset)
raise Exception(f"Don't know how to unwrap this dataset: {dataset}")
if not isinstance(max_positions, dict):
max_positions = {k: max_positions for k in self.datasets.keys()}
ignored_some = False
for key, dataset in self.datasets.items():
dataset = _deep_until_language_pair(dataset)
self._ordered_indices[key], ignored = dataset.filter_indices_by_size(
self._ordered_indices[key], max_positions[key]
)
if len(ignored) > 0:
ignored_some = True
logger.warning(
f"{len(ignored)} samples from {key} have invalid sizes and will be skipped, "
f"max_positions={max_positions[key]}, first few sample ids={ignored[:10]}"
)
# Since we are modifying in place the _ordered_indices,
# it's not possible anymore to return valid ignored indices.
# Hopefully the extra debug information print above should be enough to debug.
# Ideally we would receive ignore_invalid_inputs so that we could have
# a proper error message.
return (np.arange(len(self)), [0] if ignored_some else [])
@property
def supports_prefetch(self):
return all(
getattr(dataset, "supports_prefetch", False)
for dataset in self.datasets.values()
)
def prefetch(self, indices):
for key, dataset in self.datasets.items():
dataset.prefetch([self._map_index(key, index) for index in indices])
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
from fairseq.data import data_utils
from . import BaseWrapperDataset
class TruncateDataset(BaseWrapperDataset):
"""Truncate a sequence by returning the first truncation_length tokens"""
def __init__(self, dataset, truncation_length):
super().__init__(dataset)
assert truncation_length is not None
self.truncation_length = truncation_length
self.dataset = dataset
def __getitem__(self, index):
item = self.dataset[index]
item_len = item.size(0)
if item_len > self.truncation_length:
item = item[: self.truncation_length]
return item
@property
def sizes(self):
return np.minimum(self.dataset.sizes, self.truncation_length)
def __len__(self):
return len(self.dataset)
class RandomCropDataset(TruncateDataset):
"""Truncate a sequence by returning a random crop of truncation_length tokens"""
def __init__(self, dataset, truncation_length, seed=1):
super().__init__(dataset, truncation_length)
self.seed = seed
self.epoch = 0
@property
def can_reuse_epoch_itr_across_epochs(self):
return True # only the crop changes, not item sizes
def set_epoch(self, epoch, **unused):
super().set_epoch(epoch)
self.epoch = epoch
def __getitem__(self, index):
with data_utils.numpy_seed(self.seed, self.epoch, index):
item = self.dataset[index]
item_len = item.size(0)
excess = item_len - self.truncation_length
if excess > 0:
start_idx = np.random.randint(0, excess)
item = item[start_idx : start_idx + self.truncation_length]
return item
def maybe_shorten_dataset(
dataset,
split,
shorten_data_split_list,
shorten_method,
tokens_per_sample,
seed,
):
truncate_split = (
split in shorten_data_split_list.split(",") or len(shorten_data_split_list) == 0
)
if shorten_method == "truncate" and truncate_split:
dataset = TruncateDataset(dataset, tokens_per_sample)
elif shorten_method == "random_crop" and truncate_split:
dataset = RandomCropDataset(dataset, tokens_per_sample, seed)
return dataset
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
from . import BaseWrapperDataset
class SortDataset(BaseWrapperDataset):
def __init__(self, dataset, sort_order):
super().__init__(dataset)
if not isinstance(sort_order, (list, tuple)):
sort_order = [sort_order]
self.sort_order = sort_order
assert all(len(so) == len(dataset) for so in sort_order)
def ordered_indices(self):
return np.lexsort(self.sort_order)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import BaseWrapperDataset
class StripTokenDataset(BaseWrapperDataset):
def __init__(self, dataset, id_to_strip):
super().__init__(dataset)
self.id_to_strip = id_to_strip
def __getitem__(self, index):
item = self.dataset[index]
while len(item) > 0 and item[-1] == self.id_to_strip:
item = item[:-1]
while len(item) > 0 and item[0] == self.id_to_strip:
item = item[1:]
return item
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import numpy as np
from . import BaseWrapperDataset
logger = logging.getLogger(__name__)
class SubsampleDataset(BaseWrapperDataset):
"""Subsamples a given dataset by a specified ratio. Subsampling is done on the number of examples
Args:
dataset (~torch.utils.data.Dataset): dataset to subsample
size_ratio(float): the ratio to subsample to. must be between 0 and 1 (exclusive)
"""
def __init__(self, dataset, size_ratio, shuffle=False):
super().__init__(dataset)
assert size_ratio < 1
self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int)
self.indices = np.random.choice(
list(range(len(self.dataset))), self.actual_size, replace=False
)
self.shuffle = shuffle
logger.info(
"subsampled dataset from {} to {} (ratio={})".format(
len(self.dataset), self.actual_size, size_ratio
)
)
def __getitem__(self, index):
return self.dataset[self.indices[index]]
def __len__(self):
return self.actual_size
def collater(self, samples):
return self.dataset.collater(samples)
@property
def sizes(self):
return self.dataset.sizes[self.indices]
@property
def name(self):
return self.dataset.name
def num_tokens(self, index):
return self.dataset.num_tokens(self.indices[index])
def size(self, index):
return self.dataset.size(self.indices[index])
def ordered_indices(self):
"""Return an ordered list of indices. Batches will be constructed based
on this order."""
if self.shuffle:
order = [np.random.permutation(len(self))]
else:
order = [np.arange(len(self))]
order.append(self.sizes)
return np.lexsort(order)
def prefetch(self, indices):
self.dataset.prefetch(self.indices[indices])
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from enum import Enum
class TextCompressionLevel(Enum):
none = 0
low = 1
high = 2
class TextCompressor(object):
def __init__(
self, level: TextCompressionLevel, max_input_byte_length: int = 2**16
):
self.level = level
self.max_input_length = max_input_byte_length
def compress(self, text: str) -> bytes:
if self.level == TextCompressionLevel.low:
import zlib
# zlib: built-in, fast
return zlib.compress(text.encode(), level=0)
elif self.level == TextCompressionLevel.high:
try:
import unishox2
# unishox2: optimized for short text but slower
except ImportError:
raise ImportError(
"Please install unishox2 for the text compression feature: "
"pip install unishox2-py3"
)
assert len(text.encode()) <= self.max_input_length
return unishox2.compress(text)[0]
else:
return text.encode()
def decompress(self, compressed: bytes) -> str:
if self.level == TextCompressionLevel.low:
import zlib
return zlib.decompress(compressed).decode()
elif self.level == TextCompressionLevel.high:
try:
import unishox2
except ImportError:
raise ImportError(
"Please install unishox2 for the text compression feature: "
"pip install unishox2-py3"
)
return unishox2.decompress(compressed, self.max_input_length)
else:
return compressed.decode()
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import torch
from fairseq.data import FairseqDataset, plasma_utils
from fairseq.data.indexed_dataset import best_fitting_int_dtype
from typing import Tuple
class TokenBlockDataset(FairseqDataset):
"""Break a Dataset of tokens into blocks.
Args:
dataset (~torch.utils.data.Dataset): dataset to break into blocks
sizes (List[int]): sentence lengths (required for 'complete' and 'eos')
block_size (int): maximum block size (ignored in 'eos' break mode)
break_mode (str, optional): Mode used for breaking tokens. Values can
be one of:
- 'none': break tokens into equally sized blocks (up to block_size)
- 'complete': break tokens into blocks (up to block_size) such that
blocks contains complete sentences, although block_size may be
exceeded if some sentences exceed block_size
- 'complete_doc': similar to 'complete' mode, but do not
cross document boundaries
- 'eos': each block contains one sentence (block_size is ignored)
include_targets (bool, optional): return next tokens as targets
(default: False).
document_sep_len (int, optional): document separator size (required for
'complete_doc' break mode). Typically 1 if the sentences have eos
and 0 otherwise.
"""
def __init__(
self,
dataset,
sizes,
block_size,
pad,
eos,
break_mode=None,
include_targets=False,
document_sep_len=1,
use_plasma_view=False,
split_path=None,
plasma_path=None,
):
super().__init__()
self.dataset = dataset
self.pad = pad
self.eos = eos
self.include_targets = include_targets
assert len(dataset) > 0
assert len(dataset) == len(sizes)
_sizes, block_to_dataset_index, slice_indices = self._build_slice_indices(
sizes, break_mode, document_sep_len, block_size
)
if use_plasma_view:
plasma_id = (block_size, document_sep_len, str(break_mode), len(dataset))
self._slice_indices = plasma_utils.PlasmaView(
slice_indices, split_path, (plasma_id, 0), plasma_path=plasma_path
)
self._sizes = plasma_utils.PlasmaView(
_sizes, split_path, (plasma_id, 1), plasma_path=plasma_path
)
self._block_to_dataset_index = plasma_utils.PlasmaView(
block_to_dataset_index,
split_path,
(plasma_id, 2),
plasma_path=plasma_path,
)
else:
self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
self._sizes = plasma_utils.PlasmaArray(_sizes)
self._block_to_dataset_index = plasma_utils.PlasmaArray(
block_to_dataset_index
)
@staticmethod
def _build_slice_indices(
sizes, break_mode, document_sep_len, block_size
) -> Tuple[np.ndarray]:
"""Use token_block_utils_fast to build arrays for indexing into self.dataset"""
try:
from fairseq.data.token_block_utils_fast import (
_get_slice_indices_fast,
_get_block_to_dataset_index_fast,
)
except ImportError:
raise ImportError(
"Please build Cython components with: `pip install --editable .` "
"or `python setup.py build_ext --inplace`"
)
if isinstance(sizes, list):
sizes = np.array(sizes, dtype=np.int64)
else:
if torch.is_tensor(sizes):
sizes = sizes.numpy()
sizes = sizes.astype(np.int64)
break_mode = break_mode if break_mode is not None else "none"
# For "eos" break-mode, block_size is not required parameters.
if break_mode == "eos" and block_size is None:
block_size = 0
slice_indices = _get_slice_indices_fast(
sizes, str(break_mode), block_size, document_sep_len
)
_sizes = slice_indices[:, 1] - slice_indices[:, 0]
# build index mapping block indices to the underlying dataset indices
if break_mode == "eos":
# much faster version for eos break mode
block_to_dataset_index = np.stack(
[
np.arange(len(sizes)), # starting index in dataset
np.zeros(
len(sizes), dtype=np.compat.long
), # starting offset within starting index
np.arange(len(sizes)), # ending index in dataset
],
1,
)
else:
block_to_dataset_index = _get_block_to_dataset_index_fast(
sizes,
slice_indices,
)
size_dtype = np.uint16 if block_size < 65535 else np.uint32
num_tokens = slice_indices[-1].max()
slice_indices_dtype = best_fitting_int_dtype(num_tokens)
slice_indices = slice_indices.astype(slice_indices_dtype)
_sizes = _sizes.astype(size_dtype)
block_to_dataset_index = block_to_dataset_index.astype(slice_indices_dtype)
return _sizes, block_to_dataset_index, slice_indices
@property
def slice_indices(self):
return self._slice_indices.array
@property
def sizes(self):
return self._sizes.array
@property
def block_to_dataset_index(self):
return self._block_to_dataset_index.array
def attr(self, attr: str, index: int):
start_ds_idx, _, _ = self.block_to_dataset_index[index]
return self.dataset.attr(attr, start_ds_idx)
def __getitem__(self, index):
start_ds_idx, start_offset, end_ds_idx = self.block_to_dataset_index[index]
buffer = torch.cat(
[self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)]
)
slice_s, slice_e = self.slice_indices[index]
length = slice_e - slice_s
s, e = start_offset, start_offset + length
item = buffer[s:e]
if self.include_targets:
# *target* is the original sentence (=item)
# *source* is shifted right by 1 (maybe left-padded with eos)
# *past_target* is shifted right by 2 (left-padded as needed)
if s == 0:
source = torch.cat([item.new([self.eos]), buffer[0 : e - 1]])
past_target = torch.cat(
[item.new([self.pad, self.eos]), buffer[0 : e - 2]]
)
else:
source = buffer[s - 1 : e - 1]
if s == 1:
past_target = torch.cat([item.new([self.eos]), buffer[0 : e - 2]])
else:
past_target = buffer[s - 2 : e - 2]
return source, item, past_target
return item
def __len__(self):
return len(self.slice_indices)
@property
def supports_prefetch(self):
return getattr(self.dataset, "supports_prefetch", False)
def prefetch(self, indices):
self.dataset.prefetch(
{
ds_idx
for index in indices
for start_ds_idx, _, end_ds_idx in [self.block_to_dataset_index[index]]
for ds_idx in range(start_ds_idx, end_ds_idx + 1)
}
)
# cython: language_level=3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import torch
from itertools import chain
from libc.math cimport ceil
cimport cython
cimport numpy as np
from libc.stdint cimport int32_t, int64_t
DTYPE = np.int64
ctypedef int64_t DTYPE_t
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size):
cdef DTYPE_t total_size = sizes.sum()
cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE)
cdef DTYPE_t[:, :] slice_indices_view = slice_indices
cdef DTYPE_t i
cdef DTYPE_t start
cdef DTYPE_t end
for i in range(length):
start = i * block_size
end = min(start + block_size, total_size)
slice_indices_view[i][0] = start
slice_indices_view[i][1] = end
return slice_indices
cdef np.ndarray[DTYPE_t, ndim=2] _fast_convert_to_np_array(list list_of_list):
"""
Faster function to convert DTYPE_t list of list.
Only fast when there are huge number of rows and low number of columns.
"""
cdef np.ndarray[DTYPE_t, ndim=1] flat = np.fromiter(chain.from_iterable(list_of_list), DTYPE, -1)
return flat.reshape((len(list_of_list), -1))
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):
cdef DTYPE_t tok_idx = 0
cdef DTYPE_t sz_idx = 0
cdef DTYPE_t curr_size = 0
cdef DTYPE_t i = 0
cdef DTYPE_t length
cdef DTYPE_t total_size
cdef DTYPE_t[:] sizes_view = sizes
cdef np.ndarray[DTYPE_t, ndim=2] slice_indices
cdef list slice_indices_list = []
if break_mode is None or break_mode == 'none':
slice_indices = _get_slice_indices_none_mode(sizes, block_size)
elif break_mode == 'complete':
while sz_idx < len(sizes_view):
if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
curr_size += sizes_view[sz_idx]
sz_idx += 1
else:
slice_indices_list.append((tok_idx, tok_idx + curr_size))
tok_idx += curr_size
curr_size = 0
if curr_size > 0:
slice_indices_list.append((tok_idx, tok_idx + curr_size))
slice_indices = _fast_convert_to_np_array(slice_indices_list)
elif break_mode == 'complete_doc':
while sz_idx < len(sizes_view):
if (
(curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)
# an empty sentence indicates end-of-document:
and sizes_view[sz_idx] != document_sep_len
):
curr_size += sizes_view[sz_idx]
sz_idx += 1
else:
# Only keep non-empty documents.
if curr_size > 1:
slice_indices_list.append((tok_idx, tok_idx + curr_size))
tok_idx += curr_size
curr_size = 0
if sizes_view[sz_idx] == document_sep_len:
tok_idx += sizes_view[sz_idx]
sz_idx += 1
if curr_size > 1:
slice_indices_list.append((tok_idx, tok_idx + curr_size))
slice_indices = _fast_convert_to_np_array(slice_indices_list)
elif break_mode == 'eos':
slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
cumsum = sizes.cumsum(axis=0)
slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]
slice_indices[:, 1] = cumsum
else:
raise ValueError('Invalid break_mode: ' + break_mode)
return slice_indices
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices):
cdef DTYPE_t start_ds_idx
cdef DTYPE_t start_offset
cdef DTYPE_t end_ds_idx
cdef DTYPE_t i
cdef DTYPE_t s
cdef DTYPE_t e
cdef DatasetSearcher ds = DatasetSearcher(sizes)
cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE)
cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index
cdef DTYPE_t[:, :] slice_indices_view = slice_indices
cdef Py_ssize_t x_max = slice_indices.shape[0]
for i in range(x_max):
s = slice_indices_view[i][0]
e = slice_indices_view[i][1]
ds.seek(s)
start_ds_idx = ds.current_index
start_offset = ds.current_offset
if e <= s:
end_ds_idx = start_ds_idx
else:
ds.seek(e - 1)
end_ds_idx = ds.current_index
block_to_dataset_index_view[i][0] = start_ds_idx # starting index in dataset
block_to_dataset_index_view[i][1] = start_offset # starting offset within starting index
block_to_dataset_index_view[i][2] = end_ds_idx # ending index in dataset
return block_to_dataset_index
cdef class DatasetSearcher(object):
"""Helper for mapping "flat" indices to indices and offsets in an
underlying dataset."""
cdef DTYPE_t current_i
cdef DTYPE_t current_offset
cdef DTYPE_t current_index
cdef DTYPE_t[:] sizes
def __init__(self, DTYPE_t[:] sizes):
self.sizes = sizes
self.reset()
cdef reset(self):
self.current_offset = 0 # offset within current index in underlying dataset
self.current_i = 0 # "flat" index
self.current_index = 0 # index in underlying dataset
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int step(self, DTYPE_t i):
cdef DTYPE_t to_consume
cdef DTYPE_t remaining
if i < self.current_i:
self.reset()
if i > self.current_i:
to_consume = i - self.current_i
remaining = self.sizes[self.current_index] - self.current_offset
if remaining > to_consume:
self.current_offset += to_consume
self.current_i += to_consume
else:
assert remaining >= 0
self.current_i += remaining
self.current_index += 1
self.current_offset = 0
return 1
return 0
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef seek(self, DTYPE_t i):
cdef int not_done = 1
while not_done == 1:
not_done = self.step(i)
assert self.current_i == i
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment