Commit 9462a819 authored by davidecaroselli's avatar davidecaroselli Committed by Facebook Github Bot
Browse files

Enhanced MMapIndexedDataset: less memory, higher speed (#816)

Summary:
I have made an upgrade to my previous implementation of MMapIndexedDataset, now:
- It uses up to **4 times less memory and disk space**
- Words per second is slightly improved thanks to less memory access
Pull Request resolved: https://github.com/pytorch/fairseq/pull/816

Differential Revision: D15899848

Pulled By: myleott

fbshipit-source-id: 9ddeb4809729ef69cc6b0867b33ee71184d845e6
parent 9c3bb5c6
...@@ -15,9 +15,16 @@ import torch ...@@ -15,9 +15,16 @@ import torch
from . import FairseqDataset from . import FairseqDataset
def make_builder(out_file, impl): def __best_fitting_dtype(vocab_size=None):
if vocab_size is not None and vocab_size < 65500:
return np.uint16
else:
return np.int32
def make_builder(out_file, impl, vocab_size=None):
if impl == 'mmap': if impl == 'mmap':
return MMapIndexedDatasetBuilder(out_file) return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
else: else:
return IndexedDatasetBuilder(out_file) return IndexedDatasetBuilder(out_file)
...@@ -63,6 +70,7 @@ dtypes = { ...@@ -63,6 +70,7 @@ dtypes = {
5: np.int64, 5: np.int64,
6: np.float, 6: np.float,
7: np.double, 7: np.double,
8: np.uint16
} }
...@@ -143,7 +151,7 @@ class IndexedDataset(FairseqDataset): ...@@ -143,7 +151,7 @@ class IndexedDataset(FairseqDataset):
@staticmethod @staticmethod
def exists(path): def exists(path):
return ( return (
os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)) os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
) )
@property @property
...@@ -440,11 +448,11 @@ class MMapIndexedDataset(torch.utils.data.Dataset): ...@@ -440,11 +448,11 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
def __getitem__(self, i): def __getitem__(self, i):
ptr, size = self._index[i] ptr, size = self._index[i]
tensor = torch.from_numpy(np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)) np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)
if tensor.dtype == torch.int64: if self._index.dtype != np.int64:
return tensor np_array = np_array.astype(np.int64)
else:
return tensor.long() return torch.from_numpy(np_array)
@property @property
def sizes(self): def sizes(self):
...@@ -457,7 +465,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset): ...@@ -457,7 +465,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
@staticmethod @staticmethod
def exists(path): def exists(path):
return ( return (
os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)) os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
) )
......
...@@ -129,7 +129,8 @@ def main(args): ...@@ -129,7 +129,8 @@ def main(args):
) )
pool.close() pool.close()
ds = indexed_dataset.make_builder(dataset_dest_file(args, output_prefix, lang, "bin"), impl=args.dataset_impl) ds = indexed_dataset.make_builder(dataset_dest_file(args, output_prefix, lang, "bin"),
impl=args.dataset_impl, vocab_size=len(vocab))
merge_result( merge_result(
Binarizer.binarize( Binarizer.binarize(
input_file, vocab, lambda t: ds.add_item(t), input_file, vocab, lambda t: ds.add_item(t),
...@@ -231,7 +232,8 @@ def main(args): ...@@ -231,7 +232,8 @@ def main(args):
def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True): def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True):
ds = indexed_dataset.make_builder(dataset_dest_file(args, output_prefix, lang, "bin"), impl=args.dataset_impl) ds = indexed_dataset.make_builder(dataset_dest_file(args, output_prefix, lang, "bin"),
impl=args.dataset_impl, vocab_size=len(vocab))
def consumer(tensor): def consumer(tensor):
ds.add_item(tensor) ds.add_item(tensor)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment