Unverified Commit fae6c92e authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

Merge branch 'main' into feature/shardformer

parents bd186784 ac178ca5
......@@ -79,7 +79,7 @@ from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
```
- Other modules
......@@ -273,8 +273,8 @@ SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token
### Build pipeline model (`/hybrid_parallel/model/vit.py`)
Colossal-AI provides two methods to build a pipeline model from the existing model.
- `colossalai.builder.build_pipeline_model_from_cfg`
- `colossalai.builder.build_pipeline_model`
- `colossalai.legacy.builder.build_pipeline_model_from_cfg`
- `colossalai.legacy.builder.build_pipeline_model`
Besides, you can also build a pipeline model from scratch with Colossal-AI.
```python
......@@ -284,11 +284,11 @@ from typing import Callable
import inspect
import torch
from colossalai import nn as col_nn
from colossalai.registry import LAYERS, MODELS
from colossalai.legacy.registry import LAYERS, MODELS
from colossalai.logging import get_dist_logger
from colossalai.core import global_context as gpc
from colossalai.context import ParallelMode
from colossalai.builder.pipeline import partition_uniform
from colossalai.legacy.builder.pipeline import partition_uniform
from torch import dtype, nn
from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead
......@@ -415,7 +415,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw
#### Import modules
```python
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.utils import MultiTimer
import os
......@@ -644,3 +644,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co
# If your torch >= 1.9.0
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
```
<!-- doc-test-command: echo -->
......@@ -64,7 +64,7 @@ Trainer is a more high-level wrapper for the user to execute training with fewer
```python
from colossalai.logging import get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
# build components and initialize with colossalai.initialize
...
......@@ -107,7 +107,7 @@ If you want to customize your own hook class, you can inherit `hooks.BaseHook` a
```python
from colossalai.logging import get_dist_logger
from colossalai.trainer import hooks
from colossalai.legacy.trainer import hooks
class LogMessageHook(hooks.BaseHook):
......@@ -345,7 +345,7 @@ If you wish to train with a trainer object, you can follow the code snippet belo
```python
from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
# create a trainer object
......@@ -387,3 +387,4 @@ python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr loc
# with trainer
python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
```
<!-- doc-test-command: echo -->
......@@ -41,7 +41,7 @@ for epoch in range(num_epochs):
#### Save when using trainer
```python
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
model = ...
engine, _, _, _ = colossalai.initialize(model=model, ...)
trainer = Trainer(engine, ...)
......@@ -61,3 +61,4 @@ model = ...
load_checkpoint('xxx.pt', model)
... # train or test
```
<!-- doc-test-command: echo -->
......@@ -28,8 +28,8 @@ To implement a customized gradient handler, you need to follow these steps.
3. implement `handle_gradient` method.
```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.engine.gradient_handler import BaseGradientHandler
from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
@GRADIENT_HANDLER.register_module
......@@ -61,3 +61,4 @@ to demonstrate the use of gradient handler. In this example, we used `DataParall
```shell
python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py
```
<!-- doc-test-command: echo -->
......@@ -267,7 +267,7 @@ from pathlib import Path
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.nn.lr_scheduler import LinearWarmupLR
from timm.models import vit_base_patch16_224
from torchvision import datasets, transforms
......
......@@ -79,7 +79,7 @@ import colossalai.nn as col_nn
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader
from colossalai.context import ParallelMode
from colossalai.pipeline.pipelinable import PipelinableContext
......@@ -157,3 +157,4 @@ trainer.fit(train_dataloader=train_dataloader,
```
We use `2` pipeline stages and the batch will be split into `4` micro batches.
<!-- doc-test-command: echo -->
......@@ -81,14 +81,14 @@ Colossal-AI 为用户提供了一个全局 context,使他们能够轻松地管
## 梯度 Handler
梯度 handler 是对参数的梯度执行 all-reduce 操作的对象。由于不同的 all-reduce 策略或许在不同的并行中被执行,用户可以继承
`colossalai.engine.gradient_handler.BaseGradientHandler` 来实现其策略。目前,Colossal-AI 使用普通的数据并行梯度 handler 在数据并行的 rank 间 all-reduce 梯度。
`colossalai.legacy.engine.gradient_handler.BaseGradientHandler` 来实现其策略。目前,Colossal-AI 使用普通的数据并行梯度 handler 在数据并行的 rank 间 all-reduce 梯度。
如果数据并行被检测到,梯度 handler 会被自动添加进 engine。
你可以添加你自己的梯度 handler,如下所示:
```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.engine import BaseGradientHandler
from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.legacy.engine import BaseGradientHandler
@GRADIENT_HANDLER.register_module
class YourGradientHandler(BaseGradientHandler):
......@@ -109,4 +109,5 @@ gradient_handlers = [
## Schedule
Schedule 包含了如何执行前向和后向计算。目前, Colossal-AI 提供了流水和非流水的 schedule。
如果你想修改前向和后向计算的执行方式,你可以继承 `colossalai.engine.schedule.BaseSchedule` 并实现 `forward_back_step` 函数。
如果你想修改前向和后向计算的执行方式,你可以继承 `colossalai.legacy.engine.schedule.BaseSchedule` 并实现 `forward_back_step` 函数。
<!-- doc-test-command: echo -->
......@@ -36,14 +36,14 @@ import torch
import torch.nn as nn
from colossalai import nn as col_nn
from colossalai.amp import AMP_TYPE
from colossalai.builder.pipeline import partition_uniform
from colossalai.legacy.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils.timer import MultiTimer
from model_zoo.gpt import GPTLMLoss
from torch.nn import functional as F
......@@ -273,3 +273,4 @@ def train():
return_output_label=False,
)
```
<!-- doc-test-command: echo -->
......@@ -32,11 +32,11 @@ import colossalai
import colossalai.nn as col_nn
import torch
import torch.nn as nn
from colossalai.builder import build_pipeline_model
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.builder import build_pipeline_model
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader
from timm.models import vision_transformer as vit
from torchvision import transforms
......@@ -48,17 +48,17 @@ from torchvision.datasets import CIFAR10
总的来说, 我们提供3种方法来建立一个流水并行的模型:
1. `colossalai.builder.build_pipeline_model_from_cfg`
2. `colossalai.builder.build_pipeline_model`
1. `colossalai.legacy.builder.build_pipeline_model_from_cfg`
2. `colossalai.legacy.builder.build_pipeline_model`
3. 自己按阶段拆分模型
当你的内存能够容纳模型时,你可以使用前两种方法来建立你的模型,否则你必须自己分割模型。前两种方法首先在 CPU 上建立整个模型,然后分割模型,最后你可以直接把模型的相应部分移到 GPU 上。
`colossalai.builder.build_pipeline_model_from_cfg()` 接收一个模型的配置文件,它可以均匀地(按层)或平衡地(按参数大小)分割模型。
`colossalai.legacy.builder.build_pipeline_model_from_cfg()` 接收一个模型的配置文件,它可以均匀地(按层)或平衡地(按参数大小)分割模型。
如果你熟悉 `PyTorch`, 你可以使用 `colossalai.builder.build_pipeline_model()` 它接收一个 `torch.nn.Sequential` 模型并按层均匀分割。
如果你熟悉 `PyTorch`, 你可以使用 `colossalai.legacy.builder.build_pipeline_model()` 它接收一个 `torch.nn.Sequential` 模型并按层均匀分割。
在本教程中,我们将修改 [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential`,然后使用 `colossalai.builder.build_pipeline_model()` 来建立流水线模型。
在本教程中,我们将修改 [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential`,然后使用 `colossalai.legacy.builder.build_pipeline_model()` 来建立流水线模型。
当数据是 **一个** `Tensor`, 你可以使用你的模型 `forward()` 中的位置参数来获得数据张量。对于流水线的第一阶段,`forward()` 的第一个位置参数是从数据加载器加载的数据张量。对于其他阶段,`forward()` 的第一个位置参数是上一阶段的输出张量。注意,如果该阶段不是最后一个阶段,则 `forward()` 的返回必须是一个 `Tensor`
......@@ -244,3 +244,4 @@ def train():
hooks=hook_list,
display_progress=True)
```
<!-- doc-test-command: echo -->
......@@ -74,7 +74,7 @@ from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
```
- 其他模块
......@@ -256,8 +256,8 @@ SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token
### 构建流水线模型 (`/hybrid_parallel/model/vit.py`)
Colossal-AI 提供了两种从现有模型构建流水线模型的方法。
- `colossalai.builder.build_pipeline_model_from_cfg`
- `colossalai.builder.build_pipeline_model`
- `colossalai.legacy.builder.build_pipeline_model_from_cfg`
- `colossalai.legacy.builder.build_pipeline_model`
此外,您还可以使用 Colossal-AI 从头开始构建流水线模型。
```python
......@@ -266,11 +266,11 @@ from typing import Callable
import inspect
import torch
from colossalai import nn as col_nn
from colossalai.registry import LAYERS, MODELS
from colossalai.legacy.registry import LAYERS, MODELS
from colossalai.logging import get_dist_logger
from colossalai.core import global_context as gpc
from colossalai.context import ParallelMode
from colossalai.builder.pipeline import partition_uniform
from colossalai.legacy.builder.pipeline import partition_uniform
from torch import dtype, nn
from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead
@MODELS.register_module
......@@ -380,7 +380,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw
#### 导入模块
```python
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.utils import MultiTimer
import os
......@@ -589,3 +589,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co
# If your torch >= 1.9.0
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
```
<!-- doc-test-command: echo -->
......@@ -61,7 +61,7 @@ Trainer 的参数 `schedule` 默认值是 `None` 。在大多数情况下,除
```python
from colossalai.logging import get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
# build components and initialize with colossalai.initialize
...
......@@ -104,7 +104,7 @@ trainer.fit(
```python
from colossalai.logging import get_dist_logger
from colossalai.trainer import hooks
from colossalai.legacy.trainer import hooks
class LogMessageHook(hooks.BaseHook):
......@@ -341,7 +341,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
```python
from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
# create a trainer object
......@@ -384,3 +384,4 @@ python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr loc
# with trainer
python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
```
<!-- doc-test-command: echo -->
......@@ -41,7 +41,7 @@ for epoch in range(num_epochs):
#### 用 trainer 保存
```python
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
model = ...
engine, _, _, _ = colossalai.initialize(model=model, ...)
trainer = Trainer(engine, ...)
......@@ -61,3 +61,4 @@ model = ...
load_checkpoint('xxx.pt', model)
... # train or test
```
<!-- doc-test-command: echo -->
......@@ -25,8 +25,8 @@
3. 实现 `handle_gradient`
```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.engine.gradient_handler import BaseGradientHandler
from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
@GRADIENT_HANDLER.register_module
......@@ -57,3 +57,4 @@ gradient_handler = [dict(type='MyGradientHandler')]
```shell
python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py
```
<!-- doc-test-command: echo -->
......@@ -245,7 +245,7 @@ from pathlib import Path
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.nn.lr_scheduler import LinearWarmupLR
from timm.models import vit_base_patch16_224
from torchvision import datasets, transforms
......
......@@ -78,7 +78,7 @@ import colossalai.nn as col_nn
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader
from colossalai.context import ParallelMode
from colossalai.pipeline.pipelinable import PipelinableContext
......@@ -156,3 +156,4 @@ trainer.fit(train_dataloader=train_dataloader,
```
我们使用 `2` 个流水段,并且 batch 将被切分为 `4` 个 micro batches。
<!-- doc-test-command: echo -->
......@@ -6,7 +6,7 @@ import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer
from colossalai.registry import DATASETS
from colossalai.legacy.registry import DATASETS
@DATASETS.register_module
......
......@@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.legacy.registry import LAYERS, LOSSES, MODELS
from colossalai.nn.layer.base_layer import ParallelLayer
from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
from colossalai.nn.layer.utils import divide
from colossalai.registry import LAYERS, LOSSES, MODELS
from colossalai.utils import get_current_device
......
......@@ -10,9 +10,9 @@ import colossalai
import colossalai.utils as utils
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn import LinearWarmupLR
from colossalai.trainer import Trainer, hooks
from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
from colossalai.utils.timer import MultiTimer
from colossalai.zero.legacy.init_ctx import ZeroInitContext
......
......@@ -3,17 +3,16 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# copied from fairseq/fairseq/data/indexed_dataset.py
# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
# other slight modifications to remove fairseq dependencies
# Added document index to index file and made it accessible.
# An empty sentence no longer separates documents.
from functools import lru_cache
import os
import shutil
import struct
from functools import lru_cache
from itertools import accumulate
import numpy as np
......@@ -88,16 +87,7 @@ def write_longs(f, a):
f.write(np.array(a, dtype=np.int64))
dtypes = {
1: np.uint8,
2: np.int8,
3: np.int16,
4: np.int32,
5: np.int64,
6: np.float,
7: np.double,
8: np.uint16
}
dtypes = {1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: float, 7: np.double, 8: np.uint16}
def code(dtype):
......@@ -136,10 +126,8 @@ class IndexedDataset(torch.utils.data.Dataset):
def read_index(self, path):
with open(index_file_path(path), 'rb') as f:
magic = f.read(8)
assert magic == self._HDR_MAGIC, (
'Index file doesn\'t match expected format. '
'Make sure that --dataset-impl is configured properly.'
)
assert magic == self._HDR_MAGIC, ('Index file doesn\'t match expected format. '
'Make sure that --dataset-impl is configured properly.')
version = f.read(8)
assert struct.unpack('<Q', version) == (1,)
code, self.element_size = struct.unpack('<QQ', f.read(16))
......@@ -198,9 +186,7 @@ class IndexedDataset(torch.utils.data.Dataset):
@staticmethod
def exists(path):
return (
os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
)
return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
@property
def supports_prefetch(self):
......@@ -233,7 +219,7 @@ class IndexedCachedDataset(IndexedDataset):
for i in indices:
self.cache_index[i] = ptx
size = self.data_offsets[i + 1] - self.data_offsets[i]
a = self.cache[ptx: ptx + size]
a = self.cache[ptx:ptx + size]
self.data_file.seek(self.data_offsets[i] * self.element_size)
self.data_file.readinto(a)
ptx += size
......@@ -250,7 +236,7 @@ class IndexedCachedDataset(IndexedDataset):
tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
a = np.empty(tensor_size, dtype=self.dtype)
ptx = self.cache_index[i]
np.copyto(a, self.cache[ptx: ptx + a.size])
np.copyto(a, self.cache[ptx:ptx + a.size])
return a
elif isinstance(idx, slice):
# Hack just to make this work, can optimizer later if necessary
......@@ -261,15 +247,7 @@ class IndexedCachedDataset(IndexedDataset):
class IndexedDatasetBuilder(object):
element_sizes = {
np.uint8: 1,
np.int8: 1,
np.int16: 2,
np.int32: 4,
np.int64: 8,
np.float: 4,
np.double: 8
}
element_sizes = {np.uint8: 1, np.int8: 1, np.int16: 2, np.int32: 4, np.int64: 8, float: 4, np.double: 8}
def __init__(self, out_file, dtype=np.int32):
self.out_file = open(out_file, 'wb')
......@@ -332,12 +310,15 @@ def _warmup_mmap_file(path):
class MMapIndexedDataset(torch.utils.data.Dataset):
class Index(object):
_HDR_MAGIC = b'MMIDIDX\x00\x00'
@classmethod
def writer(cls, path, dtype):
class _Writer(object):
def __enter__(self):
self._file = open(path, 'wb')
......@@ -384,10 +365,8 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
def __init__(self, path, skip_warmup=False):
with open(path, 'rb') as stream:
magic_test = stream.read(9)
assert self._HDR_MAGIC == magic_test, (
'Index file doesn\'t match expected format. '
'Make sure that --dataset-impl is configured properly.'
)
assert self._HDR_MAGIC == magic_test, ('Index file doesn\'t match expected format. '
'Make sure that --dataset-impl is configured properly.')
version = struct.unpack('<Q', stream.read(8))
assert (1,) == version
......@@ -406,16 +385,16 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
self._bin_buffer = memoryview(self._bin_buffer_mmap)
print(" reading sizes...")
self._sizes = np.frombuffer(
self._bin_buffer,
dtype=np.int32,
count=self._len,
offset=offset)
self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset)
print(" reading pointers...")
self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
self._pointers = np.frombuffer(self._bin_buffer,
dtype=np.int64,
count=self._len,
offset=offset + self._sizes.nbytes)
print(" reading document index...")
self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
self._doc_idx = np.frombuffer(self._bin_buffer,
dtype=np.int64,
count=self._doc_count,
offset=offset + self._sizes.nbytes + self._pointers.nbytes)
def __del__(self):
......@@ -480,8 +459,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
def __getitem__(self, idx):
if isinstance(idx, int):
ptr, size = self._index[idx]
np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
count=size, offset=ptr)
np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)
return np_array
elif isinstance(idx, slice):
start, stop, step = idx.indices(len(self))
......@@ -491,8 +469,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
sizes = self._index._sizes[idx]
offsets = list(accumulate(sizes))
total_size = sum(sizes)
np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
count=total_size, offset=ptr)
np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr)
sents = np.split(np_array, offsets[:-1])
return sents
......@@ -506,8 +483,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
if length is None:
length = size - offset
ptr += offset * np.dtype(self._index.dtype).itemsize
np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
count=length, offset=ptr)
np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr)
return np_array
@property
......@@ -530,12 +506,11 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
@staticmethod
def exists(path):
return (
os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
)
return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
class MMapIndexedDatasetBuilder(object):
def __init__(self, out_file, dtype=np.int64):
self._data_file = open(out_file, 'wb')
self._dtype = dtype
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment