Unverified Commit fae6c92e authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

Merge branch 'main' into feature/shardformer

parents bd186784 ac178ca5
...@@ -79,7 +79,7 @@ from colossalai.core import global_context as gpc ...@@ -79,7 +79,7 @@ from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.lr_scheduler import LinearWarmupLR from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.metric import Accuracy from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
``` ```
- Other modules - Other modules
...@@ -273,8 +273,8 @@ SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token ...@@ -273,8 +273,8 @@ SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token
### Build pipeline model (`/hybrid_parallel/model/vit.py`) ### Build pipeline model (`/hybrid_parallel/model/vit.py`)
Colossal-AI provides two methods to build a pipeline model from the existing model. Colossal-AI provides two methods to build a pipeline model from the existing model.
- `colossalai.builder.build_pipeline_model_from_cfg` - `colossalai.legacy.builder.build_pipeline_model_from_cfg`
- `colossalai.builder.build_pipeline_model` - `colossalai.legacy.builder.build_pipeline_model`
Besides, you can also build a pipeline model from scratch with Colossal-AI. Besides, you can also build a pipeline model from scratch with Colossal-AI.
```python ```python
...@@ -284,11 +284,11 @@ from typing import Callable ...@@ -284,11 +284,11 @@ from typing import Callable
import inspect import inspect
import torch import torch
from colossalai import nn as col_nn from colossalai import nn as col_nn
from colossalai.registry import LAYERS, MODELS from colossalai.legacy.registry import LAYERS, MODELS
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.builder.pipeline import partition_uniform from colossalai.legacy.builder.pipeline import partition_uniform
from torch import dtype, nn from torch import dtype, nn
from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead
...@@ -415,7 +415,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw ...@@ -415,7 +415,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw
#### Import modules #### Import modules
```python ```python
from colossalai.engine.schedule import (InterleavedPipelineSchedule, from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule) PipelineSchedule)
from colossalai.utils import MultiTimer from colossalai.utils import MultiTimer
import os import os
...@@ -644,3 +644,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co ...@@ -644,3 +644,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co
# If your torch >= 1.9.0 # If your torch >= 1.9.0
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py # python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
``` ```
<!-- doc-test-command: echo -->
...@@ -64,7 +64,7 @@ Trainer is a more high-level wrapper for the user to execute training with fewer ...@@ -64,7 +64,7 @@ Trainer is a more high-level wrapper for the user to execute training with fewer
```python ```python
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
# build components and initialize with colossalai.initialize # build components and initialize with colossalai.initialize
... ...
...@@ -107,7 +107,7 @@ If you want to customize your own hook class, you can inherit `hooks.BaseHook` a ...@@ -107,7 +107,7 @@ If you want to customize your own hook class, you can inherit `hooks.BaseHook` a
```python ```python
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.trainer import hooks from colossalai.legacy.trainer import hooks
class LogMessageHook(hooks.BaseHook): class LogMessageHook(hooks.BaseHook):
...@@ -345,7 +345,7 @@ If you wish to train with a trainer object, you can follow the code snippet belo ...@@ -345,7 +345,7 @@ If you wish to train with a trainer object, you can follow the code snippet belo
```python ```python
from colossalai.nn.metric import Accuracy from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
# create a trainer object # create a trainer object
...@@ -387,3 +387,4 @@ python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr loc ...@@ -387,3 +387,4 @@ python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr loc
# with trainer # with trainer
python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
``` ```
<!-- doc-test-command: echo -->
...@@ -41,7 +41,7 @@ for epoch in range(num_epochs): ...@@ -41,7 +41,7 @@ for epoch in range(num_epochs):
#### Save when using trainer #### Save when using trainer
```python ```python
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
model = ... model = ...
engine, _, _, _ = colossalai.initialize(model=model, ...) engine, _, _, _ = colossalai.initialize(model=model, ...)
trainer = Trainer(engine, ...) trainer = Trainer(engine, ...)
...@@ -61,3 +61,4 @@ model = ... ...@@ -61,3 +61,4 @@ model = ...
load_checkpoint('xxx.pt', model) load_checkpoint('xxx.pt', model)
... # train or test ... # train or test
``` ```
<!-- doc-test-command: echo -->
...@@ -28,8 +28,8 @@ To implement a customized gradient handler, you need to follow these steps. ...@@ -28,8 +28,8 @@ To implement a customized gradient handler, you need to follow these steps.
3. implement `handle_gradient` method. 3. implement `handle_gradient` method.
```python ```python
from colossalai.registry import GRADIENT_HANDLER from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.engine.gradient_handler import BaseGradientHandler from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
@GRADIENT_HANDLER.register_module @GRADIENT_HANDLER.register_module
...@@ -61,3 +61,4 @@ to demonstrate the use of gradient handler. In this example, we used `DataParall ...@@ -61,3 +61,4 @@ to demonstrate the use of gradient handler. In this example, we used `DataParall
```shell ```shell
python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py
``` ```
<!-- doc-test-command: echo -->
...@@ -267,7 +267,7 @@ from pathlib import Path ...@@ -267,7 +267,7 @@ from pathlib import Path
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader from colossalai.utils import get_dataloader
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
from colossalai.nn.lr_scheduler import LinearWarmupLR from colossalai.nn.lr_scheduler import LinearWarmupLR
from timm.models import vit_base_patch16_224 from timm.models import vit_base_patch16_224
from torchvision import datasets, transforms from torchvision import datasets, transforms
......
...@@ -79,7 +79,7 @@ import colossalai.nn as col_nn ...@@ -79,7 +79,7 @@ import colossalai.nn as col_nn
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader from colossalai.utils import MultiTimer, get_dataloader
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.pipeline.pipelinable import PipelinableContext from colossalai.pipeline.pipelinable import PipelinableContext
...@@ -157,3 +157,4 @@ trainer.fit(train_dataloader=train_dataloader, ...@@ -157,3 +157,4 @@ trainer.fit(train_dataloader=train_dataloader,
``` ```
We use `2` pipeline stages and the batch will be split into `4` micro batches. We use `2` pipeline stages and the batch will be split into `4` micro batches.
<!-- doc-test-command: echo -->
...@@ -81,14 +81,14 @@ Colossal-AI 为用户提供了一个全局 context,使他们能够轻松地管 ...@@ -81,14 +81,14 @@ Colossal-AI 为用户提供了一个全局 context,使他们能够轻松地管
## 梯度 Handler ## 梯度 Handler
梯度 handler 是对参数的梯度执行 all-reduce 操作的对象。由于不同的 all-reduce 策略或许在不同的并行中被执行,用户可以继承 梯度 handler 是对参数的梯度执行 all-reduce 操作的对象。由于不同的 all-reduce 策略或许在不同的并行中被执行,用户可以继承
`colossalai.engine.gradient_handler.BaseGradientHandler` 来实现其策略。目前,Colossal-AI 使用普通的数据并行梯度 handler 在数据并行的 rank 间 all-reduce 梯度。 `colossalai.legacy.engine.gradient_handler.BaseGradientHandler` 来实现其策略。目前,Colossal-AI 使用普通的数据并行梯度 handler 在数据并行的 rank 间 all-reduce 梯度。
如果数据并行被检测到,梯度 handler 会被自动添加进 engine。 如果数据并行被检测到,梯度 handler 会被自动添加进 engine。
你可以添加你自己的梯度 handler,如下所示: 你可以添加你自己的梯度 handler,如下所示:
```python ```python
from colossalai.registry import GRADIENT_HANDLER from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.engine import BaseGradientHandler from colossalai.legacy.engine import BaseGradientHandler
@GRADIENT_HANDLER.register_module @GRADIENT_HANDLER.register_module
class YourGradientHandler(BaseGradientHandler): class YourGradientHandler(BaseGradientHandler):
...@@ -109,4 +109,5 @@ gradient_handlers = [ ...@@ -109,4 +109,5 @@ gradient_handlers = [
## Schedule ## Schedule
Schedule 包含了如何执行前向和后向计算。目前, Colossal-AI 提供了流水和非流水的 schedule。 Schedule 包含了如何执行前向和后向计算。目前, Colossal-AI 提供了流水和非流水的 schedule。
如果你想修改前向和后向计算的执行方式,你可以继承 `colossalai.engine.schedule.BaseSchedule` 并实现 `forward_back_step` 函数。 如果你想修改前向和后向计算的执行方式,你可以继承 `colossalai.legacy.engine.schedule.BaseSchedule` 并实现 `forward_back_step` 函数。
<!-- doc-test-command: echo -->
...@@ -36,14 +36,14 @@ import torch ...@@ -36,14 +36,14 @@ import torch
import torch.nn as nn import torch.nn as nn
from colossalai import nn as col_nn from colossalai import nn as col_nn
from colossalai.amp import AMP_TYPE from colossalai.amp import AMP_TYPE
from colossalai.builder.pipeline import partition_uniform from colossalai.legacy.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.engine.schedule import (InterleavedPipelineSchedule, from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule) PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils.timer import MultiTimer from colossalai.utils.timer import MultiTimer
from model_zoo.gpt import GPTLMLoss from model_zoo.gpt import GPTLMLoss
from torch.nn import functional as F from torch.nn import functional as F
...@@ -273,3 +273,4 @@ def train(): ...@@ -273,3 +273,4 @@ def train():
return_output_label=False, return_output_label=False,
) )
``` ```
<!-- doc-test-command: echo -->
...@@ -32,11 +32,11 @@ import colossalai ...@@ -32,11 +32,11 @@ import colossalai
import colossalai.nn as col_nn import colossalai.nn as col_nn
import torch import torch
import torch.nn as nn import torch.nn as nn
from colossalai.builder import build_pipeline_model from colossalai.legacy.builder import build_pipeline_model
from colossalai.engine.schedule import (InterleavedPipelineSchedule, from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule) PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader from colossalai.utils import MultiTimer, get_dataloader
from timm.models import vision_transformer as vit from timm.models import vision_transformer as vit
from torchvision import transforms from torchvision import transforms
...@@ -48,17 +48,17 @@ from torchvision.datasets import CIFAR10 ...@@ -48,17 +48,17 @@ from torchvision.datasets import CIFAR10
总的来说, 我们提供3种方法来建立一个流水并行的模型: 总的来说, 我们提供3种方法来建立一个流水并行的模型:
1. `colossalai.builder.build_pipeline_model_from_cfg` 1. `colossalai.legacy.builder.build_pipeline_model_from_cfg`
2. `colossalai.builder.build_pipeline_model` 2. `colossalai.legacy.builder.build_pipeline_model`
3. 自己按阶段拆分模型 3. 自己按阶段拆分模型
当你的内存能够容纳模型时,你可以使用前两种方法来建立你的模型,否则你必须自己分割模型。前两种方法首先在 CPU 上建立整个模型,然后分割模型,最后你可以直接把模型的相应部分移到 GPU 上。 当你的内存能够容纳模型时,你可以使用前两种方法来建立你的模型,否则你必须自己分割模型。前两种方法首先在 CPU 上建立整个模型,然后分割模型,最后你可以直接把模型的相应部分移到 GPU 上。
`colossalai.builder.build_pipeline_model_from_cfg()` 接收一个模型的配置文件,它可以均匀地(按层)或平衡地(按参数大小)分割模型。 `colossalai.legacy.builder.build_pipeline_model_from_cfg()` 接收一个模型的配置文件,它可以均匀地(按层)或平衡地(按参数大小)分割模型。
如果你熟悉 `PyTorch`, 你可以使用 `colossalai.builder.build_pipeline_model()` 它接收一个 `torch.nn.Sequential` 模型并按层均匀分割。 如果你熟悉 `PyTorch`, 你可以使用 `colossalai.legacy.builder.build_pipeline_model()` 它接收一个 `torch.nn.Sequential` 模型并按层均匀分割。
在本教程中,我们将修改 [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential`,然后使用 `colossalai.builder.build_pipeline_model()` 来建立流水线模型。 在本教程中,我们将修改 [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential`,然后使用 `colossalai.legacy.builder.build_pipeline_model()` 来建立流水线模型。
当数据是 **一个** `Tensor`, 你可以使用你的模型 `forward()` 中的位置参数来获得数据张量。对于流水线的第一阶段,`forward()` 的第一个位置参数是从数据加载器加载的数据张量。对于其他阶段,`forward()` 的第一个位置参数是上一阶段的输出张量。注意,如果该阶段不是最后一个阶段,则 `forward()` 的返回必须是一个 `Tensor` 当数据是 **一个** `Tensor`, 你可以使用你的模型 `forward()` 中的位置参数来获得数据张量。对于流水线的第一阶段,`forward()` 的第一个位置参数是从数据加载器加载的数据张量。对于其他阶段,`forward()` 的第一个位置参数是上一阶段的输出张量。注意,如果该阶段不是最后一个阶段,则 `forward()` 的返回必须是一个 `Tensor`
...@@ -244,3 +244,4 @@ def train(): ...@@ -244,3 +244,4 @@ def train():
hooks=hook_list, hooks=hook_list,
display_progress=True) display_progress=True)
``` ```
<!-- doc-test-command: echo -->
...@@ -74,7 +74,7 @@ from colossalai.core import global_context as gpc ...@@ -74,7 +74,7 @@ from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.lr_scheduler import LinearWarmupLR from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.metric import Accuracy from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
``` ```
- 其他模块 - 其他模块
...@@ -256,8 +256,8 @@ SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token ...@@ -256,8 +256,8 @@ SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token
### 构建流水线模型 (`/hybrid_parallel/model/vit.py`) ### 构建流水线模型 (`/hybrid_parallel/model/vit.py`)
Colossal-AI 提供了两种从现有模型构建流水线模型的方法。 Colossal-AI 提供了两种从现有模型构建流水线模型的方法。
- `colossalai.builder.build_pipeline_model_from_cfg` - `colossalai.legacy.builder.build_pipeline_model_from_cfg`
- `colossalai.builder.build_pipeline_model` - `colossalai.legacy.builder.build_pipeline_model`
此外,您还可以使用 Colossal-AI 从头开始构建流水线模型。 此外,您还可以使用 Colossal-AI 从头开始构建流水线模型。
```python ```python
...@@ -266,11 +266,11 @@ from typing import Callable ...@@ -266,11 +266,11 @@ from typing import Callable
import inspect import inspect
import torch import torch
from colossalai import nn as col_nn from colossalai import nn as col_nn
from colossalai.registry import LAYERS, MODELS from colossalai.legacy.registry import LAYERS, MODELS
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.builder.pipeline import partition_uniform from colossalai.legacy.builder.pipeline import partition_uniform
from torch import dtype, nn from torch import dtype, nn
from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead
@MODELS.register_module @MODELS.register_module
...@@ -380,7 +380,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw ...@@ -380,7 +380,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw
#### 导入模块 #### 导入模块
```python ```python
from colossalai.engine.schedule import (InterleavedPipelineSchedule, from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule) PipelineSchedule)
from colossalai.utils import MultiTimer from colossalai.utils import MultiTimer
import os import os
...@@ -589,3 +589,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co ...@@ -589,3 +589,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co
# If your torch >= 1.9.0 # If your torch >= 1.9.0
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py # python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
``` ```
<!-- doc-test-command: echo -->
...@@ -61,7 +61,7 @@ Trainer 的参数 `schedule` 默认值是 `None` 。在大多数情况下,除 ...@@ -61,7 +61,7 @@ Trainer 的参数 `schedule` 默认值是 `None` 。在大多数情况下,除
```python ```python
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
# build components and initialize with colossalai.initialize # build components and initialize with colossalai.initialize
... ...
...@@ -104,7 +104,7 @@ trainer.fit( ...@@ -104,7 +104,7 @@ trainer.fit(
```python ```python
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.trainer import hooks from colossalai.legacy.trainer import hooks
class LogMessageHook(hooks.BaseHook): class LogMessageHook(hooks.BaseHook):
...@@ -341,7 +341,7 @@ for epoch in range(gpc.config.NUM_EPOCHS): ...@@ -341,7 +341,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
```python ```python
from colossalai.nn.metric import Accuracy from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
# create a trainer object # create a trainer object
...@@ -384,3 +384,4 @@ python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr loc ...@@ -384,3 +384,4 @@ python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr loc
# with trainer # with trainer
python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
``` ```
<!-- doc-test-command: echo -->
...@@ -41,7 +41,7 @@ for epoch in range(num_epochs): ...@@ -41,7 +41,7 @@ for epoch in range(num_epochs):
#### 用 trainer 保存 #### 用 trainer 保存
```python ```python
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
model = ... model = ...
engine, _, _, _ = colossalai.initialize(model=model, ...) engine, _, _, _ = colossalai.initialize(model=model, ...)
trainer = Trainer(engine, ...) trainer = Trainer(engine, ...)
...@@ -61,3 +61,4 @@ model = ... ...@@ -61,3 +61,4 @@ model = ...
load_checkpoint('xxx.pt', model) load_checkpoint('xxx.pt', model)
... # train or test ... # train or test
``` ```
<!-- doc-test-command: echo -->
...@@ -25,8 +25,8 @@ ...@@ -25,8 +25,8 @@
3. 实现 `handle_gradient` 3. 实现 `handle_gradient`
```python ```python
from colossalai.registry import GRADIENT_HANDLER from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.engine.gradient_handler import BaseGradientHandler from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
@GRADIENT_HANDLER.register_module @GRADIENT_HANDLER.register_module
...@@ -57,3 +57,4 @@ gradient_handler = [dict(type='MyGradientHandler')] ...@@ -57,3 +57,4 @@ gradient_handler = [dict(type='MyGradientHandler')]
```shell ```shell
python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py
``` ```
<!-- doc-test-command: echo -->
...@@ -245,7 +245,7 @@ from pathlib import Path ...@@ -245,7 +245,7 @@ from pathlib import Path
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader from colossalai.utils import get_dataloader
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
from colossalai.nn.lr_scheduler import LinearWarmupLR from colossalai.nn.lr_scheduler import LinearWarmupLR
from timm.models import vit_base_patch16_224 from timm.models import vit_base_patch16_224
from torchvision import datasets, transforms from torchvision import datasets, transforms
......
...@@ -78,7 +78,7 @@ import colossalai.nn as col_nn ...@@ -78,7 +78,7 @@ import colossalai.nn as col_nn
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader from colossalai.utils import MultiTimer, get_dataloader
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.pipeline.pipelinable import PipelinableContext from colossalai.pipeline.pipelinable import PipelinableContext
...@@ -156,3 +156,4 @@ trainer.fit(train_dataloader=train_dataloader, ...@@ -156,3 +156,4 @@ trainer.fit(train_dataloader=train_dataloader,
``` ```
我们使用 `2` 个流水段,并且 batch 将被切分为 `4` 个 micro batches。 我们使用 `2` 个流水段,并且 batch 将被切分为 `4` 个 micro batches。
<!-- doc-test-command: echo -->
...@@ -6,7 +6,7 @@ import torch ...@@ -6,7 +6,7 @@ import torch
from torch.utils.data import Dataset from torch.utils.data import Dataset
from transformers import GPT2Tokenizer from transformers import GPT2Tokenizer
from colossalai.registry import DATASETS from colossalai.legacy.registry import DATASETS
@DATASETS.register_module @DATASETS.register_module
......
...@@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter ...@@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter
from colossalai.context import ParallelMode, seed from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.registry import LAYERS, LOSSES, MODELS
from colossalai.nn.layer.base_layer import ParallelLayer from colossalai.nn.layer.base_layer import ParallelLayer
from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
from colossalai.nn.layer.utils import divide from colossalai.nn.layer.utils import divide
from colossalai.registry import LAYERS, LOSSES, MODELS
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
......
...@@ -10,9 +10,9 @@ import colossalai ...@@ -10,9 +10,9 @@ import colossalai
import colossalai.utils as utils import colossalai.utils as utils
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn import LinearWarmupLR from colossalai.nn import LinearWarmupLR
from colossalai.trainer import Trainer, hooks
from colossalai.utils import colo_set_process_memory_fraction, is_using_pp from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
from colossalai.utils.timer import MultiTimer from colossalai.utils.timer import MultiTimer
from colossalai.zero.legacy.init_ctx import ZeroInitContext from colossalai.zero.legacy.init_ctx import ZeroInitContext
......
...@@ -3,17 +3,16 @@ ...@@ -3,17 +3,16 @@
# This source code is licensed under the MIT license found in the # This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
# copied from fairseq/fairseq/data/indexed_dataset.py # copied from fairseq/fairseq/data/indexed_dataset.py
# Removed IndexedRawTextDataset since it relied on Fairseq dictionary # Removed IndexedRawTextDataset since it relied on Fairseq dictionary
# other slight modifications to remove fairseq dependencies # other slight modifications to remove fairseq dependencies
# Added document index to index file and made it accessible. # Added document index to index file and made it accessible.
# An empty sentence no longer separates documents. # An empty sentence no longer separates documents.
from functools import lru_cache
import os import os
import shutil import shutil
import struct import struct
from functools import lru_cache
from itertools import accumulate from itertools import accumulate
import numpy as np import numpy as np
...@@ -88,16 +87,7 @@ def write_longs(f, a): ...@@ -88,16 +87,7 @@ def write_longs(f, a):
f.write(np.array(a, dtype=np.int64)) f.write(np.array(a, dtype=np.int64))
dtypes = { dtypes = {1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: float, 7: np.double, 8: np.uint16}
1: np.uint8,
2: np.int8,
3: np.int16,
4: np.int32,
5: np.int64,
6: np.float,
7: np.double,
8: np.uint16
}
def code(dtype): def code(dtype):
...@@ -136,10 +126,8 @@ class IndexedDataset(torch.utils.data.Dataset): ...@@ -136,10 +126,8 @@ class IndexedDataset(torch.utils.data.Dataset):
def read_index(self, path): def read_index(self, path):
with open(index_file_path(path), 'rb') as f: with open(index_file_path(path), 'rb') as f:
magic = f.read(8) magic = f.read(8)
assert magic == self._HDR_MAGIC, ( assert magic == self._HDR_MAGIC, ('Index file doesn\'t match expected format. '
'Index file doesn\'t match expected format. ' 'Make sure that --dataset-impl is configured properly.')
'Make sure that --dataset-impl is configured properly.'
)
version = f.read(8) version = f.read(8)
assert struct.unpack('<Q', version) == (1,) assert struct.unpack('<Q', version) == (1,)
code, self.element_size = struct.unpack('<QQ', f.read(16)) code, self.element_size = struct.unpack('<QQ', f.read(16))
...@@ -198,13 +186,11 @@ class IndexedDataset(torch.utils.data.Dataset): ...@@ -198,13 +186,11 @@ class IndexedDataset(torch.utils.data.Dataset):
@staticmethod @staticmethod
def exists(path): def exists(path):
return ( return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
)
@property @property
def supports_prefetch(self): def supports_prefetch(self):
return False # avoid prefetching to save memory return False # avoid prefetching to save memory
class IndexedCachedDataset(IndexedDataset): class IndexedCachedDataset(IndexedDataset):
...@@ -233,7 +219,7 @@ class IndexedCachedDataset(IndexedDataset): ...@@ -233,7 +219,7 @@ class IndexedCachedDataset(IndexedDataset):
for i in indices: for i in indices:
self.cache_index[i] = ptx self.cache_index[i] = ptx
size = self.data_offsets[i + 1] - self.data_offsets[i] size = self.data_offsets[i + 1] - self.data_offsets[i]
a = self.cache[ptx: ptx + size] a = self.cache[ptx:ptx + size]
self.data_file.seek(self.data_offsets[i] * self.element_size) self.data_file.seek(self.data_offsets[i] * self.element_size)
self.data_file.readinto(a) self.data_file.readinto(a)
ptx += size ptx += size
...@@ -250,7 +236,7 @@ class IndexedCachedDataset(IndexedDataset): ...@@ -250,7 +236,7 @@ class IndexedCachedDataset(IndexedDataset):
tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
a = np.empty(tensor_size, dtype=self.dtype) a = np.empty(tensor_size, dtype=self.dtype)
ptx = self.cache_index[i] ptx = self.cache_index[i]
np.copyto(a, self.cache[ptx: ptx + a.size]) np.copyto(a, self.cache[ptx:ptx + a.size])
return a return a
elif isinstance(idx, slice): elif isinstance(idx, slice):
# Hack just to make this work, can optimizer later if necessary # Hack just to make this work, can optimizer later if necessary
...@@ -261,15 +247,7 @@ class IndexedCachedDataset(IndexedDataset): ...@@ -261,15 +247,7 @@ class IndexedCachedDataset(IndexedDataset):
class IndexedDatasetBuilder(object): class IndexedDatasetBuilder(object):
element_sizes = { element_sizes = {np.uint8: 1, np.int8: 1, np.int16: 2, np.int32: 4, np.int64: 8, float: 4, np.double: 8}
np.uint8: 1,
np.int8: 1,
np.int16: 2,
np.int32: 4,
np.int64: 8,
np.float: 4,
np.double: 8
}
def __init__(self, out_file, dtype=np.int32): def __init__(self, out_file, dtype=np.int32):
self.out_file = open(out_file, 'wb') self.out_file = open(out_file, 'wb')
...@@ -332,12 +310,15 @@ def _warmup_mmap_file(path): ...@@ -332,12 +310,15 @@ def _warmup_mmap_file(path):
class MMapIndexedDataset(torch.utils.data.Dataset): class MMapIndexedDataset(torch.utils.data.Dataset):
class Index(object): class Index(object):
_HDR_MAGIC = b'MMIDIDX\x00\x00' _HDR_MAGIC = b'MMIDIDX\x00\x00'
@classmethod @classmethod
def writer(cls, path, dtype): def writer(cls, path, dtype):
class _Writer(object): class _Writer(object):
def __enter__(self): def __enter__(self):
self._file = open(path, 'wb') self._file = open(path, 'wb')
...@@ -384,10 +365,8 @@ class MMapIndexedDataset(torch.utils.data.Dataset): ...@@ -384,10 +365,8 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
def __init__(self, path, skip_warmup=False): def __init__(self, path, skip_warmup=False):
with open(path, 'rb') as stream: with open(path, 'rb') as stream:
magic_test = stream.read(9) magic_test = stream.read(9)
assert self._HDR_MAGIC == magic_test, ( assert self._HDR_MAGIC == magic_test, ('Index file doesn\'t match expected format. '
'Index file doesn\'t match expected format. ' 'Make sure that --dataset-impl is configured properly.')
'Make sure that --dataset-impl is configured properly.'
)
version = struct.unpack('<Q', stream.read(8)) version = struct.unpack('<Q', stream.read(8))
assert (1,) == version assert (1,) == version
...@@ -406,16 +385,16 @@ class MMapIndexedDataset(torch.utils.data.Dataset): ...@@ -406,16 +385,16 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
self._bin_buffer_mmap = np.memmap(path, mode='r', order='C') self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
self._bin_buffer = memoryview(self._bin_buffer_mmap) self._bin_buffer = memoryview(self._bin_buffer_mmap)
print(" reading sizes...") print(" reading sizes...")
self._sizes = np.frombuffer( self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset)
self._bin_buffer,
dtype=np.int32,
count=self._len,
offset=offset)
print(" reading pointers...") print(" reading pointers...")
self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len, self._pointers = np.frombuffer(self._bin_buffer,
dtype=np.int64,
count=self._len,
offset=offset + self._sizes.nbytes) offset=offset + self._sizes.nbytes)
print(" reading document index...") print(" reading document index...")
self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count, self._doc_idx = np.frombuffer(self._bin_buffer,
dtype=np.int64,
count=self._doc_count,
offset=offset + self._sizes.nbytes + self._pointers.nbytes) offset=offset + self._sizes.nbytes + self._pointers.nbytes)
def __del__(self): def __del__(self):
...@@ -480,8 +459,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset): ...@@ -480,8 +459,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
if isinstance(idx, int): if isinstance(idx, int):
ptr, size = self._index[idx] ptr, size = self._index[idx]
np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)
count=size, offset=ptr)
return np_array return np_array
elif isinstance(idx, slice): elif isinstance(idx, slice):
start, stop, step = idx.indices(len(self)) start, stop, step = idx.indices(len(self))
...@@ -491,8 +469,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset): ...@@ -491,8 +469,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
sizes = self._index._sizes[idx] sizes = self._index._sizes[idx]
offsets = list(accumulate(sizes)) offsets = list(accumulate(sizes))
total_size = sum(sizes) total_size = sum(sizes)
np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr)
count=total_size, offset=ptr)
sents = np.split(np_array, offsets[:-1]) sents = np.split(np_array, offsets[:-1])
return sents return sents
...@@ -506,8 +483,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset): ...@@ -506,8 +483,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
if length is None: if length is None:
length = size - offset length = size - offset
ptr += offset * np.dtype(self._index.dtype).itemsize ptr += offset * np.dtype(self._index.dtype).itemsize
np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr)
count=length, offset=ptr)
return np_array return np_array
@property @property
...@@ -530,12 +506,11 @@ class MMapIndexedDataset(torch.utils.data.Dataset): ...@@ -530,12 +506,11 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
@staticmethod @staticmethod
def exists(path): def exists(path):
return ( return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
)
class MMapIndexedDatasetBuilder(object): class MMapIndexedDatasetBuilder(object):
def __init__(self, out_file, dtype=np.int64): def __init__(self, out_file, dtype=np.int64):
self._data_file = open(out_file, 'wb') self._data_file = open(out_file, 'wb')
self._dtype = dtype self._dtype = dtype
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment