Commit 404ecbdc authored by zbian's avatar zbian
Browse files

Migrated project

parent 2ebaefc5
# Config file
Here is an example config file of training ViT on cifar:
```python
# build train_dataset and train_dataloader from this dictionary
# It is not compulsory in Config File, instead, you can input this dictionary as an argument into colossalai.initialize()
train_data = dict(
# dictionary for building Dataset
dataset=dict(
# the type CIFAR10Dataset has to be registered
type='CIFAR10Dataset',
root='/path/to/data',
# transform pipeline
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
# dictionary for building Dataloader
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
# num_workers=1,
shuffle=True,
)
)
# build test_dataset and test_dataloader from this dictionary
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root='/path/to/data',
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
# num_workers=1,
)
)
# compulsory
# build optimizer from this dictionary
optimizer = dict(
# Avaluable types: 'ZeroRedundancyOptimizer_Level_1', 'ZeroRedundancyOptimizer_Level_2', 'ZeroRedundancyOptimizer_Level_3'
# 'Adam', 'Lamb', 'SGD', 'FusedLAMB', 'FusedAdam', 'FusedSGD', 'FP16Optimizer'
type='Adam',
lr=0.001,
weight_decay=0
)
# compulsory
# build loss function from this dictionary
loss = dict(
# Avaluable types:
# 'CrossEntropyLoss2D', 'CrossEntropyLoss2p5D', 'CrossEntropyLoss3D'
type='CrossEntropyLoss2D',
)
# compulsory
# build model from this dictionary
model = dict(
# types avaluable: 'PretrainBERT', 'VanillaResNet', 'VisionTransformerFromConfig'
type='VisionTransformerFromConfig',
# each key-value pair above refers to a layer
# input data pass through these layers recursively
tensor_splitting_cfg=dict(
type='ViTInputSplitter2D',
),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(
type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
# ViTBlock is a submodule
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
checkpoint=True
),
droppath_cfg=dict(
type='VanillaViTDropPath',
),
mlp_cfg=dict(
type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=4,
checkpoint=True
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
# hooks are built when initializing trainer
# possible hooks: 'BaseHook', 'MetricHook','LoadCheckpointHook'
# 'SaveCheckpointHook','LossHook', 'AccuracyHook', 'Accuracy2DHook'
# 'LogMetricByEpochHook', 'TensorboardHook','LogTimingByEpochHook', 'LogMemoryByEpochHook'
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='LogTimingByEpochHook'),
dict(type='LogMemoryByEpochHook'),
dict(type='Accuracy2DHook'),
dict(type='LossHook'),
# dict(type='TensorboardHook', log_dir='./tfb_logs'),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
# three keys: pipeline, tensor, data
# if data=dict(size=1), which means no data parallelization, then there is no need to define it
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
# not compulsory
# pipeline or no pipeline schedule
fp16 = dict(
mode=AMP_TYPE.PARALLEL,
initial_scale=2 ** 8
)
# not compulsory
# build learning rate scheduler
lr_scheduler = dict(
type='LinearWarmupLR',
warmup_epochs=5
)
schedule = dict(
num_microbatches=8
)
# training stopping criterion
# you can give num_steps or num_epochs
num_epochs = 60
# config logging path
logging = dict(
root_path='./logs'
)
```
\ No newline at end of file
.. ColossalAI documentation master file, created by
sphinx-quickstart on Mon Oct 11 17:05:05 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
ColossalAI documentation
======================================
.. toctree::
:maxdepth: 1
:caption: GETTING STARTED
installation.md
run_demo.md
.. toctree::
:maxdepth: 1
:caption: CUSTOMIZE YOUR TRAINING
parallelization.md
model.md
trainer_engine.md
amp.md
zero.md
add_your_parallel.md
config.md
.. toctree::
:maxdepth: 2
:caption: API REFERENCE
colossalai/colossalai
Indices and tables
==================
* :ref:`genindex`
\ No newline at end of file
# Setup
## Install with pip
```bash
pip install colossalai
```
## Install from source
```shell
git clone git@github.com:hpcaitech/ColossalAI.git
cd ColossalAI
# install dependency
pip install -r requirements/requirements.txt
# install colossalai
pip install .
```
Install and enable CUDA kernel fusion (compulsory installation when using fused optimizer)
```
pip install -v --no-cache-dir --global-option="--cuda_ext" .
```
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=.build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
# Define your own parallel model
## Write a Simple 2D Parallel Model
Let's say we have a huge MLP model and its very large hidden size makes it difficult to fit into a single GPU. We can
then distribute the model weights across GPUs in a 2D mesh while you still write your model in a familiar way.
```python
from colossalai.nn import Linear2D
import torch.nn as nn
class MLP_2D(nn.Module):
def __init__(self):
super().__init__()
self.linear_1 = Linear2D(in_features=1024, out_features=16384)
self.linear_2 = Linear2D(in_features=16384, out_features=1024)
def forward(self, x):
x = self.linear_1(x)
x = self.linear_2(x)
return x
```
## Use pre-defined model
Our Model Zoo supports *BERT*, *VIT*, *MLP-Mixer* of different sizes.
\ No newline at end of file
# Parallelization
## Configure the Combination of Parallelization
We support multiple parallelization in our library.
Hybrid parallelism in our codebase, namely data parallelism, pipeline parallelism and tensor parallelism (
1D,2D, 2.5D, 3D). You can initialize the corresponding process group by setting `parallel` in our config. The parallel
configuration can be easily deployed by a dictionary in configuration file. The configuration dictionary must obey the
following format. Data parallel size will be inferred automatically based on your inputs to pipeline parallelism and
tensor parallelism.
```python
parallel = dict(
pipeline=dict["size": int],
tensor=dict["size": int, "mode": '1d' or '2d' or '2.5d' or '3d', "kwargs": Any]
)
```
The name of the dictionary variable should be **parallel**. All the arguments even **parallel** itself are optional and data,
pipeline, tensor parallel size will be set to defaulted value 1. The value of data, pipeline and tensor can be a int
representing the size of specific parallel dimension or a dictionary with a key called "size". The key "mode"
represents the way of model parallelism.
## Data Parallel
Data parallel is the most common way to distribute your training task by splitting data into several shards and train
on a single shard on each device. The configuration for data parallel is detected automatically and set for you. You do
not have to explicitly set them in your configurations. When data parallel size is larger than 1, Colossal-AI automatically
adds the distributed data sampler to the dataloader to shard the dataset.
## Pipeline Parallel (experimental)
Pipeline parallelism is to split the model into several partitions by layer. For example, let's assume we have a simple
model which consists of two linear layer. We have two GPUs, and we can allocate the first linear layer to the first GPU
and the second layer to the second GPU. This example of course wastes the computing resources and is only to demonstrate
the idea of pipeline parallelism.
As PyTorch is based on dynamic computation graph, the computation flow is not known until execution. To support pipeline
parallelism in PyTorch, you may need to add one more attribute in your model class which tells Colossal-AI the sequence
of execution. One example you can refer is `colossalai.nn.VanillaResNet`.
```python
from colossalai.nn import BaseModel
import torch
class VanillaResNet(BaseModel):
def __init__(
self,
num_cls: int,
block_type: str,
layers: List[int],
norm_layer_type: str = 'BatchNorm2d',
in_channels: int = 3,
groups: int = 1,
width_per_group: int = 64,
zero_init_residual: bool = False,
replace_stride_with_dilation: Optional[List[bool]] = None,
dilations=(1, 1, 1, 1)
) -> None:
super().__init__()
... # some model params
self.layers_cfg = [
# conv1
dict(type='Conv2d',
in_channels=in_channels,
out_channels=self.inplanes,
kernel_size=7,
stride=2,
padding=3,
bias=False),
# bn1
dict(
type=norm_layer_type,
num_features=self.inplanes
),
# relu
dict(
type='ReLU',
inplace=True
),
# maxpool
dict(
type='MaxPool2d',
kernel_size=3,
stride=2,
padding=1
),
# layer 1
dict(
inplanes=self.inplanes,
planes=64,
blocks=self.blocks[0],
dilation=self.dilations[0],
**self.reslayer_common_cfg
),
# layer 2
dict(
inplanes=64 * self.block_expansion,
planes=128,
blocks=self.blocks[1],
stride=2,
dilate=replace_stride_with_dilation[0],
dilation=self.dilations[1],
**self.reslayer_common_cfg
),
# layer 3
dict(
inplanes=128 * self.block_expansion,
planes=256,
blocks=layers[2],
stride=2,
dilate=replace_stride_with_dilation[1],
dilation=self.dilations[2],
**self.reslayer_common_cfg
),
# layer 4
dict(
inplanes=256 * self.block_expansion,
planes=512,
blocks=layers[3], stride=2,
dilate=replace_stride_with_dilation[2],
dilation=self.dilations[3],
**self.reslayer_common_cfg
),
# avg pool
dict(
type='AdaptiveAvgPool2d',
output_size=(1, 1)
),
# flatten
dict(
type='LambdaWrapper',
func=lambda mod, x: torch.flatten(x, 1)
),
# linear
dict(
type='Linear',
in_features=512 * self.block_expansion,
out_features=num_cls
)
]
```
You can set the number of pipeline stages in your configuration file. When pipeline size is larger than 1, Colossal-AI
will automatically creates the pipeline schedule which defines the forward and backward step. You can specify how many microbatches
to run in each step in the `schedule` configuration.
```python
parallel = dict(
pipeline=dict(size=1), # number of pipeline stages
tensor=dict(size=1, mode=None)
)
schedule = dict(
num_microbatches = 4 # set the number of microbatches per step
)
```
## 1D, 2D, 2.5D and 3D Parallel
To enable hybrid parallelism, we provide an array of tensor parallelism. We provide the list of papers which match each
tensor parallel method. These parallel modes need to work with the distributed layers provided by Colossal-AI.
- 1D: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
- 2D: [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343)
2D parallel relies on the SUMMA matrix multiplication algorithm and splits the input data,
model weights and layer outputs along two different dimensions. The tensor chunks are distributed over a 2D mesh of $P = N^2$
devices where N is the number of tensor chunks in a single dimension.
- 2.5D: [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500)
Inspired by the 2.5D matrix multi-plication algorithm, 2.5D parallel introduces a novel tensor parallelism which further
parallelizes 2D tensor parallelism. An amount of $P = N^2 ∗ d$ processors are arranged into d layers,
where each layer performs matrix multiplication operations independently with a dimension N.
- 3D: [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450)
We also introduce a 3D tensor parallelism that parallelizes neural networks on a 3D processor cube. This method achieves
the optimal, $O(P^{1/3})$ communication overhead on P processors, while both computation and memory usage are evenly distributed
through optimized load balancing of parameters as well as activations.
```python
# 1D parallel
parallel = dict(
pipeline=dict(size=1), # number of pipeline stages
tensor=dict(size=4, mode='1d')
)
# 2D parallel
parallel = dict(
pipeline=dict(size=1), # number of pipeline stages
tensor=dict(size=4, mode='2d')
)
# 2.5D parallel
parallel = dict(
pipeline=dict(size=1), # number of pipeline stages
tensor=dict(size=8, mode='2.5d', depth=2)
)
# 3D parallel
parallel = dict(
pipeline=dict(size=1), # number of pipeline stages
tensor=dict(size=8, mode='3d')
)
```
## Sequence Parallel (experimental)
Sequence parallel is to support long-sequence modelling such as document-level text understanding and medical imaging.
This method is proposed in [Sequence Parallelism: Making 4D Parallelism Possible](https://arxiv.org/abs/2105.13120).
This feature is still in development is only experimental for now.
\ No newline at end of file
# Quick demo
ColossalAI is an integrated large-scale deep learning framework with efficient parallelization techniques. The framework
can accelerate model training on distributed systems with multiple GPUs by applying parallelization techniques. The
framework can also run on systems with only one GPU. Quick demos showing how to use ColossalAI are given below.
## Single GPU
ColossalAI can be used to train deep learning models on systems with only one GPU and achieve baseline
performances. [Here](https://colab.research.google.com/drive/1fJnqqFzPuzZ_kn1lwCpG2nh3l2ths0KE?usp=sharing#scrollTo=cQ_y7lBG09LS)
is an example showing how to train a LeNet model on the CIFAR10 dataset using ColossalAI.
## Multiple GPUs
ColossalAI can be used to train deep learning models on distributed systems with multiple GPUs and accelerate the
training process drastically by applying efficient parallelization techiniques, which will be elaborated in
the [Parallelization](parallelization.md) section below. Run the code below on your distributed system with 4 GPUs,
where `HOST` is the IP address of your system. Note that we use
the [Slurm](https://slurm.schedmd.com/documentation.html) job scheduling system here.
```bash
HOST=xxx.xxx.xxx.xxx srun ./scripts/slurm_dist_train.sh ./example/train_vit_2d.py ./configs/vit/vit_2d.py
```
`./configs/vit/vit_2d.py` is a config file, which is introduced in the [Config file](config.md) section below. These
config files are used by ColossalAI to define all kinds of training arguments, such as the model, dataset and training
method (optimizer, lr_scheduler, epoch, etc.). Config files are highly customizable and can be modified so as to train
different models.
`./example/run_trainer.py` contains a standard training script and is presented below, it reads the config file and
realizes the training process.
```python
import colossalai
from colossalai.engine import Engine
from colossalai.trainer import Trainer
from colossalai.core import global_context as gpc
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
engine = Engine(
model=model,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule
)
trainer = Trainer(engine=engine,
hooks_cfg=gpc.config.hooks,
verbose=True)
trainer.fit(
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
max_epochs=gpc.config.num_epochs,
display_progress=True,
test_interval=5
)
```
Alternatively, the `model` variable can be substituted with a self-defined model or a pre-defined model in our Model
Zoo. The detailed substitution process is elaborated [here](model.md).
## Features
ColossalAI provides a collection of parallel training components for you. We aim to support you with your development of
distributed deep learning models just like how you write single-GPU deeo learning models. We provide friendly tools to
kickstart distributed training in a few lines.
- [Data Parallelism](parallelization.md)
- [Pipeline Parallelism](parallelization.md)
- [1D, 2D, 2.5D, 3D and sequence parallelism](parallelization.md)
- [Friendly trainer and engine](trainer_engine.md)
- [Extensible for new parallelism](add_your_parallel.md)
- [Mixed Precision Training](amp.md)
- [Zero Redundancy Optimizer (ZeRO)](zero.md)
# Build your engine & Customize your trainer
## Build your engine
To better understand the function of `Engine` class, you should know the conception of the process function in common engines. The process function usually controls the behavior over a batch of a dataset, `Engine` class just controls the process function. For example, common process function looks like this:
```python
def process_function(dataloader, model, criterion, optim):
optim.zero_grad()
data, label = next(dataloader)
output = model(data)
loss = criterion(output, label)
loss.backward()
optim.setp()
```
In `ignite.engine` or `keras.engine`, the process function is always provided by users. However, it is hard for users to write their own functions for pipeline parallelism. Aiming at accessible hybrid parallelism for users, we provide powerful `Engine` class. It enables pipeline parallelism and offers 1F1B non-interleaving strategy. Also, you can use pre-defined learning rate scheduler in your `Engine` to adjust learning rate during training.
In order to build your engine, just set model, criterion, optimizer, learning rate scheduler and schedule. Consider the following code as an example.
```python
import torch
import torch.nn as nn
import torchvision.models as models
import colossalai
model = models.resnet18()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model)
lr_scheduler = colossalai.nn.lr_scheduler.CosineAnnealingLR(optimizer, 1000)
schedule = colossalai.engine.schedule.NoPipelineSchedule()
MyEngine = Engine(
model=model,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule
)
```
More information is in API reference.
## Customize your trainer
### Overview
Before starting to learn how to customize a trainer meeting your need, you should have a basic understanding about the function of `Trainer`. We recommend you to read *Get Started* section and *Build your engine* first.
Trainer class tends to enable researchers and engineers to use our framework more conveniently, instead of writing their own scripts, we provide `Trainer` class and you can simply construct it with your own `Engine` by calling `MyTrainer = Trainer(MyEngine)`. Then use method `fit` to train or evaluate your model. In order to make our `Trainer` class more powerful, we add some useful features to it, such as monitor or record running states and metrics which indicate model's performance, or save after a training epoch.
To accomplish that, specific actions must be added to the training or evaluation. `BaseHook` class allow you to add desired actions in specific time points. We have already created practical hooks for those useful features. What you need to do is just picking the hooks you want.
More detailed class descriptions can be found in API reference.
### Example
```python
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='LogTimingByEpochHook'),
dict(type='LogMemoryByEpochHook'),
dict(type='AccuracyHook'),
dict(type='LossHook'),
# dict(type='TensorboardHook', log_dir='./tfb_logs'),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
```
Above hooks will record metrics, used time and memory usage to log every epoch. Also it prints loss and accuracy to let users monitor the performance of the model.
### Hook
You can extend our `BaseHook` class. Hooks can be called at twelve time points. More detailed information can be found in API reference.
Or extend from `MetricHook` to write a metric collector. You should also use the decorator `@HOOKS.register_module` for your own hook class, and import it in your main python script.
For `after_train_iter()`, it receives the output of engine per iteration, which is a list including output, label and loss.
Note that you can define the priority to arrange the execution order of all hooks.
### Metric
You can write your own metric by extending `Metric` class. It is always used with `MetricHook`. If you write your own metric hooks, please set the priority carefully and make sure is called before other hooks which may use the results of metrics.
We've already provided some metric hooks. We store metric objects in `runner.states['metrics']`. It is a dictionary and you can use the name of the metric to access it.
\ No newline at end of file
# Zero Redundancy Optimizer and Zero Offload
The Zero Redundancy Optimizer (ZeRO) removes the memory redundancies across data-parallel processes by partitioning the three model states (optimizer states, gradients, and parameters) across data-parallel processes instead of replicating them. By doing this, it boosts memory efficiency compared to classic data-parallelism while retaining its computational granularity and communication efficiency.
1. **ZeRO Level 1**: The optimizer states (e.g., for [Adam optimizer](https://arxiv.org/abs/1412.6980), 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
2. **ZeRO Level 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
3. **ZeRO Level 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
## Getting Started
Once you are training with ColossalAI, enabling ZeRO-3 offload is as simple as enabling it in your ColossalAI configuration! Below are a few examples of ZeRO-3 configurations.
### Example ZeRO-3 Configurations
Here we use ``Adam`` as the initial optimizer.
1. Use ZeRO to partition the optimizer states (level 1), gradients (level 2), and parameters (level 3).
```python
optimizer = dict(
type='Adam',
lr=0.001,
weight_decay=0
)
zero = dict(
type='ZeroRedundancyOptimizer_Level_3',
dynamic_loss_scale=True,
clip_grad=1.0
)
```
2. Additionally offload the optimizer states and computations to the CPU.
```python
zero = dict(
offload_optimizer_config=dict(
device='cpu',
pin_memory=True,
fast_init=True
),
...
)
```
3. Save even more memory by offloading parameters to the CPU memory.
```python
zero = dict(
offload_optimizer_config=dict(
device='cpu',
pin_memory=True,
fast_init=True
),
offload_param_config=dict(
device='cpu',
pin_memory=True,
fast_init=OFFLOAD_PARAM_MAX_IN_CPU
),
...
)
```
4. Save even MORE memory by offloading to NVMe (if available on your system):
```python
zero = dict(
offload_optimizer_config=dict(
device='nvme',
pin_memory=True,
fast_init=True,
nvme_path='/nvme_data'
),
offload_param_config=dict(
device='nvme',
pin_memory=True,
max_in_cpu=OFFLOAD_PARAM_MAX_IN_CPU,
nvme_path='/nvme_data'
),
...
)
```
Note that ``fp16`` is automatically enabled when using ZeRO.
### Training
Once you complete your configuration, just use `colossalai.initialize()` to initialize your training. All you need to do is to write your configuration.
\ No newline at end of file
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "colossal_cifar_demo.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "uhrbvVEh2iJd"
},
"source": [
"# Train an image classifier\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vP7LvCpG23a2",
"outputId": "b37f7203-8a02-4736-c527-603f2bb34d7d"
},
"source": [
"!pip install ColossalAI deepspeed"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: ColossalAI in /usr/local/lib/python3.7/dist-packages (0.1)\n",
"Requirement already satisfied: deepspeed in /usr/local/lib/python3.7/dist-packages (0.5.4)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from deepspeed) (21.0)\n",
"Requirement already satisfied: triton in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.1.1)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from deepspeed) (4.62.3)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.19.5)\n",
"Requirement already satisfied: tensorboardX==1.8 in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.8)\n",
"Requirement already satisfied: ninja in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.10.2.2)\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.9.0+cu111)\n",
"Requirement already satisfied: psutil in /usr/local/lib/python3.7/dist-packages (from deepspeed) (5.4.8)\n",
"Requirement already satisfied: protobuf>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (3.17.3)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (1.15.0)\n",
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->deepspeed) (2.4.7)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->deepspeed) (3.7.4.3)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from triton->deepspeed) (3.3.0)\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UVKEurtS4SFS",
"outputId": "99fb6050-5da7-4f27-b4eb-9b3ccf830efb"
},
"source": [
"import colossalai\n",
"from colossalai.engine import Engine, NoPipelineSchedule\n",
"from colossalai.trainer import Trainer\n",
"from colossalai.context import Config\n",
"import torch"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Please install apex to use FP16 Optimizer\n",
"Apex should be installed to use the FP16 optimizer\n",
"apex is required for mixed precision training\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PpFfhNBD7NSn"
},
"source": [
"First, we should initialize distributed environment. Though we just use single GPU in this example, we still need initialize distributed environment for compatibility. We just consider the simplest case here, so we just set the number of parallel processes to 1."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8yF7Lc-K7NAS",
"outputId": "01312349-a8b0-4de4-9103-7d1b48e6cc36"
},
"source": [
"parallel_cfg = Config(dict(parallel=dict(\n",
" data=dict(size=1),\n",
" pipeline=dict(size=1),\n",
" tensor=dict(size=1, mode=None),\n",
")))\n",
"colossalai.init_dist(config=parallel_cfg,\n",
" local_rank=0,\n",
" world_size=1,\n",
" host='127.0.0.1',\n",
" port=8888,\n",
" backend='nccl')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,596 INFO: Added key: store_based_barrier_key:1 to store for rank: 0\n",
"colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,598 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
"colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,602 INFO: Added key: store_based_barrier_key:2 to store for rank: 0\n",
"colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,605 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
"colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,608 INFO: Added key: store_based_barrier_key:3 to store for rank: 0\n",
"colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,610 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"process rank 0 is bound to device 0\n",
"initialized seed on rank 0, numpy: 1024, python random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1124,the default parallel seed is ParallelMode.DATA.\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ppjmMxc_81TK"
},
"source": [
"Load and normalize the CIFAR10 training and test datasets using `colossalai.nn.data`. Note that we have wrapped `torchvision.transforms`, so that we can simply use the config dict to use them."
]
},
{
"cell_type": "code",
"metadata": {
"id": "ZyGhyD47-dUY",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "98bbf2d1-a1c4-4bb4-b6df-600777b1e8f5"
},
"source": [
"transform_cfg = [\n",
" dict(type='ToTensor'),\n",
" dict(type='Normalize',\n",
" mean=[0.4914, 0.4822, 0.4465],\n",
" std=[0.2023, 0.1994, 0.2010]),\n",
"]\n",
"\n",
"batch_size = 128\n",
"\n",
"trainset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=True)\n",
"trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)\n",
"\n",
"testset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=False)\n",
"testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Files already downloaded and verified\n",
"Files already downloaded and verified\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NvPbfLLR9NzC"
},
"source": [
"We just define a simple Convolutional Neural Network here."
]
},
{
"cell_type": "code",
"metadata": {
"id": "cQ_y7lBG09LS"
},
"source": [
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"\n",
"\n",
"class Net(nn.Module):\n",
" def __init__(self):\n",
" super().__init__()\n",
" self.conv1 = nn.Conv2d(3, 6, 5)\n",
" self.pool = nn.MaxPool2d(2, 2)\n",
" self.conv2 = nn.Conv2d(6, 16, 5)\n",
" self.fc1 = nn.Linear(16 * 5 * 5, 120)\n",
" self.fc2 = nn.Linear(120, 84)\n",
" self.fc3 = nn.Linear(84, 10)\n",
"\n",
" def forward(self, x):\n",
" x = self.pool(F.relu(self.conv1(x)))\n",
" x = self.pool(F.relu(self.conv2(x)))\n",
" x = torch.flatten(x, 1) # flatten all dimensions except batch\n",
" x = F.relu(self.fc1(x))\n",
" x = F.relu(self.fc2(x))\n",
" x = self.fc3(x)\n",
" return x\n",
"\n",
"\n",
"model = Net().cuda()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "tgsszAmM9dYZ"
},
"source": [
"Define a Loss function and optimizer. And then we use them to initialize `Engine` and `Trainer`. We provide various training / evaluating hooks. In this case, we just use the simplest hooks which can compute and print loss and accuracy."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YtaDoCax1BCf",
"outputId": "b33b1641-03d8-4597-c8c2-1a4c1d61e9b0"
},
"source": [
"import torch.optim as optim\n",
"\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)\n",
"schedule = NoPipelineSchedule()\n",
"engine = Engine(\n",
" model=model,\n",
" criterion=criterion,\n",
" optimizer=optimizer,\n",
" lr_scheduler=None,\n",
" schedule=schedule\n",
" )\n",
"trainer = Trainer(engine=engine,\n",
" hooks_cfg=[dict(type='LossHook'), dict(type='LogMetricByEpochHook'), dict(type='AccuracyHook')],\n",
" verbose=True)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"colossalai - rank_0 - 2021-10-15 03:27:56,018 WARNING: No gradient handler is set up, please make sure you do not need to all-reduce the gradients after a training step.\n",
"colossalai - rank_0 - 2021-10-15 03:27:56,024 INFO: build LogMetricByEpochHook for train, priority = 1\n",
"colossalai - rank_0 - 2021-10-15 03:27:56,026 INFO: build LossHook for train, priority = 10\n",
"colossalai - rank_0 - 2021-10-15 03:27:56,029 INFO: build AccuracyHook for train, priority = 10\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_JR2TuvH99Ik"
},
"source": [
"Then we set training configs. We train our model for 10 epochs and it will be evaluated every 1 epoch. Set `display_progress` to `True` to display the training / evaluating progress bar."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "w-J3IP-J1sfx",
"outputId": "bdb76939-04f1-4124-ce5e-3af44c0d902c"
},
"source": [
"num_epochs = 10\n",
"test_interval = 1\n",
"trainer.fit(\n",
" train_dataloader=trainloader,\n",
" test_dataloader=testloader,\n",
" max_epochs=num_epochs,\n",
" display_progress=True,\n",
" test_interval=test_interval\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[Epoch 0 train]: 0%| | 0/391 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at /pytorch/c10/core/TensorImpl.h:1156.)\n",
" return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)\n",
"[Epoch 0 train]: 100%|██████████| 391/391 [00:14<00:00, 26.82it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:28:11,088 INFO: Training - Epoch 1 - LogMetricByEpochHook: Loss = 2.29158\n",
"[Epoch 0 val]: 100%|██████████| 79/79 [00:02<00:00, 28.66it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:28:14,040 INFO: Testing - Epoch 1 - LogMetricByEpochHook: Loss = 2.26517, Accuracy = 0.14820\n",
"[Epoch 1 train]: 100%|██████████| 391/391 [00:14<00:00, 26.31it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:28:29,059 INFO: Training - Epoch 2 - LogMetricByEpochHook: Loss = 2.15763\n",
"[Epoch 1 val]: 100%|██████████| 79/79 [00:02<00:00, 28.50it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:28:32,007 INFO: Testing - Epoch 2 - LogMetricByEpochHook: Loss = 2.00450, Accuracy = 0.27850\n",
"[Epoch 2 train]: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:28:47,167 INFO: Training - Epoch 3 - LogMetricByEpochHook: Loss = 1.85409\n",
"[Epoch 2 val]: 100%|██████████| 79/79 [00:02<00:00, 27.89it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:28:50,168 INFO: Testing - Epoch 3 - LogMetricByEpochHook: Loss = 1.73788, Accuracy = 0.35990\n",
"[Epoch 3 train]: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:29:05,330 INFO: Training - Epoch 4 - LogMetricByEpochHook: Loss = 1.69363\n",
"[Epoch 3 val]: 100%|██████████| 79/79 [00:02<00:00, 28.43it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:29:08,290 INFO: Testing - Epoch 4 - LogMetricByEpochHook: Loss = 1.65005, Accuracy = 0.39350\n",
"[Epoch 4 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:29:23,530 INFO: Training - Epoch 5 - LogMetricByEpochHook: Loss = 1.61387\n",
"[Epoch 4 val]: 100%|██████████| 79/79 [00:02<00:00, 27.75it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:29:26,515 INFO: Testing - Epoch 5 - LogMetricByEpochHook: Loss = 1.57507, Accuracy = 0.42430\n",
"[Epoch 5 train]: 100%|██████████| 391/391 [00:15<00:00, 25.92it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:29:41,764 INFO: Training - Epoch 6 - LogMetricByEpochHook: Loss = 1.55712\n",
"[Epoch 5 val]: 100%|██████████| 79/79 [00:02<00:00, 27.51it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:29:44,778 INFO: Testing - Epoch 6 - LogMetricByEpochHook: Loss = 1.53242, Accuracy = 0.43700\n",
"[Epoch 6 train]: 100%|██████████| 391/391 [00:14<00:00, 26.13it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:29:59,927 INFO: Training - Epoch 7 - LogMetricByEpochHook: Loss = 1.51618\n",
"[Epoch 6 val]: 100%|██████████| 79/79 [00:02<00:00, 28.31it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:30:02,884 INFO: Testing - Epoch 7 - LogMetricByEpochHook: Loss = 1.49720, Accuracy = 0.45430\n",
"[Epoch 7 train]: 100%|██████████| 391/391 [00:14<00:00, 26.23it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:30:17,968 INFO: Training - Epoch 8 - LogMetricByEpochHook: Loss = 1.47857\n",
"[Epoch 7 val]: 100%|██████████| 79/79 [00:02<00:00, 27.97it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:30:20,967 INFO: Testing - Epoch 8 - LogMetricByEpochHook: Loss = 1.45808, Accuracy = 0.46320\n",
"[Epoch 8 train]: 100%|██████████| 391/391 [00:14<00:00, 26.11it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:30:36,129 INFO: Training - Epoch 9 - LogMetricByEpochHook: Loss = 1.44656\n",
"[Epoch 8 val]: 100%|██████████| 79/79 [00:02<00:00, 28.18it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:30:39,096 INFO: Testing - Epoch 9 - LogMetricByEpochHook: Loss = 1.44903, Accuracy = 0.46580\n",
"[Epoch 9 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:30:54,342 INFO: Training - Epoch 10 - LogMetricByEpochHook: Loss = 1.41120\n",
"[Epoch 9 val]: 100%|██████████| 79/79 [00:02<00:00, 28.05it/s]\n",
"colossalai - rank_0 - 2021-10-15 03:30:57,332 INFO: Testing - Epoch 10 - LogMetricByEpochHook: Loss = 1.41242, Accuracy = 0.48500\n"
]
}
]
}
]
}
\ No newline at end of file
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import colossalai
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.logging import get_global_dist_logger
from colossalai.trainer import Trainer
def run_trainer():
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
logger = get_global_dist_logger()
schedule.data_sync = False
engine = Engine(
model=model,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule
)
logger.info("engine is built", ranks=[0])
trainer = Trainer(engine=engine,
hooks_cfg=gpc.config.hooks,
verbose=True)
logger.info("trainer is built", ranks=[0])
logger.info("start training", ranks=[0])
trainer.fit(
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
max_epochs=gpc.config.num_epochs,
display_progress=True,
test_interval=2
)
if __name__ == '__main__':
run_trainer()
from .vit import *
from .mlp_mixer import *
from .parallel_3d import *
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment