Commit ac178ca5 authored by Hongxin Liu's avatar Hongxin Liu
Browse files

[legacy] move builder and registry to legacy (#4603)

parent 8accecd5
from torch.optim.lr_scheduler import _LRScheduler
from colossalai.registry import LR_SCHEDULERS
from colossalai.legacy.registry import LR_SCHEDULERS
from .delayed import WarmupScheduler
......
from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR
from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
from torch.optim.lr_scheduler import StepLR as _StepLR
from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR
from colossalai.registry import LR_SCHEDULERS
from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
......
......@@ -4,7 +4,7 @@ from typing import Optional
import torch
from colossalai.kernel.op_builder import CPUAdamBuilder
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from .nvme_optimizer import NVMeOptimizer
......
......@@ -8,7 +8,7 @@ Licensed under the MIT License.
'''
import torch
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
......
# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py
import torch
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
......
......@@ -2,7 +2,7 @@
import torch
from torch.optim.optimizer import Optimizer, required
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
......
......@@ -4,7 +4,7 @@ import torch
from torch.optim import Adam
from colossalai.kernel.op_builder import FusedOptimBuilder
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
from .cpu_adam import CPUAdam
......
......@@ -5,7 +5,7 @@ Adapted from the pytorch-lamb library at https://github.com/cybertronai/pytorch-
import torch
from torch.optim import Optimizer
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
@OPTIMIZERS.register_module
......
......@@ -5,7 +5,7 @@ from typing import Iterable
import torch
from torch.optim import Optimizer
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
@OPTIMIZERS.register_module
......@@ -22,28 +22,24 @@ class Lars(Optimizer):
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
"""
def __init__(
self,
params: Iterable[torch.nn.Parameter],
lr=1e-3,
momentum=0,
eeta=1e-3,
weight_decay=0,
epsilon=0.0
) -> None:
def __init__(self,
params: Iterable[torch.nn.Parameter],
lr=1e-3,
momentum=0,
eeta=1e-3,
weight_decay=0,
epsilon=0.0) -> None:
if not isinstance(lr, float) or lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay))
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
if eeta <= 0 or eeta > 1:
raise ValueError("Invalid eeta value: {}".format(eeta))
if epsilon < 0:
raise ValueError("Invalid epsilon value: {}".format(epsilon))
defaults = dict(lr=lr, momentum=momentum,
weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
super().__init__(params, defaults)
......@@ -76,11 +72,9 @@ class Lars(Optimizer):
if lars:
w_norm = torch.norm(p)
g_norm = torch.norm(p.grad)
trust_ratio = torch.where(
w_norm > 0 and g_norm > 0,
eeta * w_norm / (g_norm + weight_decay * w_norm + eps),
torch.ones_like(w_norm)
)
trust_ratio = torch.where(w_norm > 0 and g_norm > 0,
eeta * w_norm / (g_norm + weight_decay * w_norm + eps),
torch.ones_like(w_norm))
trust_ratio.clamp_(0.0, 50)
scaled_lr *= trust_ratio.item()
if weight_decay != 0:
......@@ -90,8 +84,7 @@ class Lars(Optimizer):
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(
decayed_grad).detach()
buf = param_state['momentum_buffer'] = torch.clone(decayed_grad).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(decayed_grad)
......
......@@ -4,15 +4,15 @@
import math
import random
import numpy as np
from typing import TypeVar, Iterator
from typing import Iterator, TypeVar
import numpy as np
import torch
from torch.utils.data import Sampler, Dataset, DataLoader
from torch.utils.data import DataLoader, Dataset, Sampler
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.registry import DATA_SAMPLERS
from colossalai.legacy.registry import DATA_SAMPLERS
T_co = TypeVar('T_co', covariant=True)
......@@ -30,11 +30,7 @@ class DataParallelSampler(Sampler):
the batch size, then the last batch will be smaller, defaults to False.
"""
def __init__(self,
dataset: Dataset,
shuffle: bool = False,
seed: int = 0,
drop_last: bool = False) -> None:
def __init__(self, dataset: Dataset, shuffle: bool = False, seed: int = 0, drop_last: bool = False) -> None:
self.dataset = dataset
self.num_replicas = gpc.get_world_size(ParallelMode.DATA)
self.rank = gpc.get_local_rank(ParallelMode.DATA)
......@@ -54,8 +50,7 @@ class DataParallelSampler(Sampler):
self.num_replicas # type: ignore[arg-type]
)
else:
self.num_samples = math.ceil(
len(self.dataset) / self.num_replicas) # type: ignore[arg-type]
self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) # type: ignore[arg-type]
self.total_size = self.num_samples * self.num_replicas
self.shuffle = shuffle
self.seed = seed
......@@ -72,7 +67,7 @@ class DataParallelSampler(Sampler):
# set_epoch manually
self.epoch += 1
else:
indices = list(range(len(self.dataset))) # type: ignore[arg-type]
indices = list(range(len(self.dataset))) # type: ignore[arg-type]
if not self.drop_last:
# add extra samples to make it evenly divisible
......@@ -80,8 +75,7 @@ class DataParallelSampler(Sampler):
if padding_size <= len(indices):
indices += indices[:padding_size]
else:
indices += (indices * math.ceil(padding_size /
len(indices)))[:padding_size]
indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
else:
# remove tail of data to make it evenly divisible.
indices = indices[:self.total_size]
......@@ -109,8 +103,8 @@ class DataParallelSampler(Sampler):
def get_dataloader(dataset,
shuffle=False,
seed=1024,
add_sampler=True,
seed=1024,
add_sampler=True,
drop_last=False,
pin_memory=False,
num_workers=0,
......
import torch
from colossalai.registry import OPHOOKS
from colossalai.legacy.registry import OPHOOKS
from . import BaseOpHook
......
import torch
from colossalai.registry import OPHOOKS
from colossalai.legacy.registry import OPHOOKS
from . import BaseOpHook
......
......@@ -3,8 +3,8 @@ from typing import Optional
import torch
import torch.distributed as dist
from colossalai.legacy.registry import OPHOOKS
from colossalai.logging import get_dist_logger
from colossalai.registry import OPHOOKS
from colossalai.utils import get_current_device
from colossalai.zero.gemini.memory_tracer import MemStatsCollector
from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
......
......@@ -98,7 +98,7 @@ parallel gradient handler is added to the engine automatically if data parallel
gradient handler like below:
```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.legacy.engine import BaseGradientHandler
@GRADIENT_HANDLER.register_module
......
......@@ -36,7 +36,7 @@ import torch
import torch.nn as nn
from colossalai import nn as col_nn
from colossalai.amp import AMP_TYPE
from colossalai.builder.pipeline import partition_uniform
from colossalai.legacy.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
......
......@@ -34,7 +34,7 @@ import colossalai
import colossalai.nn as col_nn
import torch
import torch.nn as nn
from colossalai.builder import build_pipeline_model
from colossalai.legacy.builder import build_pipeline_model
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
......@@ -51,17 +51,17 @@ from torchvision.datasets import CIFAR10
Generally, we provide 3 ways to build a pipelined model:
1. `colossalai.builder.build_pipeline_model_from_cfg`
2. `colossalai.builder.build_pipeline_model`
1. `colossalai.legacy.builder.build_pipeline_model_from_cfg`
2. `colossalai.legacy.builder.build_pipeline_model`
3. Split the model by stages by yourself
When your memory can fit the model, you can use the first two methods to build your model, otherwise you must split the model by yourself. The first two methods first build the whole model on CPU, then split the model, and finally you can just move the corresponding part of model to GPU.
`colossalai.builder.build_pipeline_model_from_cfg()` receives a config file of model, and it can split the model uniformly (by layer) or balanced (by parameter size).
`colossalai.legacy.builder.build_pipeline_model_from_cfg()` receives a config file of model, and it can split the model uniformly (by layer) or balanced (by parameter size).
If you are familiar with `PyTorch`, you can use `colossalai.builder.build_pipeline_model()` which receives a `torch.nn.Sequential` model and split it by layer uniformly.
If you are familiar with `PyTorch`, you can use `colossalai.legacy.builder.build_pipeline_model()` which receives a `torch.nn.Sequential` model and split it by layer uniformly.
In this tutorial, we will modify [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential` and then use `colossalai.builder.build_pipeline_model()` to build the pipelined model.
In this tutorial, we will modify [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential` and then use `colossalai.legacy.builder.build_pipeline_model()` to build the pipelined model.
When the data is **one** `Tensor`, you can use the positional argument in `forward()` of your model to get the data tensor. For the first stage of pipeline, the first positional argument of `forward()` is the data tensor loaded from data loader. For other stages, the first positional argument of `forward()` is the output tensor from the previous stage. Note that if the stage is not the last stage, the return of `forward()` must be a `Tensor`.
......
......@@ -273,8 +273,8 @@ SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token
### Build pipeline model (`/hybrid_parallel/model/vit.py`)
Colossal-AI provides two methods to build a pipeline model from the existing model.
- `colossalai.builder.build_pipeline_model_from_cfg`
- `colossalai.builder.build_pipeline_model`
- `colossalai.legacy.builder.build_pipeline_model_from_cfg`
- `colossalai.legacy.builder.build_pipeline_model`
Besides, you can also build a pipeline model from scratch with Colossal-AI.
```python
......@@ -284,11 +284,11 @@ from typing import Callable
import inspect
import torch
from colossalai import nn as col_nn
from colossalai.registry import LAYERS, MODELS
from colossalai.legacy.registry import LAYERS, MODELS
from colossalai.logging import get_dist_logger
from colossalai.core import global_context as gpc
from colossalai.context import ParallelMode
from colossalai.builder.pipeline import partition_uniform
from colossalai.legacy.builder.pipeline import partition_uniform
from torch import dtype, nn
from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead
......
......@@ -28,7 +28,7 @@ To implement a customized gradient handler, you need to follow these steps.
3. implement `handle_gradient` method.
```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
......
......@@ -87,7 +87,7 @@ Colossal-AI 为用户提供了一个全局 context,使他们能够轻松地管
你可以添加你自己的梯度 handler,如下所示:
```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.legacy.engine import BaseGradientHandler
@GRADIENT_HANDLER.register_module
......
......@@ -36,7 +36,7 @@ import torch
import torch.nn as nn
from colossalai import nn as col_nn
from colossalai.amp import AMP_TYPE
from colossalai.builder.pipeline import partition_uniform
from colossalai.legacy.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment