Unverified Commit fae6c92e authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

Merge branch 'main' into feature/shardformer

parents bd186784 ac178ca5
from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
from colossalai.registry import LR_SCHEDULERS from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module @LR_SCHEDULERS.register_module
......
from torch.optim.lr_scheduler import _LRScheduler from torch.optim.lr_scheduler import _LRScheduler
from colossalai.registry import LR_SCHEDULERS from colossalai.legacy.registry import LR_SCHEDULERS
from .delayed import WarmupScheduler from .delayed import WarmupScheduler
......
from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR
from torch.optim.lr_scheduler import LambdaLR as _LambdaLR from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
from torch.optim.lr_scheduler import StepLR as _StepLR from torch.optim.lr_scheduler import StepLR as _StepLR
from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR
from colossalai.registry import LR_SCHEDULERS from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module @LR_SCHEDULERS.register_module
......
...@@ -4,7 +4,7 @@ from typing import Optional ...@@ -4,7 +4,7 @@ from typing import Optional
import torch import torch
from colossalai.kernel.op_builder import CPUAdamBuilder from colossalai.kernel.op_builder import CPUAdamBuilder
from colossalai.registry import OPTIMIZERS from colossalai.legacy.registry import OPTIMIZERS
from .nvme_optimizer import NVMeOptimizer from .nvme_optimizer import NVMeOptimizer
......
...@@ -8,7 +8,7 @@ Licensed under the MIT License. ...@@ -8,7 +8,7 @@ Licensed under the MIT License.
''' '''
import torch import torch
from colossalai.registry import OPTIMIZERS from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier from colossalai.utils import multi_tensor_applier
......
# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py
import torch import torch
from colossalai.registry import OPTIMIZERS from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier from colossalai.utils import multi_tensor_applier
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import torch import torch
from torch.optim.optimizer import Optimizer, required from torch.optim.optimizer import Optimizer, required
from colossalai.registry import OPTIMIZERS from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier from colossalai.utils import multi_tensor_applier
......
...@@ -4,7 +4,7 @@ import torch ...@@ -4,7 +4,7 @@ import torch
from torch.optim import Adam from torch.optim import Adam
from colossalai.kernel.op_builder import FusedOptimBuilder from colossalai.kernel.op_builder import FusedOptimBuilder
from colossalai.registry import OPTIMIZERS from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier from colossalai.utils import multi_tensor_applier
from .cpu_adam import CPUAdam from .cpu_adam import CPUAdam
......
...@@ -5,7 +5,7 @@ Adapted from the pytorch-lamb library at https://github.com/cybertronai/pytorch- ...@@ -5,7 +5,7 @@ Adapted from the pytorch-lamb library at https://github.com/cybertronai/pytorch-
import torch import torch
from torch.optim import Optimizer from torch.optim import Optimizer
from colossalai.registry import OPTIMIZERS from colossalai.legacy.registry import OPTIMIZERS
@OPTIMIZERS.register_module @OPTIMIZERS.register_module
......
...@@ -5,7 +5,7 @@ from typing import Iterable ...@@ -5,7 +5,7 @@ from typing import Iterable
import torch import torch
from torch.optim import Optimizer from torch.optim import Optimizer
from colossalai.registry import OPTIMIZERS from colossalai.legacy.registry import OPTIMIZERS
@OPTIMIZERS.register_module @OPTIMIZERS.register_module
...@@ -22,28 +22,24 @@ class Lars(Optimizer): ...@@ -22,28 +22,24 @@ class Lars(Optimizer):
weight_decay (float, optional): weight decay (L2 penalty) (default: 0) weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
""" """
def __init__( def __init__(self,
self, params: Iterable[torch.nn.Parameter],
params: Iterable[torch.nn.Parameter], lr=1e-3,
lr=1e-3, momentum=0,
momentum=0, eeta=1e-3,
eeta=1e-3, weight_decay=0,
weight_decay=0, epsilon=0.0) -> None:
epsilon=0.0
) -> None:
if not isinstance(lr, float) or lr < 0.0: if not isinstance(lr, float) or lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr)) raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0: if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum)) raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0: if weight_decay < 0.0:
raise ValueError( raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
"Invalid weight_decay value: {}".format(weight_decay))
if eeta <= 0 or eeta > 1: if eeta <= 0 or eeta > 1:
raise ValueError("Invalid eeta value: {}".format(eeta)) raise ValueError("Invalid eeta value: {}".format(eeta))
if epsilon < 0: if epsilon < 0:
raise ValueError("Invalid epsilon value: {}".format(epsilon)) raise ValueError("Invalid epsilon value: {}".format(epsilon))
defaults = dict(lr=lr, momentum=momentum, defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
super().__init__(params, defaults) super().__init__(params, defaults)
...@@ -76,11 +72,9 @@ class Lars(Optimizer): ...@@ -76,11 +72,9 @@ class Lars(Optimizer):
if lars: if lars:
w_norm = torch.norm(p) w_norm = torch.norm(p)
g_norm = torch.norm(p.grad) g_norm = torch.norm(p.grad)
trust_ratio = torch.where( trust_ratio = torch.where(w_norm > 0 and g_norm > 0,
w_norm > 0 and g_norm > 0, eeta * w_norm / (g_norm + weight_decay * w_norm + eps),
eeta * w_norm / (g_norm + weight_decay * w_norm + eps), torch.ones_like(w_norm))
torch.ones_like(w_norm)
)
trust_ratio.clamp_(0.0, 50) trust_ratio.clamp_(0.0, 50)
scaled_lr *= trust_ratio.item() scaled_lr *= trust_ratio.item()
if weight_decay != 0: if weight_decay != 0:
...@@ -90,8 +84,7 @@ class Lars(Optimizer): ...@@ -90,8 +84,7 @@ class Lars(Optimizer):
if momentum != 0: if momentum != 0:
param_state = self.state[p] param_state = self.state[p]
if 'momentum_buffer' not in param_state: if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone( buf = param_state['momentum_buffer'] = torch.clone(decayed_grad).detach()
decayed_grad).detach()
else: else:
buf = param_state['momentum_buffer'] buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(decayed_grad) buf.mul_(momentum).add_(decayed_grad)
......
...@@ -4,15 +4,15 @@ ...@@ -4,15 +4,15 @@
import math import math
import random import random
import numpy as np from typing import Iterator, TypeVar
from typing import TypeVar, Iterator
import numpy as np
import torch import torch
from torch.utils.data import Sampler, Dataset, DataLoader from torch.utils.data import DataLoader, Dataset, Sampler
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.registry import DATA_SAMPLERS from colossalai.legacy.registry import DATA_SAMPLERS
T_co = TypeVar('T_co', covariant=True) T_co = TypeVar('T_co', covariant=True)
...@@ -30,11 +30,7 @@ class DataParallelSampler(Sampler): ...@@ -30,11 +30,7 @@ class DataParallelSampler(Sampler):
the batch size, then the last batch will be smaller, defaults to False. the batch size, then the last batch will be smaller, defaults to False.
""" """
def __init__(self, def __init__(self, dataset: Dataset, shuffle: bool = False, seed: int = 0, drop_last: bool = False) -> None:
dataset: Dataset,
shuffle: bool = False,
seed: int = 0,
drop_last: bool = False) -> None:
self.dataset = dataset self.dataset = dataset
self.num_replicas = gpc.get_world_size(ParallelMode.DATA) self.num_replicas = gpc.get_world_size(ParallelMode.DATA)
self.rank = gpc.get_local_rank(ParallelMode.DATA) self.rank = gpc.get_local_rank(ParallelMode.DATA)
...@@ -54,8 +50,7 @@ class DataParallelSampler(Sampler): ...@@ -54,8 +50,7 @@ class DataParallelSampler(Sampler):
self.num_replicas # type: ignore[arg-type] self.num_replicas # type: ignore[arg-type]
) )
else: else:
self.num_samples = math.ceil( self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) # type: ignore[arg-type]
len(self.dataset) / self.num_replicas) # type: ignore[arg-type]
self.total_size = self.num_samples * self.num_replicas self.total_size = self.num_samples * self.num_replicas
self.shuffle = shuffle self.shuffle = shuffle
self.seed = seed self.seed = seed
...@@ -72,7 +67,7 @@ class DataParallelSampler(Sampler): ...@@ -72,7 +67,7 @@ class DataParallelSampler(Sampler):
# set_epoch manually # set_epoch manually
self.epoch += 1 self.epoch += 1
else: else:
indices = list(range(len(self.dataset))) # type: ignore[arg-type] indices = list(range(len(self.dataset))) # type: ignore[arg-type]
if not self.drop_last: if not self.drop_last:
# add extra samples to make it evenly divisible # add extra samples to make it evenly divisible
...@@ -80,8 +75,7 @@ class DataParallelSampler(Sampler): ...@@ -80,8 +75,7 @@ class DataParallelSampler(Sampler):
if padding_size <= len(indices): if padding_size <= len(indices):
indices += indices[:padding_size] indices += indices[:padding_size]
else: else:
indices += (indices * math.ceil(padding_size / indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
len(indices)))[:padding_size]
else: else:
# remove tail of data to make it evenly divisible. # remove tail of data to make it evenly divisible.
indices = indices[:self.total_size] indices = indices[:self.total_size]
...@@ -109,8 +103,8 @@ class DataParallelSampler(Sampler): ...@@ -109,8 +103,8 @@ class DataParallelSampler(Sampler):
def get_dataloader(dataset, def get_dataloader(dataset,
shuffle=False, shuffle=False,
seed=1024, seed=1024,
add_sampler=True, add_sampler=True,
drop_last=False, drop_last=False,
pin_memory=False, pin_memory=False,
num_workers=0, num_workers=0,
......
import os import gzip
from typing import List
from colossalai.engine import Engine
from torch.profiler import profile as torch_profile
from torch.profiler.profiler import ProfilerAction
from typing import Any, Callable, Iterable, Optional
from torch.autograd import ProfilerActivity
import json import json
import os import os
import tempfile import tempfile
import gzip from typing import Any, Callable, Iterable, List, Optional
from torch.autograd import ProfilerActivity
from torch.profiler import profile as torch_profile
from torch.profiler.profiler import ProfilerAction
from colossalai.legacy.engine import Engine
from colossalai.logging import get_dist_logger
from colossalai.utils.profiler.extention import ProfilerExtension from colossalai.utils.profiler.extention import ProfilerExtension
from colossalai.utils.profiler.stateful_tensor_mem_extention import StatefulTensorMemoryProfilerExtention from colossalai.utils.profiler.stateful_tensor_mem_extention import StatefulTensorMemoryProfilerExtention
from colossalai.logging import get_dist_logger
class profile(torch_profile): class profile(torch_profile):
......
import os import os
import threading import threading
import time import time
import torch
from enum import Enum from enum import Enum
from typing import List from typing import List
from colossalai.gemini.stateful_tensor import StatefulTensor
import torch
from colossalai.gemini.ophooks import BaseOpHook from colossalai.gemini.ophooks import BaseOpHook
from colossalai.engine import Engine from colossalai.gemini.stateful_tensor import StatefulTensor
from colossalai.legacy.engine import Engine
from colossalai.utils.profiler.extention import ProfilerExtension from colossalai.utils.profiler.extention import ProfilerExtension
......
import torch import torch
from colossalai.registry import OPHOOKS from colossalai.legacy.registry import OPHOOKS
from . import BaseOpHook from . import BaseOpHook
......
import torch import torch
from colossalai.registry import OPHOOKS from colossalai.legacy.registry import OPHOOKS
from . import BaseOpHook from . import BaseOpHook
......
...@@ -3,8 +3,8 @@ from typing import Optional ...@@ -3,8 +3,8 @@ from typing import Optional
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.legacy.registry import OPHOOKS
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.registry import OPHOOKS
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from colossalai.zero.gemini.memory_tracer import MemStatsCollector from colossalai.zero.gemini.memory_tracer import MemStatsCollector
from colossalai.zero.legacy.gemini.ophooks import BaseOpHook from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
......
...@@ -6,6 +6,7 @@ from typing import Dict, Iterator, Optional, Tuple ...@@ -6,6 +6,7 @@ from typing import Dict, Iterator, Optional, Tuple
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch.nn as nn
from torch.distributed import ProcessGroup from torch.distributed import ProcessGroup
from torch.optim import Optimizer from torch.optim import Optimizer
...@@ -617,3 +618,19 @@ class LowLevelZeroOptimizer(OptimizerWrapper): ...@@ -617,3 +618,19 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
ret_block_size += current_block_size ret_block_size += current_block_size
yield ret_block, ret_block_size yield ret_block, ret_block_size
def update_master_params(self, model: nn.Module) -> None:
"""Update master params from working params
Args:
model (nn.Module): The model to update master params
"""
for p in model.parameters():
p_id = id(p)
if p_id in self._param_store.working_to_master_param:
master_param = self._param_store.working_to_master_param[p_id]
padding_size = self._param_store.get_param_padding_size(p)
working_param = p.data.view(-1)
if padding_size > 0:
working_param = torch.nn.functional.pad(working_param, [0, padding_size])
master_param.copy_(working_param.chunk(self._world_size)[self._local_rank])
...@@ -92,14 +92,14 @@ follow the steps below to create a new distributed initialization. ...@@ -92,14 +92,14 @@ follow the steps below to create a new distributed initialization.
Gradient handlers are objects which execute the all-reduce operations on parameters' gradients. As different all-reduce Gradient handlers are objects which execute the all-reduce operations on parameters' gradients. As different all-reduce
strategies may be executed for different kinds of parallelism, users can strategies may be executed for different kinds of parallelism, users can
inherit `colossalai.engine.gradient_handler.BaseGradientHandler` to implement their strategies. Currently, the library inherit `colossalai.legacy.engine.gradient_handler.BaseGradientHandler` to implement their strategies. Currently, the library
uses the normal data parallel gradient handler which all-reduces the gradients across data parallel ranks. The data uses the normal data parallel gradient handler which all-reduces the gradients across data parallel ranks. The data
parallel gradient handler is added to the engine automatically if data parallel is detected. You can add your own parallel gradient handler is added to the engine automatically if data parallel is detected. You can add your own
gradient handler like below: gradient handler like below:
```python ```python
from colossalai.registry import GRADIENT_HANDLER from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.engine import BaseGradientHandler from colossalai.legacy.engine import BaseGradientHandler
@GRADIENT_HANDLER.register_module @GRADIENT_HANDLER.register_module
class YourGradientHandler(BaseGradientHandler): class YourGradientHandler(BaseGradientHandler):
...@@ -121,4 +121,5 @@ gradient_handlers = [ ...@@ -121,4 +121,5 @@ gradient_handlers = [
Schedule entails how to execute a forward and backward pass. Currently, Colossal-AI provides pipeline and non-pipeline Schedule entails how to execute a forward and backward pass. Currently, Colossal-AI provides pipeline and non-pipeline
schedules. If you want to modify how the forward and backward passes are executed, you can schedules. If you want to modify how the forward and backward passes are executed, you can
inherit `colossalai.engine.schedule.BaseSchedule` and implement the `forward_back_step` function. inherit `colossalai.legacy.engine.schedule.BaseSchedule` and implement the `forward_back_step` function.
<!-- doc-test-command: echo -->
...@@ -36,14 +36,14 @@ import torch ...@@ -36,14 +36,14 @@ import torch
import torch.nn as nn import torch.nn as nn
from colossalai import nn as col_nn from colossalai import nn as col_nn
from colossalai.amp import AMP_TYPE from colossalai.amp import AMP_TYPE
from colossalai.builder.pipeline import partition_uniform from colossalai.legacy.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.engine.schedule import (InterleavedPipelineSchedule, from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule) PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils.timer import MultiTimer from colossalai.utils.timer import MultiTimer
from model_zoo.gpt import GPTLMLoss from model_zoo.gpt import GPTLMLoss
from torch.nn import functional as F from torch.nn import functional as F
...@@ -268,3 +268,4 @@ def train(): ...@@ -268,3 +268,4 @@ def train():
return_output_label=False, return_output_label=False,
) )
``` ```
<!-- doc-test-command: echo -->
...@@ -34,11 +34,11 @@ import colossalai ...@@ -34,11 +34,11 @@ import colossalai
import colossalai.nn as col_nn import colossalai.nn as col_nn
import torch import torch
import torch.nn as nn import torch.nn as nn
from colossalai.builder import build_pipeline_model from colossalai.legacy.builder import build_pipeline_model
from colossalai.engine.schedule import (InterleavedPipelineSchedule, from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule) PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader from colossalai.utils import MultiTimer, get_dataloader
from timm.models import vision_transformer as vit from timm.models import vision_transformer as vit
from torchvision import transforms from torchvision import transforms
...@@ -51,17 +51,17 @@ from torchvision.datasets import CIFAR10 ...@@ -51,17 +51,17 @@ from torchvision.datasets import CIFAR10
Generally, we provide 3 ways to build a pipelined model: Generally, we provide 3 ways to build a pipelined model:
1. `colossalai.builder.build_pipeline_model_from_cfg` 1. `colossalai.legacy.builder.build_pipeline_model_from_cfg`
2. `colossalai.builder.build_pipeline_model` 2. `colossalai.legacy.builder.build_pipeline_model`
3. Split the model by stages by yourself 3. Split the model by stages by yourself
When your memory can fit the model, you can use the first two methods to build your model, otherwise you must split the model by yourself. The first two methods first build the whole model on CPU, then split the model, and finally you can just move the corresponding part of model to GPU. When your memory can fit the model, you can use the first two methods to build your model, otherwise you must split the model by yourself. The first two methods first build the whole model on CPU, then split the model, and finally you can just move the corresponding part of model to GPU.
`colossalai.builder.build_pipeline_model_from_cfg()` receives a config file of model, and it can split the model uniformly (by layer) or balanced (by parameter size). `colossalai.legacy.builder.build_pipeline_model_from_cfg()` receives a config file of model, and it can split the model uniformly (by layer) or balanced (by parameter size).
If you are familiar with `PyTorch`, you can use `colossalai.builder.build_pipeline_model()` which receives a `torch.nn.Sequential` model and split it by layer uniformly. If you are familiar with `PyTorch`, you can use `colossalai.legacy.builder.build_pipeline_model()` which receives a `torch.nn.Sequential` model and split it by layer uniformly.
In this tutorial, we will modify [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential` and then use `colossalai.builder.build_pipeline_model()` to build the pipelined model. In this tutorial, we will modify [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential` and then use `colossalai.legacy.builder.build_pipeline_model()` to build the pipelined model.
When the data is **one** `Tensor`, you can use the positional argument in `forward()` of your model to get the data tensor. For the first stage of pipeline, the first positional argument of `forward()` is the data tensor loaded from data loader. For other stages, the first positional argument of `forward()` is the output tensor from the previous stage. Note that if the stage is not the last stage, the return of `forward()` must be a `Tensor`. When the data is **one** `Tensor`, you can use the positional argument in `forward()` of your model to get the data tensor. For the first stage of pipeline, the first positional argument of `forward()` is the data tensor loaded from data loader. For other stages, the first positional argument of `forward()` is the output tensor from the previous stage. Note that if the stage is not the last stage, the return of `forward()` must be a `Tensor`.
...@@ -245,3 +245,4 @@ def train(): ...@@ -245,3 +245,4 @@ def train():
hooks=hook_list, hooks=hook_list,
display_progress=True) display_progress=True)
``` ```
<!-- doc-test-command: echo -->
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment