Unverified Commit fae6c92e authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

Merge branch 'main' into feature/shardformer

parents bd186784 ac178ca5
from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
from colossalai.registry import LR_SCHEDULERS
from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
......
from torch.optim.lr_scheduler import _LRScheduler
from colossalai.registry import LR_SCHEDULERS
from colossalai.legacy.registry import LR_SCHEDULERS
from .delayed import WarmupScheduler
......
from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR
from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
from torch.optim.lr_scheduler import StepLR as _StepLR
from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR
from colossalai.registry import LR_SCHEDULERS
from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
......
......@@ -4,7 +4,7 @@ from typing import Optional
import torch
from colossalai.kernel.op_builder import CPUAdamBuilder
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from .nvme_optimizer import NVMeOptimizer
......
......@@ -8,7 +8,7 @@ Licensed under the MIT License.
'''
import torch
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
......
# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py
import torch
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
......
......@@ -2,7 +2,7 @@
import torch
from torch.optim.optimizer import Optimizer, required
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
......
......@@ -4,7 +4,7 @@ import torch
from torch.optim import Adam
from colossalai.kernel.op_builder import FusedOptimBuilder
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
from .cpu_adam import CPUAdam
......
......@@ -5,7 +5,7 @@ Adapted from the pytorch-lamb library at https://github.com/cybertronai/pytorch-
import torch
from torch.optim import Optimizer
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
@OPTIMIZERS.register_module
......
......@@ -5,7 +5,7 @@ from typing import Iterable
import torch
from torch.optim import Optimizer
from colossalai.registry import OPTIMIZERS
from colossalai.legacy.registry import OPTIMIZERS
@OPTIMIZERS.register_module
......@@ -22,28 +22,24 @@ class Lars(Optimizer):
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
"""
def __init__(
self,
params: Iterable[torch.nn.Parameter],
lr=1e-3,
momentum=0,
eeta=1e-3,
weight_decay=0,
epsilon=0.0
) -> None:
def __init__(self,
params: Iterable[torch.nn.Parameter],
lr=1e-3,
momentum=0,
eeta=1e-3,
weight_decay=0,
epsilon=0.0) -> None:
if not isinstance(lr, float) or lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay))
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
if eeta <= 0 or eeta > 1:
raise ValueError("Invalid eeta value: {}".format(eeta))
if epsilon < 0:
raise ValueError("Invalid epsilon value: {}".format(epsilon))
defaults = dict(lr=lr, momentum=momentum,
weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
super().__init__(params, defaults)
......@@ -76,11 +72,9 @@ class Lars(Optimizer):
if lars:
w_norm = torch.norm(p)
g_norm = torch.norm(p.grad)
trust_ratio = torch.where(
w_norm > 0 and g_norm > 0,
eeta * w_norm / (g_norm + weight_decay * w_norm + eps),
torch.ones_like(w_norm)
)
trust_ratio = torch.where(w_norm > 0 and g_norm > 0,
eeta * w_norm / (g_norm + weight_decay * w_norm + eps),
torch.ones_like(w_norm))
trust_ratio.clamp_(0.0, 50)
scaled_lr *= trust_ratio.item()
if weight_decay != 0:
......@@ -90,8 +84,7 @@ class Lars(Optimizer):
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(
decayed_grad).detach()
buf = param_state['momentum_buffer'] = torch.clone(decayed_grad).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(decayed_grad)
......
......@@ -4,15 +4,15 @@
import math
import random
import numpy as np
from typing import TypeVar, Iterator
from typing import Iterator, TypeVar
import numpy as np
import torch
from torch.utils.data import Sampler, Dataset, DataLoader
from torch.utils.data import DataLoader, Dataset, Sampler
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.registry import DATA_SAMPLERS
from colossalai.legacy.registry import DATA_SAMPLERS
T_co = TypeVar('T_co', covariant=True)
......@@ -30,11 +30,7 @@ class DataParallelSampler(Sampler):
the batch size, then the last batch will be smaller, defaults to False.
"""
def __init__(self,
dataset: Dataset,
shuffle: bool = False,
seed: int = 0,
drop_last: bool = False) -> None:
def __init__(self, dataset: Dataset, shuffle: bool = False, seed: int = 0, drop_last: bool = False) -> None:
self.dataset = dataset
self.num_replicas = gpc.get_world_size(ParallelMode.DATA)
self.rank = gpc.get_local_rank(ParallelMode.DATA)
......@@ -54,8 +50,7 @@ class DataParallelSampler(Sampler):
self.num_replicas # type: ignore[arg-type]
)
else:
self.num_samples = math.ceil(
len(self.dataset) / self.num_replicas) # type: ignore[arg-type]
self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) # type: ignore[arg-type]
self.total_size = self.num_samples * self.num_replicas
self.shuffle = shuffle
self.seed = seed
......@@ -72,7 +67,7 @@ class DataParallelSampler(Sampler):
# set_epoch manually
self.epoch += 1
else:
indices = list(range(len(self.dataset))) # type: ignore[arg-type]
indices = list(range(len(self.dataset))) # type: ignore[arg-type]
if not self.drop_last:
# add extra samples to make it evenly divisible
......@@ -80,8 +75,7 @@ class DataParallelSampler(Sampler):
if padding_size <= len(indices):
indices += indices[:padding_size]
else:
indices += (indices * math.ceil(padding_size /
len(indices)))[:padding_size]
indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
else:
# remove tail of data to make it evenly divisible.
indices = indices[:self.total_size]
......@@ -109,8 +103,8 @@ class DataParallelSampler(Sampler):
def get_dataloader(dataset,
shuffle=False,
seed=1024,
add_sampler=True,
seed=1024,
add_sampler=True,
drop_last=False,
pin_memory=False,
num_workers=0,
......
import os
from typing import List
from colossalai.engine import Engine
from torch.profiler import profile as torch_profile
from torch.profiler.profiler import ProfilerAction
from typing import Any, Callable, Iterable, Optional
from torch.autograd import ProfilerActivity
import gzip
import json
import os
import tempfile
import gzip
from typing import Any, Callable, Iterable, List, Optional
from torch.autograd import ProfilerActivity
from torch.profiler import profile as torch_profile
from torch.profiler.profiler import ProfilerAction
from colossalai.legacy.engine import Engine
from colossalai.logging import get_dist_logger
from colossalai.utils.profiler.extention import ProfilerExtension
from colossalai.utils.profiler.stateful_tensor_mem_extention import StatefulTensorMemoryProfilerExtention
from colossalai.logging import get_dist_logger
class profile(torch_profile):
......
import os
import threading
import time
import torch
from enum import Enum
from typing import List
from colossalai.gemini.stateful_tensor import StatefulTensor
import torch
from colossalai.gemini.ophooks import BaseOpHook
from colossalai.engine import Engine
from colossalai.gemini.stateful_tensor import StatefulTensor
from colossalai.legacy.engine import Engine
from colossalai.utils.profiler.extention import ProfilerExtension
......
import torch
from colossalai.registry import OPHOOKS
from colossalai.legacy.registry import OPHOOKS
from . import BaseOpHook
......
import torch
from colossalai.registry import OPHOOKS
from colossalai.legacy.registry import OPHOOKS
from . import BaseOpHook
......
......@@ -3,8 +3,8 @@ from typing import Optional
import torch
import torch.distributed as dist
from colossalai.legacy.registry import OPHOOKS
from colossalai.logging import get_dist_logger
from colossalai.registry import OPHOOKS
from colossalai.utils import get_current_device
from colossalai.zero.gemini.memory_tracer import MemStatsCollector
from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
......
......@@ -6,6 +6,7 @@ from typing import Dict, Iterator, Optional, Tuple
import torch
import torch.distributed as dist
import torch.nn as nn
from torch.distributed import ProcessGroup
from torch.optim import Optimizer
......@@ -617,3 +618,19 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
ret_block_size += current_block_size
yield ret_block, ret_block_size
def update_master_params(self, model: nn.Module) -> None:
"""Update master params from working params
Args:
model (nn.Module): The model to update master params
"""
for p in model.parameters():
p_id = id(p)
if p_id in self._param_store.working_to_master_param:
master_param = self._param_store.working_to_master_param[p_id]
padding_size = self._param_store.get_param_padding_size(p)
working_param = p.data.view(-1)
if padding_size > 0:
working_param = torch.nn.functional.pad(working_param, [0, padding_size])
master_param.copy_(working_param.chunk(self._world_size)[self._local_rank])
......@@ -92,14 +92,14 @@ follow the steps below to create a new distributed initialization.
Gradient handlers are objects which execute the all-reduce operations on parameters' gradients. As different all-reduce
strategies may be executed for different kinds of parallelism, users can
inherit `colossalai.engine.gradient_handler.BaseGradientHandler` to implement their strategies. Currently, the library
inherit `colossalai.legacy.engine.gradient_handler.BaseGradientHandler` to implement their strategies. Currently, the library
uses the normal data parallel gradient handler which all-reduces the gradients across data parallel ranks. The data
parallel gradient handler is added to the engine automatically if data parallel is detected. You can add your own
gradient handler like below:
```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.engine import BaseGradientHandler
from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.legacy.engine import BaseGradientHandler
@GRADIENT_HANDLER.register_module
class YourGradientHandler(BaseGradientHandler):
......@@ -121,4 +121,5 @@ gradient_handlers = [
Schedule entails how to execute a forward and backward pass. Currently, Colossal-AI provides pipeline and non-pipeline
schedules. If you want to modify how the forward and backward passes are executed, you can
inherit `colossalai.engine.schedule.BaseSchedule` and implement the `forward_back_step` function.
inherit `colossalai.legacy.engine.schedule.BaseSchedule` and implement the `forward_back_step` function.
<!-- doc-test-command: echo -->
......@@ -36,14 +36,14 @@ import torch
import torch.nn as nn
from colossalai import nn as col_nn
from colossalai.amp import AMP_TYPE
from colossalai.builder.pipeline import partition_uniform
from colossalai.legacy.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils.timer import MultiTimer
from model_zoo.gpt import GPTLMLoss
from torch.nn import functional as F
......@@ -268,3 +268,4 @@ def train():
return_output_label=False,
)
```
<!-- doc-test-command: echo -->
......@@ -34,11 +34,11 @@ import colossalai
import colossalai.nn as col_nn
import torch
import torch.nn as nn
from colossalai.builder import build_pipeline_model
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.builder import build_pipeline_model
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader
from timm.models import vision_transformer as vit
from torchvision import transforms
......@@ -51,17 +51,17 @@ from torchvision.datasets import CIFAR10
Generally, we provide 3 ways to build a pipelined model:
1. `colossalai.builder.build_pipeline_model_from_cfg`
2. `colossalai.builder.build_pipeline_model`
1. `colossalai.legacy.builder.build_pipeline_model_from_cfg`
2. `colossalai.legacy.builder.build_pipeline_model`
3. Split the model by stages by yourself
When your memory can fit the model, you can use the first two methods to build your model, otherwise you must split the model by yourself. The first two methods first build the whole model on CPU, then split the model, and finally you can just move the corresponding part of model to GPU.
`colossalai.builder.build_pipeline_model_from_cfg()` receives a config file of model, and it can split the model uniformly (by layer) or balanced (by parameter size).
`colossalai.legacy.builder.build_pipeline_model_from_cfg()` receives a config file of model, and it can split the model uniformly (by layer) or balanced (by parameter size).
If you are familiar with `PyTorch`, you can use `colossalai.builder.build_pipeline_model()` which receives a `torch.nn.Sequential` model and split it by layer uniformly.
If you are familiar with `PyTorch`, you can use `colossalai.legacy.builder.build_pipeline_model()` which receives a `torch.nn.Sequential` model and split it by layer uniformly.
In this tutorial, we will modify [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential` and then use `colossalai.builder.build_pipeline_model()` to build the pipelined model.
In this tutorial, we will modify [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential` and then use `colossalai.legacy.builder.build_pipeline_model()` to build the pipelined model.
When the data is **one** `Tensor`, you can use the positional argument in `forward()` of your model to get the data tensor. For the first stage of pipeline, the first positional argument of `forward()` is the data tensor loaded from data loader. For other stages, the first positional argument of `forward()` is the output tensor from the previous stage. Note that if the stage is not the last stage, the return of `forward()` must be a `Tensor`.
......@@ -245,3 +245,4 @@ def train():
hooks=hook_list,
display_progress=True)
```
<!-- doc-test-command: echo -->
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment