Commit da3f0934 authored by zhuwenwen's avatar zhuwenwen
Browse files

delete unused files

parent c4dd1fd4
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch.nn as nn
from torch import Tensor
from typing import Iterable, Any
from colossalai.nn.optimizer import ColossalaiOptimizer
from torch.nn.parallel.distributed import DistributedDataParallel
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import DataLoader
from colossalai.utils import conditional_context
from colossalai.engine import BaseGradientHandler
class GradAccumOptimizer(ColossalaiOptimizer):
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param optim: Your optimizer object
:type optim: :class:`torch.optim.Optimizer`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
:param model: Your model object to check if it is DDP for special handling of no_sync() context
:type model: :class:`torch.nn.Module`
"""
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
super().__init__(optim)
self.accumulate_size = accumulate_size
self.accumulate_step = 0
# handle pytorch ddp auto all reduce
self.model = model
self.is_torch_ddp = isinstance(self.model, DistributedDataParallel)
def zero_grad(self, *args, **kwargs):
if self.accumulate_step == 0:
self.optim.zero_grad(*args, **kwargs)
def step(self, *args, **kwargs):
if self.accumulate_step < self.accumulate_size:
return None
else:
self.accumulate_step = 0
return self.optim.step(*args, **kwargs)
def clip_grad_norm(self, model: nn.Module, max_norm: float):
if self.accumulate_step < self.accumulate_size:
pass
else:
self.optim.clip_grad_norm(model, max_norm)
def backward(self, loss: Tensor):
self.accumulate_step += 1
if self.is_torch_ddp:
no_sync = self.accumulate_step < self.accumulate_size
with conditional_context(self.model.no_sync(), enable=no_sync):
scaled_loss = loss / self.accumulate_size
self.optim.backward(scaled_loss)
else:
scaled_loss = loss / self.accumulate_size
self.optim.backward(scaled_loss)
def backward_by_grad(self, tensor: Tensor, grad: Tensor):
self.accumulate_step += 1
no_sync = self.is_torch_ddp and self.accumulate_step < self.accumulate_size
if no_sync:
with self.model.no_sync():
self.optim.backward_by_grad(tensor, grad)
else:
self.optim.backward_by_grad(tensor, grad)
class GradAccumDataloader:
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
:param dataloader: Your dataloader object
:type dataloader: Iterable
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
"""
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
self.dataloader = dataloader
self.consume_remain_data = not isinstance(dataloader, DataLoader)
self.steps_per_epoch = len(dataloader) - len(dataloader) % accumulate_size
def __getattr__(self, __name: str) -> Any:
return getattr(self.dataloader, __name)
def __len__(self):
return self.steps_per_epoch
def __iter__(self):
self._cur_step = 0
self._dataiter = iter(self.dataloader)
return self
def __next__(self) -> Any:
if self._cur_step < self.steps_per_epoch:
self._cur_step += 1
if self._cur_step == self.steps_per_epoch and self.consume_remain_data:
# this is to handle non standard pytorch dataloader
# such as dali dataloader
while True:
try:
_ = next(self._dataiter)
except StopIteration:
break
return next(self._dataiter)
else:
raise StopIteration
class GradAccumLrSchedulerByStep(_LRScheduler):
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param lr_scheduler: Your lr scheduler object
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
"""
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
self.lr_scheduler = lr_scheduler
self.accumulate_size = accumulate_size
self.accumulate_step = 0
@staticmethod
def compute_effective_steps_per_epoch(dataloader: Iterable, accumulate_size: int):
return len(dataloader) // accumulate_size
def __getattr__(self, __name: str) -> Any:
return getattr(self.lr_scheduler, __name)
def step(self, *args, **kwargs):
self.accumulate_step += 1
if self.accumulate_step < self.accumulate_size:
pass
else:
self.accumulate_step = 0
self.lr_scheduler.step(*args, **kwargs)
def get_lr(self):
return self.lr_scheduler.get_lr()
def get_last_lr(self):
return self.lr_scheduler.get_last_lr()
def print_lr(self, *args, **kwargs):
self.lr_scheduler.print_lr(*args, **kwargs)
def state_dict(self) -> dict:
return self.lr_scheduler.state_dict()
def load_state_dict(self, state_dict: dict) -> None:
self.lr_scheduler.load_state_dict(state_dict)
class GradAccumGradientHandler:
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param grad_handler: Your gradient handler object
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
"""
def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
assert isinstance(grad_handler, BaseGradientHandler), \
f'expected grad_handler to be type BaseGradientHandler, but got {type(grad_handler)}'
self.grad_handler = grad_handler
self.accumulate_size = accumulate_size
self.accumulate_step = 0
def handle_gradient(self):
self.accumulate_step += 1
if self.accumulate_step < self.accumulate_size:
pass
else:
self.accumulate_step = 0
self.grad_handler.handle_gradient()
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import gc
import psutil
import torch
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
def bytes_to_GB(val, decimal=2):
"""A byte-to-Gigabyte converter, defaultly using binary notation.
:param val: X bytes to convert
:return: X' GB
"""
return round(val / (1024 * 1024 * 1024), decimal)
def bytes_to_MB(val, decimal=2):
"""A byte-to-Megabyte converter, defaultly using binary notation.
:param val: X bytes to convert
:return: X' MB
"""
return round(val / (1024 * 1024), decimal)
def report_memory_usage(message, logger=None, report_cpu=False):
"""Calculate and print RAM usage (in GB)
:param message: A prefix message to add in the log
:type message: str
:param logger: An instance of :class:`colossalai.logging.DistributedLogger`
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
:param report_cpu: Whether to report CPU memory
:type report_cpu: bool, optional
:raises EnvironmentError: Raise error if no distributed environment has been initialized
"""
if not gpc.is_initialized(ParallelMode.GLOBAL):
raise EnvironmentError("No distributed environment is initialized")
gpu_allocated = bytes_to_MB(torch.cuda.memory_allocated())
gpu_max_allocated = bytes_to_MB(torch.cuda.max_memory_allocated())
gpu_cached = bytes_to_MB(torch.cuda.memory_reserved())
gpu_max_cached = bytes_to_MB(torch.cuda.max_memory_reserved())
full_log = f"{message}: GPU: allocated {gpu_allocated} MB, max allocated {gpu_max_allocated} MB, " \
+ f"cached: {gpu_cached} MB, max cached: {gpu_max_cached} MB"
if report_cpu:
# python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
gc.collect()
vm_stats = psutil.virtual_memory()
vm_used = bytes_to_MB(vm_stats.total - vm_stats.available)
full_log += f", CPU Virtual Memory: used = {vm_used} MB, percent = {vm_stats.percent}%"
if logger is None:
logger = get_dist_logger()
logger.info(full_log)
# get the peak memory to report correct data, so reset the counter for the next call
if hasattr(torch.cuda, "reset_peak_memory_stats"): # pytorch 1.4+
torch.cuda.reset_peak_memory_stats()
from .multi_tensor_apply import MultiTensorApply
multi_tensor_applier = MultiTensorApply(2048 * 32)
# modified from https://github.com/NVIDIA/apex/blob/master/apex/multi_tensor_apply/multi_tensor_apply.py
class MultiTensorApply(object):
"""
Apply an operation to a list of tensors efficiently
:param chunk_size: Size of a chunk
:type chunk_size: int
"""
available = False
warned = False
def __init__(self, chunk_size):
try:
import colossal_C
MultiTensorApply.available = True
self.chunk_size = chunk_size
except ImportError as err:
MultiTensorApply.available = False
MultiTensorApply.import_err = err
def check_avail(self):
if not MultiTensorApply.available:
raise RuntimeError(
"Attempted to call MultiTensorApply method, but MultiTensorApply "
"is not available, possibly because Apex was installed without "
"--cpp_ext --cuda_ext. Original import error message:",
MultiTensorApply.import_err)
def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
self.check_avail()
return op(self.chunk_size,
noop_flag_buffer,
tensor_lists,
*args)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import time
from typing import Tuple
from .cuda import synchronize
class Timer:
"""A timer object which helps to log the execution times, and provides different tools to assess the times.
"""
def __init__(self):
self._started = False
self._start_time = time.time()
self._elapsed = 0
self._history = []
@property
def has_history(self):
return len(self._history) != 0
def start(self):
"""Fisrtly synchronize cuda, reset the clock and then start the timer.
"""
self._elapsed = 0
synchronize()
self._start_time = time.time()
self._started = True
def stop(self, keep_in_history: bool = False):
"""Stop the timer and record the start-stop time interval.
:param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
:type keep_in_history: bool, optional
:return: Start-stop interval
:rtype: int
"""
synchronize()
end_time = time.time()
elapsed = end_time - self._start_time
if keep_in_history:
self._history.append(elapsed)
self._elapsed = elapsed
self._started = False
return elapsed
def get_history_mean(self):
"""Mean of all history start-stop time intervals.
:return: Mean of time intervals
:rtype: int
"""
return sum(self._history) / len(self._history)
def get_history_sum(self):
"""Add up all the start-stop time intervals.
:return: Sum of time intervals
:rtype: int
"""
return sum(self._history)
def get_elapsed_time(self):
"""Return the last start-stop time interval.
.. note:: Use it only when timer is not in progress
:return: The last time interval
:rtype: int
"""
assert not self._started, 'Timer is still in progress'
return self._elapsed
def reset(self):
"""Clear up the timer and its history
"""
self._history = []
self._started = False
self._elapsed = 0
class MultiTimer:
"""An object contains multiple timers
:param on: Whether the timer is enabled. Default is True
:type on: bool, optional
"""
def __init__(self, on: bool = True):
self._on = on
self._timers = dict()
def start(self, name: str):
"""Start namely one of the timers
:param name: Timer's key
:type name: str
"""
if self._on:
if name not in self._timers:
self._timers[name] = Timer()
return self._timers[name].start()
def stop(self, name: str, keep_in_history: bool):
"""Stop namely one of the timers.
:param name: Timer's key
:type name: str
:param keep_in_history: Whether does it record into history each start-stop interval
:type keep_in_history: bool
"""
if self._on:
return self._timers[name].stop(keep_in_history)
else:
return None
def get_timer(self, name):
"""Get timer by its name (from multitimer)
:param name: Timer's key
:return: Timer with the name you give correctly
:rtype: Timer
"""
return self._timers[name]
def reset(self, name=None):
"""Reset timers.
:param name: If name is designated, the named timer will be reset and others will not, defaults to None
:type name: optional
"""
if self._on:
if name is not None:
self._timers[name].reset()
else:
for timer in self._timers:
timer.reset()
def is_on(self):
return self._on
def set_status(self, mode: bool):
self._on = mode
def __iter__(self) -> Tuple[str, Timer]:
for name, timer in self._timers.items():
yield name, timer
import torch
import torch.nn as nn
from torch.optim import Optimizer
from colossalai.amp.naive_amp import NaiveAMPModel
from colossalai.utils import is_no_pp_or_last_stage
from colossalai.core import global_context as gpc
from colossalai.context.parallel_mode import ParallelMode
from .zero_redundancy_optimizer_level_2 import ZeroRedundancyOptimizer_Level_2
from .zero_redundancy_optimizer_level_3 import ZeroRedundancyOptimizer_Level_3
def convert_to_zero(model: nn.Module,
optimizer: Optimizer,
level: int,
zero_config: dict):
"""
A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
:param model: Your model object
:type model: :class:`torch.nn.Module`
:param optimizer: Your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param level: Optimizer level, can be 2 or 3
:type level: int
:param zero_config: Configuration for zero
:type zero_config: dict
:return: (model, optimizer)
:rtype: Tuple
"""
import deepspeed
assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
model = NaiveAMPModel(model, output_to_fp32=False)
if level == 2:
optimizer = ZeroRedundancyOptimizer_Level_2(init_optimizer=optimizer, **zero_config)
else:
optimizer = ZeroRedundancyOptimizer_Level_3(init_optimizer=optimizer, module=model, **zero_config)
return model, optimizer
def zero3_model_context(dtype=torch.half):
"""A context to enable massive model construction for training with
ZeRO-3. Models are automatically partitioned (or, sharded) across the
system and converted to half precision. Note that the config of ZeRO-3 will be loaded automatically from `gpc.config`.
Args:
dtype (``dtype``, optional): Can be used to change the data type of the parameters.
Supported options are ``torch.half`` and ``torch.float``. Defaults to ``torch.half``
This context accelerates model initialization and enables models that
are too large to allocate in their entirety in CPU memory. It has the
following effects:
#. allocates tensors to either GPU or CPU memory or NVMe
#. converts floating point tensors to half precision
#. immediately partitions tensors among the group of data-parallel devices
#. (*optional*) replaces ``torch.nn.functional.linear`` with a more
memory-efficient implementation
These modifications allow for models that exceed the size of local CPU/GPU
memory/NVMe, but fit within the total NVMe capacity (*i.e.*, aggregate CPU
or GPU memory or NVMe) across all nodes. Consider initializing a model with one
trillion parameters, whose weights occupy two terabytes (TB) in half
precision. The initial CPU allocation in full precision requires 4TB of
memory *per process*, and so a system with 8 GPUs per node would need 32TB of
CPU memory due to data-parallel redundancies. Instead, by immediately
partitioning tensors we remove the redundancies. The result is that
regardless of the number of GPUs, we still only require the original 4TB. This
allows for a linear increase in model size with the aggregate system memory.
For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
parameter model with 4 nodes and 32 GPUs.
Important: If the fp16 weights of the model can't fit onto a single GPU memory
this feature must be used.
Examples
--------
#. Allocate a model and partition it among all processes:
.. code-block:: python
with zero3_model_context():
model = MyLargeModel()
"""
assert dtype == torch.half or dtype == torch.float, f'Invalid dtype, except torch.half or torch.float, got {dtype}'
import deepspeed
ds_config = {
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 1,
"zero_optimization": {
"offload_param": getattr(gpc.config.zero, 'offload_param_config', None),
"offload_optimizer": getattr(gpc.config.zero, 'offload_optimizer_config'),
},
"aio": getattr(gpc.config.zero, 'aio_config', None)
}
remote_device = getattr(ds_config['zero_optimization']['offload_param'], 'device', None)
pin_memory = getattr(ds_config['zero_optimization']['offload_param'], 'pin_memory', False)
return deepspeed.zero.Init(data_parallel_group=gpc.get_group(ParallelMode.DATA),
remote_device=remote_device,
config_dict_or_path=ds_config,
pin_memory=pin_memory,
dtype=dtype)
__all__ = ['convert_to_zero', 'ZeroRedundancyOptimizer_Level_2',
'ZeroRedundancyOptimizer_Level_3', 'zero3_model_context']
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment