delete unused files

da3f0934 · zhuwenwen · c4dd1fd4 · c4dd1fd4 · c4dd1fd4 · c4dd1fd4
Commit da3f0934 authored Apr 23, 2023 by zhuwenwen
20 changed files
--- a/colossalai/utils/gradient_accumulation/__pycache__/__init__.cpython-36.pyc
+++ b/colossalai/utils/gradient_accumulation/__pycache__/__init__.cpython-36.pyc
--- a/colossalai/utils/gradient_accumulation/__pycache__/__init__.cpython-37.pyc
+++ b/colossalai/utils/gradient_accumulation/__pycache__/__init__.cpython-37.pyc
--- a/colossalai/utils/gradient_accumulation/__pycache__/_gradient_accumulation.cpython-36.pyc
+++ b/colossalai/utils/gradient_accumulation/__pycache__/_gradient_accumulation.cpython-36.pyc
--- a/colossalai/utils/gradient_accumulation/__pycache__/_gradient_accumulation.cpython-37.pyc
+++ b/colossalai/utils/gradient_accumulation/__pycache__/_gradient_accumulation.cpython-37.pyc
--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import torch.nn as nn
-from torch import Tensor
-from typing import Iterable, Any
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from torch.nn.parallel.distributed import DistributedDataParallel
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.utils.data import DataLoader
-from colossalai.utils import conditional_context
-from colossalai.engine import BaseGradientHandler
-
-
-class GradAccumOptimizer(ColossalaiOptimizer):
-    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
-
-    :param optim: Your optimizer object
-    :type optim: :class:`torch.optim.Optimizer`
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
-    :param model: Your model object to check if it is DDP for special handling of no_sync() context
-    :type model: :class:`torch.nn.Module`
-
-    """
-
-    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
-        super().__init__(optim)
-        self.accumulate_size = accumulate_size
-        self.accumulate_step = 0
-
-        # handle pytorch ddp auto all reduce
-        self.model = model
-        self.is_torch_ddp = isinstance(self.model, DistributedDataParallel)
-
-    def zero_grad(self, *args, **kwargs):
-        if self.accumulate_step == 0:
-            self.optim.zero_grad(*args, **kwargs)
-
-    def step(self, *args, **kwargs):
-        if self.accumulate_step < self.accumulate_size:
-            return None
-        else:
-            self.accumulate_step = 0
-            return self.optim.step(*args, **kwargs)
-
-    def clip_grad_norm(self, model: nn.Module, max_norm: float):
-        if self.accumulate_step < self.accumulate_size:
-            pass
-        else:
-            self.optim.clip_grad_norm(model, max_norm)
-
-    def backward(self, loss: Tensor):
-        self.accumulate_step += 1
-
-        if self.is_torch_ddp:
-            no_sync = self.accumulate_step < self.accumulate_size
-            with conditional_context(self.model.no_sync(), enable=no_sync):
-                scaled_loss = loss / self.accumulate_size
-                self.optim.backward(scaled_loss)
-        else:
-            scaled_loss = loss / self.accumulate_size
-            self.optim.backward(scaled_loss)
-
-    def backward_by_grad(self, tensor: Tensor, grad: Tensor):
-        self.accumulate_step += 1
-        no_sync = self.is_torch_ddp and self.accumulate_step < self.accumulate_size
-
-        if no_sync:
-            with self.model.no_sync():
-                self.optim.backward_by_grad(tensor, grad)
-        else:
-            self.optim.backward_by_grad(tensor, grad)
-
-
-class GradAccumDataloader:
-    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
-
-    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
-    be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
-    Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, 
-    (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
-
-    :param dataloader: Your dataloader object
-    :type dataloader: Iterable
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
-
-    """
-
-    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
-        self.dataloader = dataloader
-        self.consume_remain_data = not isinstance(dataloader, DataLoader)
-        self.steps_per_epoch = len(dataloader) - len(dataloader) % accumulate_size
-
-    def __getattr__(self, __name: str) -> Any:
-        return getattr(self.dataloader, __name)
-
-    def __len__(self):
-        return self.steps_per_epoch
-
-    def __iter__(self):
-        self._cur_step = 0
-        self._dataiter = iter(self.dataloader)
-        return self
-
-    def __next__(self) -> Any:
-        if self._cur_step < self.steps_per_epoch:
-            self._cur_step += 1
-
-            if self._cur_step == self.steps_per_epoch and self.consume_remain_data:
-                # this is to handle non standard pytorch dataloader
-                # such as dali dataloader
-                while True:
-                    try:
-                        _ = next(self._dataiter)
-                    except StopIteration:
-                        break
-            return next(self._dataiter)
-        else:
-            raise StopIteration
-
-
-class GradAccumLrSchedulerByStep(_LRScheduler):
-    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
-
-    :param lr_scheduler: Your lr scheduler object
-    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
-
-    """
-
-    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
-        self.lr_scheduler = lr_scheduler
-        self.accumulate_size = accumulate_size
-        self.accumulate_step = 0
-
-    @staticmethod
-    def compute_effective_steps_per_epoch(dataloader: Iterable, accumulate_size: int):
-        return len(dataloader) // accumulate_size
-
-    def __getattr__(self, __name: str) -> Any:
-        return getattr(self.lr_scheduler, __name)
-
-    def step(self, *args, **kwargs):
-        self.accumulate_step += 1
-        if self.accumulate_step < self.accumulate_size:
-            pass
-        else:
-            self.accumulate_step = 0
-            self.lr_scheduler.step(*args, **kwargs)
-
-    def get_lr(self):
-        return self.lr_scheduler.get_lr()
-
-    def get_last_lr(self):
-        return self.lr_scheduler.get_last_lr()
-
-    def print_lr(self, *args, **kwargs):
-        self.lr_scheduler.print_lr(*args, **kwargs)
-
-    def state_dict(self) -> dict:
-        return self.lr_scheduler.state_dict()
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        self.lr_scheduler.load_state_dict(state_dict)
-
-
-class GradAccumGradientHandler:
-    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
-
-    :param grad_handler: Your gradient handler object
-    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
-
-    """
-
-    def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
-        assert isinstance(grad_handler, BaseGradientHandler), \
-            f'expected grad_handler to be type BaseGradientHandler, but got {type(grad_handler)}'
-        self.grad_handler = grad_handler
-        self.accumulate_size = accumulate_size
-        self.accumulate_step = 0
-
-    def handle_gradient(self):
-        self.accumulate_step += 1
-        if self.accumulate_step < self.accumulate_size:
-            pass
-        else:
-            self.accumulate_step = 0
-            self.grad_handler.handle_gradient()
--- a/colossalai/utils/memory.py
+++ b/colossalai/utils/memory.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import gc
-
-import psutil
-import torch
-
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-
-
-def bytes_to_GB(val, decimal=2):
-    """A byte-to-Gigabyte converter, defaultly using binary notation.
-
-    :param val: X bytes to convert
-    :return: X' GB
-    """
-    return round(val / (1024 * 1024 * 1024), decimal)
-
-
-def bytes_to_MB(val, decimal=2):
-    """A byte-to-Megabyte converter, defaultly using binary notation.
-
-    :param val: X bytes to convert
-    :return: X' MB
-    """
-    return round(val / (1024 * 1024), decimal)
-
-
-def report_memory_usage(message, logger=None, report_cpu=False):
-    """Calculate and print RAM usage (in GB)
-
-    :param message: A prefix message to add in the log
-    :type message: str
-    :param logger: An instance of :class:`colossalai.logging.DistributedLogger`
-    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
-    :param report_cpu: Whether to report CPU memory
-    :type report_cpu: bool, optional
-    :raises EnvironmentError: Raise error if no distributed environment has been initialized
-    """
-    if not gpc.is_initialized(ParallelMode.GLOBAL):
-        raise EnvironmentError("No distributed environment is initialized")
-
-    gpu_allocated = bytes_to_MB(torch.cuda.memory_allocated())
-    gpu_max_allocated = bytes_to_MB(torch.cuda.max_memory_allocated())
-    gpu_cached = bytes_to_MB(torch.cuda.memory_reserved())
-    gpu_max_cached = bytes_to_MB(torch.cuda.max_memory_reserved())
-
-    full_log = f"{message}: GPU: allocated {gpu_allocated} MB, max allocated {gpu_max_allocated} MB, " \
-        + f"cached: {gpu_cached} MB, max cached: {gpu_max_cached} MB"
-
-    if report_cpu:
-        # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
-        gc.collect()
-        vm_stats = psutil.virtual_memory()
-        vm_used = bytes_to_MB(vm_stats.total - vm_stats.available)
-        full_log += f", CPU Virtual Memory: used = {vm_used} MB, percent = {vm_stats.percent}%"
-
-    if logger is None:
-        logger = get_dist_logger()
-    logger.info(full_log)
-
-    # get the peak memory to report correct data, so reset the counter for the next call
-    if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
-        torch.cuda.reset_peak_memory_stats()
--- a/colossalai/utils/multi_tensor_apply/__init__.py
+++ b/colossalai/utils/multi_tensor_apply/__init__.py
-from .multi_tensor_apply import MultiTensorApply
-
-multi_tensor_applier = MultiTensorApply(2048 * 32)
--- a/colossalai/utils/multi_tensor_apply/__pycache__/__init__.cpython-36.pyc
+++ b/colossalai/utils/multi_tensor_apply/__pycache__/__init__.cpython-36.pyc
--- a/colossalai/utils/multi_tensor_apply/__pycache__/__init__.cpython-37.pyc
+++ b/colossalai/utils/multi_tensor_apply/__pycache__/__init__.cpython-37.pyc
--- a/colossalai/utils/multi_tensor_apply/__pycache__/multi_tensor_apply.cpython-36.pyc
+++ b/colossalai/utils/multi_tensor_apply/__pycache__/multi_tensor_apply.cpython-36.pyc
--- a/colossalai/utils/multi_tensor_apply/__pycache__/multi_tensor_apply.cpython-37.pyc
+++ b/colossalai/utils/multi_tensor_apply/__pycache__/multi_tensor_apply.cpython-37.pyc
--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
-# modified from https://github.com/NVIDIA/apex/blob/master/apex/multi_tensor_apply/multi_tensor_apply.py
-
-
-class MultiTensorApply(object):
-    """
-    Apply an operation to a list of tensors efficiently
-
-    :param chunk_size: Size of a chunk
-    :type chunk_size: int
-    """
-
-    available = False
-    warned = False
-
-    def __init__(self, chunk_size):
-        try:
-            import colossal_C
-            MultiTensorApply.available = True
-            self.chunk_size = chunk_size
-        except ImportError as err:
-            MultiTensorApply.available = False
-            MultiTensorApply.import_err = err
-
-    def check_avail(self):
-        if not MultiTensorApply.available:
-            raise RuntimeError(
-                "Attempted to call MultiTensorApply method, but MultiTensorApply "
-                "is not available, possibly because Apex was installed without "
-                "--cpp_ext --cuda_ext.  Original import error message:",
-                MultiTensorApply.import_err)
-
-    def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
-        self.check_avail()
-
-        return op(self.chunk_size,
-                  noop_flag_buffer,
-                  tensor_lists,
-                  *args)
--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import time
-from typing import Tuple
-from .cuda import synchronize
-
-
-class Timer:
-    """A timer object which helps to log the execution times, and provides different tools to assess the times.
-    """
-
-    def __init__(self):
-        self._started = False
-        self._start_time = time.time()
-        self._elapsed = 0
-        self._history = []
-
-    @property
-    def has_history(self):
-        return len(self._history) != 0
-
-    def start(self):
-        """Fisrtly synchronize cuda, reset the clock and then start the timer.
-        """
-        self._elapsed = 0
-        synchronize()
-        self._start_time = time.time()
-        self._started = True
-
-    def stop(self, keep_in_history: bool = False):
-        """Stop the timer and record the start-stop time interval.
-
-        :param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
-        :type keep_in_history: bool, optional
-        :return: Start-stop interval
-        :rtype: int
-        """
-        synchronize()
-        end_time = time.time()
-        elapsed = end_time - self._start_time
-        if keep_in_history:
-            self._history.append(elapsed)
-        self._elapsed = elapsed
-        self._started = False
-        return elapsed
-
-    def get_history_mean(self):
-        """Mean of all history start-stop time intervals.
-
-        :return: Mean of time intervals
-        :rtype: int
-        """
-        return sum(self._history) / len(self._history)
-
-    def get_history_sum(self):
-        """Add up all the start-stop time intervals.
-
-        :return: Sum of time intervals
-        :rtype: int
-        """
-        return sum(self._history)
-
-    def get_elapsed_time(self):
-        """Return the last start-stop time interval.
-
-        .. note:: Use it only when timer is not in progress
-
-        :return: The last time interval
-        :rtype: int
-        """
-        assert not self._started, 'Timer is still in progress'
-        return self._elapsed
-
-    def reset(self):
-        """Clear up the timer and its history
-        """
-        self._history = []
-        self._started = False
-        self._elapsed = 0
-
-
-class MultiTimer:
-    """An object contains multiple timers
-
-    :param on: Whether the timer is enabled. Default is True
-    :type on: bool, optional
-    """
-
-    def __init__(self, on: bool = True):
-        self._on = on
-        self._timers = dict()
-
-    def start(self, name: str):
-        """Start namely one of the timers
-
-        :param name: Timer's key
-        :type name: str
-        """
-        if self._on:
-            if name not in self._timers:
-                self._timers[name] = Timer()
-            return self._timers[name].start()
-
-    def stop(self, name: str, keep_in_history: bool):
-        """Stop namely one of the timers.
-
-        :param name: Timer's key
-        :type name: str
-        :param keep_in_history: Whether does it record into history each start-stop interval
-        :type keep_in_history: bool
-        """
-        if self._on:
-            return self._timers[name].stop(keep_in_history)
-        else:
-            return None
-
-    def get_timer(self, name):
-        """Get timer by its name (from multitimer)
-
-        :param name: Timer's key
-        :return: Timer with the name you give correctly
-        :rtype: Timer
-        """
-        return self._timers[name]
-
-    def reset(self, name=None):
-        """Reset timers.
-
-        :param name: If name is designated, the named timer will be reset and others will not, defaults to None
-        :type name: optional
-        """
-        if self._on:
-            if name is not None:
-                self._timers[name].reset()
-            else:
-                for timer in self._timers:
-                    timer.reset()
-
-    def is_on(self):
-        return self._on
-
-    def set_status(self, mode: bool):
-        self._on = mode
-
-    def __iter__(self) -> Tuple[str, Timer]:
-        for name, timer in self._timers.items():
-            yield name, timer
--- a/colossalai/zero/__init__.py
+++ b/colossalai/zero/__init__.py
-import torch
-import torch.nn as nn
-from torch.optim import Optimizer
-from colossalai.amp.naive_amp import NaiveAMPModel
-from colossalai.utils import is_no_pp_or_last_stage
-from colossalai.core import global_context as gpc
-from colossalai.context.parallel_mode import ParallelMode
-
-from .zero_redundancy_optimizer_level_2 import ZeroRedundancyOptimizer_Level_2
-from .zero_redundancy_optimizer_level_3 import ZeroRedundancyOptimizer_Level_3
-
-
-def convert_to_zero(model: nn.Module,
-                    optimizer: Optimizer,
-                    level: int,
-                    zero_config: dict):
-    """
-    A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
-
-    :param model: Your model object
-    :type model: :class:`torch.nn.Module`
-    :param optimizer: Your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :param level: Optimizer level, can be 2 or 3
-    :type level: int
-    :param zero_config: Configuration for zero
-    :type zero_config: dict
-
-    :return: (model, optimizer)
-    :rtype: Tuple
-    """
-    import deepspeed
-    assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
-    model = NaiveAMPModel(model, output_to_fp32=False)
-
-    if level == 2:
-        optimizer = ZeroRedundancyOptimizer_Level_2(init_optimizer=optimizer, **zero_config)
-    else:
-        optimizer = ZeroRedundancyOptimizer_Level_3(init_optimizer=optimizer, module=model, **zero_config)
-    return model, optimizer
-
-
-def zero3_model_context(dtype=torch.half):
-    """A context to enable massive model construction for training with
-        ZeRO-3. Models are automatically partitioned (or, sharded) across the
-        system and converted to half precision. Note that the config of ZeRO-3 will be loaded automatically from `gpc.config`.
-
-        Args:
-            dtype (``dtype``, optional): Can be used to change the data type of the parameters.
-                Supported options are ``torch.half`` and ``torch.float``. Defaults to ``torch.half``
-
-        This context accelerates model initialization and enables models that
-        are too large to allocate in their entirety in CPU memory. It has the
-        following effects:
-
-        #. allocates tensors to either GPU or CPU memory or NVMe
-        #. converts floating point tensors to half precision
-        #. immediately partitions tensors among the group of data-parallel devices
-        #. (*optional*) replaces ``torch.nn.functional.linear`` with a more
-           memory-efficient implementation
-
-        These modifications allow for models that exceed the size of local CPU/GPU
-        memory/NVMe, but fit within the total NVMe capacity (*i.e.*, aggregate CPU
-        or GPU memory or NVMe) across all nodes. Consider initializing a model with one
-        trillion parameters, whose weights occupy two terabytes (TB) in half
-        precision. The initial CPU allocation in full precision requires 4TB of
-        memory *per process*, and so a system with 8 GPUs per node would need 32TB of
-        CPU memory due to data-parallel redundancies. Instead, by immediately
-        partitioning tensors we remove the redundancies. The result is that
-        regardless of the number of GPUs, we still only require the original 4TB. This
-        allows for a linear increase in model size with the aggregate system memory.
-        For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
-        parameter model with 4 nodes and 32 GPUs.
-
-        Important: If the fp16 weights of the model can't fit onto a single GPU memory
-        this feature must be used.
-
-        Examples
-        --------
-
-        #. Allocate a model and partition it among all processes:
-
-            .. code-block:: python
-
-                with zero3_model_context():
-                    model = MyLargeModel()
-
-    """
-    assert dtype == torch.half or dtype == torch.float, f'Invalid dtype, except torch.half or torch.float, got {dtype}'
-    import deepspeed
-    ds_config = {
-        "train_micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "zero_optimization": {
-            "offload_param": getattr(gpc.config.zero, 'offload_param_config', None),
-            "offload_optimizer": getattr(gpc.config.zero, 'offload_optimizer_config'),
-        },
-        "aio": getattr(gpc.config.zero, 'aio_config', None)
-    }
-    remote_device = getattr(ds_config['zero_optimization']['offload_param'], 'device', None)
-    pin_memory = getattr(ds_config['zero_optimization']['offload_param'], 'pin_memory', False)
-    return deepspeed.zero.Init(data_parallel_group=gpc.get_group(ParallelMode.DATA),
-                               remote_device=remote_device,
-                               config_dict_or_path=ds_config,
-                               pin_memory=pin_memory,
-                               dtype=dtype)
-
-
-__all__ = ['convert_to_zero', 'ZeroRedundancyOptimizer_Level_2',
-           'ZeroRedundancyOptimizer_Level_3', 'zero3_model_context']
--- a/colossalai/zero/__pycache__/__init__.cpython-36.pyc
+++ b/colossalai/zero/__pycache__/__init__.cpython-36.pyc
--- a/colossalai/zero/__pycache__/__init__.cpython-37.pyc
+++ b/colossalai/zero/__pycache__/__init__.cpython-37.pyc
--- a/colossalai/zero/__pycache__/loss_scaler.cpython-36.pyc
+++ b/colossalai/zero/__pycache__/loss_scaler.cpython-36.pyc
--- a/colossalai/zero/__pycache__/loss_scaler.cpython-37.pyc
+++ b/colossalai/zero/__pycache__/loss_scaler.cpython-37.pyc
--- a/colossalai/zero/__pycache__/zero_redundancy_optimizer_level_2.cpython-36.pyc
+++ b/colossalai/zero/__pycache__/zero_redundancy_optimizer_level_2.cpython-36.pyc
--- a/colossalai/zero/__pycache__/zero_redundancy_optimizer_level_2.cpython-37.pyc
+++ b/colossalai/zero/__pycache__/zero_redundancy_optimizer_level_2.cpython-37.pyc