"tests/test_legacy/test_comm/test_comm.py" did not exist on "0fedef4f3c30634cf9ad929eecf4baf5f0f415ca"
Commit da3f0934 authored by zhuwenwen's avatar zhuwenwen
Browse files

delete unused files

parent c4dd1fd4
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import functools
from contextlib import contextmanager
import torch.cuda
from torch import Tensor
from .seed_manager import SeedManager
from ..parallel_mode import ParallelMode
_SEED_MANAGER = SeedManager()
def get_seeds():
"""Returns the seeds of the seed manager.
:return: The seeds of the seed manager
:rtype: dict
"""
return _SEED_MANAGER.seeds
def get_states(copy=False):
"""Returns the seed states of the seed manager.
:return: The seed states of the seed manager
:rtype: dict
"""
states = _SEED_MANAGER.seed_states
if copy:
new_states = dict()
for parallel_mode, state in states.items():
new_states[parallel_mode] = state.clone()
return new_states
else:
return _SEED_MANAGER.seed_states
def get_current_mode():
"""Returns the current mode of the seed manager.
:return: The current mode of the seed manager.
:rtype: :class:`torch.ByteTensor`
"""
return _SEED_MANAGER.current_mode
def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
"""Adds a seed to the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param seed: The seed to be added
:type seed: int
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
"""
_SEED_MANAGER.add_seed(parallel_mode, seed, overwrite)
def set_mode(parallel_mode: ParallelMode):
"""Sets the current mode of the seed manager.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
"""
_SEED_MANAGER.set_mode(parallel_mode)
def set_seed_states(parallel_mode: ParallelMode, state: Tensor):
"""Sets the state of the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param state: the state to be set
:type state: :class:`torch.Tensor`
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
"""
_SEED_MANAGER.set_state(parallel_mode, state)
def sync_states():
current_mode = get_current_mode()
current_states = torch.cuda.get_rng_state()
set_seed_states(current_mode, current_states)
@contextmanager
def seed(parallel_mode: ParallelMode):
""" A context for seed switch
Examples::
with seed(ParallelMode.DATA):
output = F.dropout(input)
"""
try:
# set to new mode
current_mode = _SEED_MANAGER.current_mode
yield _SEED_MANAGER.set_mode(parallel_mode)
finally:
# recover
_SEED_MANAGER.set_mode(current_mode)
def with_seed(func, parallel_mode: ParallelMode):
"""
A function wrapper which executes the function with a specified seed.
Examples::
# use with decorator
@with_seed(ParallelMode.DATA)
def forward(input):
return F.dropout(input)
out = forward(input)
# OR use it inline
def forward(input):
return F.dropout(input)
wrapper_forward = with_seed(forward, ParallelMode.DATA)
out = wrapped_forward(input)
"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
# switch mode
current_mode = _SEED_MANAGER.current_mode
_SEED_MANAGER.set_mode(parallel_mode)
# exec func
out = func(*args, **kwargs)
# recover state
_SEED_MANAGER.set_mode(current_mode)
return out
return wrapper
def moe_set_seed(seed):
if torch.cuda.is_available():
from colossalai.core import global_context as gpc
moe_mp_rank = gpc.get_local_rank(ParallelMode.MOE_MODEL)
moe_mp_seed = seed + moe_mp_rank
add_seed(ParallelMode.MOE_MODEL, moe_mp_seed)
global_rank = gpc.get_global_rank()
add_seed(ParallelMode.TENSOR, global_rank, True)
print(f"moe seed condition: {global_rank} with moe seed {moe_mp_seed}, ",
f"tensor seed {global_rank}", flush=True)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
from torch import Tensor
from colossalai.context.parallel_mode import ParallelMode
class SeedManager:
"""This class is a manager of all random seeds involved in the system.
"""
def __init__(self):
self._current_mode = None
self._seeds = dict()
self._seed_states = dict()
@property
def current_mode(self):
return self._current_mode
@property
def seeds(self):
return self._seeds
@property
def seed_states(self):
return self._seed_states
def set_state(self, parallel_mode: ParallelMode, state: Tensor):
"""Sets the state of the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param state: the state to be set
:type state: :class:`torch.Tensor`
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
"""
assert parallel_mode in self._seed_states, f'Parallel mode {parallel_mode} is not found in the seed manager'
self._seed_states[parallel_mode] = state
def set_mode(self, parallel_mode: ParallelMode):
"""Sets the current mode of the seed manager.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
"""
if self.current_mode:
# save the current state for current mode
self._seed_states[self._current_mode] = torch.cuda.get_rng_state()
# set the new state for new mode
self._current_mode = parallel_mode
torch.cuda.set_rng_state(self._seed_states[parallel_mode])
def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
"""Adds a seed to the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param seed: The seed to be added
:type seed: int
:param overwrtie: Whether allows to overwrite the seed that has been set already
:type overwrtie: bool, optional
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
"""
assert isinstance(
parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
if overwrtie is False:
assert parallel_mode not in self._seed_states, f'The seed for {parallel_mode} has been added'
elif parallel_mode in self._seed_states:
print(f"Warnning: {parallel_mode} seed has been overwritten.", flush=True)
current_state = torch.cuda.get_rng_state()
torch.cuda.manual_seed(seed)
self._seed_states[parallel_mode] = torch.cuda.get_rng_state()
self._seeds[parallel_mode] = seed
torch.cuda.set_rng_state(current_state)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from colossalai.context import ParallelContext
global_context = ParallelContext.get_instance()
from ._base_engine import Engine
from .gradient_handler import *
__all__ = ['Engine']
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import List
from torch.nn import Module
from torch.nn.modules.loss import _Loss
from torch.optim import Optimizer
from colossalai.logging import get_dist_logger
from torch import Tensor
from colossalai.engine.ophooks import register_ophooks_recursively, BaseOpHook
class Engine:
"""Basic engine class for training and evaluation. It runs a specific process method
:meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
It controls a iteration in training.
:param model: The neural network model
:type model: ``torch.nn.Module``
:param optimizer: Optimizer for updating the parameters
:type optimizer: ``torch.optim.Optimizer``
:param criterion: Loss function for calculating loss
:type criterion: ``torch.nn.modules.loss._Loss``
:param gradient_handlers: A list of gradient handler used in backward
:type gradient_handlers: list
:param clip_grad_norm: The norm of gradient clipping
:type clip_grad_norm: float, optional
:param verbose: whether to display log info
:type verbose: bool
"""
def __init__(self,
model: Module,
optimizer: Optimizer,
criterion: _Loss,
gradient_handlers: List = None,
clip_grad_norm: float = 0.0,
ophook_list: List[BaseOpHook] = [],
verbose: bool = True):
self._model = model
self._optimizer = optimizer
self._criterion = criterion
self._clip_grad_norm = clip_grad_norm
self._verbose = verbose
self._logger = get_dist_logger()
# state
self.training = True # default
# build gradient handler
if gradient_handlers:
self._gradient_handlers = gradient_handlers
else:
self._gradient_handlers = []
self._ophook_list = ophook_list
register_ophooks_recursively(self._model, self._ophook_list)
@property
def model(self):
"""Model attached to the engine"""
return self._model
@property
def optimizer(self):
"""Optimizer attached to the engine"""
return self._optimizer
@property
def criterion(self):
"""Criterion attached to the engine"""
return self._criterion
def zero_grad(self):
"""Set the gradient of parameters to zero
"""
self.optimizer.zero_grad()
def step(self):
"""Execute parameter update
"""
self._all_reduce_gradients()
self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
return self.optimizer.step()
def backward(self, loss: Tensor):
"""Start backward propagation given the loss value computed by a loss function
:param loss: Loss value computed by a loss function
:type loss: :class:`torch.Tensor`
"""
ret = self.optimizer.backward(loss)
for ophook in self._ophook_list:
ophook.post_iter()
return ret
def backward_by_grad(self, tensor, grad):
"""Start backward propagation given the gradient of the output tensor
:param tensor: Output tensor
:type tensor: :class:`torch.Tensor`
:param grad: Gradient passed back to the output
:type grad: :class:`torch.Tensor`
"""
ret = self.optimizer.backward_by_grad(tensor, grad)
for ophook in self._ophook_list:
ophook.post_iter()
return ret
def calc_loss(self, *args, **kwargs):
"""Compute the loss value
:param args: Args used in criterion function
:param kwargs: Kwargs used in criterion function
:return: The loss value
:rtype: :class:`torch.Tensor`
"""
return self.criterion(*args, **kwargs)
def __call__(self, *args, **kwargs):
"""Run the forward step for the model
:return: Output the model
:rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
"""
return self.model(*args, **kwargs)
def _all_reduce_gradients(self):
"""Handles all-reduce operations of gradients across different parallel groups.
"""
for handler in self._gradient_handlers:
handler.handle_gradient()
def train(self):
"""Sets the model to training mode.
"""
self.training = True
self._model.train()
def eval(self):
"""Sets the model to evaluation mode.
"""
self.training = False
self._model.eval()
from ._base_gradient_handler import BaseGradientHandler
from ._data_parallel_gradient_handler import DataParallelGradientHandler
from ._zero_gradient_handler import ZeROGradientHandler
from ._sequence_parallel_gradient_handler import SequenceParallelGradientHandler
from ._pipeline_parallel_gradient_handler import PipelineSharedModuleGradientHandler
from ._moe_gradient_handler import MoeGradientHandler
from ._sequence_parallel_gradient_handler import SequenceParallelGradientHandler
__all__ = ['BaseGradientHandler', 'DataParallelGradientHandler',
'ZeROGradientHandler', 'PipelineSharedModuleGradientHandler',
'MoeGradientHandler', 'SequenceParallelGradientHandler']
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment