Unverified Commit e956d93a authored by Jiarui Fang's avatar Jiarui Fang Committed by GitHub
Browse files

[refactor] memory utils (#577)

parent 104cbbb3
...@@ -29,6 +29,7 @@ class MoeGradientHandler(BaseGradientHandler): ...@@ -29,6 +29,7 @@ class MoeGradientHandler(BaseGradientHandler):
if global_data > 1: if global_data > 1:
epsize_param_dict = get_moe_epsize_param_dict(self._model) epsize_param_dict = get_moe_epsize_param_dict(self._model)
# epsize is 1, indicating the params are replicated among processes in data parallelism # epsize is 1, indicating the params are replicated among processes in data parallelism
# use the ParallelMode.DATA to get data parallel group # use the ParallelMode.DATA to get data parallel group
# reduce gradients for all parameters in data parallelism # reduce gradients for all parameters in data parallelism
......
...@@ -10,8 +10,7 @@ from colossalai.zero.sharded_param.tensorful_state import TensorState ...@@ -10,8 +10,7 @@ from colossalai.zero.sharded_param.tensorful_state import TensorState
from ._base_ophook import BaseOpHook from ._base_ophook import BaseOpHook
from colossalai.utils.memory_utils.utils import \ from colossalai.zero.shard_utils.tensor_utils import colo_model_data_tensor_move_inline
colo_model_data_tensor_move_inline
@OPHOOKS.register_module @OPHOOKS.register_module
......
...@@ -4,8 +4,8 @@ import pickle ...@@ -4,8 +4,8 @@ import pickle
import torch import torch
from colossalai.utils.memory_utils.utils import colo_device_memory_used
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from colossalai.utils.memory_utils.memory_monitor import colo_cuda_memory_used
class AsyncMemoryMonitor: class AsyncMemoryMonitor:
...@@ -82,7 +82,7 @@ class AsyncMemoryMonitor: ...@@ -82,7 +82,7 @@ class AsyncMemoryMonitor:
while self.keep_measuring: while self.keep_measuring:
max_usage = max( max_usage = max(
max_usage, max_usage,
colo_cuda_memory_used(), colo_device_memory_used(get_current_device()),
) )
sleep(self.interval) sleep(self.interval)
return max_usage return max_usage
......
from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER
from colossalai.utils.memory_utils.memory_monitor import colo_cuda_memory_used from colossalai.utils.memory_utils.utils import colo_device_memory_used
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
import torch import torch
from typing import Tuple from typing import List
class SamplingCounter: class SamplingCounter:
...@@ -23,45 +23,71 @@ class SamplingCounter: ...@@ -23,45 +23,71 @@ class SamplingCounter:
class MemStatsCollector: class MemStatsCollector:
def __init__(self) -> None:
""" """
Collecting Memory Statistics. A Memory statistic collector.
It has two phases. It works in two phases.
1. Collection Phase: collect memory usage statistics Phase 1. Collection Phase: collect memory usage statistics of CPU and GPU.
2. Runtime Phase: do not collect statistics. The first iteration of DNN training.
Phase 2. Runtime Phase: use the read-only collected stats
The rest iterations of DNN training.
It has a Sampling counter which is reset after DNN training iteration.
""" """
def __init__(self) -> None:
self._sampling_cnter = SamplingCounter() self._sampling_cnter = SamplingCounter()
self._model_data_cuda = [] self._model_data_cuda_list = []
self._overall_cuda = [] self._overall_cuda_list = []
# TODO(jiaruifang) Now no cpu mem stats collecting self._model_data_cpu_list = []
self._model_data_cpu = [] self._overall_cpu_list = []
self._overall_cpu = []
self._start_flag = False self._start_flag = False
@property def overall_mem_stats(self, device_type: str):
def overall_cuda(self): if device_type == 'cuda':
return self._overall_cuda return self._overall_cuda_list
elif device_type == 'cpu':
@property return self._overall_cpu_list
def model_data_cuda_GB(self): else:
return [elem / 1e9 for elem in self._model_data_cuda] raise TypeError
@property
def model_data_cuda(self):
return self._model_data_cuda
@property @property
def non_model_data_cuda_GB(self): def model_data_cuda_list(self, device_type: str, unit: str = 'B') -> List[int]:
return [elem / 1e9 for elem in self.non_model_data_cuda] scale = 1
if unit == 'GB':
@property scale = 1e9
def non_model_data_cuda(self): elif unit == 'MB':
scale = 1e6
elif unit == 'KB':
scale = 1e3
else:
raise TypeError
if device_type == 'cuda':
return [elem / scale for elem in self._model_data_cuda_list]
elif device_type == 'cpu':
return [elem / scale for elem in self._model_data_cpu_list]
else:
raise TypeError
def non_model_data_cuda_list(self, device_type: str, unit: str = 'B') -> List[int]:
"""Non model data stats """Non model data stats
""" """
return [(v1 - v2) for v1, v2 in zip(self._overall_cuda, self._model_data_cuda)] scale = 1
if unit == 'GB':
scale = 1e9
elif unit == 'MB':
scale = 1e6
elif unit == 'KB':
scale = 1e3
if device_type == 'cuda':
return [(v1 - v2) / scale for v1, v2 in zip(self._overall_cuda_list, self._model_data_cuda_list)]
elif device_type == 'cpu':
return [(v1 - v2) / scale for v1, v2 in zip(self._overall_cpu_list, self._model_data_cpu_list)]
else:
raise TypeError
def start_collection(self): def start_collection(self):
self._start_flag = True self._start_flag = True
...@@ -73,32 +99,28 @@ class MemStatsCollector: ...@@ -73,32 +99,28 @@ class MemStatsCollector:
""" """
Sampling memory statistics. Sampling memory statistics.
Record the current model data CUDA memory usage as well as system CUDA memory usage. Record the current model data CUDA memory usage as well as system CUDA memory usage.
Advance the sampling cnter.
""" """
if self._start_flag: if self._start_flag:
sampling_cnt = self._sampling_cnter.sampling_cnt sampling_cnt = self._sampling_cnter.sampling_cnt
assert sampling_cnt == len(self._overall_cuda) assert sampling_cnt == len(self._overall_cuda_list)
self._model_data_cuda.append(GLOBAL_MODEL_DATA_TRACER.cuda_usage) self._model_data_cuda_list.append(GLOBAL_MODEL_DATA_TRACER.cuda_usage)
self._overall_cuda.append(colo_cuda_memory_used(torch.device(f'cuda:{get_current_device()}'))) self._overall_cuda_list.append(colo_device_memory_used(get_current_device()))
self._sampling_cnter.advance()
def fetch_memstats(self) -> Tuple[int, int]: self._model_data_cpu_list.append(GLOBAL_MODEL_DATA_TRACER.cpu_usage)
""" self._overall_cpu_list.append(colo_device_memory_used(torch.device(f'cpu')))
returns cuda usage of model data and overall cuda usage.
""" self._sampling_cnter.advance()
sampling_cnt = self._sampling_cnter.sampling_cnt
if len(self._model_data_cuda) < sampling_cnt:
raise RuntimeError
return (self._model_data_cuda[sampling_cnt], self._overall_cuda[sampling_cnt])
def reset_sampling_cnter(self) -> None: def reset_sampling_cnter(self) -> None:
self._sampling_cnter.reset() self._sampling_cnter.reset()
def clear(self) -> None: def clear(self) -> None:
self._model_data_cuda = [] self._model_data_cuda_list = []
self._overall_cuda = [] self._overall_cuda_list = []
self._model_data_cpu = [] self._model_data_cpu_list = []
self._overall_cpu = [] self._overall_cpu_list = []
self._start_flag = False self._start_flag = False
self._sampling_cnter.reset() self._sampling_cnter.reset()
...@@ -30,10 +30,7 @@ def test_mem_collector(): ...@@ -30,10 +30,7 @@ def test_mem_collector():
collector.sample_memstats() collector.sample_memstats()
collector.sample_memstats() collector.sample_memstats()
cuda_use, overall_use = collector.fetch_memstats() print(collector.overall_mem_stats('cuda'))
print(cuda_use, overall_use)
print(collector.overall_cuda)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -9,29 +9,6 @@ import torch ...@@ -9,29 +9,6 @@ import torch
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.utils.cuda import get_current_device
from typing import Optional
def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
"""Get the free memory info of device.
Args:
device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
Returns:
int: current memory usage, sized by Byte.
"""
if device:
assert device.type == 'cuda'
else:
device = torch.device(f'cuda:{get_current_device()}')
ret: int = torch.cuda.memory_allocated(device)
# get the peak memory to report correct data, so reset the counter for the next call
if hasattr(torch.cuda, "reset_peak_memory_stats"): # pytorch 1.4+
torch.cuda.reset_peak_memory_stats(device)
return ret
def bytes_to_GB(val, decimal=2): def bytes_to_GB(val, decimal=2):
......
import torch import torch
from colossalai.context.parallel_mode import ParallelMode
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from colossalai.zero.sharded_param.tensorful_state import StatefulTensor
from typing import Tuple, Union from collections import namedtuple
import psutil
from colossalai.core import global_context as gpc
_GLOBAL_CUDA_MEM_FRACTION = 1.0 _GLOBAL_CUDA_MEM_FRACTION = 1.0
def colo_tensor_mem_usage(tensor: Union[torch.Tensor, StatefulTensor]) -> Tuple[int, int]: # copy from PatrickStar
if issubclass(type(tensor), StatefulTensor): def _get_cpu_memory_info():
t = tensor.payload ps_mem_info = namedtuple("ps_mem_info", ["total", "free", "cached", "buffers", "used"])
elif isinstance(tensor, torch.Tensor): try:
t = tensor # psutil reads the memory info from /proc/memory_info,
else: # which results in returning the host memory instead of
return 0, 0 # that of container.
# Here we try to read the container memory with method in:
cuda_use, cpu_use = 0, 0 # https://stackoverflow.com/a/46213331/5163915
mems = {}
mem_use = t.numel() * t.element_size() with open("/sys/fs/cgroup/memory/memory.meminfo", "rb") as f:
if t.device.type == 'cuda': for line in f:
cuda_use += mem_use fields = line.split()
elif t.device.type == 'cpu': mems[fields[0]] = int(fields[1]) * 1024
cpu_use += mem_use total = mems[b"MemTotal:"]
free = mems[b"MemFree:"]
return cuda_use, cpu_use cached = mems[b"Cached:"]
buffers = mems[b"Buffers:"]
used = total - free - cached - buffers
if used < 0:
used = total - free
mem_info = ps_mem_info(total=total, free=free, cached=cached, buffers=buffers, used=used)
except FileNotFoundError:
mems = psutil.virtual_memory()
mem_info = ps_mem_info(
total=mems.total,
free=mems.free,
cached=mems.cached,
buffers=mems.buffers,
used=mems.used,
)
return mem_info
def colo_device_memory_used(device) -> int:
if not isinstance(device, torch.device):
device = torch.device(f"cuda:{device}")
if device.type == 'cpu':
mem_info = _get_cpu_memory_info()
# FIXME(jiaruifang) only work for 1-CPU multi-GPU
# CPU memory is sharded with all processes
# Not support multi-GPU multi-CPU
# We need a local_world_size here
ret = mem_info.used / gpc.get_world_size(ParallelMode.DATA)
return ret
elif device.type == 'cuda':
ret: int = torch.cuda.memory_allocated(device)
# get the peak memory to report correct data, so reset the counter for the next call
if hasattr(torch.cuda, "reset_peak_memory_stats"): # pytorch 1.4+
torch.cuda.reset_peak_memory_stats(device)
return ret
def colo_set_process_memory_fraction(ratio: float) -> None: def colo_set_process_memory_fraction(ratio: float) -> None:
...@@ -44,97 +80,3 @@ def colo_cuda_memory_capacity() -> float: ...@@ -44,97 +80,3 @@ def colo_cuda_memory_capacity() -> float:
Get cuda memory capacity of the current cuda. Get cuda memory capacity of the current cuda.
""" """
return torch.cuda.get_device_properties(get_current_device()).total_memory * _GLOBAL_CUDA_MEM_FRACTION return torch.cuda.get_device_properties(get_current_device()).total_memory * _GLOBAL_CUDA_MEM_FRACTION
def colo_model_data_tensor_move(src_t: Union[StatefulTensor, torch.Tensor], tgt_t: Union[StatefulTensor,
torch.Tensor]) -> None:
"""
A colossal API for model data tensor move.
The src and target tensors could be resident on both CPU and GPU.
NOTE() The source tensor payload will be removed after this function.
The function will record the communication volume between CPU and GPU.
Args:
t_src (Union[StatefulTensor, torch.Tensor]): source tensor
tgt_t (Union[StatefulTensor, torch.Tensor]): target tensor
"""
if issubclass(type(src_t), StatefulTensor):
src_t_payload = src_t.payload
else:
src_t_payload = src_t.data
src_dev = src_t_payload.device
if issubclass(type(tgt_t), StatefulTensor):
tgt_t_payload = tgt_t.payload
else:
tgt_t_payload = tgt_t.data
tgt_t_payload.copy_(src_t_payload)
# remove payload of src_t
if issubclass(type(src_t), StatefulTensor):
src_t.reset_payload(torch.tensor([], device=src_dev, dtype=src_t_payload.dtype))
else:
src_t.data = torch.tensor([], device=src_dev, dtype=src_t_payload.dtype)
def colo_model_data_tensor_move_inline(t: Union[StatefulTensor, torch.Tensor], target_device: Union[torch.device,
int]) -> None:
"""
move a tensor to the target_device
Args:
t (Union[StatefulTensor, torch.Tensor]): the tensor be moved
"""
if isinstance(t, torch.Tensor):
t_payload = t
elif issubclass(type(t), StatefulTensor):
t_payload = t.payload
else:
raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')
if isinstance(target_device, int):
target_device = torch.device(f'cuda:{target_device}')
# deal with torch.device('cpu') and torch.device('cpu:0)
if t_payload.device.type == target_device.type:
return
t_payload.data = t_payload.data.to(target_device)
def colo_model_data_move_to_cpu(t: Union[StatefulTensor, torch.Tensor]) -> None:
"""colo_model_data_move_to_cpu
move a model data tensor from gpu to cpu
Args:
t (Union[StatefulTensor, torch.Tensor]): _description_
"""
if issubclass(type(t), StatefulTensor):
t_payload = t.payload
elif isinstance(t, torch.Tensor):
t_payload = t
else:
raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')
if t_payload.device.type == 'cpu':
return
# TODO() optimize the tensor moving with non-blocking
t_payload.data = t_payload.data.cpu()
def colo_model_tensor_clone(t: Union[StatefulTensor, torch.Tensor], target_device: torch.device) -> torch.Tensor:
"""
Clone a model data tensor
Args:
t (Union[StatefulTensor, torch.Tensor]): a model data tensor
target_device (torch.device): the target device
Returns:
torch.Tensor: a cloned torch tensor
"""
t_payload = t.payload if issubclass(type(t), StatefulTensor) else t
ret = t_payload.to(target_device)
return ret
...@@ -3,7 +3,7 @@ from typing import List, Optional ...@@ -3,7 +3,7 @@ from typing import List, Optional
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from colossalai.utils.memory_utils.utils import colo_model_data_tensor_move_inline from colossalai.zero.shard_utils.tensor_utils import colo_model_data_tensor_move_inline
from colossalai.zero.shard_utils import BaseShardStrategy from colossalai.zero.shard_utils import BaseShardStrategy
from colossalai.zero.shard_utils.commons import get_shard from colossalai.zero.shard_utils.commons import get_shard
from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor
......
import torch
from colossalai.zero.sharded_param.tensorful_state import StatefulTensor
from typing import Union, Tuple
def colo_tensor_mem_usage(tensor: Union[torch.Tensor, StatefulTensor]) -> Tuple[int, int]:
if issubclass(type(tensor), StatefulTensor):
t = tensor.payload
elif isinstance(tensor, torch.Tensor):
t = tensor
else:
return 0, 0
cuda_use, cpu_use = 0, 0
mem_use = t.numel() * t.element_size()
if t.device.type == 'cuda':
cuda_use += mem_use
elif t.device.type == 'cpu':
cpu_use += mem_use
return cuda_use, cpu_use
def colo_model_data_tensor_move(src_t: Union[StatefulTensor, torch.Tensor], tgt_t: Union[StatefulTensor,
torch.Tensor]) -> None:
"""
A colossal API for model data tensor move.
The src and target tensors could be resident on both CPU and GPU.
NOTE() The source tensor payload will be removed after this function.
The function will record the communication volume between CPU and GPU.
Args:
t_src (Union[StatefulTensor, torch.Tensor]): source tensor
tgt_t (Union[StatefulTensor, torch.Tensor]): target tensor
"""
if issubclass(type(src_t), StatefulTensor):
src_t_payload = src_t.payload
else:
src_t_payload = src_t.data
src_dev = src_t_payload.device
if issubclass(type(tgt_t), StatefulTensor):
tgt_t_payload = tgt_t.payload
else:
tgt_t_payload = tgt_t.data
tgt_t_payload.copy_(src_t_payload)
# remove payload of src_t
if issubclass(type(src_t), StatefulTensor):
src_t.reset_payload(torch.tensor([], device=src_dev, dtype=src_t_payload.dtype))
else:
src_t.data = torch.tensor([], device=src_dev, dtype=src_t_payload.dtype)
def colo_model_data_tensor_move_inline(t: Union[StatefulTensor, torch.Tensor], target_device: Union[torch.device,
int]) -> None:
"""
move a tensor to the target_device
Args:
t (Union[StatefulTensor, torch.Tensor]): the tensor be moved
target_device: a traget device, if type is int, it the index of cuda card.
"""
if isinstance(t, torch.Tensor):
t_payload = t
elif issubclass(type(t), StatefulTensor):
t_payload = t.payload
else:
raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')
if not isinstance(target_device, torch.device):
target_device = torch.device(f'cuda:{target_device}')
# deal with torch.device('cpu') and torch.device('cpu:0)
if t_payload.device.type == target_device.type:
return
t_payload.data = t_payload.data.to(target_device)
def colo_model_data_move_to_cpu(t: Union[StatefulTensor, torch.Tensor]) -> None:
"""colo_model_data_move_to_cpu
move a model data tensor from gpu to cpu
Args:
t (Union[StatefulTensor, torch.Tensor]): _description_
"""
if issubclass(type(t), StatefulTensor):
t_payload = t.payload
elif isinstance(t, torch.Tensor):
t_payload = t
else:
raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')
if t_payload.device.type == 'cpu':
return
# TODO() optimize the tensor moving with non-blocking
t_payload.data = t_payload.data.cpu()
def colo_model_tensor_clone(t: Union[StatefulTensor, torch.Tensor], target_device: torch.device) -> torch.Tensor:
"""
Clone a model data tensor
Args:
t (Union[StatefulTensor, torch.Tensor]): a model data tensor
target_device (torch.device): the target device
Returns:
torch.Tensor: a cloned torch tensor
"""
t_payload = t.payload if issubclass(type(t), StatefulTensor) else t
ret = t_payload.to(target_device)
return ret
...@@ -16,8 +16,9 @@ from colossalai.utils import get_current_device ...@@ -16,8 +16,9 @@ from colossalai.utils import get_current_device
from colossalai.utils.memory_tracer.memstats_collector import MemStatsCollector from colossalai.utils.memory_tracer.memstats_collector import MemStatsCollector
from colossalai.utils.memory_tracer.model_data_memtracer import \ from colossalai.utils.memory_tracer.model_data_memtracer import \
GLOBAL_MODEL_DATA_TRACER GLOBAL_MODEL_DATA_TRACER
from colossalai.utils.memory_utils.utils import (colo_cuda_memory_capacity, colo_model_data_move_to_cpu) from colossalai.utils.memory_utils.utils import colo_cuda_memory_capacity
from colossalai.zero.shard_utils import BaseShardStrategy from colossalai.zero.shard_utils import BaseShardStrategy
from colossalai.zero.shard_utils.tensor_utils import colo_model_data_move_to_cpu
from colossalai.zero.sharded_model.reduce_scatter import ReduceScatterBucketer from colossalai.zero.sharded_model.reduce_scatter import ReduceScatterBucketer
from colossalai.zero.sharded_param.tensorful_state import TensorState from colossalai.zero.sharded_param.tensorful_state import TensorState
from torch.distributed import ProcessGroup from torch.distributed import ProcessGroup
...@@ -160,11 +161,13 @@ class ShardedModelV2(nn.Module): ...@@ -160,11 +161,13 @@ class ShardedModelV2(nn.Module):
with open(filename, 'w+') as f: with open(filename, 'w+') as f:
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device())/1e9} GB\n') f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device())/1e9} GB\n')
f.write(f'cuda max allocated {torch.cuda.max_memory_allocated(get_current_device())/1e9} GB\n') f.write(f'cuda max allocated {torch.cuda.max_memory_allocated(get_current_device())/1e9} GB\n')
f.write('model data\n') f.write('CUDA model data (GB)\n')
f.write(str(self._memstats_collector.model_data_cuda_GB)) f.write(str(self._memstats_collector.model_data_cuda_list('cuda', 'GB')))
f.write('\n') f.write('\n')
f.write('non model data\n') f.write('CUDA non model data (GB)\n')
f.write(str(self._memstats_collector.non_model_data_cuda_GB)) f.write(str(self._memstats_collector.non_model_data_cuda_list('cuda', 'GB')))
f.write('CPU non model data (GB)\n')
f.write(str(self._memstats_collector.non_model_data_cuda_list('cpu', 'GB')))
f.write('\n') f.write('\n')
def _pre_forward_operations(self): def _pre_forward_operations(self):
...@@ -209,7 +212,8 @@ class ShardedModelV2(nn.Module): ...@@ -209,7 +212,8 @@ class ShardedModelV2(nn.Module):
# the way to calculate margin space is based on the assumption that # the way to calculate margin space is based on the assumption that
# model data is fixed in cuda during training. # model data is fixed in cuda during training.
# cuda margin space can be used to store OS. # cuda margin space can be used to store OS.
self._cuda_margin_space = colo_cuda_memory_capacity() - max(self._memstats_collector.overall_cuda) self._cuda_margin_space = colo_cuda_memory_capacity() - max(
self._memstats_collector.overall_mem_stats('cuda'))
self._iter_cnter += 1 self._iter_cnter += 1
@torch.no_grad() @torch.no_grad()
......
...@@ -12,12 +12,13 @@ from colossalai.logging import get_dist_logger ...@@ -12,12 +12,13 @@ from colossalai.logging import get_dist_logger
from colossalai.nn.optimizer import ColossalaiOptimizer from colossalai.nn.optimizer import ColossalaiOptimizer
from colossalai.utils.memory_tracer.model_data_memtracer import \ from colossalai.utils.memory_tracer.model_data_memtracer import \
GLOBAL_MODEL_DATA_TRACER GLOBAL_MODEL_DATA_TRACER
from colossalai.utils.memory_utils.utils import (colo_model_data_tensor_move_inline, colo_model_tensor_clone, from colossalai.zero.shard_utils.tensor_utils import (colo_model_tensor_clone, colo_tensor_mem_usage)
colo_tensor_mem_usage)
from colossalai.zero.sharded_model import ShardedModelV2 from colossalai.zero.sharded_model import ShardedModelV2
from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32 from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
from colossalai.zero.sharded_optim._utils import has_inf_or_nan from colossalai.zero.sharded_optim._utils import has_inf_or_nan
from colossalai.zero.sharded_param.tensorful_state import (StatefulTensor, TensorState) from colossalai.zero.sharded_param.tensorful_state import (StatefulTensor, TensorState)
from colossalai.zero.shard_utils.tensor_utils import colo_model_data_tensor_move_inline
from torch import Tensor from torch import Tensor
from torch.distributed import ProcessGroup from torch.distributed import ProcessGroup
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
......
import torch import torch
import torch.distributed as dist
from colossalai.zero.sharded_param import ShardedTensor from colossalai.zero.sharded_param import ShardedTensor
from typing import Optional, Tuple from typing import Optional, Tuple
from colossalai.utils.memory_utils.utils import colo_tensor_mem_usage from colossalai.zero.shard_utils.tensor_utils import colo_tensor_mem_usage
from .tensorful_state import StatefulTensor, TensorState from .tensorful_state import StatefulTensor, TensorState
......
from colossalai.utils.memory_utils.utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline from colossalai.zero.shard_utils.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
from colossalai.utils import free_port from colossalai.utils import free_port
from colossalai.testing import rerun_on_exception from colossalai.testing import rerun_on_exception
from colossalai.zero.sharded_param import ShardedTensor from colossalai.zero.sharded_param import ShardedTensor
......
import pytest import pytest
from colossalai.utils.cuda import get_current_device from colossalai.utils.cuda import get_current_device
from colossalai.utils.memory_utils.utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline from colossalai.zero.shard_utils.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
from colossalai.utils import free_port from colossalai.utils import free_port
from colossalai.zero.sharded_param import ShardedTensor from colossalai.zero.sharded_param import ShardedTensor
import colossalai import colossalai
......
...@@ -13,7 +13,7 @@ from colossalai.utils import free_port ...@@ -13,7 +13,7 @@ from colossalai.utils import free_port
from colossalai.utils.cuda import get_current_device from colossalai.utils.cuda import get_current_device
from colossalai.utils.memory_tracer.model_data_memtracer import \ from colossalai.utils.memory_tracer.model_data_memtracer import \
colo_model_mem_usage colo_model_mem_usage
from colossalai.utils.memory_utils.memory_monitor import colo_cuda_memory_used from colossalai.utils.memory_utils.utils import colo_device_memory_used
from colossalai.zero.init_ctx import ZeroInitContext from colossalai.zero.init_ctx import ZeroInitContext
from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardStrategy) from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardStrategy)
from tests.components_to_test.registry import non_distributed_component_funcs from tests.components_to_test.registry import non_distributed_component_funcs
...@@ -51,10 +51,10 @@ def run_model_test(init_device_type, shard_strategy_class): ...@@ -51,10 +51,10 @@ def run_model_test(init_device_type, shard_strategy_class):
assert param.colo_attr.sharded_data_tensor.payload.device.type == init_device.type, \ assert param.colo_attr.sharded_data_tensor.payload.device.type == init_device.type, \
f'{param.colo_attr.sharded_data_tensor.payload.device.type} vs. {init_device.type}' f'{param.colo_attr.sharded_data_tensor.payload.device.type} vs. {init_device.type}'
cuda_mem_use, cpu_mem_use = colo_model_mem_usage(model) cuda_mem_use, _ = colo_model_mem_usage(model)
model_data_cuda_mem_MB = cuda_mem_use / 1e6 model_data_cuda_mem_MB = cuda_mem_use / 1e6
logger.info(f"Existing ZeRO Context.\nModel Data CUDA Memory {model_data_cuda_mem_MB} MB", ranks=[0]) logger.info(f"Existing ZeRO Context.\nModel Data CUDA Memory {model_data_cuda_mem_MB} MB", ranks=[0])
sys_cuda_mem_MB = colo_cuda_memory_used() / 1e6 sys_cuda_mem_MB = colo_device_memory_used(get_current_device()) / 1e6
logger.info(f"System CUDA Memory Usage {sys_cuda_mem_MB} MB", ranks=[0]) logger.info(f"System CUDA Memory Usage {sys_cuda_mem_MB} MB", ranks=[0])
logger.info(f"Model Number Parameter {model_numel_tensor.numpy()[0]/1e6} M", ranks=[0]) logger.info(f"Model Number Parameter {model_numel_tensor.numpy()[0]/1e6} M", ranks=[0])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment