Unverified Commit 8823cc48 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

Merge pull request #5310 from hpcaitech/feature/npu

Feature/npu
parents bce9499e 73f4dc57
......@@ -3,7 +3,7 @@
import time
from typing import Tuple
from .device import synchronize
from colossalai.accelerator import get_accelerator
class Timer:
......@@ -21,13 +21,13 @@ class Timer:
@property
def current_time(self) -> float:
synchronize()
get_accelerator().synchronize()
return time.time()
def start(self):
"""Firstly synchronize cuda, reset the clock and then start the timer."""
self._elapsed = 0
synchronize()
get_accelerator().synchronize()
self._start_time = time.time()
self._started = True
......@@ -44,7 +44,7 @@ class Timer:
Returns:
int: Start-stop interval.
"""
synchronize()
get_accelerator().synchronize()
end_time = time.time()
elapsed = end_time - self._start_time
if keep_in_history:
......
......@@ -6,8 +6,7 @@ import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup
from colossalai.utils import get_current_device
from colossalai.utils.device import IS_NPU_AVAILABLE
from colossalai.accelerator import get_accelerator
class TensorState(Enum):
......@@ -107,7 +106,7 @@ class Chunk:
self.valid_end = self.shard_size
self.dtype = dtype
device = init_device or get_current_device()
device = init_device or get_accelerator().get_current_device()
# chunk_temp is a global chunk, which only exists during building the chunks.
self.chunk_temp = torch.zeros(chunk_size, dtype=dtype, device=device) # keep all zero
......@@ -125,7 +124,7 @@ class Chunk:
# configure the init device of the shard
# no-offload default: fp16, fp32 -> CUDA
# offload default: fp16, fp32 -> CPU
self.shard_device = torch.device("cpu") if cpu_shard_init else get_current_device()
self.shard_device = torch.device("cpu") if cpu_shard_init else get_accelerator().get_current_device()
self.chunk_mem = self.chunk_size * self.chunk_temp.element_size()
self.shard_mem = self.chunk_mem // self.pg_size
......@@ -191,9 +190,8 @@ class Chunk:
def device_type(self) -> str:
if self.chunk_temp is not None:
return self.chunk_temp.device.type
else:
if self.is_gathered or self.cuda_shard is not None:
return "npu" if IS_NPU_AVAILABLE else "cuda"
elif self.is_gathered or self.cuda_shard is not None:
return get_accelerator().name
else:
return "cpu"
......@@ -297,7 +295,7 @@ class Chunk:
self.valid_end = self.utilized_size - self.shard_begin
if self.chunk_temp.device.type == "cpu":
self.cuda_global_chunk = self.chunk_temp.to(get_current_device())
self.cuda_global_chunk = self.chunk_temp.to(get_accelerator().get_current_device())
self.__update_tensors_ptr()
else:
self.cuda_global_chunk = self.chunk_temp
......@@ -334,12 +332,12 @@ class Chunk:
return
if device.type == "cuda" or device.type == "npu":
assert device == get_current_device(), "can't move chunk to another device"
assert device == get_accelerator().get_current_device(), "can't move chunk to another device"
if self.cuda_shard:
return
self.cuda_shard = self.cpu_shard.to(get_current_device())
self.cuda_shard = self.cpu_shard.to(get_accelerator().get_current_device())
if not self.pin_memory:
self.cpu_shard = None
......@@ -394,7 +392,9 @@ class Chunk:
if self.extra_dp_group is not None:
dist.all_reduce(self.cuda_global_chunk, group=self.extra_dp_group)
else:
self.cuda_shard = torch.empty(self.shard_size, dtype=self.dtype, device=get_current_device())
self.cuda_shard = torch.empty(
self.shard_size, dtype=self.dtype, device=get_accelerator().get_current_device()
)
input_list = list(torch.chunk(self.cuda_global_chunk, chunks=self.pg_size, dim=0))
dist.reduce_scatter(self.cuda_shard, input_list, group=self.torch_pg)
......@@ -533,7 +533,7 @@ class Chunk:
# only be called when optimizer state is in CPU memory
# the grad and param should be in the same device
assert self.cuda_shard is None
temp = optim_chunk.cpu_shard.to(get_current_device())
temp = optim_chunk.cpu_shard.to(get_accelerator().get_current_device())
# avoid to transform FP32 in CPU
self.cuda_shard = temp.to(self.dtype)
......@@ -631,7 +631,7 @@ class Chunk:
grad_chunk.valid_end = self.valid_end
if grad_chunk.chunk_temp.device.type == "cpu":
grad_chunk.cuda_global_chunk = grad_chunk.chunk_temp.to(get_current_device())
grad_chunk.cuda_global_chunk = grad_chunk.chunk_temp.to(get_accelerator().get_current_device())
else:
grad_chunk.cuda_global_chunk = grad_chunk.chunk_temp
grad_chunk.chunk_temp = None
......
......@@ -5,7 +5,8 @@ import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup
from colossalai.utils import free_storage, get_current_device
from colossalai.accelerator import get_accelerator
from colossalai.utils import free_storage
from .chunk import Chunk, ChunkFullError, TensorState
......@@ -20,7 +21,7 @@ class ChunkManager:
"""
def __init__(self, chunk_configuration, init_device: Optional[torch.device] = None) -> None:
self.device = init_device or get_current_device()
self.device = init_device or get_accelerator().get_current_device()
self.dp_degree_chunk_size_dict: Dict[int, int] = dict()
self.kwargs_config = chunk_configuration
for k, v in self.kwargs_config.items():
......@@ -107,7 +108,7 @@ class ChunkManager:
return
self.__sub_memory_usage(chunk.memory_usage)
if chunk.device_type == "cpu":
chunk.shard_move(get_current_device())
chunk.shard_move(get_accelerator().get_current_device())
self.__add_accessed_chunk(chunk)
self.__add_memory_usage(chunk.memory_usage)
......@@ -276,7 +277,10 @@ class ChunkManager:
accumulated_grad = chunk.grad_chunk.cuda_shard.clone().detach().mul_(chunk.pg_size)
else:
accumulated_grad = (
chunk.grad_chunk.cpu_shard.to(get_current_device()).clone().detach().mul_(chunk.pg_size)
chunk.grad_chunk.cpu_shard.to(get_accelerator().get_current_device())
.clone()
.detach()
.mul_(chunk.pg_size)
)
accumulated_grad_gathered = False
......
......@@ -10,6 +10,7 @@ import torch.nn as nn
from torch.distributed import ProcessGroup
from torch.distributed.distributed_c10d import _get_default_group
from colossalai.accelerator import get_accelerator
from colossalai.checkpoint_io.utils import StateDictSharder, gather_distributed_param
from colossalai.interface import ModelWrapper
from colossalai.lazy import LazyTensor
......@@ -27,7 +28,7 @@ from colossalai.tensor.d_tensor import (
is_distributed_tensor,
)
from colossalai.tensor.param_op_hook import ColoParamOpHookManager
from colossalai.utils import _cast_float, free_storage, get_current_device, is_ddp_ignored
from colossalai.utils import _cast_float, free_storage, is_ddp_ignored
from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager
from .gemini_hook import GeminiZeROHook
......@@ -766,7 +767,7 @@ class GeminiDDP(ModelWrapper):
# move ignored parameters to CUDA
if is_ddp_ignored(p):
p.data = p.data.to(device=get_current_device(), dtype=self.mixed_precision)
p.data = p.data.to(device=get_accelerator().get_current_device(), dtype=self.mixed_precision)
continue
# create a fp16 parameter
......@@ -815,7 +816,7 @@ class GeminiDDP(ModelWrapper):
for buffer in self.module.buffers():
if isinstance(buffer, LazyTensor):
buffer.materialize()
buffer.data = buffer.to(get_current_device())
buffer.data = buffer.to(get_accelerator().get_current_device())
if torch.is_floating_point(buffer):
buffer.data = buffer.to(self.mixed_precision)
......
......@@ -11,6 +11,7 @@ from torch.distributed import ProcessGroup
from torch.nn import Parameter
from torch.optim import Optimizer
from colossalai.accelerator import get_accelerator
from colossalai.amp.naive_amp.mixed_precision_mixin import BF16MixedPrecisionMixin, FP16MixedPrecisionMixin
from colossalai.checkpoint_io.utils import StateDictSharder, gather_distributed_param
from colossalai.interface import OptimizerWrapper
......@@ -26,7 +27,7 @@ from colossalai.tensor.d_tensor import (
is_customized_distributed_tensor,
is_distributed_tensor,
)
from colossalai.utils import disposable, get_current_device, is_ddp_ignored
from colossalai.utils import disposable, is_ddp_ignored
from .chunk import Chunk, ChunkManager
from .gemini_ddp import GeminiDDP
......@@ -233,7 +234,7 @@ class GeminiOptimizer(OptimizerWrapper):
grad_chunk.l2_norm = None # clear l2 norm
comm_buffer = torch.zeros(1, dtype=torch.float, device=get_current_device())
comm_buffer = torch.zeros(1, dtype=torch.float, device=get_accelerator().get_current_device())
for group, part_norm in group_to_norm.items():
comm_buffer.fill_(part_norm)
dist.all_reduce(comm_buffer, group=group)
......@@ -314,10 +315,10 @@ class GeminiOptimizer(OptimizerWrapper):
continue
if fp32_params_used_cuda_margin_mem + chunk32.payload_mem < fp32_params_available_cuda_margin_mem:
self.chunk_manager.move_chunk(chunk32, get_current_device())
self.chunk_manager.move_chunk(chunk32, get_accelerator().get_current_device())
# stores grad now
self.chunk_manager.move_chunk(chunk16, get_current_device())
self.module.set_chunk_grad_device(chunk16, get_current_device())
self.chunk_manager.move_chunk(chunk16, get_accelerator().get_current_device())
self.module.set_chunk_grad_device(chunk16, get_accelerator().get_current_device())
fp32_params_used_cuda_margin_mem += chunk32.payload_mem
for group in self.param_groups:
......@@ -328,7 +329,7 @@ class GeminiOptimizer(OptimizerWrapper):
state = self.optim.state[fake_param]
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.to(get_current_device())
state[k] = v.to(get_accelerator().get_current_device())
def _register_states_(self):
for group in self.optim.param_groups:
......@@ -551,7 +552,7 @@ class GeminiOptimizer(OptimizerWrapper):
self,
param_id: int,
state_names: list,
device: torch.device = get_current_device(),
device: torch.device = get_accelerator().get_current_device(),
dtype: torch.dtype = torch.float32,
) -> torch.Tensor:
"""
......
from typing import Optional
from colossalai.utils import get_current_device
from colossalai.accelerator import get_accelerator
from colossalai.zero.gemini.chunk import ChunkManager
from .memory_stats import MemStats
......@@ -33,4 +33,4 @@ class ChunkMemStatsCollector(MemStatsCollector):
def cuda_margin_mem(self) -> float:
from colossalai.legacy.utils.memory import colo_device_memory_capacity
return colo_device_memory_capacity(get_current_device()) - self._memstats.max_overall_cuda
return colo_device_memory_capacity(get_accelerator().get_current_device()) - self._memstats.max_overall_cuda
......@@ -5,7 +5,7 @@ from time import sleep, time
import torch
from colossalai.utils import get_current_device
from colossalai.accelerator import get_accelerator
class MemoryMonitor:
......@@ -77,7 +77,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
super().__init__()
self.keep_measuring = False
current_device = get_current_device()
current_device = get_accelerator().get_current_device()
def _set_cuda_device():
torch.cuda.set_device(current_device)
......@@ -116,7 +116,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
while self.keep_measuring:
max_usage = max(
max_usage,
colo_device_memory_used(get_current_device()),
colo_device_memory_used(get_accelerator().get_current_device()),
)
sleep(self.interval)
return max_usage
......
......@@ -6,8 +6,8 @@ from typing import Dict, List, Optional, Tuple, Type
import torch
from colossalai.utils import get_current_device
from colossalai.utils.memory import colo_device_memory_capacity
from colossalai.accelerator import get_accelerator
from colossalai.legacy.utils.memory import colo_device_memory_capacity
from colossalai.zero.gemini.chunk import Chunk
from .chunk import Chunk, ChunkManager
......@@ -85,7 +85,7 @@ class StaticPlacementPolicy(PlacementPolicy):
# init offload optim settings
# keep gathered chunks are in CUDA
if chunk.keep_gathered or offloaded_optim_chunk_mem >= offload_optim_chunk_mem:
device = get_current_device()
device = get_accelerator().get_current_device()
else:
device = torch.device("cpu")
# real offloaded mem is chunk.shard_mem, for simplicity we use chunk mem here
......@@ -140,7 +140,7 @@ class AutoPlacementPolicy(PlacementPolicy):
int: the volume of memory that is evicted
"""
start = time()
cuda_capacity = colo_device_memory_capacity(get_current_device())
cuda_capacity = colo_device_memory_capacity(get_accelerator().get_current_device())
used_cuda_model_data = self.chunk_manager.total_mem["cuda"]
if warmup:
# We designate a part of CUDA memory for model data in warmup iterations.
......@@ -194,7 +194,7 @@ class AutoPlacementPolicy(PlacementPolicy):
# init offload optim settings
# keep gathered chunks are in CUDA
if chunk.keep_gathered:
grads_device_map[p] = get_current_device()
grads_device_map[p] = get_accelerator().get_current_device()
else:
grads_device_map[p] = torch.device("cpu")
......
......@@ -6,7 +6,7 @@ import torch
import torch.distributed as dist
import torch.nn as nn
from colossalai.utils import get_current_device
from colossalai.accelerator import get_accelerator
from .chunk import Chunk
......@@ -18,11 +18,11 @@ def get_temp_total_chunk_on_cuda(chunk: Chunk, dtype: torch.dtype):
if chunk.cuda_shard is not None:
shard_temp = chunk.cuda_shard
else:
shard_temp = chunk.cpu_shard.to(get_current_device())
shard_temp = chunk.cpu_shard.to(get_accelerator().get_current_device())
shard_temp = shard_temp.to(dtype)
total_temp = torch.zeros(chunk.chunk_size, dtype=dtype, device=get_current_device())
total_temp = torch.zeros(chunk.chunk_size, dtype=dtype, device=get_accelerator().get_current_device())
gather_list = list(torch.chunk(input=total_temp, chunks=chunk.pg_size, dim=0))
dist.all_gather(tensor_list=gather_list, tensor=shard_temp, group=chunk.torch_pg)
......
......@@ -12,7 +12,7 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from torch.distributed import ProcessGroup
from torch.optim import Optimizer
import colossalai.utils.device as device_utils
from colossalai.accelerator import get_accelerator
from colossalai.amp.naive_amp.mixed_precision_mixin import (
BF16MixedPrecisionMixin,
FP16MixedPrecisionMixin,
......@@ -22,9 +22,6 @@ from colossalai.interface import OptimizerWrapper
from colossalai.logging import get_dist_logger
from colossalai.tensor.moe_tensor.api import is_moe_tensor
# from colossalai.tensor import ColoParameter, ProcessGroup
from colossalai.utils.device import IS_NPU_AVAILABLE, get_current_device
from ._utils import calculate_global_norm_from_list, flatten, has_inf_or_nan, release_param_grad, sync_tensor
from .bookkeeping import BucketStore, GradientStore, ParameterStore
......@@ -183,7 +180,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
# initialize communication stream for
# communication-computation overlapping
if self._overlap_communication:
self._comm_stream = device_utils.Stream()
self._comm_stream = get_accelerator().Stream()
# reduction hook is only used if overlapping communication
# or stage 2 is used
......@@ -217,7 +214,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
return len(self._working_param_groups)
def _sanity_checks(self):
assert torch.cuda.is_available() or IS_NPU_AVAILABLE, "device is required"
assert get_accelerator().name in ["cuda", "npu"], "device is required"
for param_group in self.optim.param_groups:
group_params = param_group["params"]
for param in group_params:
......@@ -228,7 +225,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
def _create_master_param_current_rank(self, param_list):
# split each param evenly by world size
params_current_rank = []
device = "cpu" if self._cpu_offload else get_current_device()
device = "cpu" if self._cpu_offload else get_accelerator().get_current_device()
for param in param_list:
padding_size = (self._world_size - param.numel() % self._world_size) % self._world_size
......@@ -340,11 +337,11 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
if len(moe_grad_list) > 0:
moe_flat_grads.record_stream(stream)
# waiting for ops in the default stream finishing
stream.wait_stream(device_utils.current_stream())
stream.wait_stream(get_accelerator().current_stream())
else:
stream = device_utils.current_stream()
stream = get_accelerator().current_stream()
with device_utils.stream(stream):
with get_accelerator().stream(stream):
group_id = self._bucket_store.current_group_id
if self.moe_extra_dp_pg is None:
......@@ -486,7 +483,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
# clear reduced grads
if self._overlap_communication:
device_utils.synchronize()
get_accelerator().synchronize()
self.zero_grad()
......@@ -505,7 +502,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
# clear reduced grads
if self._overlap_communication:
device_utils.synchronize()
get_accelerator().synchronize()
self.zero_grad()
......@@ -621,7 +618,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
release_param_grad(self._master_param_groups_of_current_rank[group_id])
# update working partition updated by the current rank
device = get_current_device()
device = get_accelerator().get_current_device()
for group_id in range(self.num_param_groups):
master_working_param = self.optim.param_groups[group_id]["params"]
for idx, splited_param in enumerate(master_working_param):
......@@ -661,7 +658,9 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
norm_type = float(norm_type)
if norm_type == inf:
total_norm = max(grad.data.abs().max() for grad in gradients)
total_norm_cuda = torch.tensor([float(total_norm)], device=get_current_device(), dtype=torch.float)
total_norm_cuda = torch.tensor(
[float(total_norm)], device=get_accelerator().get_current_device(), dtype=torch.float
)
dist.all_reduce(total_norm_cuda, op=torch.distributed.ReduceOp.MAX, group=self.dp_pg)
total_norm = total_norm_cuda.item()
......@@ -673,7 +672,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
# Sum across all model parallel GPUs.
total_norm_exponentiated_cuda = torch.tensor(
[float(total_norm_exponentiated)], device=get_current_device(), dtype=torch.float
[float(total_norm_exponentiated)], device=get_accelerator().get_current_device(), dtype=torch.float
)
torch.distributed.all_reduce(
total_norm_exponentiated_cuda, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg
......@@ -765,7 +764,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
Dict: the pytorch form state_dict
"""
zero_state = dict()
device = get_current_device()
device = get_accelerator().get_current_device()
for param, state in self.optim.state.items():
zero_state[param] = copy.deepcopy(state)
for k, v in state.items():
......@@ -827,7 +826,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
ret_block = dict()
ret_block_size = 0
device = get_current_device()
device = get_accelerator().get_current_device()
local_states = self.optim.state_dict()["state"]
for param_idx, states in local_states.items():
current_block_size = 0
......
......@@ -45,7 +45,6 @@ from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
```
## Define Plugin
Create a `HybridParallelPlugin` object and specify the desired parallelism strategies to be used. In this example, both pipeline parallelism and ZeRO-1 are used simultaneously.
......
......@@ -43,7 +43,6 @@ from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
```
### 定义plugin
定义一个[`HybridParallelPlugin`](../basics/booster_plugins.md)对象,指定所需要使用的并行策略,在该例子中,同时使用了流水线并行和zero1.
......
......@@ -16,10 +16,10 @@ from utils.global_vars import get_tensorboard_writer, get_timers, set_global_var
from utils.logger import Logger
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.context import ParallelMode
from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
from colossalai.tensor import ProcessGroup, ShardSpec
from colossalai.utils import get_current_device
from colossalai.utils.model.colo_init_context import ColoInitContext
......@@ -53,7 +53,7 @@ def main():
set_global_variables(launch_time, args.tensorboard_path)
world_size = torch.distributed.get_world_size()
get_current_device()
get_accelerator().get_current_device()
# build model, optimizer and criterion
if args.distplan.startswith("CAI"):
......@@ -67,7 +67,10 @@ def main():
# build GPT model
with ColoInitContext(
device=get_current_device(), dtype=torch.half, default_dist_spec=default_dist_spec, default_pg=shard_pg
device=get_accelerator().get_current_device(),
dtype=torch.half,
default_dist_spec=default_dist_spec,
default_pg=shard_pg,
):
config, model, numel = get_model(args, logger)
......@@ -78,7 +81,7 @@ def main():
elif args.distplan == "CAI_Gemini":
gemini_config = dict(
strict_ddp_mode=args.tp_degree == 1,
device=get_current_device(),
device=get_accelerator().get_current_device(),
placement_policy=args.placement,
pin_memory=True,
hidden_dim=model.config.hidden_size,
......
......@@ -20,11 +20,11 @@ from tqdm.auto import tqdm
from transformers import AutoTokenizer, PretrainedConfig
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
disable_existing_loggers()
logger = get_dist_logger()
......@@ -386,7 +386,7 @@ def main(args):
cur_class_images = len(list(class_images_dir.iterdir()))
if cur_class_images < args.num_class_images:
torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
torch_dtype = torch.float16 if get_accelerator().get_current_device() == "cuda" else torch.float32
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
torch_dtype=torch_dtype,
......@@ -401,7 +401,7 @@ def main(args):
sample_dataset = PromptDataset(args.class_prompt, num_new_images)
sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
pipeline.to(get_current_device())
pipeline.to(get_accelerator().get_current_device())
for example in tqdm(
sample_dataloader,
......@@ -578,8 +578,8 @@ def main(args):
# Move text_encode and vae to gpu.
# For mixed precision training we cast the text_encoder and vae weights to half-precision
# as these models are only used for inference, keeping weights in full precision is not required.
vae.to(get_current_device(), dtype=weight_dtype)
text_encoder.to(get_current_device(), dtype=weight_dtype)
vae.to(get_accelerator().get_current_device(), dtype=weight_dtype)
text_encoder.to(get_accelerator().get_current_device(), dtype=weight_dtype)
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader))
......@@ -613,7 +613,7 @@ def main(args):
torch.cuda.reset_peak_memory_stats()
# Move batch to gpu
for key, value in batch.items():
batch[key] = value.to(get_current_device(), non_blocking=True)
batch[key] = value.to(get_accelerator().get_current_device(), non_blocking=True)
# Convert images to latent space
optimizer.zero_grad()
......
......@@ -21,13 +21,13 @@ from tqdm.auto import tqdm
from transformers import AutoTokenizer, PretrainedConfig
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.legacy.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
disable_existing_loggers()
logger = get_dist_logger()
......@@ -385,7 +385,7 @@ def main(args):
cur_class_images = len(list(class_images_dir.iterdir()))
if cur_class_images < args.num_class_images:
torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
torch_dtype = torch.float16 if get_accelerator().get_current_device() == "cuda" else torch.float32
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
torch_dtype=torch_dtype,
......@@ -400,7 +400,7 @@ def main(args):
sample_dataset = PromptDataset(args.class_prompt, num_new_images)
sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
pipeline.to(get_current_device())
pipeline.to(get_accelerator().get_current_device())
for example in tqdm(
sample_dataloader,
......@@ -598,8 +598,8 @@ def main(args):
# Move text_encode and vae to gpu.
# For mixed precision training we cast the text_encoder and vae weights to half-precision
# as these models are only used for inference, keeping weights in full precision is not required.
vae.to(get_current_device(), dtype=weight_dtype)
text_encoder.to(get_current_device(), dtype=weight_dtype)
vae.to(get_accelerator().get_current_device(), dtype=weight_dtype)
text_encoder.to(get_accelerator().get_current_device(), dtype=weight_dtype)
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader))
......@@ -633,7 +633,7 @@ def main(args):
torch.cuda.reset_peak_memory_stats()
# Move batch to gpu
for key, value in batch.items():
batch[key] = value.to(get_current_device(), non_blocking=True)
batch[key] = value.to(get_accelerator().get_current_device(), non_blocking=True)
# Convert images to latent space
optimizer.zero_grad()
......
......@@ -13,12 +13,12 @@ from torch.utils.data import DataLoader
from tqdm import tqdm
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
......@@ -53,8 +53,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
model.eval()
correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
correct = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
for images, labels in test_dataloader:
images = images.cuda()
labels = labels.cuda()
......
......@@ -33,9 +33,10 @@ def get_data_batch(batch_size, num_labels, num_channels=3, height=224, width=224
def colo_memory_cap(size_in_GB):
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
from colossalai.accelerator import get_accelerator
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction
cuda_capacity = colo_device_memory_capacity(get_current_device())
cuda_capacity = colo_device_memory_capacity(get_accelerator().get_current_device())
if size_in_GB * (1024**3) < cuda_capacity:
colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
print(f"Limiting GPU memory usage to {size_in_GB} GB")
......
......@@ -6,10 +6,9 @@ import torch.distributed as dist
import transformers
import colossalai
import colossalai.utils.device as device_utils
from colossalai.accelerator import get_accelerator
from colossalai.inference import InferenceEngine
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
from colossalai.utils.device import get_current_device
GIGABYTE = 1024**3
MEGABYTE = 1024 * 1024
......@@ -52,7 +51,7 @@ CONFIG_MAP = {
def data_gen(batch_size: int = 4, seq_len: int = 512):
input_ids = torch.randint(10, 30000, (batch_size, seq_len), device=get_current_device())
input_ids = torch.randint(10, 30000, (batch_size, seq_len), device=get_accelerator().get_current_device())
attention_mask = torch.ones_like(input_ids)
data = dict(input_ids=input_ids, attention_mask=attention_mask)
return data
......@@ -97,9 +96,9 @@ def print_details_info(outputs, model_config, args, whole_end2end):
msg += f"Flops: {num_parameters * num_bytes / whole_avg_latency / 1e12:.2f} TFLOPS\n"
if torch.cuda.is_available():
msg += f"-------Memory Summary Device:{device_utils.current_device()}-------\n"
msg += f"Max memory allocated: {device_utils.max_memory_allocated() / GIGABYTE:.2f} GB\n"
msg += f"Max memory reserved: {device_utils.max_memory_reserved() / GIGABYTE:.2f} GB\n"
msg += f"-------Memory Summary Device:{get_accelerator().current_device()}-------\n"
msg += f"Max memory allocated: {get_accelerator().max_memory_allocated() / GIGABYTE:.2f} GB\n"
msg += f"Max memory reserved: {get_accelerator().max_memory_reserved() / GIGABYTE:.2f} GB\n"
print(msg)
......
......@@ -5,9 +5,9 @@ import torch.distributed as dist
from transformers import LlamaForCausalLM, LlamaTokenizer
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.inference import InferenceEngine
from colossalai.testing import spawn
from colossalai.utils.device import get_current_device
INPUT_TEXTS = [
"What is the longest river in the world?",
......@@ -57,7 +57,7 @@ def run_inference(args):
)
inputs = tokenizer(INPUT_TEXTS, return_tensors="pt", padding="longest", max_length=max_input_len, truncation=True)
inputs = {k: v.to(get_current_device()) for k, v in inputs.items()}
inputs = {k: v.to(get_accelerator().get_current_device()) for k, v in inputs.items()}
outputs = engine.generate(inputs)
if rank == 0:
......
......@@ -18,11 +18,11 @@ from transformers import (
)
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
......@@ -59,7 +59,7 @@ def evaluate_model(
use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
is_pp_last_device = use_pipeline and booster.plugin.stage_manager.is_last_stage(ignore_chunk=True)
accum_loss = torch.zeros(1, device=get_current_device())
accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
for batch in dataloader:
batch = move_to_cuda(batch)
labels = batch["labels"]
......@@ -89,8 +89,10 @@ def evaluate_model(
object_list = [None, None]
dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group)
metric.add_batch(predictions=object_list[0].to(get_current_device()), references=labels)
accum_loss.add_(object_list[1].to(get_current_device()))
metric.add_batch(
predictions=object_list[0].to(get_accelerator().get_current_device()), references=labels
)
accum_loss.add_(object_list[1].to(get_accelerator().get_current_device()))
else:
batch = move_to_cuda(batch)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment