Unverified Commit d202cc28 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[npu] change device to accelerator api (#5239)



* update accelerator

* fix timer

* fix amp

* update

* fix

* update bug

* add error raise

* fix autocast

* fix set device

* remove doc accelerator

* update doc

* update doc

* update doc

* use nullcontext

* update cpu

* update null context

* change time limit for example

* udpate

* update

* update

* update

* [npu] polish accelerator code

---------
Co-authored-by: default avatarXuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: default avatarzxl <43881818+oahzxl@users.noreply.github.com>
parent dd2c28a3
...@@ -33,9 +33,10 @@ def get_data_batch(batch_size, num_labels, num_channels=3, height=224, width=224 ...@@ -33,9 +33,10 @@ def get_data_batch(batch_size, num_labels, num_channels=3, height=224, width=224
def colo_memory_cap(size_in_GB): def colo_memory_cap(size_in_GB):
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device from colossalai.accelerator import get_accelerator
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction
cuda_capacity = colo_device_memory_capacity(get_current_device()) cuda_capacity = colo_device_memory_capacity(get_accelerator().get_current_device())
if size_in_GB * (1024**3) < cuda_capacity: if size_in_GB * (1024**3) < cuda_capacity:
colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity) colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
print(f"Limiting GPU memory usage to {size_in_GB} GB") print(f"Limiting GPU memory usage to {size_in_GB} GB")
......
...@@ -6,10 +6,9 @@ import torch.distributed as dist ...@@ -6,10 +6,9 @@ import torch.distributed as dist
import transformers import transformers
import colossalai import colossalai
import colossalai.utils.device as device_utils from colossalai.accelerator import get_accelerator
from colossalai.inference import InferenceEngine from colossalai.inference import InferenceEngine
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
from colossalai.utils.device import get_current_device
GIGABYTE = 1024**3 GIGABYTE = 1024**3
MEGABYTE = 1024 * 1024 MEGABYTE = 1024 * 1024
...@@ -52,7 +51,7 @@ CONFIG_MAP = { ...@@ -52,7 +51,7 @@ CONFIG_MAP = {
def data_gen(batch_size: int = 4, seq_len: int = 512): def data_gen(batch_size: int = 4, seq_len: int = 512):
input_ids = torch.randint(10, 30000, (batch_size, seq_len), device=get_current_device()) input_ids = torch.randint(10, 30000, (batch_size, seq_len), device=get_accelerator().get_current_device())
attention_mask = torch.ones_like(input_ids) attention_mask = torch.ones_like(input_ids)
data = dict(input_ids=input_ids, attention_mask=attention_mask) data = dict(input_ids=input_ids, attention_mask=attention_mask)
return data return data
...@@ -97,9 +96,9 @@ def print_details_info(outputs, model_config, args, whole_end2end): ...@@ -97,9 +96,9 @@ def print_details_info(outputs, model_config, args, whole_end2end):
msg += f"Flops: {num_parameters * num_bytes / whole_avg_latency / 1e12:.2f} TFLOPS\n" msg += f"Flops: {num_parameters * num_bytes / whole_avg_latency / 1e12:.2f} TFLOPS\n"
if torch.cuda.is_available(): if torch.cuda.is_available():
msg += f"-------Memory Summary Device:{device_utils.current_device()}-------\n" msg += f"-------Memory Summary Device:{get_accelerator().current_device()}-------\n"
msg += f"Max memory allocated: {device_utils.max_memory_allocated() / GIGABYTE:.2f} GB\n" msg += f"Max memory allocated: {get_accelerator().max_memory_allocated() / GIGABYTE:.2f} GB\n"
msg += f"Max memory reserved: {device_utils.max_memory_reserved() / GIGABYTE:.2f} GB\n" msg += f"Max memory reserved: {get_accelerator().max_memory_reserved() / GIGABYTE:.2f} GB\n"
print(msg) print(msg)
......
...@@ -5,9 +5,9 @@ import torch.distributed as dist ...@@ -5,9 +5,9 @@ import torch.distributed as dist
from transformers import LlamaForCausalLM, LlamaTokenizer from transformers import LlamaForCausalLM, LlamaTokenizer
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.inference import InferenceEngine from colossalai.inference import InferenceEngine
from colossalai.testing import spawn from colossalai.testing import spawn
from colossalai.utils.device import get_current_device
INPUT_TEXTS = [ INPUT_TEXTS = [
"What is the longest river in the world?", "What is the longest river in the world?",
...@@ -57,7 +57,7 @@ def run_inference(args): ...@@ -57,7 +57,7 @@ def run_inference(args):
) )
inputs = tokenizer(INPUT_TEXTS, return_tensors="pt", padding="longest", max_length=max_input_len, truncation=True) inputs = tokenizer(INPUT_TEXTS, return_tensors="pt", padding="longest", max_length=max_input_len, truncation=True)
inputs = {k: v.to(get_current_device()) for k, v in inputs.items()} inputs = {k: v.to(get_accelerator().get_current_device()) for k, v in inputs.items()}
outputs = engine.generate(inputs) outputs = engine.generate(inputs)
if rank == 0: if rank == 0:
......
...@@ -18,11 +18,11 @@ from transformers import ( ...@@ -18,11 +18,11 @@ from transformers import (
) )
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ============================== # ==============================
# Prepare Hyperparameters # Prepare Hyperparameters
...@@ -59,7 +59,7 @@ def evaluate_model( ...@@ -59,7 +59,7 @@ def evaluate_model(
use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1 use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage() is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
accum_loss = torch.zeros(1, device=get_current_device()) accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
for batch in dataloader: for batch in dataloader:
batch = move_to_cuda(batch) batch = move_to_cuda(batch)
labels = batch["labels"] labels = batch["labels"]
...@@ -88,8 +88,10 @@ def evaluate_model( ...@@ -88,8 +88,10 @@ def evaluate_model(
object_list = [None, None] object_list = [None, None]
dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group) dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group)
metric.add_batch(predictions=object_list[0].to(get_current_device()), references=labels) metric.add_batch(
accum_loss.add_(object_list[1].to(get_current_device())) predictions=object_list[0].to(get_accelerator().get_current_device()), references=labels
)
accum_loss.add_(object_list[1].to(get_accelerator().get_current_device()))
else: else:
batch = move_to_cuda(batch) batch = move_to_cuda(batch)
......
...@@ -7,13 +7,13 @@ from model_zoo import GPTLMLoss, get_gpt2_components ...@@ -7,13 +7,13 @@ from model_zoo import GPTLMLoss, get_gpt2_components
from torch.utils._pytree import tree_map from torch.utils._pytree import tree_map
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.auto_parallel.offload.amp_optimizer import AMPOptimizer from colossalai.auto_parallel.offload.amp_optimizer import AMPOptimizer
from colossalai.auto_parallel.offload.mem_optimize import memory_optimize from colossalai.auto_parallel.offload.mem_optimize import memory_optimize
from colossalai.auto_parallel.offload.solver import NOT_NVML from colossalai.auto_parallel.offload.solver import NOT_NVML
from colossalai.fx.profiler import parameter_size from colossalai.fx.profiler import parameter_size
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import spawn from colossalai.testing import spawn
from colossalai.utils import get_current_device
def parse_args(): def parse_args():
...@@ -41,7 +41,7 @@ def train_gpt(args): ...@@ -41,7 +41,7 @@ def train_gpt(args):
64, 64,
8, 8,
), ),
device=get_current_device(), device=get_accelerator().get_current_device(),
) )
criterion = GPTLMLoss() criterion = GPTLMLoss()
......
...@@ -12,12 +12,12 @@ from commons.utils import get_data, get_profile_context, get_tflops, get_time_st ...@@ -12,12 +12,12 @@ from commons.utils import get_data, get_profile_context, get_tflops, get_time_st
from packaging import version from packaging import version
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.lazy import LazyInitContext from colossalai.lazy import LazyInitContext
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
CAI_VERSION = colossalai.__version__ CAI_VERSION = colossalai.__version__
...@@ -141,7 +141,11 @@ def main(): ...@@ -141,7 +141,11 @@ def main():
criterion = GPTLMLoss() criterion = GPTLMLoss()
torch.manual_seed(123) torch.manual_seed(123)
if args.distplan.startswith("CAI"): if args.distplan.startswith("CAI"):
ctx = LazyInitContext(default_device=get_current_device()) if args.distplan == "CAI_Gemini" else nullcontext() ctx = (
LazyInitContext(default_device=get_accelerator().get_current_device())
if args.distplan == "CAI_Gemini"
else nullcontext()
)
# build GPT model # build GPT model
with ctx: with ctx:
model = model_builder(args.model_type)(checkpoint=True) model = model_builder(args.model_type)(checkpoint=True)
......
...@@ -13,11 +13,11 @@ from tqdm import tqdm ...@@ -13,11 +13,11 @@ from tqdm import tqdm
from transformers import AutoConfig, GPT2ForSequenceClassification, get_linear_schedule_with_warmup from transformers import AutoConfig, GPT2ForSequenceClassification, get_linear_schedule_with_warmup
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ============================== # ==============================
# Prepare Hyperparameters # Prepare Hyperparameters
...@@ -54,7 +54,7 @@ def evaluate_model( ...@@ -54,7 +54,7 @@ def evaluate_model(
use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1 use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage() is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
accum_loss = torch.zeros(1, device=get_current_device()) accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
for batch in dataloader: for batch in dataloader:
batch = move_to_cuda(batch) batch = move_to_cuda(batch)
labels = batch["labels"] labels = batch["labels"]
...@@ -83,8 +83,10 @@ def evaluate_model( ...@@ -83,8 +83,10 @@ def evaluate_model(
object_list = [None, None] object_list = [None, None]
dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group) dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group)
metric.add_batch(predictions=object_list[0].to(get_current_device()), references=labels) metric.add_batch(
accum_loss.add_(object_list[1].to(get_current_device())) predictions=object_list[0].to(get_accelerator().get_current_device()), references=labels
)
accum_loss.add_(object_list[1].to(get_accelerator().get_current_device()))
else: else:
batch = move_to_cuda(batch) batch = move_to_cuda(batch)
......
...@@ -5,6 +5,7 @@ from torch import nn as nn ...@@ -5,6 +5,7 @@ from torch import nn as nn
from torch.nn import functional as F from torch.nn import functional as F
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from colossalai.accelerator import get_accelerator
from colossalai.legacy.context import ParallelMode, seed from colossalai.legacy.context import ParallelMode, seed
from colossalai.legacy.core import global_context as gpc from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.nn.layer.base_layer import ParallelLayer from colossalai.legacy.nn.layer.base_layer import ParallelLayer
...@@ -12,7 +13,6 @@ from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_b ...@@ -12,7 +13,6 @@ from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_b
from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
from colossalai.legacy.nn.layer.utils import divide from colossalai.legacy.nn.layer.utils import divide
from colossalai.legacy.registry import LAYERS, LOSSES from colossalai.legacy.registry import LAYERS, LOSSES
from colossalai.utils import get_current_device
class VocabParallelEmbedding(torch.nn.Module): class VocabParallelEmbedding(torch.nn.Module):
...@@ -96,7 +96,9 @@ class VocabParallelEmbedding(torch.nn.Module): ...@@ -96,7 +96,9 @@ class VocabParallelEmbedding(torch.nn.Module):
if position_ids is not None: if position_ids is not None:
position_ids = position_ids.view(-1, input_shape[-1]) position_ids = position_ids.view(-1, input_shape[-1])
if position_ids is None: if position_ids is None:
position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device()) position_ids = torch.arange(
0, input_shape[-1] + 0, dtype=torch.long, device=get_accelerator().get_current_device()
)
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
position_embeddings = self.position_embeddings(position_ids) position_embeddings = self.position_embeddings(position_ids)
...@@ -194,7 +196,7 @@ class VocabParallelEmbedding1D(torch.nn.Module): ...@@ -194,7 +196,7 @@ class VocabParallelEmbedding1D(torch.nn.Module):
self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
# Allocate weights and initialize. # Allocate weights and initialize.
factory_kwargs = {"device": get_current_device(), "dtype": dtype} factory_kwargs = {"device": get_accelerator().get_current_device(), "dtype": dtype}
self.weight = Parameter(torch.empty(self.num_embeddings_per_partition, self.embedding_dim, **factory_kwargs)) self.weight = Parameter(torch.empty(self.num_embeddings_per_partition, self.embedding_dim, **factory_kwargs))
init.uniform_(self.weight, -1, 1) init.uniform_(self.weight, -1, 1)
...@@ -439,7 +441,9 @@ class HiddenParallelEmbedding(torch.nn.Module): ...@@ -439,7 +441,9 @@ class HiddenParallelEmbedding(torch.nn.Module):
if position_ids is not None: if position_ids is not None:
position_ids = position_ids.view(-1, input_shape[-1]) position_ids = position_ids.view(-1, input_shape[-1])
if position_ids is None: if position_ids is None:
position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device()) position_ids = torch.arange(
0, input_shape[-1] + 0, dtype=torch.long, device=get_accelerator().get_current_device()
)
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
position_embeddings = self.position_embeddings(position_ids) position_embeddings = self.position_embeddings(position_ids)
...@@ -532,7 +536,7 @@ class HiddenParallelEmbedding1D(torch.nn.Module): ...@@ -532,7 +536,7 @@ class HiddenParallelEmbedding1D(torch.nn.Module):
self._weight = None self._weight = None
# Allocate weights and initialize. # Allocate weights and initialize.
factory_kwargs = {"device": get_current_device(), "dtype": dtype} factory_kwargs = {"device": get_accelerator().get_current_device(), "dtype": dtype}
self.weight = Parameter(torch.empty(num_embeddings, embed_dim_per_partition, **factory_kwargs)) self.weight = Parameter(torch.empty(num_embeddings, embed_dim_per_partition, **factory_kwargs))
init.uniform_(self.weight, -1, 1) init.uniform_(self.weight, -1, 1)
......
...@@ -13,13 +13,12 @@ from transformers.models.llama.configuration_llama import LlamaConfig ...@@ -13,13 +13,12 @@ from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.models.llama.modeling_llama import LlamaForCausalLM from transformers.models.llama.modeling_llama import LlamaForCausalLM
import colossalai import colossalai
import colossalai.utils.device as device_utils from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.lazy import LazyInitContext from colossalai.lazy import LazyInitContext
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ============================== # ==============================
# Constants # Constants
...@@ -166,7 +165,7 @@ def main(): ...@@ -166,7 +165,7 @@ def main():
# Initialize Model and Optimizer # Initialize Model and Optimizer
# ============================== # ==============================
init_ctx = ( init_ctx = (
LazyInitContext(default_device=get_current_device()) LazyInitContext(default_device=get_accelerator().get_current_device())
if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin)) if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin))
else nullcontext() else nullcontext()
) )
...@@ -197,7 +196,9 @@ def main(): ...@@ -197,7 +196,9 @@ def main():
torch.set_default_dtype(torch.bfloat16) torch.set_default_dtype(torch.bfloat16)
model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader) model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
torch.set_default_dtype(torch.float) torch.set_default_dtype(torch.float)
coordinator.print_on_master(f"Booster init max CUDA memory: {device_utils.max_memory_allocated()/1024**2:.2f} MB") coordinator.print_on_master(
f"Booster init max CUDA memory: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB"
)
coordinator.print_on_master( coordinator.print_on_master(
f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB" f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
) )
...@@ -223,7 +224,7 @@ def main(): ...@@ -223,7 +224,7 @@ def main():
performance_evaluator.on_step_end(**batch) performance_evaluator.on_step_end(**batch)
performance_evaluator.on_fit_end() performance_evaluator.on_fit_end()
coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB") coordinator.print_on_master(f"Max CUDA memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB")
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -8,7 +8,7 @@ from torch.distributed import ProcessGroup ...@@ -8,7 +8,7 @@ from torch.distributed import ProcessGroup
from torch.distributed.distributed_c10d import _get_default_group from torch.distributed.distributed_c10d import _get_default_group
from torch.utils.data import DataLoader, Dataset, DistributedSampler from torch.utils.data import DataLoader, Dataset, DistributedSampler
from colossalai.utils import get_current_device from colossalai.accelerator import get_accelerator
class StatefulDistributedSampler(DistributedSampler): class StatefulDistributedSampler(DistributedSampler):
...@@ -108,7 +108,9 @@ class RandomDataset(Dataset): ...@@ -108,7 +108,9 @@ class RandomDataset(Dataset):
def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000): def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
self.num_samples = num_samples self.num_samples = num_samples
self.max_length = max_length self.max_length = max_length
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device()) self.input_ids = torch.randint(
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
)
self.attention_mask = torch.ones_like(self.input_ids) self.attention_mask = torch.ones_like(self.input_ids)
def __len__(self): def __len__(self):
......
...@@ -21,13 +21,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM ...@@ -21,13 +21,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM
from transformers.models.llama.tokenization_llama import LlamaTokenizer from transformers.models.llama.tokenization_llama import LlamaTokenizer
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.lazy import LazyInitContext from colossalai.lazy import LazyInitContext
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
def get_model_numel(model: nn.Module) -> int: def get_model_numel(model: nn.Module) -> int:
...@@ -191,7 +191,9 @@ def main(): ...@@ -191,7 +191,9 @@ def main():
config = LlamaConfig.from_pretrained(args.model_path) config = LlamaConfig.from_pretrained(args.model_path)
# use lazy init when using GeminiPlugin # use lazy init when using GeminiPlugin
init_ctx = ( init_ctx = (
LazyInitContext(default_device=get_current_device()) if isinstance(plugin, GeminiPlugin) else nullcontext() LazyInitContext(default_device=get_accelerator().get_current_device())
if isinstance(plugin, GeminiPlugin)
else nullcontext()
) )
with init_ctx: with init_ctx:
......
...@@ -5,9 +5,8 @@ import torch ...@@ -5,9 +5,8 @@ import torch
import torch.distributed as dist import torch.distributed as dist
from torch import Tensor from torch import Tensor
import colossalai.utils.device as device_utils from colossalai.accelerator import get_accelerator
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.utils.device import get_current_device
def divide(x: float, y: float) -> float: def divide(x: float, y: float) -> float:
...@@ -22,7 +21,7 @@ def divide(x: float, y: float) -> float: ...@@ -22,7 +21,7 @@ def divide(x: float, y: float) -> float:
def all_reduce_mean(x: float, world_size: int) -> float: def all_reduce_mean(x: float, world_size: int) -> float:
if world_size == 1: if world_size == 1:
return x return x
tensor = torch.tensor([x], device=get_current_device()) tensor = torch.tensor([x], device=get_accelerator().get_current_device())
dist.all_reduce(tensor) dist.all_reduce(tensor)
tensor = tensor / world_size tensor = tensor / world_size
return tensor.item() return tensor.item()
...@@ -86,13 +85,13 @@ class PerformanceEvaluator: ...@@ -86,13 +85,13 @@ class PerformanceEvaluator:
self.disable = self.ignore_steps > 0 and step < self.ignore_steps self.disable = self.ignore_steps > 0 and step < self.ignore_steps
if self.disable: if self.disable:
return return
device_utils.synchronize() get_accelerator().synchronize()
self.timer.start() self.timer.start()
def on_step_end(self, input_ids: Tensor, **kwargs) -> None: def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
if self.disable: if self.disable:
return return
device_utils.synchronize() get_accelerator().synchronize()
self.timer.end() self.timer.end()
batch_size, seq_len = input_ids.shape batch_size, seq_len = input_ids.shape
......
...@@ -20,13 +20,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM ...@@ -20,13 +20,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM
from transformers.models.llama.tokenization_llama import LlamaTokenizer from transformers.models.llama.tokenization_llama import LlamaTokenizer
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.lazy import LazyInitContext from colossalai.lazy import LazyInitContext
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
MODEL_CONFIGS = { MODEL_CONFIGS = {
"7b": LlamaConfig(max_position_embeddings=4096), "7b": LlamaConfig(max_position_embeddings=4096),
...@@ -227,7 +227,9 @@ def main(): ...@@ -227,7 +227,9 @@ def main():
config = MODEL_CONFIGS[args.config] config = MODEL_CONFIGS[args.config]
# use lazy init when using GeminiPlugin # use lazy init when using GeminiPlugin
init_ctx = ( init_ctx = (
LazyInitContext(default_device=get_current_device()) if isinstance(plugin, GeminiPlugin) else nullcontext() LazyInitContext(default_device=get_accelerator().get_current_device())
if isinstance(plugin, GeminiPlugin)
else nullcontext()
) )
with init_ctx: with init_ctx:
......
...@@ -14,6 +14,7 @@ from transformers.models.llama import LlamaConfig ...@@ -14,6 +14,7 @@ from transformers.models.llama import LlamaConfig
from utils import PerformanceEvaluator, get_model_numel from utils import PerformanceEvaluator, get_model_numel
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
...@@ -21,7 +22,6 @@ from colossalai.moe.layers import apply_load_balance ...@@ -21,7 +22,6 @@ from colossalai.moe.layers import apply_load_balance
from colossalai.moe.manager import MOE_MANAGER from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import skip_init from colossalai.moe.utils import skip_init
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
def move_to_cuda(batch, device): def move_to_cuda(batch, device):
...@@ -64,13 +64,15 @@ class RandomDataset(Dataset): ...@@ -64,13 +64,15 @@ class RandomDataset(Dataset):
) )
self.input_ids.append(encode["input_ids"]) self.input_ids.append(encode["input_ids"])
self.attention_mask.append(encode["attention_mask"]) self.attention_mask.append(encode["attention_mask"])
self.input_ids = torch.cat(self.input_ids, dim=0).to(get_current_device()) self.input_ids = torch.cat(self.input_ids, dim=0).to(get_accelerator().get_current_device())
self.attention_mask = torch.cat(self.attention_mask, dim=0).to(get_current_device()) self.attention_mask = torch.cat(self.attention_mask, dim=0).to(get_accelerator().get_current_device())
repeat_times = num_samples // self.input_ids.shape[0] + 1 repeat_times = num_samples // self.input_ids.shape[0] + 1
self.input_ids = self.input_ids.repeat(repeat_times, 1)[:num_samples] self.input_ids = self.input_ids.repeat(repeat_times, 1)[:num_samples]
self.attention_mask = self.attention_mask.repeat(repeat_times, 1)[:num_samples] self.attention_mask = self.attention_mask.repeat(repeat_times, 1)[:num_samples]
else: else:
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device()) self.input_ids = torch.randint(
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
)
self.attention_mask = torch.ones_like(self.input_ids) self.attention_mask = torch.ones_like(self.input_ids)
def __len__(self): def __len__(self):
......
...@@ -15,6 +15,7 @@ from transformers import T5Tokenizer ...@@ -15,6 +15,7 @@ from transformers import T5Tokenizer
from transformers.models.llama import LlamaConfig from transformers.models.llama import LlamaConfig
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
...@@ -22,7 +23,6 @@ from colossalai.moe.layers import apply_load_balance ...@@ -22,7 +23,6 @@ from colossalai.moe.layers import apply_load_balance
from colossalai.moe.manager import MOE_MANAGER from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import skip_init from colossalai.moe.utils import skip_init
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
def move_to_cuda(batch, device): def move_to_cuda(batch, device):
...@@ -61,7 +61,9 @@ class RandomDataset(Dataset): ...@@ -61,7 +61,9 @@ class RandomDataset(Dataset):
def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000, tokenizer=None): def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000, tokenizer=None):
self.num_samples = num_samples self.num_samples = num_samples
self.max_length = max_length self.max_length = max_length
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device()) self.input_ids = torch.randint(
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
)
self.attention_mask = torch.ones_like(self.input_ids) self.attention_mask = torch.ones_like(self.input_ids)
def __len__(self): def __len__(self):
......
...@@ -14,12 +14,12 @@ from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper ...@@ -14,12 +14,12 @@ from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
from torch.utils.data import DataLoader, Dataset from torch.utils.data import DataLoader, Dataset
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.lazy import LazyInitContext from colossalai.lazy import LazyInitContext
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn import HybridAdam from colossalai.nn import HybridAdam
from colossalai.utils import get_current_device
# constants # constants
...@@ -159,7 +159,11 @@ if args.distplan == "colossalai": ...@@ -159,7 +159,11 @@ if args.distplan == "colossalai":
logger.info(f"plugin: {plugin}") logger.info(f"plugin: {plugin}")
booster = Booster(plugin=plugin, **booster_kwargs) booster = Booster(plugin=plugin, **booster_kwargs)
ctx = LazyInitContext(default_device=get_current_device()) if args.plugin == "gemini" else nullcontext() ctx = (
LazyInitContext(default_device=get_accelerator().get_current_device())
if args.plugin == "gemini"
else nullcontext()
)
with ctx: with ctx:
model = PaLM(num_tokens=50304, dim=4096, depth=64) model = PaLM(num_tokens=50304, dim=4096, depth=64)
......
...@@ -13,12 +13,12 @@ from torch.utils.data import DataLoader ...@@ -13,12 +13,12 @@ from torch.utils.data import DataLoader
from tqdm import tqdm from tqdm import tqdm
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ============================== # ==============================
# Prepare Hyperparameters # Prepare Hyperparameters
...@@ -53,8 +53,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl ...@@ -53,8 +53,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
@torch.no_grad() @torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float: def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
model.eval() model.eval()
correct = torch.zeros(1, dtype=torch.int64, device=get_current_device()) correct = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_current_device()) total = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
for images, labels in test_dataloader: for images, labels in test_dataloader:
images = images.cuda() images = images.cuda()
labels = labels.cuda() labels = labels.cuda()
......
...@@ -13,13 +13,13 @@ from torch.utils.data import DataLoader ...@@ -13,13 +13,13 @@ from torch.utils.data import DataLoader
from tqdm import tqdm from tqdm import tqdm
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.nn.lr_scheduler import LinearWarmupLR from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ============================== # ==============================
# Prepare Hyperparameters # Prepare Hyperparameters
...@@ -73,8 +73,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl ...@@ -73,8 +73,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
@torch.no_grad() @torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float: def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
model.eval() model.eval()
correct = torch.zeros(1, dtype=torch.int64, device=get_current_device()) correct = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_current_device()) total = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
for images, labels in test_dataloader: for images, labels in test_dataloader:
images = images.cuda() images = images.cuda()
labels = labels.cuda() labels = labels.cuda()
......
...@@ -12,11 +12,11 @@ from tqdm import tqdm ...@@ -12,11 +12,11 @@ from tqdm import tqdm
from transformers import AutoConfig, BertForSequenceClassification, get_linear_schedule_with_warmup from transformers import AutoConfig, BertForSequenceClassification, get_linear_schedule_with_warmup
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ============================== # ==============================
# Prepare Hyperparameters # Prepare Hyperparameters
...@@ -45,7 +45,7 @@ def evaluate( ...@@ -45,7 +45,7 @@ def evaluate(
model.eval() model.eval()
def evaluate_subset(dataloader: DataLoader): def evaluate_subset(dataloader: DataLoader):
accum_loss = torch.zeros(1, device=get_current_device()) accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
for batch in dataloader: for batch in dataloader:
batch = move_to_cuda(batch) batch = move_to_cuda(batch)
outputs = model(**batch) outputs = model(**batch)
......
...@@ -51,13 +51,13 @@ from transformers import ( ...@@ -51,13 +51,13 @@ from transformers import (
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
import colossalai import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.legacy.context import ParallelMode from colossalai.legacy.context import ParallelMode
from colossalai.legacy.core import global_context as gpc from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.tensor import ProcessGroup from colossalai.legacy.tensor import ProcessGroup
from colossalai.legacy.utils import get_dataloader from colossalai.legacy.utils import get_dataloader
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
from colossalai.zero import GeminiOptimizer from colossalai.zero import GeminiOptimizer
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
...@@ -249,9 +249,9 @@ def parse_args(): ...@@ -249,9 +249,9 @@ def parse_args():
def colo_memory_cap(size_in_GB): def colo_memory_cap(size_in_GB):
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction
cuda_capacity = colo_device_memory_capacity(get_current_device()) cuda_capacity = colo_device_memory_capacity(get_accelerator().get_current_device())
if size_in_GB * (1024**3) < cuda_capacity: if size_in_GB * (1024**3) < cuda_capacity:
colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity) colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
print("Using {} GB of GPU memory".format(size_in_GB)) print("Using {} GB of GPU memory".format(size_in_GB))
...@@ -265,7 +265,9 @@ class DummyDataloader: ...@@ -265,7 +265,9 @@ class DummyDataloader:
self.vocab_size = vocab_size self.vocab_size = vocab_size
def generate(self): def generate(self):
input_ids = torch.randint(0, self.vocab_size, (self.batch_size, self.seq_len), device=get_current_device()) input_ids = torch.randint(
0, self.vocab_size, (self.batch_size, self.seq_len), device=get_accelerator().get_current_device()
)
attention_mask = torch.ones_like(input_ids) attention_mask = torch.ones_like(input_ids)
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids} return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}
...@@ -390,7 +392,7 @@ def main(): ...@@ -390,7 +392,7 @@ def main():
if args.init_in_cpu: if args.init_in_cpu:
init_dev = torch.device("cpu") init_dev = torch.device("cpu")
else: else:
init_dev = get_current_device() init_dev = get_accelerator().get_current_device()
cai_version = colossalai.__version__ cai_version = colossalai.__version__
logger.info(f"using Colossal-AI version {cai_version}") logger.info(f"using Colossal-AI version {cai_version}")
...@@ -439,7 +441,9 @@ def main(): ...@@ -439,7 +441,9 @@ def main():
except ImportError: except ImportError:
# this works for unreleased main branch, and this may be released on 0.2.9 # this works for unreleased main branch, and this may be released on 0.2.9
from colossalai.zero import GeminiDDP from colossalai.zero import GeminiDDP
model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True) model = GeminiDDP(
model, device=get_accelerator().get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True
)
elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"): elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
from colossalai.gemini import ChunkManager, GeminiManager from colossalai.gemini import ChunkManager, GeminiManager
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment