Unverified Commit 8823cc48 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

Merge pull request #5310 from hpcaitech/feature/npu

Feature/npu
parents bce9499e 73f4dc57
......@@ -140,7 +140,7 @@ jobs:
- name: Install Colossal-AI
run: |
CUDA_EXT=1 pip install -v -e .
BUILD_EXT=1 pip install -v -e .
pip install -r requirements/requirements-test.txt
- name: Store Colossal-AI Cache
......
......@@ -12,7 +12,7 @@ jobs:
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 90
steps:
......@@ -55,7 +55,7 @@ jobs:
if: steps.check-avai.outputs.avai == 'true'
run: |
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
CUDA_EXT=1 pip install -v -e .
BUILD_EXT=1 pip install -v -e .
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
pip install -r requirements/requirements-test.txt
......
......@@ -45,9 +45,9 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 10
timeout-minutes: 15
steps:
- name: 📚 Checkout
uses: actions/checkout@v3
......
......@@ -77,7 +77,7 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 20
concurrency:
......
......@@ -34,7 +34,7 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
timeout-minutes: 10
steps:
- name: 📚 Checkout
......
......@@ -18,7 +18,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
timeout-minutes: 30
defaults:
......
......@@ -20,7 +20,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/chatgpt:/data/scratch/chatgpt
timeout-minutes: 30
defaults:
......
......@@ -19,7 +19,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
volumes:
- /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa
- /data/scratch/llama-tiny:/data/scratch/llama-tiny
......
include *.txt README.md
recursive-include requirements *.txt
recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
recursive-include op_builder *.py
recursive-include extensions *.py *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
......@@ -10,7 +10,7 @@ from torch.utils.data import DataLoader, DistributedSampler
from tqdm import tqdm
from transformers import PreTrainedTokenizerBase
from colossalai.utils import get_current_device
from colossalai.accelerator import get_accelerator
from .base import OnPolicyTrainer
from .callbacks import Callback
......@@ -105,7 +105,7 @@ class PPOTrainer(OnPolicyTrainer):
self.critic_optim = critic_optim
self.offload_inference_models = offload_inference_models
self.device = get_current_device()
self.device = get_accelerator().get_current_device()
def _before_fit(
self,
......
......@@ -6,7 +6,6 @@ import torch.nn as nn
import colossalai
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
from colossalai.utils import get_current_device
from colossalai.zero.gemini.gemini_ddp import GeminiDDP
from .ddp import DDPStrategy
......@@ -158,9 +157,19 @@ class GeminiStrategy(DDPStrategy):
warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")
# colossalai has changed api for get_current_device in 0.3.4 version or newer
try:
from colossalai.accelerator import get_accelerator
chunk_init_device = get_accelerator().get_current_device()
except:
from colossalai.utils import get_current_device
chunk_init_device = get_current_device()
# NOTE: dist should be initialized before calling get_current_device()
plugin_initializer = lambda: GeminiPlugin(
chunk_init_device=get_current_device(),
chunk_init_device=chunk_init_device,
placement_policy=placement_policy,
shard_param_frac=shard_param_frac,
offload_optim_frac=offload_optim_frac,
......
......@@ -4,41 +4,34 @@
Continual Pre-training of LLaMA-2 developed by Colossal-AI Team
"""
import json
import argparse
import json
import os
import resource
from contextlib import nullcontext
from tqdm import tqdm
import torch
import torch.distributed as dist
from colossal_llama2.dataset.loader import (
DataCollatorForSupervisedDataset,
StatefulDistributedSampler,
load_tokenized_dataset,
setup_distributed_dataloader,
)
from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint
from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention
from colossal_llama2.utils.froze import freeze_non_embeds_parameters
from torch.utils.tensorboard import SummaryWriter
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
from tqdm import tqdm
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import (
GeminiPlugin,
LowLevelZeroPlugin,
HybridParallelPlugin,
)
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
from colossalai.cluster import DistCoordinator
from colossalai.lazy import LazyInitContext
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
from colossal_llama2.dataset.loader import (
load_tokenized_dataset,
setup_distributed_dataloader,
DataCollatorForSupervisedDataset,
StatefulDistributedSampler,
)
from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention
from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint
from colossal_llama2.utils.froze import freeze_non_embeds_parameters
def get_model_numel(model: torch.nn.Module) -> int:
......@@ -215,9 +208,18 @@ def main() -> None:
# ======================================================
# Initialize Model, Objective, Optimizer and LR Scheduler
# ======================================================
init_ctx = (
LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
)
# colossalai has changed api for get_current_device in 0.3.4 version or newer
try:
from colossalai.accelerator import get_accelerator
current_device = get_accelerator().get_current_device()
except:
from colossalai.utils import get_current_device
current_device = get_current_device()
init_ctx = LazyInitContext(default_device=current_device) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
with init_ctx:
model = LlamaForCausalLM(LlamaConfig.from_pretrained(args.pretrained))
# Freeze part of parameters.
......@@ -320,7 +322,7 @@ def main() -> None:
initial=start_step,
) as pbar:
for step, batch in pbar:
batch = {k: v.to(get_current_device()) for k, v in batch.items() if isinstance(v, torch.Tensor)}
batch = {k: v.to(current_device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
batch_output = model(**batch)
......@@ -372,9 +374,7 @@ def main() -> None:
# Final save.
coordinator.print_on_master("Start saving final model checkpoint")
booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
coordinator.print_on_master(
f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}"
)
coordinator.print_on_master(f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}")
coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
......
from .initialize import launch, launch_from_openmpi, launch_from_slurm, launch_from_torch
from . import accelerator
try:
# .version will be created by setup.py
......
# 🚀 Accelerator
## 🔗 Table of Contents
- [🚀 Accelerator](#-accelerator)
- [🔗 Table of Contents](#-table-of-contents)
- [📚 Introduction](#-introduction)
- [📌 Design and Acknowledgement](#-design-and-acknowledgement)
## 📚 Introduction
This module offers a layer of abstraction for ColossalAI. With this module, the user can easily switch between different accelerator backends, such as Nvidia GPUs, Huawei NPUs, etc. This module is an attempt to make users' code portable across different hardware platform with a simple `auto_set_accelerator()` API.
## 📌 Design and Acknowledgement
Our `accelerator` module is heavily inspired by [`deepspeed/accelerator`](https://www.deepspeed.ai/tutorials/accelerator-abstraction-interface/). We found that it is a very well-designed and well-structured module that can be easily integrated into our project. We would like to thank the DeepSpeed team for their great work.
We implemented this accelerator module from scratch. At the same time, we have implemented our own modifications:
1. we updated the accelerator API names to be aligned with PyTorch's native API names.
2. we did not include the `op builder` in the `accelerator`. Instead, we have reconstructed our `kernel` module to automatically match the accelerator and its corresponding kernel implementations, so as to make modules less tangled.
from .api import auto_set_accelerator, get_accelerator, set_accelerator
from .base_accelerator import BaseAccelerator
from .cpu_accelerator import CpuAccelerator
from .cuda_accelerator import CudaAccelerator
from .npu_accelerator import NpuAccelerator
__all__ = [
"get_accelerator",
"set_accelerator",
"auto_set_accelerator",
"BaseAccelerator",
"CudaAccelerator",
"NpuAccelerator",
"CpuAccelerator",
]
#!/usr/bin/env python
from collections import OrderedDict
from typing import Union
from .base_accelerator import BaseAccelerator
from .cpu_accelerator import CpuAccelerator
from .cuda_accelerator import CudaAccelerator
from .npu_accelerator import NpuAccelerator
__all__ = ["set_accelerator", "auto_set_accelerator", "get_accelerator"]
_ACCELERATOR = None
# we use ordered dictionary here to associate the
# order with device check priority
# i.e. auto_set_accelerator will check cuda first
_ACCELERATOR_MAPPING = OrderedDict(cuda=CudaAccelerator, npu=NpuAccelerator, cpu=CpuAccelerator)
def set_accelerator(accelerator: Union[str, BaseAccelerator]) -> None:
"""
Set the global accelerator for the current process.
Args:
accelerator (Union[str, BaseAccelerator]): the type of accelerator to which the current device belongs.
"""
global _ACCELERATOR
if isinstance(accelerator, str):
_ACCELERATOR = _ACCELERATOR_MAPPING[accelerator]()
elif isinstance(accelerator, BaseAccelerator):
_ACCELERATOR = accelerator
else:
raise TypeError("accelerator must be either a string or an instance of BaseAccelerator")
def auto_set_accelerator() -> None:
"""
Automatically check if any accelerator is available.
If an accelerator is availabe, set it as the global accelerator.
"""
global _ACCELERATOR
for accelerator_name, accelerator_cls in _ACCELERATOR_MAPPING.items():
try:
accelerator = accelerator_cls()
if accelerator_name == "cpu" or accelerator.is_available():
_ACCELERATOR = accelerator
break
except:
pass
if _ACCELERATOR is None:
raise RuntimeError("No accelerator is available.")
def get_accelerator() -> BaseAccelerator:
"""
Return the accelerator for the current process. If the accelerator is not initialized, it will be initialized
to the default accelerator type.
Returns: the accelerator for the current process.
"""
global _ACCELERATOR
if _ACCELERATOR is None:
auto_set_accelerator()
return _ACCELERATOR
#!/usr/bin/env python
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
__all__ = ["BaseAccelerator"]
class BaseAccelerator(ABC):
support_set_device: bool = True
def __init__(self, name: str, communication_backend: str, is_synchronous: bool) -> None:
self._name = name
self._communication_backend = communication_backend
self._is_synchronous = is_synchronous
# =======================
# immutable attributes
# =======================
@property
def name(self) -> str:
"""
Return the name of the accelerator.
"""
return self._name
@property
def communication_backend(self) -> str:
"""
Return the name of the backend communication library.
"""
return self._communication_backend
@property
def is_synchronous(self) -> bool:
"""
Return whether the accelerator is a synchronous device.
"""
return self._is_synchronous
def __repr__(self) -> str:
cls_name = self.__class__.__name__
return f"{cls_name}(name={self._name}, communication_backend={self._communication_backend}, is_synchronous={self._is_synchronous})"
# =======================
# device APIs
# =======================
@abstractmethod
def get_version(self) -> str:
"""
Return the version of the accelerator which torch is built against.
"""
@abstractmethod
def get_current_device(self) -> torch.device:
"""
Return the current device.
"""
@abstractmethod
def current_device(self) -> int:
"""
Return the current device index.
"""
@abstractmethod
def set_device(self, device: Optional[Union[torch.device, int]] = None) -> None:
"""
Bind the current process to a device.
"""
@abstractmethod
def get_device_name(self, device: Union[torch.device, int]) -> str:
"""
Return the name of the device.
"""
@abstractmethod
def synchronize(self, device: Union[torch.device, int] = None):
"""
Synchronize the current process.
"""
@abstractmethod
def is_available(self):
"""
Check if the accelerator is available.
"""
@abstractmethod
def device_count(self):
"""
Return the number of devices on the machine.
"""
def set_to_device(self, models: Any) -> Any:
"""
Send model to device.
:param models: nn.module or a list of module
"""
if isinstance(models, list) and len(models) > 1:
ret = []
for model in models:
ret.append(model.to(self.get_current_device()))
return ret
elif isinstance(models, list):
return models[0].to(self.get_current_device())
else:
return models.to(self.get_current_device())
@abstractmethod
def get_device_capability(self, device=None) -> Tuple[int, int]:
"""
Gets the capability of a device.
"""
@abstractmethod
def get_device_name(self, device=None) -> str:
"""
Gets the name of a device.
"""
@abstractmethod
def get_device_properties(self, device):
"""
Gets the properties of a device.
"""
@abstractmethod
def utilization(self, device=None) -> int:
"""
Returns the percent of time over the past sample period during which one or more kernels was executing on the device as given by nvidia-smi or npu-smi, etc.
"""
# =======================
# random number generator APIs
# =======================
@abstractmethod
def get_rng_state(self, device="cuda") -> torch.Tensor:
"""
Returns the random number generator state of the specified device as a ByteTensor.
"""
@abstractmethod
def get_rng_state_all(self) -> List[torch.Tensor]:
"""
Returns a list of ByteTensor representing the random number states of all devices.
"""
@abstractmethod
def set_rng_state(self, new_state: torch.ByteTensor, device: str = "cuda") -> None:
"""
Sets the random number generator state of the specified device.
"""
@abstractmethod
def set_rng_state_all(self, new_states: List[torch.ByteTensor]) -> None:
"""
Sets the random number generator state of all devices.
"""
@abstractmethod
def manual_seed(self, seed: int) -> None:
"""
Sets the seed for generating random numbers for the current device.
"""
@abstractmethod
def manual_seed_all(self, seed: int) -> None:
"""
Sets the seed for generating random numbers on all devices.
"""
@abstractmethod
def seed(self) -> None:
"""
Sets the seed for generating random numbers to a random number for the current device.
"""
@abstractmethod
def seed_all(self) -> None:
"""
Sets the seed for generating random numbers to a random number on all devices.
"""
@abstractmethod
def initial_seed(self) -> int:
"""
Returns the current random seed of the current device.
"""
# =======================
# memory management APIs
# =======================
@abstractmethod
def empty_cache(self) -> None:
"""
Releases all unoccupied cached memory currently held by the caching allocator so that those can be used in other device application and visible in nvidia-smi.
"""
@abstractmethod
def memory_stats(self, device=None) -> Dict[str, Any]:
"""
Returns a dictionary of CUDA memory allocator statistics for a given device.
"""
@abstractmethod
def memory_summary(self, device=None, abbreviated=False) -> str:
"""
Returns a human-readable printout of the current memory allocator statistics for a given device.
"""
@abstractmethod
def memory_snapshot(self):
"""
Returns a snapshot of the CUDA memory allocator state across all devices.
"""
@abstractmethod
def memory_allocated(self, device=None) -> int:
"""
Returns the current device memory occupied by tensors in bytes for a given device.
"""
@abstractmethod
def max_memory_allocated(self, device=None) -> int:
"""
Returns the maximum device memory occupied by tensors in bytes for a given device.
"""
@abstractmethod
def reset_max_memory_allocated(self, device=None) -> None:
"""
Resets the starting point in tracking maximum device memory occupied by tensors for a given device.
"""
@abstractmethod
def reset_max_memory_cached(self, device=None) -> None:
"""
Resets the starting point in tracking maximum device memory managed by the caching allocator for a given device.
"""
@abstractmethod
def memory_reserved(self, device=None) -> int:
"""
Returns the current device memory managed by the caching allocator in bytes for a given device.
"""
@abstractmethod
def max_memory_reserved(self, device=None) -> int:
"""
Returns the maximum device memory managed by the caching allocator in bytes for a given device.
"""
@abstractmethod
def set_per_process_memory_fraction(self, fraction: float, device=None) -> None:
"""
Set memory fraction for a process.
"""
@abstractmethod
def reset_peak_memory_stats(self, device=None) -> None:
"""
Resets the "peak" stats tracked by the device memory allocator.
"""
# =======================
# streams and events APIs
# =======================
@abstractmethod
def Stream(self, device=None, priority=0, **kwargs):
"""
A device stream is a linear sequence of execution that belongs to a specific device, independent from other streams. See cuda-semantics for details.
"""
@abstractmethod
def Event(self, enable_timing: bool = False, blocking: bool = False, interprocess: bool = False):
"""
device events are synchronization markers that can be used to monitor the device's progress, to accurately measure timing, and to synchronize CUDA streams.
"""
@abstractmethod
def current_stream(self, device=None):
"""
Returns the currently selected Stream for a given device.
"""
@abstractmethod
def default_stream(self, device=None):
"""
Returns the default Stream for a given device.
"""
@abstractmethod
def set_stream(self, stream_):
"""
Sets the current stream.This is a wrapper API to set the stream.
"""
@abstractmethod
def stream(self, stream_):
"""
Wrapper around the Context-manager StreamContext that selects a given stream.
"""
# =======================
# amp APIs
# =======================
@abstractmethod
def autocast(
self, enabled: bool = True, dtype: torch.dtype = torch.float16, cache_enabled: bool = True
) -> Callable:
"""
Return autocast function
"""
#!/usr/bin/env python
import resource
from contextlib import nullcontext
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import psutil
import torch
from .base_accelerator import BaseAccelerator
__all__ = ["CpuAccelerator"]
class CpuAccelerator(BaseAccelerator):
support_set_device: bool = False
"""
Accelerator class for cpu.
"""
def __init__(self):
super().__init__(name="cpu", communication_backend="gloo", is_synchronous=False)
# =======================
# device APIs
# =======================
def get_version(self) -> str:
"""
Return the version of the accelerator which torch is built against.
"""
return ""
def get_current_device(self) -> torch.device:
"""
Return the current device.
"""
return torch.device("cpu")
def current_device(self) -> int:
"""
Return the current device index.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def set_device(self, device: Optional[Union[torch.device, int]] = None) -> None:
"""
Bind the current process to a device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def get_device_name(self, device: Union[torch.device, int]) -> str:
"""
Return the name of the device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def synchronize(self, device: Union[torch.device, int] = None):
"""
Synchronize the current process.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def is_available(self):
"""
Check if the accelerator is available.
"""
return True
def device_count(self):
"""
Return the number of devices on the machine.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def get_device_capability(self, device=None) -> Tuple[int, int]:
"""
Gets the cuda capability of a device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def get_device_name(self, device=None) -> str:
"""
Gets the name of a device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def get_device_properties(self, device):
"""
Gets the properties of a device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def utilization(self, device=None) -> int:
"""
Returns the percent of time over the past sample period during which one or more kernels was executing on the GPU as given by nvidia-smi
"""
raise RuntimeError("this method is not supported for cpu accelerator")
# =======================
# random number generator APIs
# =======================
def get_rng_state(self, device=None) -> torch.Tensor:
"""
Returns the random number generator state of the specified GPU as a ByteTensor.
"""
return torch.get_rng_state(device)
def get_rng_state_all(self) -> List[torch.Tensor]:
"""
Returns a list of ByteTensor representing the random number states of all devices.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def set_rng_state(self, new_state: torch.ByteTensor, device: str = None) -> None:
"""
Sets the random number generator state of the specified GPU.
"""
torch.set_rng_state(new_state)
def set_rng_state_all(self, new_states: List[torch.ByteTensor]) -> None:
"""
Sets the random number generator state of all devices.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def manual_seed(self, seed: int) -> None:
"""
Sets the seed for generating random numbers for the current GPU.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def manual_seed_all(self, seed: int) -> None:
"""
Set the random seed for the all processes.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def seed(self) -> None:
"""
Sets the seed for generating random numbers to a random number for the current GPU.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def seed_all(self) -> None:
"""
Sets the seed for generating random numbers to a random number on all GPUs.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def initial_seed(self) -> int:
"""
Returns the current random seed of the current GPU.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
# =======================
# memory management APIs
# =======================
def empty_cache(self) -> None:
"""
Releases all unoccupied cached memory currently held by the caching allocator so that those can be used in other GPU application and visible in nvidia-smi.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def memory_stats(self, device=None) -> Dict[str, Any]:
"""
Returns a dictionary of CUDA memory allocator statistics for a given device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def memory_summary(self, device=None, abbreviated=False) -> str:
"""
Returns a human-readable printout of the current memory allocator statistics for a given device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def memory_snapshot(self):
"""
Returns a snapshot of the CUDA memory allocator state across all devices.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def memory_allocated(self, device=None) -> int:
"""
Returns the current GPU memory occupied by tensors in bytes for a given device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def max_memory_allocated(self, device=None) -> int:
"""
Returns the maximum GPU memory occupied by tensors in bytes for a given device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def reset_max_memory_allocated(self, device=None) -> None:
"""
Resets the starting point in tracking maximum GPU memory occupied by tensors for a given device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def reset_max_memory_cached(self, device=None) -> None:
"""
Resets the starting point in tracking maximum GPU memory managed by the caching allocator for a given device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def memory_reserved(self, device=None) -> int:
"""
Returns the current GPU memory managed by the caching allocator in bytes for a given device.
"""
return psutil.Process().memory_info().rss
def max_memory_reserved(self, device=None) -> int:
"""
Returns the maximum GPU memory managed by the caching allocator in bytes for a given device.
"""
return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
def set_per_process_memory_fraction(self, fraction: float, device=None) -> None:
"""
Set memory fraction for a process.
"""
max_memory = int(psutil.virtual_memory().total * fraction)
_, hard = resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS, (max_memory, hard))
def reset_peak_memory_stats(self, device=None) -> None:
"""
Resets the "peak" stats tracked by the CUDA memory allocator.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
# =======================
# streams and events APIs
# =======================
def Stream(self, device=None, priority=0, **kwargs):
"""
A CUDA stream is a linear sequence of execution that belongs to a specific device, independent from other streams. See cuda-semantics for details.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def Event(self, enable_timing: bool = False, blocking: bool = False, interprocess: bool = False):
"""
CUDA events are synchronization markers that can be used to monitor the device's progress, to accurately measure timing, and to synchronize CUDA streams.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def current_stream(self, device=None):
"""
Returns the currently selected Stream for a given device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def default_stream(self, device=None):
"""
Returns the default Stream for a given device.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def set_stream(self, stream_):
"""
Sets the current stream.This is a wrapper API to set the stream.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
def stream(self, stream_):
"""
Wrapper around the Context-manager StreamContext that selects a given stream.
"""
raise RuntimeError("this method is not supported for cpu accelerator")
# =======================
# amp APIs
# =======================
def autocast(
self, enabled: bool = True, dtype: torch.dtype = torch.float16, cache_enabled: bool = True
) -> Callable:
"""
Return autocast function
"""
return nullcontext
#!/usr/bin/env python
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
import torch.distributed as dist
from .base_accelerator import BaseAccelerator
__all__ = ["CudaAccelerator"]
class CudaAccelerator(BaseAccelerator):
"""
Accelerator class for Nvidia CUDA devices.
"""
def __init__(self):
super().__init__(name="cuda", communication_backend="nccl", is_synchronous=False)
# =======================
# device APIs
# =======================
def get_version(self) -> str:
"""
Return the version of the accelerator which torch is built against.
"""
return torch.version.cuda
def get_current_device(self) -> torch.device:
"""
Return the current device.
"""
return torch.device(f"cuda:{torch.cuda.current_device()}")
def current_device(self) -> int:
"""
Return the current device index.
"""
return torch.cuda.current_device()
def set_device(self, device: Optional[Union[torch.device, int]] = None) -> None:
"""
Bind the current process to a device.
"""
if device is None:
if not dist.is_initialized():
raise RuntimeError("Cannot get current device when distributed is not initialized.")
device = dist.get_rank() % self.device_count()
torch.cuda.set_device(device)
def get_device_name(self, device: Union[torch.device, int]) -> str:
"""
Return the name of the device.
"""
return torch.cuda.get_device_name(device)
def synchronize(self, device: Union[torch.device, int] = None):
"""
Synchronize the current process.
"""
torch.cuda.synchronize(device)
def is_available(self):
"""
Check if the accelerator is available.
"""
return torch.cuda.is_available()
def device_count(self):
"""
Return the number of devices on the machine.
"""
return torch.cuda.device_count()
def get_device_capability(self, device=None) -> Tuple[int, int]:
"""
Gets the cuda capability of a device.
"""
return torch.cuda.get_device_capability(device)
def get_device_name(self, device=None) -> str:
"""
Gets the name of a device.
"""
return torch.cuda.get_device_name(device)
def get_device_properties(self, device):
"""
Gets the properties of a device.
"""
return torch.cuda.get_device_properties(device)
def utilization(self, device=None) -> int:
"""
Returns the percent of time over the past sample period during which one or more kernels was executing on the GPU as given by nvidia-smi
"""
return torch.cuda.utilization(device)
# =======================
# random number generator APIs
# =======================
def get_rng_state(self, device="cuda") -> torch.Tensor:
"""
Returns the random number generator state of the specified GPU as a ByteTensor.
"""
return torch.cuda.get_rng_state(device)
def get_rng_state_all(self) -> List[torch.Tensor]:
"""
Returns a list of ByteTensor representing the random number states of all devices.
"""
return torch.cuda.get_rng_state_all()
def set_rng_state(self, new_state: torch.ByteTensor, device: str = "cuda") -> None:
"""
Sets the random number generator state of the specified GPU.
"""
torch.cuda.set_rng_state(new_state, device)
def set_rng_state_all(self, new_states: List[torch.ByteTensor]) -> None:
"""
Sets the random number generator state of all devices.
"""
torch.cuda.set_rng_state_all(new_states)
def manual_seed(self, seed: int) -> None:
"""
Sets the seed for generating random numbers for the current GPU.
"""
torch.cuda.manual_seed(seed)
def manual_seed_all(self, seed: int) -> None:
"""
Set the random seed for the all processes.
"""
torch.cuda.manual_seed_all(seed)
def seed(self) -> None:
"""
Sets the seed for generating random numbers to a random number for the current GPU.
"""
torch.cuda.seed()
def seed_all(self) -> None:
"""
Sets the seed for generating random numbers to a random number on all GPUs.
"""
torch.cuda.seed_all()
def initial_seed(self) -> int:
"""
Returns the current random seed of the current GPU.
"""
return torch.cuda.initial_seed()
# =======================
# memory management APIs
# =======================
def empty_cache(self) -> None:
"""
Releases all unoccupied cached memory currently held by the caching allocator so that those can be used in other GPU application and visible in nvidia-smi.
"""
torch.cuda.empty_cache()
def memory_stats(self, device=None) -> Dict[str, Any]:
"""
Returns a dictionary of CUDA memory allocator statistics for a given device.
"""
return torch.cuda.memory_stats(device=device)
def memory_summary(self, device=None, abbreviated=False) -> str:
"""
Returns a human-readable printout of the current memory allocator statistics for a given device.
"""
return torch.cuda.memory_summary(device=device, abbreviated=abbreviated)
def memory_snapshot(self):
"""
Returns a snapshot of the CUDA memory allocator state across all devices.
"""
return torch.cuda.memory_snapshot()
def memory_allocated(self, device=None) -> int:
"""
Returns the current GPU memory occupied by tensors in bytes for a given device.
"""
return torch.cuda.memory_allocated(device=device)
def max_memory_allocated(self, device=None) -> int:
"""
Returns the maximum GPU memory occupied by tensors in bytes for a given device.
"""
return torch.cuda.max_memory_allocated(device=device)
def reset_max_memory_allocated(self, device=None) -> None:
"""
Resets the starting point in tracking maximum GPU memory occupied by tensors for a given device.
"""
torch.cuda.reset_max_memory_allocated(device=device)
def reset_max_memory_cached(self, device=None) -> None:
"""
Resets the starting point in tracking maximum GPU memory managed by the caching allocator for a given device.
"""
torch.cuda.reset_max_memory_cached(device=device)
def memory_reserved(self, device=None) -> int:
"""
Returns the current GPU memory managed by the caching allocator in bytes for a given device.
"""
return torch.cuda.memory_reserved(device=device)
def max_memory_reserved(self, device=None) -> int:
"""
Returns the maximum GPU memory managed by the caching allocator in bytes for a given device.
"""
return torch.cuda.max_memory_reserved(device=device)
def set_per_process_memory_fraction(self, fraction: float, device=None) -> None:
"""
Set memory fraction for a process.
"""
torch.cuda.set_per_process_memory_fraction(fraction, device=device)
def reset_peak_memory_stats(self, device=None) -> None:
"""
Resets the "peak" stats tracked by the CUDA memory allocator.
"""
torch.cuda.reset_peak_memory_stats(device=device)
# =======================
# streams and events APIs
# =======================
def Stream(self, device=None, priority=0, **kwargs):
"""
A CUDA stream is a linear sequence of execution that belongs to a specific device, independent from other streams. See cuda-semantics for details.
"""
return torch.cuda.Stream(device, priority, **kwargs)
def Event(self, enable_timing: bool = False, blocking: bool = False, interprocess: bool = False):
"""
CUDA events are synchronization markers that can be used to monitor the device's progress, to accurately measure timing, and to synchronize CUDA streams.
"""
return torch.cuda.Event(enable_timing, blocking, interprocess)
def current_stream(self, device=None):
"""
Returns the currently selected Stream for a given device.
"""
return torch.cuda.current_stream(device)
def default_stream(self, device=None):
"""
Returns the default Stream for a given device.
"""
return torch.cuda.default_stream(device)
def set_stream(self, stream_):
"""
Sets the current stream.This is a wrapper API to set the stream.
"""
torch.cuda.set_stream(stream_)
def stream(self, stream_):
"""
Wrapper around the Context-manager StreamContext that selects a given stream.
"""
return torch.cuda.stream(stream_)
# =======================
# amp APIs
# =======================
def autocast(
self, enabled: bool = True, dtype: torch.dtype = torch.float16, cache_enabled: bool = True
) -> Callable:
"""
Return autocast function
"""
return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled)
#!/usr/bin/env python
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
import torch.distributed as dist
from .base_accelerator import BaseAccelerator
try:
import torch_npu # noqa
except ImportError:
pass
__all__ = ["NpuAccelerator"]
class NpuAccelerator(BaseAccelerator):
"""
Accelerator class for Huawei NPU devices.
"""
def __init__(self):
super().__init__(name="npu", communication_backend="hccl", is_synchronous=False)
# =======================
# device APIs
# =======================
def get_version(self) -> str:
"""
Return the version of the accelerator which torch is built against.
"""
return torch.version.npu
def get_current_device(self) -> torch.device:
"""
Return the current device.
"""
return torch.device(f"npu:{torch.npu.current_device()}")
def current_device(self) -> int:
"""
Return the current device index.
"""
return torch.npu.current_device()
def set_device(self, device: Optional[Union[torch.device, int]] = None) -> None:
"""
Bind the current process to a device.
"""
if device is None:
if not dist.is_initialized():
raise RuntimeError("Cannot get current device when distributed is not initialized.")
device = dist.get_rank() % self.device_count()
torch.npu.set_device(device)
def get_device_name(self, device: Union[torch.device, int]) -> str:
"""
Return the name of the device.
"""
return torch.npu.get_device_name(device)
def synchronize(self, device: Union[torch.device, int] = None):
"""
Synchronize the current process.
"""
torch.npu.synchronize(device)
def is_available(self):
"""
Check if the accelerator is available.
"""
return torch.npu.is_available()
def device_count(self):
"""
Return the number of devices on the machine.
"""
return torch.npu.device_count()
def get_device_capability(self, device=None) -> Tuple[int, int]:
"""
Gets the npu capability of a device.
"""
return torch.npu.get_device_capability(device)
def get_device_name(self, device=None) -> str:
"""
Gets the name of a device.
"""
return torch.npu.get_device_name(device)
def get_device_properties(self, device):
"""
Gets the properties of a device.
"""
return torch.npu.get_device_properties(device)
def utilization(self, device=None) -> int:
"""
Returns the percent of time over the past sample period during which one or more kernels was executing on the GPU as given by nvidia-smi
"""
return torch.npu.utilization(device)
# =======================
# random number generator APIs
# =======================
def get_rng_state(self, device="npu") -> torch.Tensor:
"""
Returns the random number generator state of the specified GPU as a ByteTensor.
"""
return torch.npu.get_rng_state(device)
def get_rng_state_all(self) -> List[torch.Tensor]:
"""
Returns a list of ByteTensor representing the random number states of all devices.
"""
return torch.npu.get_rng_state_all()
def set_rng_state(self, new_state: torch.ByteTensor, device: str = "npu") -> None:
"""
Sets the random number generator state of the specified GPU.
"""
torch.npu.set_rng_state(new_state, device)
def set_rng_state_all(self, new_states: List[torch.ByteTensor]) -> None:
"""
Sets the random number generator state of all devices.
"""
torch.npu.set_rng_state_all(new_states)
def manual_seed(self, seed: int) -> None:
"""
Sets the seed for generating random numbers for the current GPU.
"""
torch.npu.manual_seed(seed)
def manual_seed_all(self, seed: int) -> None:
"""
Set the random seed for the all processes.
"""
torch.npu.manual_seed_all(seed)
def seed(self) -> None:
"""
Sets the seed for generating random numbers to a random number for the current GPU.
"""
torch.npu.seed()
def seed_all(self) -> None:
"""
Sets the seed for generating random numbers to a random number on all GPUs.
"""
torch.npu.seed_all()
def initial_seed(self) -> int:
"""
Returns the current random seed of the current GPU.
"""
return torch.npu.initial_seed()
# =======================
# memory management APIs
# =======================
def empty_cache(self) -> None:
"""
Releases all unoccupied cached memory currently held by the caching allocator so that those can be used in other GPU application and visible in nvidia-smi.
"""
torch.npu.empty_cache()
def memory_stats(self, device=None) -> Dict[str, Any]:
"""
Returns a dictionary of npu memory allocator statistics for a given device.
"""
return torch.npu.memory_stats(device=device)
def memory_summary(self, device=None, abbreviated=False) -> str:
"""
Returns a human-readable printout of the current memory allocator statistics for a given device.
"""
return torch.npu.memory_summary(device=device, abbreviated=abbreviated)
def memory_snapshot(self):
"""
Returns a snapshot of the npu memory allocator state across all devices.
"""
return torch.npu.memory_snapshot()
def memory_allocated(self, device=None) -> int:
"""
Returns the current GPU memory occupied by tensors in bytes for a given device.
"""
return torch.npu.memory_allocated(device=device)
def max_memory_allocated(self, device=None) -> int:
"""
Returns the maximum GPU memory occupied by tensors in bytes for a given device.
"""
return torch.npu.max_memory_allocated(device=device)
def reset_max_memory_allocated(self, device=None) -> None:
"""
Resets the starting point in tracking maximum GPU memory occupied by tensors for a given device.
"""
torch.npu.reset_max_memory_allocated(device=device)
def reset_max_memory_cached(self, device=None) -> None:
"""
Resets the starting point in tracking maximum GPU memory managed by the caching allocator for a given device.
"""
torch.npu.reset_max_memory_cached(device=device)
def memory_reserved(self, device=None) -> int:
"""
Returns the current GPU memory managed by the caching allocator in bytes for a given device.
"""
return torch.npu.memory_reserved(device=device)
def max_memory_reserved(self, device=None) -> int:
"""
Returns the maximum GPU memory managed by the caching allocator in bytes for a given device.
"""
return torch.npu.max_memory_reserved(device=device)
def set_per_process_memory_fraction(self, fraction: float, device=None) -> None:
"""
Set memory fraction for a process.
"""
torch.npu.set_per_process_memory_fraction(fraction, device=device)
def reset_peak_memory_stats(self, device=None) -> None:
"""
Resets the "peak" stats tracked by the npu memory allocator.
"""
torch.npu.reset_peak_memory_stats(device=device)
# =======================
# streams and events APIs
# =======================
def Stream(self, device=None, priority=0, **kwargs):
"""
A npu stream is a linear sequence of execution that belongs to a specific device, independent from other streams. See npu-semantics for details.
"""
return torch.npu.Stream(device, priority, **kwargs)
def Event(self, enable_timing: bool = False, blocking: bool = False, interprocess: bool = False):
"""
npu events are synchronization markers that can be used to monitor the device's progress, to accurately measure timing, and to synchronize npu streams.
"""
return torch.npu.Event(enable_timing, blocking, interprocess)
def current_stream(self, device=None):
"""
Returns the currently selected Stream for a given device.
"""
return torch.npu.current_stream(device)
def default_stream(self, device=None):
"""
Returns the default Stream for a given device.
"""
return torch.npu.default_stream(device)
def set_stream(self, stream_):
"""
Sets the current stream.This is a wrapper API to set the stream.
"""
torch.npu.set_stream(stream_)
def stream(self, stream_):
"""
Wrapper around the Context-manager StreamContext that selects a given stream.
"""
return torch.npu.stream(stream_)
# =======================
# amp APIs
# =======================
def autocast(
self, enabled: bool = True, dtype: torch.dtype = torch.float16, cache_enabled: bool = True
) -> Callable:
"""
Return autocast function
"""
return torch.npu.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment