Unverified Commit 8823cc48 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

Merge pull request #5310 from hpcaitech/feature/npu

Feature/npu
parents bce9499e 73f4dc57
from .cpu_adam import CpuAdamArmExtension, CpuAdamX86Extension
from .flash_attention import (
FlashAttentionDaoCudaExtension,
FlashAttentionNpuExtension,
FlashAttentionXformersCudaExtension,
)
from .layernorm import LayerNormCudaExtension
from .moe import MoeCudaExtension
from .optimizer import FusedOptimizerCudaExtension
from .softmax import ScaledMaskedSoftmaxCudaExtension, ScaledUpperTriangleMaskedSoftmaxCudaExtension
ALL_EXTENSIONS = [
CpuAdamArmExtension,
CpuAdamX86Extension,
LayerNormCudaExtension,
MoeCudaExtension,
FusedOptimizerCudaExtension,
ScaledMaskedSoftmaxCudaExtension,
ScaledUpperTriangleMaskedSoftmaxCudaExtension,
FlashAttentionDaoCudaExtension,
FlashAttentionXformersCudaExtension,
FlashAttentionNpuExtension,
]
__all__ = [
"CpuAdamArmExtension",
"CpuAdamX86Extension",
"LayerNormCudaExtension",
"MoeCudaExtension",
"FusedOptimizerCudaExtension",
"ScaledMaskedSoftmaxCudaExtension",
"ScaledUpperTriangleMaskedSoftmaxCudaExtension",
"FlashAttentionDaoCudaExtension",
"FlashAttentionXformersCudaExtension",
"FlashAttentionNpuExtension",
]
import hashlib
import os
from abc import ABC, abstractmethod
from typing import Union
__all__ = ["_Extension"]
class _Extension(ABC):
def __init__(self, name: str, support_aot: bool, support_jit: bool, priority: int = 1):
self._name = name
self._support_aot = support_aot
self._support_jit = support_jit
self.priority = priority
@property
def name(self):
return self._name
@property
def support_aot(self):
return self._support_aot
@property
def support_jit(self):
return self._support_jit
@staticmethod
def get_jit_extension_folder_path():
"""
Kernels which are compiled during runtime will be stored in the same cache folder for reuse.
The folder is in the path ~/.cache/colossalai/torch_extensions/<cache-folder>.
The name of the <cache-folder> follows a common format:
torch<torch_version_major>.<torch_version_minor>_<device_name><device_version>-<hash>
The <hash> suffix is the hash value of the path of the `colossalai` file.
"""
import torch
import colossalai
from colossalai.accelerator import get_accelerator
# get torch version
torch_version_major = torch.__version__.split(".")[0]
torch_version_minor = torch.__version__.split(".")[1]
# get device version
device_name = get_accelerator().name
device_version = get_accelerator().get_version()
# use colossalai's file path as hash
hash_suffix = hashlib.sha256(colossalai.__file__.encode()).hexdigest()
# concat
home_directory = os.path.expanduser("~")
extension_directory = f".cache/colossalai/torch_extensions/torch{torch_version_major}.{torch_version_minor}_{device_name}-{device_version}-{hash_suffix}"
cache_directory = os.path.join(home_directory, extension_directory)
return cache_directory
@abstractmethod
def is_hardware_available(self) -> bool:
"""
Check if the hardware required by the kernel is available.
"""
@abstractmethod
def assert_hardware_compatible(self) -> bool:
"""
Check if the hardware required by the kernel is compatible.
"""
@abstractmethod
def build_aot(self) -> Union["CppExtension", "CUDAExtension"]:
pass
@abstractmethod
def build_jit(self) -> None:
pass
@abstractmethod
def load(self):
pass
import importlib
import os
import time
from abc import abstractmethod
from pathlib import Path
from typing import List
from .base_extension import _Extension
__all__ = ["_CppExtension"]
class _CppExtension(_Extension):
def __init__(self, name: str, priority: int = 1):
super().__init__(name, support_aot=True, support_jit=True, priority=priority)
# we store the op as an attribute to avoid repeated building and loading
self.cached_op = None
# build-related variables
self.prebuilt_module_path = "colossalai._C"
self.prebuilt_import_path = f"{self.prebuilt_module_path}.{self.name}"
self.version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]
def csrc_abs_path(self, path):
return os.path.join(self.relative_to_abs_path("csrc"), path)
def relative_to_abs_path(self, code_path: str) -> str:
"""
This function takes in a path relative to the colossalai root directory and return the absolute path.
"""
# get the current file path
# iteratively check the parent directory
# if the parent directory is "extensions", then the current file path is the root directory
# otherwise, the current file path is inside the root directory
current_file_path = Path(__file__)
while True:
if current_file_path.name == "extensions":
break
else:
current_file_path = current_file_path.parent
extension_module_path = current_file_path
code_abs_path = extension_module_path.joinpath(code_path)
return str(code_abs_path)
# functions must be overrided over
def strip_empty_entries(self, args):
"""
Drop any empty strings from the list of compile and link flags
"""
return [x for x in args if len(x) > 0]
def import_op(self):
"""
This function will import the op module by its string name.
"""
return importlib.import_module(self.prebuilt_import_path)
def build_aot(self) -> "CppExtension":
from torch.utils.cpp_extension import CppExtension
return CppExtension(
name=self.prebuilt_import_path,
sources=self.strip_empty_entries(self.sources_files()),
include_dirs=self.strip_empty_entries(self.include_dirs()),
extra_compile_args=self.strip_empty_entries(self.cxx_flags()),
)
def build_jit(self) -> None:
from torch.utils.cpp_extension import load
build_directory = _Extension.get_jit_extension_folder_path()
build_directory = Path(build_directory)
build_directory.mkdir(parents=True, exist_ok=True)
# check if the kernel has been built
compiled_before = False
kernel_file_path = build_directory.joinpath(f"{self.name}.o")
if kernel_file_path.exists():
compiled_before = True
# load the kernel
if compiled_before:
print(f"[extension] Loading the JIT-built {self.name} kernel during runtime now")
else:
print(f"[extension] Compiling the JIT {self.name} kernel during runtime now")
build_start = time.time()
op_kernel = load(
name=self.name,
sources=self.strip_empty_entries(self.sources_files()),
extra_include_paths=self.strip_empty_entries(self.include_dirs()),
extra_cflags=self.cxx_flags(),
extra_ldflags=[],
build_directory=str(build_directory),
)
build_duration = time.time() - build_start
if compiled_before:
print(f"[extension] Time taken to load {self.name} op: {build_duration} seconds")
else:
print(f"[extension] Time taken to compile {self.name} op: {build_duration} seconds")
return op_kernel
# functions must be overrided begin
@abstractmethod
def sources_files(self) -> List[str]:
"""
This function should return a list of source files for extensions.
"""
@abstractmethod
def include_dirs(self) -> List[str]:
"""
This function should return a list of include files for extensions.
"""
@abstractmethod
def cxx_flags(self) -> List[str]:
"""
This function should return a list of cxx compilation flags for extensions.
"""
def load(self):
try:
op_kernel = self.import_op()
except ImportError:
# if import error occurs, it means that the kernel is not pre-built
# so we build it jit
op_kernel = self.build_jit()
return op_kernel
from .cpu_adam_arm import CpuAdamArmExtension
from .cpu_adam_x86 import CpuAdamX86Extension
__all__ = ['CpuAdamArmExtension', 'CpuAdamX86Extension']
from .builder import Builder
import platform
from ..cpp_extension import _CppExtension
class ArmCPUAdamBuilder(Builder):
NAME = "arm_cpu_adam"
PREBUILT_IMPORT_PATH = "colossalai._C.arm_cpu_adam"
ext_type = "cpu"
class CpuAdamArmExtension(_CppExtension):
def __init__(self):
super().__init__(name=ArmCPUAdamBuilder.NAME, prebuilt_import_path=ArmCPUAdamBuilder.PREBUILT_IMPORT_PATH)
self.version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]
super().__init__(name="cpu_adam_arm")
def is_hardware_available(self) -> bool:
# only arm allowed
return platform.machine() == "aarch64"
def assert_hardware_compatible(self) -> None:
arch = platform.machine()
assert (
arch == "aarch64"
), f"[extension] The {self.name} kernel requires the CPU architecture to be aarch64 but got {arch}"
# necessary 4 functions
def sources_files(self):
ret = [
self.csrc_abs_path("cpu_adam_arm.cpp"),
self.csrc_abs_path("arm/cpu_adam_arm.cpp"),
]
return ret
def include_dirs(self):
return [self.csrc_abs_path("includes")]
return []
def cxx_flags(self):
extra_cxx_flags = [
......
from .builder import Builder
from .utils import append_nvcc_threads
import platform
from ..cuda_extension import _CudaExtension
from ..utils import append_nvcc_threads
class CPUAdamBuilder(Builder):
NAME = "cpu_adam"
PREBUILT_IMPORT_PATH = "colossalai._C.cpu_adam"
class CpuAdamX86Extension(_CudaExtension):
def __init__(self):
super().__init__(name=CPUAdamBuilder.NAME, prebuilt_import_path=CPUAdamBuilder.PREBUILT_IMPORT_PATH)
self.version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]
super().__init__(name="cpu_adam_x86")
def is_hardware_available(self) -> bool:
return platform.machine() == "x86_64" and super().is_hardware_available()
def assert_hardware_compatible(self) -> None:
arch = platform.machine()
assert (
arch == "x86_64"
), f"[extension] The {self.name} kernel requires the CPU architecture to be x86_64 but got {arch}"
super().assert_hardware_compatible()
# necessary 4 functions
def sources_files(self):
ret = [
self.csrc_abs_path("cpu_adam.cpp"),
self.csrc_abs_path("cuda/cpu_adam.cpp"),
]
return ret
......
from .layer_norm import MixedFusedLayerNorm as LayerNorm
from .mha.mha import ColoAttention
from .multihead_attention import MultiHeadAttention
from .scaled_softmax import AttnMaskType, FusedScaleMaskSoftmax, ScaledUpperTriangMaskedSoftmax
......@@ -8,6 +7,5 @@ __all__ = [
"MultiHeadAttention",
"FusedScaleMaskSoftmax",
"ScaledUpperTriangMaskedSoftmax",
"ColoAttention",
"AttnMaskType",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment