Merge pull request #5310 from hpcaitech/feature/npu

Feature/npu

Merge pull request #5310 from hpcaitech/feature/npu
Feature/npu
8823cc48 · Frank Lee · GitHub · bce9499e · 73f4dc57 · 8823cc48
Unverified Commit 8823cc48 authored Jan 29, 2024 by Frank Lee Committed by GitHub Jan 29, 2024
20 changed files
--- a/op_builder/moe.py
+++ b/op_builder/moe.py
-from .builder import Builder
-from .utils import append_nvcc_threads, get_cuda_cc_flag
+from ..cuda_extension import _CudaExtension
+from ..utils import append_nvcc_threads, get_cuda_cc_flag


-class MOEBuilder(Builder):
-    NAME = "moe"
-    PREBUILT_IMPORT_PATH = "colossalai._C.moe"
-
+class MoeCudaExtension(_CudaExtension):
    def __init__(self):
-        super().__init__(name=MOEBuilder.NAME, prebuilt_import_path=MOEBuilder.PREBUILT_IMPORT_PATH)
+        super().__init__(name="moe_cuda")

    def include_dirs(self):
-        ret = [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
+        ret = [self.csrc_abs_path("cuda/include"), self.get_cuda_home_include()]
        return ret

    def sources_files(self):
-        ret = [self.csrc_abs_path(fname) for fname in ["moe_cuda.cpp", "moe_cuda_kernel.cu"]]
+        ret = [self.csrc_abs_path(fname) for fname in ["cuda/moe_cuda.cpp", "cuda/moe_cuda_kernel.cu"]]
        return ret

    def cxx_flags(self):

--- a/extensions/optimizer/__init__.py
+++ b/extensions/optimizer/__init__.py
+from .fused_optimizer_cuda import FusedOptimizerCudaExtension
+
+__all__ = ['FusedOptimizerCudaExtension']
\ No newline at end of file
--- a/op_builder/fused_optim.py
+++ b/op_builder/fused_optim.py
-from .builder import Builder
-from .utils import get_cuda_cc_flag
+from ..cuda_extension import _CudaExtension
+from ..utils import get_cuda_cc_flag


-class FusedOptimBuilder(Builder):
-    NAME = "fused_optim"
-    PREBUILT_IMPORT_PATH = "colossalai._C.fused_optim"
-
+class FusedOptimizerCudaExtension(_CudaExtension):
    def __init__(self):
-        super().__init__(name=FusedOptimBuilder.NAME, prebuilt_import_path=FusedOptimBuilder.PREBUILT_IMPORT_PATH)
+        super().__init__(name="fused_optim_cuda")

    def sources_files(self):
        ret = [
            self.csrc_abs_path(fname)
            for fname in [
-                "colossal_C_frontend.cpp",
-                "multi_tensor_sgd_kernel.cu",
-                "multi_tensor_scale_kernel.cu",
-                "multi_tensor_adam.cu",
-                "multi_tensor_l2norm_kernel.cu",
-                "multi_tensor_lamb.cu",
+                "cuda/colossal_C_frontend.cpp",
+                "cuda/multi_tensor_sgd_kernel.cu",
+                "cuda/multi_tensor_scale_kernel.cu",
+                "cuda/multi_tensor_adam.cu",
+                "cuda/multi_tensor_l2norm_kernel.cu",
+                "cuda/multi_tensor_lamb.cu",
            ]
        ]
        return ret

    def include_dirs(self):
-        ret = [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
+        ret = [self.get_cuda_home_include()]
        return ret

    def cxx_flags(self):

--- a/extensions/softmax/__init__.py
+++ b/extensions/softmax/__init__.py
+from .scaled_masked_softmax_cuda import ScaledMaskedSoftmaxCudaExtension
+from .scaled_upper_triangle_masked_softmax_cuda import ScaledUpperTriangleMaskedSoftmaxCudaExtension
+
+__all__ = ['ScaledMaskedSoftmaxCudaExtension', 'ScaledUpperTriangleMaskedSoftmaxCudaExtension'] 
\ No newline at end of file
--- a/op_builder/scaled_masked_softmax.py
+++ b/op_builder/scaled_masked_softmax.py
-from .builder import Builder
-from .utils import append_nvcc_threads
+from ..cuda_extension import _CudaExtension
+from ..utils import append_nvcc_threads


-class ScaledMaskedSoftmaxBuilder(Builder):
-    NAME = "scaled_masked_softmax"
-    PREBUILT_IMPORT_PATH = "colossalai._C.scaled_masked_softmax"
-
+class ScaledMaskedSoftmaxCudaExtension(_CudaExtension):
    def __init__(self):
-        super().__init__(
-            name=ScaledMaskedSoftmaxBuilder.NAME, prebuilt_import_path=ScaledMaskedSoftmaxBuilder.PREBUILT_IMPORT_PATH
-        )
+        super().__init__(name="scaled_masked_softmax_cuda")

-    # necessary 4 functions
    def sources_files(self):
-        ret = [self.csrc_abs_path(fname) for fname in ["scaled_masked_softmax.cpp", "scaled_masked_softmax_cuda.cu"]]
+        ret = [
+            self.csrc_abs_path(fname)
+            for fname in ["cuda/scaled_masked_softmax.cpp", "cuda/scaled_masked_softmax_cuda.cu"]
+        ]
        return ret

    def include_dirs(self):
-        return [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
+        return [self.get_cuda_home_include()]

    def cxx_flags(self):
        return ["-O3"] + self.version_dependent_macros

--- a/op_builder/scaled_upper_triangle_masked_softmax.py
+++ b/op_builder/scaled_upper_triangle_masked_softmax.py
-from .builder import Builder
-from .utils import append_nvcc_threads, get_cuda_cc_flag
+from ..cuda_extension import _CudaExtension
+from ..utils import append_nvcc_threads, get_cuda_cc_flag


-class ScaledUpperTrainglemaskedSoftmaxBuilder(Builder):
-    NAME = "scaled_upper_triangle_masked_softmax"
-    PREBUILT_IMPORT_PATH = "colossalai._C.scaled_upper_triangle_masked_softmax"
-
+class ScaledUpperTriangleMaskedSoftmaxCudaExtension(_CudaExtension):
    def __init__(self):
-        super().__init__(
-            name=ScaledUpperTrainglemaskedSoftmaxBuilder.NAME,
-            prebuilt_import_path=ScaledUpperTrainglemaskedSoftmaxBuilder.PREBUILT_IMPORT_PATH,
-        )
+        super().__init__(name="scaled_upper_triangle_masked_softmax_cuda")

    def include_dirs(self):
-        return [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
+        return [self.get_cuda_home_include()]

    def sources_files(self):
        ret = [
            self.csrc_abs_path(fname)
-            for fname in ["scaled_upper_triang_masked_softmax.cpp", "scaled_upper_triang_masked_softmax_cuda.cu"]
+            for fname in [
+                "cuda/scaled_upper_triang_masked_softmax.cpp",
+                "cuda/scaled_upper_triang_masked_softmax_cuda.cu",
+            ]
        ]
        return ret


--- a/extensions/triton_extension.py
+++ b/extensions/triton_extension.py
+from .base_extension import _Extension
+
+__all__ = ["_TritonExtension"]
+
+
+class _TritonExtension(_Extension):
+    def __init__(self, name: str, priority: int = 1):
+        super().__init__(name, support_aot=False, support_jit=True, priority=priority)
+
+    def is_hardware_compatible(self) -> bool:
+        # cuda extension can only be built if cuda is availabe
+        try:
+            import torch
+
+            cuda_available = torch.cuda.is_available()
+        except:
+            cuda_available = False
+        return cuda_available
+
+    def load(self):
+        return self.build_jit()
--- a/op_builder/utils.py
+++ b/op_builder/utils.py
--- a/op_builder/README.md
+++ b/op_builder/README.md
-# Build PyTorch Extensions
-
-## Overview
-
-Building PyTorch extensions can be a difficult task for users not from the system background. It is definitely frustrating if the users encounter many strange technical jargons when install Colossal-AI. Therefore, we will provide two methods of building the PyTorch extensions for the users.
-
-1. Build CUDA extensions when running `pip install` if `CUDA_EXT=1`
-2. Build the extension during runtime
-
-The first method is more suitable for users who are familiar with CUDA environment configurations. The second method is for those who are not as they only need to build the kernel which is required by their program.
-
-These two methods have different advantages and disadvantages.
-Method 1 is good because it allows the user to build all kernels during installation and directly import the kernel. They don't need to care about kernel building when running their program. However, installation may fail if they don't know how to configure their environments and this leads to much frustration.
-Method 2 is good because it allows the user to only build the kernel they actually need, such that there is a lower probability that they encounter environment issue. However, it may slow down their program due to the first build and subsequence load.
-
-## PyTorch Extensions in Colossal-AI
-
-The project [DeepSpeed](https://github.com/microsoft/DeepSpeed) has proposed a [solution](https://github.com/microsoft/DeepSpeed/tree/master/op_builder) to support kernel-build during either installation or runtime.
-We have adapted from DeepSpeed's solution to build extensions. The extension build requires two main functions from PyTorch:
-
-1. `torch.utils.cpp_extension.CUDAExtension`: used to build extensions in `setup.py` during `pip install`.
-2. `torch.utils.cpp_extension.load`: used to build and load extension during runtime
-
-Please note that the extension build by `CUDAExtension` cannot be loaded by the `load` function and `load` will run its own build again (correct me if I am wrong).
-
-Based on the DeepSpeed's work, we have make several modifications and improvements:
-
-1. All pre-built kernels (those installed with `setup.py`) will be found in `colossalai._C`
-2. All runtime-built kernels will be found in the default torch extension path, i.e. ~/.cache/colossalai/torch_extensions. (If we put the built kernels in the installed site-package directory, this will make pip uninstall incomplete)
-3. Once a kernel is loaded, we will cache it in the builder to avoid repeated kernel loading.
-
-When loading the built kernel, we will first check if the pre-built one exists. If not, the runtime build will be triggered.
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
-from .arm_cpu_adam import ArmCPUAdamBuilder
-from .cpu_adam import CPUAdamBuilder
-from .fused_optim import FusedOptimBuilder
-from .layernorm import LayerNormBuilder
-from .moe import MOEBuilder
-from .multi_head_attn import MultiHeadAttnBuilder
-from .scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
-from .scaled_upper_triangle_masked_softmax import ScaledUpperTrainglemaskedSoftmaxBuilder
-
-ALL_OPS = {
-    "cpu_adam": CPUAdamBuilder,
-    "fused_optim": FusedOptimBuilder,
-    "moe": MOEBuilder,
-    "multi_head_attn": MultiHeadAttnBuilder,
-    "scaled_masked_softmax": ScaledMaskedSoftmaxBuilder,
-    "scaled_upper_triangle_masked_softmax": ScaledUpperTrainglemaskedSoftmaxBuilder,
-    "layernorm": LayerNormBuilder,
-}
-
-__all__ = [
-    "ALL_OPS",
-    "CPUAdamBuilder",
-    "FusedOptimBuilder",
-    "MultiHeadAttnBuilder",
-    "ScaledMaskedSoftmaxBuilder",
-    "ScaledUpperTrainglemaskedSoftmaxBuilder",
-    "MOEBuilder",
-    "MultiTensorSGDBuilder",
-    "MultiTensorAdamBuilder",
-    "MultiTensorLambBuilder",
-    "MultiTensorScaleBuilder",
-    "MultiTensorL2NormBuilder",
-    "ArmCPUAdamBuilder",
-]
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
-# This code has been adapted from the DeepSpeed library.
-# Copyright (c) Microsoft Corporation.
-
-# Licensed under the MIT License.
-import importlib
-import os
-import time
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import List, Optional, Union
-
-from .utils import check_cuda_availability, check_system_pytorch_cuda_match, print_rank_0
-
-
-class Builder(ABC):
-    """
-    Builder is the base class to build extensions for PyTorch.
-
-    Args:
-        name (str): the name of the kernel to be built
-        prebuilt_import_path (str): the path where the extension is installed during pip install
-    """
-
-    ext_type: str = "cuda"
-
-    def __init__(self, name: str, prebuilt_import_path: str):
-        self.name = name
-        self.prebuilt_import_path = prebuilt_import_path
-        self.version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]
-
-        # we store the op as an attribute to avoid repeated building and loading
-        self.cached_op_module = None
-
-        assert prebuilt_import_path.startswith(
-            "colossalai._C"
-        ), f"The prebuilt_import_path should start with colossalai._C, but got {self.prebuilt_import_path}"
-
-    def relative_to_abs_path(self, code_path: str) -> str:
-        """
-        This function takes in a path relative to the colossalai root directory and return the absolute path.
-        """
-        op_builder_module_path = Path(__file__).parent
-
-        # if we install from source
-        # the current file path will be op_builder/builder.py
-        # if we install via pip install colossalai
-        # the current file path will be colossalai/kernel/op_builder/builder.py
-        # this is because that the op_builder inside colossalai is a symlink
-        # this symlink will be replaced with actual files if we install via pypi
-        # thus we cannot tell the colossalai root directory by checking whether the op_builder
-        # is a symlink, we can only tell whether it is inside or outside colossalai
-        if str(op_builder_module_path).endswith("colossalai/kernel/op_builder"):
-            root_path = op_builder_module_path.parent.parent
-        else:
-            root_path = op_builder_module_path.parent.joinpath("colossalai")
-
-        code_abs_path = root_path.joinpath(code_path)
-        return str(code_abs_path)
-
-    def get_cuda_home_include(self):
-        """
-        return include path inside the cuda home.
-        """
-        from torch.utils.cpp_extension import CUDA_HOME
-
-        if CUDA_HOME is None:
-            raise RuntimeError("CUDA_HOME is None, please set CUDA_HOME to compile C++/CUDA kernels in ColossalAI.")
-        cuda_include = os.path.join(CUDA_HOME, "include")
-        return cuda_include
-
-    def csrc_abs_path(self, path):
-        return os.path.join(self.relative_to_abs_path("kernel/cuda_native/csrc"), path)
-
-    # functions must be overrided begin
-    @abstractmethod
-    def sources_files(self) -> List[str]:
-        """
-        This function should return a list of source files for extensions.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def include_dirs(self) -> List[str]:
-        """
-        This function should return a list of include files for extensions.
-        """
-
-    @abstractmethod
-    def cxx_flags(self) -> List[str]:
-        """
-        This function should return a list of cxx compilation flags for extensions.
-        """
-
-    @abstractmethod
-    def nvcc_flags(self) -> List[str]:
-        """
-        This function should return a list of nvcc compilation flags for extensions.
-        """
-
-    # functions must be overrided over
-    def strip_empty_entries(self, args):
-        """
-        Drop any empty strings from the list of compile and link flags
-        """
-        return [x for x in args if len(x) > 0]
-
-    def import_op(self):
-        """
-        This function will import the op module by its string name.
-        """
-        return importlib.import_module(self.prebuilt_import_path)
-
-    def check_runtime_build_environment(self):
-        """
-        Check whether the system environment is ready for extension compilation.
-        """
-        try:
-            from torch.utils.cpp_extension import CUDA_HOME
-
-            TORCH_AVAILABLE = True
-        except ImportError:
-            TORCH_AVAILABLE = False
-            CUDA_HOME = None
-
-        if not TORCH_AVAILABLE:
-            raise ModuleNotFoundError(
-                "PyTorch is not found. You need to install PyTorch first in order to build CUDA extensions"
-            )
-
-        if CUDA_HOME is None:
-            raise RuntimeError(
-                "CUDA_HOME is not found. You need to export CUDA_HOME environment variable or install CUDA Toolkit first in order to build CUDA extensions"
-            )
-
-        # make sure CUDA is available for compilation during
-        cuda_available = check_cuda_availability()
-        if not cuda_available:
-            raise RuntimeError("CUDA is not available on your system as torch.cuda.is_available() returns False.")
-
-        # make sure system CUDA and pytorch CUDA match, an error will raised inside the function if not
-        check_system_pytorch_cuda_match(CUDA_HOME)
-
-    def load(self, verbose: Optional[bool] = None):
-        """
-        load the kernel during runtime. If the kernel is not built during pip install, it will build the kernel.
-        If the kernel is built during runtime, it will be stored in `~/.cache/colossalai/torch_extensions/`. If the
-        kernel is built during pip install, it can be accessed through `colossalai._C`.
-
-        Warning: do not load this kernel repeatedly during model execution as it could slow down the training process.
-
-        Args:
-            verbose (bool, optional): show detailed info. Defaults to True.
-        """
-        if verbose is None:
-            verbose = os.environ.get("CAI_KERNEL_VERBOSE", "0") == "1"
-        # if the kernel has be compiled and cached, we directly use it
-        if self.cached_op_module is not None:
-            return self.cached_op_module
-
-        try:
-            # if the kernel has been pre-built during installation
-            # we just directly import it
-            op_module = self.import_op()
-            if verbose:
-                print_rank_0(
-                    f"[extension] OP {self.prebuilt_import_path} has been compiled ahead of time, skip building."
-                )
-        except ImportError:
-            # check environment
-            if self.ext_type == "cuda":
-                self.check_runtime_build_environment()
-
-            # time the kernel compilation
-            start_build = time.time()
-
-            # construct the build directory
-            import torch
-            from torch.utils.cpp_extension import load
-
-            torch_version_major = torch.__version__.split(".")[0]
-            torch_version_minor = torch.__version__.split(".")[1]
-            torch_cuda_version = torch.version.cuda
-            home_directory = os.path.expanduser("~")
-            extension_directory = f".cache/colossalai/torch_extensions/torch{torch_version_major}.{torch_version_minor}_cu{torch_cuda_version}"
-            build_directory = os.path.join(home_directory, extension_directory)
-            Path(build_directory).mkdir(parents=True, exist_ok=True)
-
-            if verbose:
-                print_rank_0(f"[extension] Compiling or loading the JIT-built {self.name} kernel during runtime now")
-
-            # load the kernel
-            op_module = load(
-                name=self.name,
-                sources=self.strip_empty_entries(self.sources_files()),
-                extra_include_paths=self.strip_empty_entries(self.include_dirs()),
-                extra_cflags=self.cxx_flags(),
-                extra_cuda_cflags=self.nvcc_flags(),
-                extra_ldflags=[],
-                build_directory=build_directory,
-                verbose=verbose,
-            )
-
-            build_duration = time.time() - start_build
-
-            # log jit compilation time
-            if verbose:
-                print_rank_0(f"[extension] Time to compile or load {self.name} op: {build_duration} seconds")
-
-        # cache the built/loaded kernel
-        self.cached_op_module = op_module
-
-        return op_module
-
-    def builder(self) -> Union["CUDAExtension", "CppExtension"]:
-        """
-        get a CUDAExtension instance used for setup.py
-        """
-        from torch.utils.cpp_extension import CppExtension, CUDAExtension
-
-        if self.ext_type == "cpp":
-            return CppExtension(
-                name=self.prebuilt_import_path,
-                sources=self.strip_empty_entries(self.sources_files()),
-                include_dirs=self.strip_empty_entries(self.include_dirs()),
-                extra_compile_args=self.strip_empty_entries(self.cxx_flags()),
-            )
-
-        return CUDAExtension(
-            name=self.prebuilt_import_path,
-            sources=self.strip_empty_entries(self.sources_files()),
-            include_dirs=self.strip_empty_entries(self.include_dirs()),
-            extra_compile_args={
-                "cxx": self.strip_empty_entries(self.cxx_flags()),
-                "nvcc": self.strip_empty_entries(self.nvcc_flags()),
-            },
-        )
--- a/op_builder/gptq.py
+++ b/op_builder/gptq.py
-import re
-
-import torch
-
-from .builder import Builder
-from .utils import append_nvcc_threads
-
-
-class GPTQBuilder(Builder):
-    NAME = "cu_gptq"
-    PREBUILT_IMPORT_PATH = "colossalai._C.cu_gptq"
-
-    def __init__(self):
-        super().__init__(name=GPTQBuilder.NAME, prebuilt_import_path=GPTQBuilder.PREBUILT_IMPORT_PATH)
-
-    def include_dirs(self):
-        ret = [self.csrc_abs_path("gptq"), self.get_cuda_home_include()]
-        return ret
-
-    def sources_files(self):
-        ret = [
-            self.csrc_abs_path(fname)
-            for fname in [
-                "gptq/linear_gptq.cpp",
-                "gptq/column_remap.cu",
-                "gptq/cuda_buffers.cu",
-                "gptq/q4_matmul.cu",
-                "gptq/q4_matrix.cu",
-            ]
-        ]
-        return ret
-
-    def cxx_flags(self):
-        return ["-O3"] + self.version_dependent_macros
-
-    def nvcc_flags(self):
-        extra_cuda_flags = [
-            "-v",
-            "-std=c++14",
-            "-std=c++17",
-            "-U__CUDA_NO_HALF_OPERATORS__",
-            "-U__CUDA_NO_HALF_CONVERSIONS__",
-            "-U__CUDA_NO_HALF2_OPERATORS__",
-            "-DTHRUST_IGNORE_CUB_VERSION_CHECK",
-            "-lcublas",
-        ]
-
-        for arch in torch.cuda.get_arch_list():
-            res = re.search(r"sm_(\d+)", arch)
-            if res:
-                arch_cap = res[1]
-                if int(arch_cap) >= 80:
-                    extra_cuda_flags.extend(["-gencode", f"arch=compute_{arch_cap},code={arch}"])
-
-        ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags
-        return append_nvcc_threads(ret)
--- a/op_builder/multi_head_attn.py
+++ b/op_builder/multi_head_attn.py
-from .builder import Builder
-from .utils import append_nvcc_threads, get_cuda_cc_flag
-
-
-class MultiHeadAttnBuilder(Builder):
-    NAME = "multihead_attention"
-    PREBUILT_IMPORT_PATH = "colossalai._C.multihead_attention"
-
-    def __init__(self):
-        super().__init__(name=MultiHeadAttnBuilder.NAME, prebuilt_import_path=MultiHeadAttnBuilder.PREBUILT_IMPORT_PATH)
-
-    def include_dirs(self):
-        ret = [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
-        return ret
-
-    def sources_files(self):
-        ret = [
-            self.csrc_abs_path(fname)
-            for fname in [
-                "multihead_attention_1d.cpp",
-                "kernels/cublas_wrappers.cu",
-                "kernels/transform_kernels.cu",
-                "kernels/dropout_kernels.cu",
-                "kernels/normalize_kernels.cu",
-                "kernels/softmax_kernels.cu",
-                "kernels/general_kernels.cu",
-                "kernels/cuda_util.cu",
-            ]
-        ]
-        return ret
-
-    def cxx_flags(self):
-        return ["-O3"] + self.version_dependent_macros
-
-    def nvcc_flags(self):
-        extra_cuda_flags = [
-            "-std=c++14",
-            "-std=c++17",
-            "-U__CUDA_NO_HALF_OPERATORS__",
-            "-U__CUDA_NO_HALF_CONVERSIONS__",
-            "-U__CUDA_NO_HALF2_OPERATORS__",
-            "-DTHRUST_IGNORE_CUB_VERSION_CHECK",
-        ]
-        extra_cuda_flags.extend(get_cuda_cc_flag())
-        ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags
-        return append_nvcc_threads(ret)
--- a/op_builder/smoothquant.py
+++ b/op_builder/smoothquant.py
-import torch
-
-from .builder import Builder
-from .utils import append_nvcc_threads
-
-
-class SmoothquantBuilder(Builder):
-    NAME = "cu_smoothquant"
-    PREBUILT_IMPORT_PATH = "colossalai._C.cu_smoothquant"
-
-    def __init__(self):
-        super().__init__(name=SmoothquantBuilder.NAME, prebuilt_import_path=SmoothquantBuilder.PREBUILT_IMPORT_PATH)
-
-    def include_dirs(self):
-        ret = [self.csrc_abs_path("smoothquant"), self.get_cuda_home_include()]
-        return ret
-
-    def sources_files(self):
-        ret = [
-            self.csrc_abs_path(fname)
-            for fname in [
-                "smoothquant/binding.cpp",
-                "smoothquant/linear.cu",
-            ]
-        ]
-        return ret
-
-    def cxx_flags(self):
-        return ["-O3"] + self.version_dependent_macros
-
-    def nvcc_flags(self):
-        compute_capability = torch.cuda.get_device_capability()
-        cuda_arch = compute_capability[0] * 100 + compute_capability[1] * 10
-
-        extra_cuda_flags = [
-            "-v",
-            f"-DCUDA_ARCH={cuda_arch}",
-            "-std=c++17",
-            "-U__CUDA_NO_HALF_OPERATORS__",
-            "-U__CUDA_NO_HALF_CONVERSIONS__",
-            "-U__CUDA_NO_HALF2_OPERATORS__",
-            "-DTHRUST_IGNORE_CUB_VERSION_CHECK",
-        ]
-
-        ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags
-        return append_nvcc_threads(ret)
-
-    def builder(self):
-        try:
-            super().builder()
-        except:
-            warnings.warn("build smoothquant lib not successful")
--- a/setup.py
+++ b/setup.py
@@ -5,55 +5,23 @@ from typing import List

 from setuptools import find_packages, setup

-from op_builder.utils import (
-    check_cuda_availability,
-    check_pytorch_version,
-    check_system_pytorch_cuda_match,
-    get_cuda_bare_metal_version,
-    get_pytorch_version,
-    set_cuda_arch_list,
-)
-
 try:
-    from torch.utils.cpp_extension import CUDA_HOME, BuildExtension
+    import torch  # noqa
+    from torch.utils.cpp_extension import BuildExtension

    TORCH_AVAILABLE = True
 except ImportError:
    TORCH_AVAILABLE = False
-    CUDA_HOME = None

-# Some constants for installation checks
-MIN_PYTORCH_VERSION_MAJOR = 1
-MIN_PYTORCH_VERSION_MINOR = 10
 THIS_DIR = os.path.dirname(os.path.abspath(__file__))
-BUILD_CUDA_EXT = int(os.environ.get("CUDA_EXT", "0")) == 1
+BUILD_EXT = int(os.environ.get("BUILD_EXT", "0")) == 1
 IS_NIGHTLY = int(os.environ.get("NIGHTLY", "0")) == 1

-# a variable to store the op builder
-ext_modules = []
-
 # we do not support windows currently
 if sys.platform == "win32":
    raise RuntimeError("Windows is not supported yet. Please try again within the Windows Subsystem for Linux (WSL).")


-# check for CUDA extension dependencies
-def environment_check_for_cuda_extension_build():
-    if not TORCH_AVAILABLE:
-        raise ModuleNotFoundError(
-            "[extension] PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
-        )
-
-    if not CUDA_HOME:
-        raise RuntimeError(
-            "[extension] CUDA_HOME is not found while CUDA_EXT=1. You need to export CUDA_HOME environment variable or install CUDA Toolkit first in order to build CUDA extensions"
-        )
-
-    check_system_pytorch_cuda_match(CUDA_HOME)
-    check_pytorch_version(MIN_PYTORCH_VERSION_MAJOR, MIN_PYTORCH_VERSION_MINOR)
-    check_cuda_availability()
-
-
 def fetch_requirements(path) -> List[str]:
    """
    This function reads the requirements file.
@@ -98,46 +66,35 @@ def get_version() -> str:
    # write version into version.py
    with open(version_py_path, "w") as f:
        f.write(f"__version__ = '{version}'\n")
-
-        # look for pytorch and cuda version
-        if BUILD_CUDA_EXT:
-            torch_major, torch_minor, _ = get_pytorch_version()
-            torch_version = f"{torch_major}.{torch_minor}"
-            cuda_version = ".".join(get_cuda_bare_metal_version(CUDA_HOME))
-        else:
-            torch_version = None
-            cuda_version = None
-
-        # write the version into the python file
-        if torch_version:
-            f.write(f'torch = "{torch_version}"\n')
-        else:
-            f.write("torch = None\n")
-
-        if cuda_version:
-            f.write(f'cuda = "{cuda_version}"\n')
-        else:
-            f.write("cuda = None\n")
-
    return version


-if BUILD_CUDA_EXT:
-    environment_check_for_cuda_extension_build()
-    set_cuda_arch_list(CUDA_HOME)
+if BUILD_EXT:
+    if not TORCH_AVAILABLE:
+        raise ModuleNotFoundError(
+            "[extension] PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
+        )

-    from op_builder import ALL_OPS
+    from extensions import ALL_EXTENSIONS

    op_names = []
+    ext_modules = []

-    # load all builders
-    for name, builder_cls in ALL_OPS.items():
-        op_names.append(name)
-        ext_modules.append(builder_cls().builder())
+    for ext_cls in ALL_EXTENSIONS:
+        ext = ext_cls()
+        if ext.support_aot and ext.is_hardware_available():
+            ext.assert_hardware_compatible()
+            op_names.append(ext.name)
+            ext_modules.append(ext.build_aot())

    # show log
-    op_name_list = ", ".join(op_names)
-    print(f"[extension]  loaded builders for {op_name_list}")
+    if len(ext_modules) == 0:
+        raise RuntimeError("[extension] Could not find any kernel compatible with the current environment.")
+    else:
+        op_name_list = ", ".join(op_names)
+        print(f"[extension] Building extensions{op_name_list}")
+else:
+    ext_modules = []

 # always put not nightly branch as the if branch
 # otherwise github will treat colossalai-nightly as the project name

--- a/tests/test_auto_parallel/test_offload/test_perf.py
+++ b/tests/test_auto_parallel/test_offload/test_perf.py
@@ -5,13 +5,13 @@ import torch
 from torch.utils._pytree import tree_map

 import colossalai
+from colossalai.accelerator import get_accelerator
 from colossalai.auto_parallel.offload.amp_optimizer import AMPOptimizer
 from colossalai.auto_parallel.offload.mem_optimize import memory_optimize
 from colossalai.auto_parallel.offload.solver import NOT_NVML
 from colossalai.fx.profiler import parameter_size
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.utils import get_current_device
 from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
 from tests.test_auto_parallel.test_offload.model_utils import *
 from tests.test_tensor.common_utils import set_seed
@@ -31,7 +31,7 @@ def exam_fwd_bwd(model_name: str, memory_budget: float, solver_name: str):
            64,
            8,
        ),
-        device=get_current_device(),
+        device=get_accelerator().get_current_device(),
    )
    criterion = LMLoss()


--- a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
@@ -10,12 +10,12 @@ try:
 except:
    NO_CODEGEN = True

+from colossalai.accelerator import get_accelerator
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import assert_close, rerun_if_address_is_in_use, run_on_environment_flag, spawn
-from colossalai.utils import get_current_device
 from colossalai.zero import zero_model_wrapper, zero_optim_wrapper


@@ -72,7 +72,11 @@ def check_auto_parallel_with_gemini(rank, world_size, port):
        print("=" * msg_length)

    gemini_config = dict(
-        strict_ddp_mode=False, device=get_current_device(), placement_policy="cpu", pin_memory=True, search_range_m=128
+        strict_ddp_mode=False,
+        device=get_accelerator().get_current_device(),
+        placement_policy="cpu",
+        pin_memory=True,
+        search_range_m=128,
    )

    gm = zero_model_wrapper(gm, zero_stage=3, gemini_config=gemini_config)

--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -8,13 +8,14 @@ from torch.testing import assert_close
 from torch.utils.data import Dataset

 import colossalai
+from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
 from colossalai.booster.plugin import HybridParallelPlugin
 from colossalai.fx import is_compatible_with_meta
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.utils import get_current_device, set_seed
+from colossalai.utils import set_seed
 from tests.kit.model_zoo import model_zoo


@@ -23,7 +24,9 @@ class RandomDataset(Dataset):
        self.num_samples = num_samples
        self.max_length = max_length
        set_seed(42)
-        self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
+        self.input_ids = torch.randint(
+            0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
+        )
        self.attention_mask = torch.ones_like(self.input_ids)

    def __len__(self):

--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -5,7 +5,7 @@ import torch.distributed as dist
 from torch.optim import Adam

 import colossalai
-import colossalai.utils.device as device_utils
+from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin

@@ -23,7 +23,7 @@ _STUCK_MODELS = ["transformers_albert_for_multiple_choice"]

 @clear_cache_before_run()
 def run_fn(stage, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
-    device = device_utils.get_current_device()
+    device = get_accelerator().get_current_device()
    try:
        plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5)
        booster = Booster(plugin=plugin)
@@ -75,7 +75,7 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
            continue
        err = run_fn(stage, model_fn, data_gen_fn, output_transform_fn)

-        device_utils.empty_cache()
+        get_accelerator().empty_cache()

        if err is None:
            passed_models.append(name)

--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -7,7 +7,6 @@ from transformers import LlamaForCausalLM
 from utils import shared_tempdir

 import colossalai
-from colossalai.testing import skip_if_not_enough_gpus
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin
 from colossalai.lazy import LazyInitContext
@@ -17,6 +16,7 @@ from colossalai.testing import (
    clear_cache_before_run,
    parameterize,
    rerun_if_address_is_in_use,
+    skip_if_not_enough_gpus,
    spawn,
 )
 from tests.kit.model_zoo import model_zoo
@@ -52,7 +52,12 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
        bert_model.config.save_pretrained(save_directory=pretrained_path)

        extra_dp_size = dist.get_world_size() // (zero_size * tp_size)
-        plugin = GeminiPlugin(**placement_config, tp_size=tp_size, enable_all_optimization=enable_all_optimization, extra_dp_size=extra_dp_size)
+        plugin = GeminiPlugin(
+            **placement_config,
+            tp_size=tp_size,
+            enable_all_optimization=enable_all_optimization,
+            extra_dp_size=extra_dp_size,
+        )
        booster = Booster(plugin=plugin)
        bert_model, _, _, _, _ = booster.boost(bert_model)
        model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
@@ -78,7 +83,14 @@ def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_sha
    criterion = lambda x: x.mean()
    enable_all_optimization = True if tp_size > 1 else False
    extra_dp_size = dist.get_world_size() // (zero_size * tp_size)
-    plugin = GeminiPlugin(**placement_config, precision="fp16", initial_scale=(2**14), tp_size=tp_size, extra_dp_size=extra_dp_size, enable_all_optimization=enable_all_optimization)
+    plugin = GeminiPlugin(
+        **placement_config,
+        precision="fp16",
+        initial_scale=(2**14),
+        tp_size=tp_size,
+        extra_dp_size=extra_dp_size,
+        enable_all_optimization=enable_all_optimization,
+    )
    booster = Booster(plugin=plugin)

    model = model_fn()
@@ -161,8 +173,13 @@ def run_dist(rank, world_size, port):
 def test_gemini_ckpIO():
    spawn(run_dist, 4)

+
 @pytest.mark.largedist
 @skip_if_not_enough_gpus(min_gpus=8)
 @rerun_if_address_is_in_use()
 def test_gemini_ckpIO_3d():
-    spawn(run_dist, 8)
\ No newline at end of file
+    spawn(run_dist, 8)
+
+
+if __name__ == "__main__":
+    test_gemini_ckpIO()