Unverified Commit 8823cc48 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

Merge pull request #5310 from hpcaitech/feature/npu

Feature/npu
parents bce9499e 73f4dc57
from .builder import Builder
from .utils import append_nvcc_threads, get_cuda_cc_flag
from ..cuda_extension import _CudaExtension
from ..utils import append_nvcc_threads, get_cuda_cc_flag
class MOEBuilder(Builder):
NAME = "moe"
PREBUILT_IMPORT_PATH = "colossalai._C.moe"
class MoeCudaExtension(_CudaExtension):
def __init__(self):
super().__init__(name=MOEBuilder.NAME, prebuilt_import_path=MOEBuilder.PREBUILT_IMPORT_PATH)
super().__init__(name="moe_cuda")
def include_dirs(self):
ret = [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
ret = [self.csrc_abs_path("cuda/include"), self.get_cuda_home_include()]
return ret
def sources_files(self):
ret = [self.csrc_abs_path(fname) for fname in ["moe_cuda.cpp", "moe_cuda_kernel.cu"]]
ret = [self.csrc_abs_path(fname) for fname in ["cuda/moe_cuda.cpp", "cuda/moe_cuda_kernel.cu"]]
return ret
def cxx_flags(self):
......
from .fused_optimizer_cuda import FusedOptimizerCudaExtension
__all__ = ['FusedOptimizerCudaExtension']
\ No newline at end of file
from .builder import Builder
from .utils import get_cuda_cc_flag
from ..cuda_extension import _CudaExtension
from ..utils import get_cuda_cc_flag
class FusedOptimBuilder(Builder):
NAME = "fused_optim"
PREBUILT_IMPORT_PATH = "colossalai._C.fused_optim"
class FusedOptimizerCudaExtension(_CudaExtension):
def __init__(self):
super().__init__(name=FusedOptimBuilder.NAME, prebuilt_import_path=FusedOptimBuilder.PREBUILT_IMPORT_PATH)
super().__init__(name="fused_optim_cuda")
def sources_files(self):
ret = [
self.csrc_abs_path(fname)
for fname in [
"colossal_C_frontend.cpp",
"multi_tensor_sgd_kernel.cu",
"multi_tensor_scale_kernel.cu",
"multi_tensor_adam.cu",
"multi_tensor_l2norm_kernel.cu",
"multi_tensor_lamb.cu",
"cuda/colossal_C_frontend.cpp",
"cuda/multi_tensor_sgd_kernel.cu",
"cuda/multi_tensor_scale_kernel.cu",
"cuda/multi_tensor_adam.cu",
"cuda/multi_tensor_l2norm_kernel.cu",
"cuda/multi_tensor_lamb.cu",
]
]
return ret
def include_dirs(self):
ret = [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
ret = [self.get_cuda_home_include()]
return ret
def cxx_flags(self):
......
from .scaled_masked_softmax_cuda import ScaledMaskedSoftmaxCudaExtension
from .scaled_upper_triangle_masked_softmax_cuda import ScaledUpperTriangleMaskedSoftmaxCudaExtension
__all__ = ['ScaledMaskedSoftmaxCudaExtension', 'ScaledUpperTriangleMaskedSoftmaxCudaExtension']
\ No newline at end of file
from .builder import Builder
from .utils import append_nvcc_threads
from ..cuda_extension import _CudaExtension
from ..utils import append_nvcc_threads
class ScaledMaskedSoftmaxBuilder(Builder):
NAME = "scaled_masked_softmax"
PREBUILT_IMPORT_PATH = "colossalai._C.scaled_masked_softmax"
class ScaledMaskedSoftmaxCudaExtension(_CudaExtension):
def __init__(self):
super().__init__(
name=ScaledMaskedSoftmaxBuilder.NAME, prebuilt_import_path=ScaledMaskedSoftmaxBuilder.PREBUILT_IMPORT_PATH
)
super().__init__(name="scaled_masked_softmax_cuda")
# necessary 4 functions
def sources_files(self):
ret = [self.csrc_abs_path(fname) for fname in ["scaled_masked_softmax.cpp", "scaled_masked_softmax_cuda.cu"]]
ret = [
self.csrc_abs_path(fname)
for fname in ["cuda/scaled_masked_softmax.cpp", "cuda/scaled_masked_softmax_cuda.cu"]
]
return ret
def include_dirs(self):
return [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
return [self.get_cuda_home_include()]
def cxx_flags(self):
return ["-O3"] + self.version_dependent_macros
......
from .builder import Builder
from .utils import append_nvcc_threads, get_cuda_cc_flag
from ..cuda_extension import _CudaExtension
from ..utils import append_nvcc_threads, get_cuda_cc_flag
class ScaledUpperTrainglemaskedSoftmaxBuilder(Builder):
NAME = "scaled_upper_triangle_masked_softmax"
PREBUILT_IMPORT_PATH = "colossalai._C.scaled_upper_triangle_masked_softmax"
class ScaledUpperTriangleMaskedSoftmaxCudaExtension(_CudaExtension):
def __init__(self):
super().__init__(
name=ScaledUpperTrainglemaskedSoftmaxBuilder.NAME,
prebuilt_import_path=ScaledUpperTrainglemaskedSoftmaxBuilder.PREBUILT_IMPORT_PATH,
)
super().__init__(name="scaled_upper_triangle_masked_softmax_cuda")
def include_dirs(self):
return [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
return [self.get_cuda_home_include()]
def sources_files(self):
ret = [
self.csrc_abs_path(fname)
for fname in ["scaled_upper_triang_masked_softmax.cpp", "scaled_upper_triang_masked_softmax_cuda.cu"]
for fname in [
"cuda/scaled_upper_triang_masked_softmax.cpp",
"cuda/scaled_upper_triang_masked_softmax_cuda.cu",
]
]
return ret
......
from .base_extension import _Extension
__all__ = ["_TritonExtension"]
class _TritonExtension(_Extension):
def __init__(self, name: str, priority: int = 1):
super().__init__(name, support_aot=False, support_jit=True, priority=priority)
def is_hardware_compatible(self) -> bool:
# cuda extension can only be built if cuda is availabe
try:
import torch
cuda_available = torch.cuda.is_available()
except:
cuda_available = False
return cuda_available
def load(self):
return self.build_jit()
# Build PyTorch Extensions
## Overview
Building PyTorch extensions can be a difficult task for users not from the system background. It is definitely frustrating if the users encounter many strange technical jargons when install Colossal-AI. Therefore, we will provide two methods of building the PyTorch extensions for the users.
1. Build CUDA extensions when running `pip install` if `CUDA_EXT=1`
2. Build the extension during runtime
The first method is more suitable for users who are familiar with CUDA environment configurations. The second method is for those who are not as they only need to build the kernel which is required by their program.
These two methods have different advantages and disadvantages.
Method 1 is good because it allows the user to build all kernels during installation and directly import the kernel. They don't need to care about kernel building when running their program. However, installation may fail if they don't know how to configure their environments and this leads to much frustration.
Method 2 is good because it allows the user to only build the kernel they actually need, such that there is a lower probability that they encounter environment issue. However, it may slow down their program due to the first build and subsequence load.
## PyTorch Extensions in Colossal-AI
The project [DeepSpeed](https://github.com/microsoft/DeepSpeed) has proposed a [solution](https://github.com/microsoft/DeepSpeed/tree/master/op_builder) to support kernel-build during either installation or runtime.
We have adapted from DeepSpeed's solution to build extensions. The extension build requires two main functions from PyTorch:
1. `torch.utils.cpp_extension.CUDAExtension`: used to build extensions in `setup.py` during `pip install`.
2. `torch.utils.cpp_extension.load`: used to build and load extension during runtime
Please note that the extension build by `CUDAExtension` cannot be loaded by the `load` function and `load` will run its own build again (correct me if I am wrong).
Based on the DeepSpeed's work, we have make several modifications and improvements:
1. All pre-built kernels (those installed with `setup.py`) will be found in `colossalai._C`
2. All runtime-built kernels will be found in the default torch extension path, i.e. ~/.cache/colossalai/torch_extensions. (If we put the built kernels in the installed site-package directory, this will make pip uninstall incomplete)
3. Once a kernel is loaded, we will cache it in the builder to avoid repeated kernel loading.
When loading the built kernel, we will first check if the pre-built one exists. If not, the runtime build will be triggered.
from .arm_cpu_adam import ArmCPUAdamBuilder
from .cpu_adam import CPUAdamBuilder
from .fused_optim import FusedOptimBuilder
from .layernorm import LayerNormBuilder
from .moe import MOEBuilder
from .multi_head_attn import MultiHeadAttnBuilder
from .scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
from .scaled_upper_triangle_masked_softmax import ScaledUpperTrainglemaskedSoftmaxBuilder
ALL_OPS = {
"cpu_adam": CPUAdamBuilder,
"fused_optim": FusedOptimBuilder,
"moe": MOEBuilder,
"multi_head_attn": MultiHeadAttnBuilder,
"scaled_masked_softmax": ScaledMaskedSoftmaxBuilder,
"scaled_upper_triangle_masked_softmax": ScaledUpperTrainglemaskedSoftmaxBuilder,
"layernorm": LayerNormBuilder,
}
__all__ = [
"ALL_OPS",
"CPUAdamBuilder",
"FusedOptimBuilder",
"MultiHeadAttnBuilder",
"ScaledMaskedSoftmaxBuilder",
"ScaledUpperTrainglemaskedSoftmaxBuilder",
"MOEBuilder",
"MultiTensorSGDBuilder",
"MultiTensorAdamBuilder",
"MultiTensorLambBuilder",
"MultiTensorScaleBuilder",
"MultiTensorL2NormBuilder",
"ArmCPUAdamBuilder",
]
# This code has been adapted from the DeepSpeed library.
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import importlib
import os
import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Optional, Union
from .utils import check_cuda_availability, check_system_pytorch_cuda_match, print_rank_0
class Builder(ABC):
"""
Builder is the base class to build extensions for PyTorch.
Args:
name (str): the name of the kernel to be built
prebuilt_import_path (str): the path where the extension is installed during pip install
"""
ext_type: str = "cuda"
def __init__(self, name: str, prebuilt_import_path: str):
self.name = name
self.prebuilt_import_path = prebuilt_import_path
self.version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]
# we store the op as an attribute to avoid repeated building and loading
self.cached_op_module = None
assert prebuilt_import_path.startswith(
"colossalai._C"
), f"The prebuilt_import_path should start with colossalai._C, but got {self.prebuilt_import_path}"
def relative_to_abs_path(self, code_path: str) -> str:
"""
This function takes in a path relative to the colossalai root directory and return the absolute path.
"""
op_builder_module_path = Path(__file__).parent
# if we install from source
# the current file path will be op_builder/builder.py
# if we install via pip install colossalai
# the current file path will be colossalai/kernel/op_builder/builder.py
# this is because that the op_builder inside colossalai is a symlink
# this symlink will be replaced with actual files if we install via pypi
# thus we cannot tell the colossalai root directory by checking whether the op_builder
# is a symlink, we can only tell whether it is inside or outside colossalai
if str(op_builder_module_path).endswith("colossalai/kernel/op_builder"):
root_path = op_builder_module_path.parent.parent
else:
root_path = op_builder_module_path.parent.joinpath("colossalai")
code_abs_path = root_path.joinpath(code_path)
return str(code_abs_path)
def get_cuda_home_include(self):
"""
return include path inside the cuda home.
"""
from torch.utils.cpp_extension import CUDA_HOME
if CUDA_HOME is None:
raise RuntimeError("CUDA_HOME is None, please set CUDA_HOME to compile C++/CUDA kernels in ColossalAI.")
cuda_include = os.path.join(CUDA_HOME, "include")
return cuda_include
def csrc_abs_path(self, path):
return os.path.join(self.relative_to_abs_path("kernel/cuda_native/csrc"), path)
# functions must be overrided begin
@abstractmethod
def sources_files(self) -> List[str]:
"""
This function should return a list of source files for extensions.
"""
raise NotImplementedError
@abstractmethod
def include_dirs(self) -> List[str]:
"""
This function should return a list of include files for extensions.
"""
@abstractmethod
def cxx_flags(self) -> List[str]:
"""
This function should return a list of cxx compilation flags for extensions.
"""
@abstractmethod
def nvcc_flags(self) -> List[str]:
"""
This function should return a list of nvcc compilation flags for extensions.
"""
# functions must be overrided over
def strip_empty_entries(self, args):
"""
Drop any empty strings from the list of compile and link flags
"""
return [x for x in args if len(x) > 0]
def import_op(self):
"""
This function will import the op module by its string name.
"""
return importlib.import_module(self.prebuilt_import_path)
def check_runtime_build_environment(self):
"""
Check whether the system environment is ready for extension compilation.
"""
try:
from torch.utils.cpp_extension import CUDA_HOME
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
CUDA_HOME = None
if not TORCH_AVAILABLE:
raise ModuleNotFoundError(
"PyTorch is not found. You need to install PyTorch first in order to build CUDA extensions"
)
if CUDA_HOME is None:
raise RuntimeError(
"CUDA_HOME is not found. You need to export CUDA_HOME environment variable or install CUDA Toolkit first in order to build CUDA extensions"
)
# make sure CUDA is available for compilation during
cuda_available = check_cuda_availability()
if not cuda_available:
raise RuntimeError("CUDA is not available on your system as torch.cuda.is_available() returns False.")
# make sure system CUDA and pytorch CUDA match, an error will raised inside the function if not
check_system_pytorch_cuda_match(CUDA_HOME)
def load(self, verbose: Optional[bool] = None):
"""
load the kernel during runtime. If the kernel is not built during pip install, it will build the kernel.
If the kernel is built during runtime, it will be stored in `~/.cache/colossalai/torch_extensions/`. If the
kernel is built during pip install, it can be accessed through `colossalai._C`.
Warning: do not load this kernel repeatedly during model execution as it could slow down the training process.
Args:
verbose (bool, optional): show detailed info. Defaults to True.
"""
if verbose is None:
verbose = os.environ.get("CAI_KERNEL_VERBOSE", "0") == "1"
# if the kernel has be compiled and cached, we directly use it
if self.cached_op_module is not None:
return self.cached_op_module
try:
# if the kernel has been pre-built during installation
# we just directly import it
op_module = self.import_op()
if verbose:
print_rank_0(
f"[extension] OP {self.prebuilt_import_path} has been compiled ahead of time, skip building."
)
except ImportError:
# check environment
if self.ext_type == "cuda":
self.check_runtime_build_environment()
# time the kernel compilation
start_build = time.time()
# construct the build directory
import torch
from torch.utils.cpp_extension import load
torch_version_major = torch.__version__.split(".")[0]
torch_version_minor = torch.__version__.split(".")[1]
torch_cuda_version = torch.version.cuda
home_directory = os.path.expanduser("~")
extension_directory = f".cache/colossalai/torch_extensions/torch{torch_version_major}.{torch_version_minor}_cu{torch_cuda_version}"
build_directory = os.path.join(home_directory, extension_directory)
Path(build_directory).mkdir(parents=True, exist_ok=True)
if verbose:
print_rank_0(f"[extension] Compiling or loading the JIT-built {self.name} kernel during runtime now")
# load the kernel
op_module = load(
name=self.name,
sources=self.strip_empty_entries(self.sources_files()),
extra_include_paths=self.strip_empty_entries(self.include_dirs()),
extra_cflags=self.cxx_flags(),
extra_cuda_cflags=self.nvcc_flags(),
extra_ldflags=[],
build_directory=build_directory,
verbose=verbose,
)
build_duration = time.time() - start_build
# log jit compilation time
if verbose:
print_rank_0(f"[extension] Time to compile or load {self.name} op: {build_duration} seconds")
# cache the built/loaded kernel
self.cached_op_module = op_module
return op_module
def builder(self) -> Union["CUDAExtension", "CppExtension"]:
"""
get a CUDAExtension instance used for setup.py
"""
from torch.utils.cpp_extension import CppExtension, CUDAExtension
if self.ext_type == "cpp":
return CppExtension(
name=self.prebuilt_import_path,
sources=self.strip_empty_entries(self.sources_files()),
include_dirs=self.strip_empty_entries(self.include_dirs()),
extra_compile_args=self.strip_empty_entries(self.cxx_flags()),
)
return CUDAExtension(
name=self.prebuilt_import_path,
sources=self.strip_empty_entries(self.sources_files()),
include_dirs=self.strip_empty_entries(self.include_dirs()),
extra_compile_args={
"cxx": self.strip_empty_entries(self.cxx_flags()),
"nvcc": self.strip_empty_entries(self.nvcc_flags()),
},
)
import re
import torch
from .builder import Builder
from .utils import append_nvcc_threads
class GPTQBuilder(Builder):
NAME = "cu_gptq"
PREBUILT_IMPORT_PATH = "colossalai._C.cu_gptq"
def __init__(self):
super().__init__(name=GPTQBuilder.NAME, prebuilt_import_path=GPTQBuilder.PREBUILT_IMPORT_PATH)
def include_dirs(self):
ret = [self.csrc_abs_path("gptq"), self.get_cuda_home_include()]
return ret
def sources_files(self):
ret = [
self.csrc_abs_path(fname)
for fname in [
"gptq/linear_gptq.cpp",
"gptq/column_remap.cu",
"gptq/cuda_buffers.cu",
"gptq/q4_matmul.cu",
"gptq/q4_matrix.cu",
]
]
return ret
def cxx_flags(self):
return ["-O3"] + self.version_dependent_macros
def nvcc_flags(self):
extra_cuda_flags = [
"-v",
"-std=c++14",
"-std=c++17",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_HALF2_OPERATORS__",
"-DTHRUST_IGNORE_CUB_VERSION_CHECK",
"-lcublas",
]
for arch in torch.cuda.get_arch_list():
res = re.search(r"sm_(\d+)", arch)
if res:
arch_cap = res[1]
if int(arch_cap) >= 80:
extra_cuda_flags.extend(["-gencode", f"arch=compute_{arch_cap},code={arch}"])
ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags
return append_nvcc_threads(ret)
from .builder import Builder
from .utils import append_nvcc_threads, get_cuda_cc_flag
class MultiHeadAttnBuilder(Builder):
NAME = "multihead_attention"
PREBUILT_IMPORT_PATH = "colossalai._C.multihead_attention"
def __init__(self):
super().__init__(name=MultiHeadAttnBuilder.NAME, prebuilt_import_path=MultiHeadAttnBuilder.PREBUILT_IMPORT_PATH)
def include_dirs(self):
ret = [self.csrc_abs_path("kernels/include"), self.get_cuda_home_include()]
return ret
def sources_files(self):
ret = [
self.csrc_abs_path(fname)
for fname in [
"multihead_attention_1d.cpp",
"kernels/cublas_wrappers.cu",
"kernels/transform_kernels.cu",
"kernels/dropout_kernels.cu",
"kernels/normalize_kernels.cu",
"kernels/softmax_kernels.cu",
"kernels/general_kernels.cu",
"kernels/cuda_util.cu",
]
]
return ret
def cxx_flags(self):
return ["-O3"] + self.version_dependent_macros
def nvcc_flags(self):
extra_cuda_flags = [
"-std=c++14",
"-std=c++17",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_HALF2_OPERATORS__",
"-DTHRUST_IGNORE_CUB_VERSION_CHECK",
]
extra_cuda_flags.extend(get_cuda_cc_flag())
ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags
return append_nvcc_threads(ret)
import torch
from .builder import Builder
from .utils import append_nvcc_threads
class SmoothquantBuilder(Builder):
NAME = "cu_smoothquant"
PREBUILT_IMPORT_PATH = "colossalai._C.cu_smoothquant"
def __init__(self):
super().__init__(name=SmoothquantBuilder.NAME, prebuilt_import_path=SmoothquantBuilder.PREBUILT_IMPORT_PATH)
def include_dirs(self):
ret = [self.csrc_abs_path("smoothquant"), self.get_cuda_home_include()]
return ret
def sources_files(self):
ret = [
self.csrc_abs_path(fname)
for fname in [
"smoothquant/binding.cpp",
"smoothquant/linear.cu",
]
]
return ret
def cxx_flags(self):
return ["-O3"] + self.version_dependent_macros
def nvcc_flags(self):
compute_capability = torch.cuda.get_device_capability()
cuda_arch = compute_capability[0] * 100 + compute_capability[1] * 10
extra_cuda_flags = [
"-v",
f"-DCUDA_ARCH={cuda_arch}",
"-std=c++17",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_HALF2_OPERATORS__",
"-DTHRUST_IGNORE_CUB_VERSION_CHECK",
]
ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags
return append_nvcc_threads(ret)
def builder(self):
try:
super().builder()
except:
warnings.warn("build smoothquant lib not successful")
......@@ -5,55 +5,23 @@ from typing import List
from setuptools import find_packages, setup
from op_builder.utils import (
check_cuda_availability,
check_pytorch_version,
check_system_pytorch_cuda_match,
get_cuda_bare_metal_version,
get_pytorch_version,
set_cuda_arch_list,
)
try:
from torch.utils.cpp_extension import CUDA_HOME, BuildExtension
import torch # noqa
from torch.utils.cpp_extension import BuildExtension
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
CUDA_HOME = None
# Some constants for installation checks
MIN_PYTORCH_VERSION_MAJOR = 1
MIN_PYTORCH_VERSION_MINOR = 10
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
BUILD_CUDA_EXT = int(os.environ.get("CUDA_EXT", "0")) == 1
BUILD_EXT = int(os.environ.get("BUILD_EXT", "0")) == 1
IS_NIGHTLY = int(os.environ.get("NIGHTLY", "0")) == 1
# a variable to store the op builder
ext_modules = []
# we do not support windows currently
if sys.platform == "win32":
raise RuntimeError("Windows is not supported yet. Please try again within the Windows Subsystem for Linux (WSL).")
# check for CUDA extension dependencies
def environment_check_for_cuda_extension_build():
if not TORCH_AVAILABLE:
raise ModuleNotFoundError(
"[extension] PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
)
if not CUDA_HOME:
raise RuntimeError(
"[extension] CUDA_HOME is not found while CUDA_EXT=1. You need to export CUDA_HOME environment variable or install CUDA Toolkit first in order to build CUDA extensions"
)
check_system_pytorch_cuda_match(CUDA_HOME)
check_pytorch_version(MIN_PYTORCH_VERSION_MAJOR, MIN_PYTORCH_VERSION_MINOR)
check_cuda_availability()
def fetch_requirements(path) -> List[str]:
"""
This function reads the requirements file.
......@@ -98,46 +66,35 @@ def get_version() -> str:
# write version into version.py
with open(version_py_path, "w") as f:
f.write(f"__version__ = '{version}'\n")
# look for pytorch and cuda version
if BUILD_CUDA_EXT:
torch_major, torch_minor, _ = get_pytorch_version()
torch_version = f"{torch_major}.{torch_minor}"
cuda_version = ".".join(get_cuda_bare_metal_version(CUDA_HOME))
else:
torch_version = None
cuda_version = None
# write the version into the python file
if torch_version:
f.write(f'torch = "{torch_version}"\n')
else:
f.write("torch = None\n")
if cuda_version:
f.write(f'cuda = "{cuda_version}"\n')
else:
f.write("cuda = None\n")
return version
if BUILD_CUDA_EXT:
environment_check_for_cuda_extension_build()
set_cuda_arch_list(CUDA_HOME)
if BUILD_EXT:
if not TORCH_AVAILABLE:
raise ModuleNotFoundError(
"[extension] PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
)
from op_builder import ALL_OPS
from extensions import ALL_EXTENSIONS
op_names = []
ext_modules = []
# load all builders
for name, builder_cls in ALL_OPS.items():
op_names.append(name)
ext_modules.append(builder_cls().builder())
for ext_cls in ALL_EXTENSIONS:
ext = ext_cls()
if ext.support_aot and ext.is_hardware_available():
ext.assert_hardware_compatible()
op_names.append(ext.name)
ext_modules.append(ext.build_aot())
# show log
op_name_list = ", ".join(op_names)
print(f"[extension] loaded builders for {op_name_list}")
if len(ext_modules) == 0:
raise RuntimeError("[extension] Could not find any kernel compatible with the current environment.")
else:
op_name_list = ", ".join(op_names)
print(f"[extension] Building extensions{op_name_list}")
else:
ext_modules = []
# always put not nightly branch as the if branch
# otherwise github will treat colossalai-nightly as the project name
......
......@@ -5,13 +5,13 @@ import torch
from torch.utils._pytree import tree_map
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.auto_parallel.offload.amp_optimizer import AMPOptimizer
from colossalai.auto_parallel.offload.mem_optimize import memory_optimize
from colossalai.auto_parallel.offload.solver import NOT_NVML
from colossalai.fx.profiler import parameter_size
from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.utils import get_current_device
from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
from tests.test_auto_parallel.test_offload.model_utils import *
from tests.test_tensor.common_utils import set_seed
......@@ -31,7 +31,7 @@ def exam_fwd_bwd(model_name: str, memory_budget: float, solver_name: str):
64,
8,
),
device=get_current_device(),
device=get_accelerator().get_current_device(),
)
criterion = LMLoss()
......
......@@ -10,12 +10,12 @@ try:
except:
NO_CODEGEN = True
from colossalai.accelerator import get_accelerator
from colossalai.device.device_mesh import DeviceMesh
from colossalai.initialize import launch
from colossalai.logging import disable_existing_loggers
from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import assert_close, rerun_if_address_is_in_use, run_on_environment_flag, spawn
from colossalai.utils import get_current_device
from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
......@@ -72,7 +72,11 @@ def check_auto_parallel_with_gemini(rank, world_size, port):
print("=" * msg_length)
gemini_config = dict(
strict_ddp_mode=False, device=get_current_device(), placement_policy="cpu", pin_memory=True, search_range_m=128
strict_ddp_mode=False,
device=get_accelerator().get_current_device(),
placement_policy="cpu",
pin_memory=True,
search_range_m=128,
)
gm = zero_model_wrapper(gm, zero_stage=3, gemini_config=gemini_config)
......
......@@ -8,13 +8,14 @@ from torch.testing import assert_close
from torch.utils.data import Dataset
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import HybridParallelPlugin
from colossalai.fx import is_compatible_with_meta
from colossalai.lazy.lazy_init import LazyInitContext
from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
from colossalai.utils import get_current_device, set_seed
from colossalai.utils import set_seed
from tests.kit.model_zoo import model_zoo
......@@ -23,7 +24,9 @@ class RandomDataset(Dataset):
self.num_samples = num_samples
self.max_length = max_length
set_seed(42)
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
self.input_ids = torch.randint(
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
)
self.attention_mask = torch.ones_like(self.input_ids)
def __len__(self):
......
......@@ -5,7 +5,7 @@ import torch.distributed as dist
from torch.optim import Adam
import colossalai
import colossalai.utils.device as device_utils
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import LowLevelZeroPlugin
......@@ -23,7 +23,7 @@ _STUCK_MODELS = ["transformers_albert_for_multiple_choice"]
@clear_cache_before_run()
def run_fn(stage, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
device = device_utils.get_current_device()
device = get_accelerator().get_current_device()
try:
plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5)
booster = Booster(plugin=plugin)
......@@ -75,7 +75,7 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
continue
err = run_fn(stage, model_fn, data_gen_fn, output_transform_fn)
device_utils.empty_cache()
get_accelerator().empty_cache()
if err is None:
passed_models.append(name)
......
......@@ -7,7 +7,6 @@ from transformers import LlamaForCausalLM
from utils import shared_tempdir
import colossalai
from colossalai.testing import skip_if_not_enough_gpus
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin
from colossalai.lazy import LazyInitContext
......@@ -17,6 +16,7 @@ from colossalai.testing import (
clear_cache_before_run,
parameterize,
rerun_if_address_is_in_use,
skip_if_not_enough_gpus,
spawn,
)
from tests.kit.model_zoo import model_zoo
......@@ -52,7 +52,12 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
bert_model.config.save_pretrained(save_directory=pretrained_path)
extra_dp_size = dist.get_world_size() // (zero_size * tp_size)
plugin = GeminiPlugin(**placement_config, tp_size=tp_size, enable_all_optimization=enable_all_optimization, extra_dp_size=extra_dp_size)
plugin = GeminiPlugin(
**placement_config,
tp_size=tp_size,
enable_all_optimization=enable_all_optimization,
extra_dp_size=extra_dp_size,
)
booster = Booster(plugin=plugin)
bert_model, _, _, _, _ = booster.boost(bert_model)
model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
......@@ -78,7 +83,14 @@ def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_sha
criterion = lambda x: x.mean()
enable_all_optimization = True if tp_size > 1 else False
extra_dp_size = dist.get_world_size() // (zero_size * tp_size)
plugin = GeminiPlugin(**placement_config, precision="fp16", initial_scale=(2**14), tp_size=tp_size, extra_dp_size=extra_dp_size, enable_all_optimization=enable_all_optimization)
plugin = GeminiPlugin(
**placement_config,
precision="fp16",
initial_scale=(2**14),
tp_size=tp_size,
extra_dp_size=extra_dp_size,
enable_all_optimization=enable_all_optimization,
)
booster = Booster(plugin=plugin)
model = model_fn()
......@@ -161,8 +173,13 @@ def run_dist(rank, world_size, port):
def test_gemini_ckpIO():
spawn(run_dist, 4)
@pytest.mark.largedist
@skip_if_not_enough_gpus(min_gpus=8)
@rerun_if_address_is_in_use()
def test_gemini_ckpIO_3d():
spawn(run_dist, 8)
\ No newline at end of file
spawn(run_dist, 8)
if __name__ == "__main__":
test_gemini_ckpIO()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment