Unverified Commit f8a7148d authored by ver217's avatar ver217 Committed by GitHub
Browse files

[kernel] move all symlinks of kernel to `colossalai._C` (#1971)

parent 7e24b9b9
...@@ -20,7 +20,7 @@ class FusedSGD(Optimizer): ...@@ -20,7 +20,7 @@ class FusedSGD(Optimizer):
:class:`colossalai.nn.optimizer.FusedSGD` may be used as a drop-in replacement for ``torch.optim.SGD`` :class:`colossalai.nn.optimizer.FusedSGD` may be used as a drop-in replacement for ``torch.optim.SGD``
:class:`colossalai.nn.optimizer.FusedSGD` may be used with or without Amp. :class:`colossalai.nn.optimizer.FusedSGD` may be used with or without Amp.
Nesterov momentum is based on the formula from Nesterov momentum is based on the formula from
`On the importance of initialization and momentum in deep learning`__. `On the importance of initialization and momentum in deep learning`__.
...@@ -80,12 +80,13 @@ class FusedSGD(Optimizer): ...@@ -80,12 +80,13 @@ class FusedSGD(Optimizer):
self.wd_after_momentum = wd_after_momentum self.wd_after_momentum = wd_after_momentum
if multi_tensor_applier.available: if multi_tensor_applier.available:
import colossal_C import colossalai._C.fused_optim
# Skip buffer # Skip buffer
self._dummy_overflow_buf = torch.tensor([0], self._dummy_overflow_buf = torch.tensor([0],
dtype=torch.int, dtype=torch.int,
device=self.param_groups[0]["params"][0].device) device=self.param_groups[0]["params"][0].device)
self.multi_tensor_sgd = colossal_C.multi_tensor_sgd self.multi_tensor_sgd = colossalai._C.fused_optim.multi_tensor_sgd
else: else:
raise RuntimeError('FusedSGD requires cuda extensions') raise RuntimeError('FusedSGD requires cuda extensions')
......
...@@ -77,14 +77,15 @@ class HybridAdam(NVMeOptimizer): ...@@ -77,14 +77,15 @@ class HybridAdam(NVMeOptimizer):
super(HybridAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir) super(HybridAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir)
self.adamw_mode = adamw_mode self.adamw_mode = adamw_mode
try: try:
import colossal_C import colossalai._C.cpu_optim
import cpu_adam import colossalai._C.fused_optim
except ImportError: except ImportError:
raise ImportError('Please install colossalai from source code to use HybridAdam') raise ImportError('Please install colossalai from source code to use HybridAdam')
self.cpu_adam_op = cpu_adam.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay, adamw_mode) self.cpu_adam_op = colossalai._C.cpu_optim.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay,
adamw_mode)
self.gpu_adam_op = colossal_C.multi_tensor_adam self.gpu_adam_op = colossalai._C.fused_optim.multi_tensor_adam
self._dummy_overflow_buf = torch.cuda.IntTensor([0]) self._dummy_overflow_buf = torch.cuda.IntTensor([0])
@torch.no_grad() @torch.no_grad()
......
#!/usr/bin/env python #!/usr/bin/env python
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import functools
import os import os
import random import random
import socket import socket
from pathlib import Path from pathlib import Path
from typing import Callable, List, Union, Dict, Optional from typing import Callable, Dict, List, Optional, Union
import functools
import torch import torch
from torch._six import inf from torch._six import inf
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
try: try:
import colossal_C import colossalai._C.fused_optim
except: except:
pass pass
from collections import defaultdict
from contextlib import contextmanager from contextlib import contextmanager
import torch.distributed as dist import torch.distributed as dist
from colossalai.constants import (IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_ATTRIBUTES)
from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_ATTRIBUTES
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from .multi_tensor_apply import multi_tensor_applier
from colossalai.tensor import ColoParameter, ProcessGroup from colossalai.tensor import ColoParameter, ProcessGroup
from collections import defaultdict
from .multi_tensor_apply import multi_tensor_applier
def print_rank_0(msg: str, logger=None): def print_rank_0(msg: str, logger=None):
...@@ -132,7 +133,7 @@ def _calc_l2_norm(grads): ...@@ -132,7 +133,7 @@ def _calc_l2_norm(grads):
if len(grads) > 0: if len(grads) > 0:
dummy_overflow_buf = torch.cuda.IntTensor([0]) dummy_overflow_buf = torch.cuda.IntTensor([0])
norm, _ = multi_tensor_applier( norm, _ = multi_tensor_applier(
colossal_C.multi_tensor_l2norm, colossalai._C.fused_optim.multi_tensor_l2norm,
dummy_overflow_buf, dummy_overflow_buf,
[grads], [grads],
False # no per-parameter norm False # no per-parameter norm
...@@ -269,7 +270,8 @@ def _clip_grad_norm(parameters, max_norm: float, total_norm: float) -> None: ...@@ -269,7 +270,8 @@ def _clip_grad_norm(parameters, max_norm: float, total_norm: float) -> None:
cpu_grads.append(p.grad.detach()) cpu_grads.append(p.grad.detach())
if len(cuda_grads) > 0: if len(cuda_grads) > 0:
dummy_overflow_buf = torch.cuda.IntTensor([0]) dummy_overflow_buf = torch.cuda.IntTensor([0])
multi_tensor_applier(colossal_C.multi_tensor_scale, dummy_overflow_buf, [cuda_grads, cuda_grads], clip_coef) multi_tensor_applier(colossalai._C.fused_optim.multi_tensor_scale, dummy_overflow_buf,
[cuda_grads, cuda_grads], clip_coef)
for g in cpu_grads: for g in cpu_grads:
g.mul_(clip_coef) g.mul_(clip_coef)
...@@ -395,7 +397,8 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): ...@@ -395,7 +397,8 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
if enable_cuda_kernels: if enable_cuda_kernels:
grads = [p.grad.detach() for p in params] grads = [p.grad.detach() for p in params]
dummy_overflow_buf = torch.cuda.IntTensor([0]) dummy_overflow_buf = torch.cuda.IntTensor([0])
multi_tensor_applier(colossal_C.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff) multi_tensor_applier(colossalai._C.fused_optim.multi_tensor_scale, dummy_overflow_buf, [grads, grads],
clip_coeff)
else: else:
for p in params: for p in params:
p.grad.detach().mul_(clip_coeff) p.grad.detach().mul_(clip_coeff)
......
...@@ -14,7 +14,7 @@ class MultiTensorApply(object): ...@@ -14,7 +14,7 @@ class MultiTensorApply(object):
def __init__(self, chunk_size): def __init__(self, chunk_size):
try: try:
import colossal_C import colossalai._C.fused_optim
MultiTensorApply.available = True MultiTensorApply.available = True
self.chunk_size = chunk_size self.chunk_size = chunk_size
except ImportError as err: except ImportError as err:
......
import os import os
import subprocess
import re import re
from setuptools import find_packages, setup, Extension import subprocess
from setuptools import Extension, find_packages, setup
# ninja build does not work unless include_dirs are abs path # ninja build does not work unless include_dirs are abs path
this_dir = os.path.dirname(os.path.abspath(__file__)) this_dir = os.path.dirname(os.path.abspath(__file__))
...@@ -104,7 +105,7 @@ def get_version(): ...@@ -104,7 +105,7 @@ def get_version():
if build_cuda_ext: if build_cuda_ext:
try: try:
import torch import torch
from torch.utils.cpp_extension import (CUDA_HOME, BuildExtension, CUDAExtension) from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension
print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__))
TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1]) TORCH_MINOR = int(torch.__version__.split('.')[1])
...@@ -148,7 +149,7 @@ if build_cuda_ext: ...@@ -148,7 +149,7 @@ if build_cuda_ext:
extra_cuda_flags = ['-lineinfo'] extra_cuda_flags = ['-lineinfo']
ext_modules.append( ext_modules.append(
cuda_ext_helper('colossal_C', [ cuda_ext_helper('colossalai._C.fused_optim', [
'colossal_C_frontend.cpp', 'multi_tensor_sgd_kernel.cu', 'multi_tensor_scale_kernel.cu', 'colossal_C_frontend.cpp', 'multi_tensor_sgd_kernel.cu', 'multi_tensor_scale_kernel.cu',
'multi_tensor_adam.cu', 'multi_tensor_l2norm_kernel.cu', 'multi_tensor_lamb.cu' 'multi_tensor_adam.cu', 'multi_tensor_l2norm_kernel.cu', 'multi_tensor_lamb.cu'
], extra_cuda_flags + cc_flag)) ], extra_cuda_flags + cc_flag))
...@@ -159,21 +160,21 @@ if build_cuda_ext: ...@@ -159,21 +160,21 @@ if build_cuda_ext:
] ]
ext_modules.append( ext_modules.append(
cuda_ext_helper('colossal_scaled_upper_triang_masked_softmax', cuda_ext_helper('colossalai._C.scaled_upper_triang_masked_softmax',
['scaled_upper_triang_masked_softmax.cpp', 'scaled_upper_triang_masked_softmax_cuda.cu'], ['scaled_upper_triang_masked_softmax.cpp', 'scaled_upper_triang_masked_softmax_cuda.cu'],
extra_cuda_flags + cc_flag)) extra_cuda_flags + cc_flag))
ext_modules.append( ext_modules.append(
cuda_ext_helper('colossal_scaled_masked_softmax', cuda_ext_helper('colossalai._C.scaled_masked_softmax',
['scaled_masked_softmax.cpp', 'scaled_masked_softmax_cuda.cu'], extra_cuda_flags + cc_flag)) ['scaled_masked_softmax.cpp', 'scaled_masked_softmax_cuda.cu'], extra_cuda_flags + cc_flag))
ext_modules.append( ext_modules.append(
cuda_ext_helper('colossal_moe_cuda', ['moe_cuda.cpp', 'moe_cuda_kernel.cu'], extra_cuda_flags + cc_flag)) cuda_ext_helper('colossalai._C.moe', ['moe_cuda.cpp', 'moe_cuda_kernel.cu'], extra_cuda_flags + cc_flag))
extra_cuda_flags = ['-maxrregcount=50'] extra_cuda_flags = ['-maxrregcount=50']
ext_modules.append( ext_modules.append(
cuda_ext_helper('colossal_layer_norm_cuda', ['layer_norm_cuda.cpp', 'layer_norm_cuda_kernel.cu'], cuda_ext_helper('colossalai._C.layer_norm', ['layer_norm_cuda.cpp', 'layer_norm_cuda_kernel.cu'],
extra_cuda_flags + cc_flag)) extra_cuda_flags + cc_flag))
extra_cuda_flags = [ extra_cuda_flags = [
...@@ -182,54 +183,53 @@ if build_cuda_ext: ...@@ -182,54 +183,53 @@ if build_cuda_ext:
] ]
ext_modules.append( ext_modules.append(
cuda_ext_helper('colossal_multihead_attention', [ cuda_ext_helper('colossalai._C.multihead_attention', [
'multihead_attention_1d.cpp', 'kernels/cublas_wrappers.cu', 'kernels/transform_kernels.cu', 'multihead_attention_1d.cpp', 'kernels/cublas_wrappers.cu', 'kernels/transform_kernels.cu',
'kernels/dropout_kernels.cu', 'kernels/normalize_kernels.cu', 'kernels/softmax_kernels.cu', 'kernels/dropout_kernels.cu', 'kernels/normalize_kernels.cu', 'kernels/softmax_kernels.cu',
'kernels/general_kernels.cu', 'kernels/cuda_util.cu' 'kernels/general_kernels.cu', 'kernels/cuda_util.cu'
], extra_cuda_flags + cc_flag)) ], extra_cuda_flags + cc_flag))
extra_cxx_flags = ['-std=c++14', '-lcudart', '-lcublas', '-g', '-Wno-reorder', '-fopenmp', '-march=native'] extra_cxx_flags = ['-std=c++14', '-lcudart', '-lcublas', '-g', '-Wno-reorder', '-fopenmp', '-march=native']
ext_modules.append(cuda_ext_helper('cpu_adam', ['cpu_adam.cpp'], extra_cuda_flags, extra_cxx_flags)) ext_modules.append(cuda_ext_helper('colossalai._C.cpu_optim', ['cpu_adam.cpp'], extra_cuda_flags, extra_cxx_flags))
setup( setup(name='colossalai',
name='colossalai', version=get_version(),
version=get_version(), packages=find_packages(exclude=(
packages=find_packages(exclude=( 'benchmark',
'benchmark', 'docker',
'docker', 'tests',
'tests', 'docs',
'docs', 'examples',
'examples', 'tests',
'tests', 'scripts',
'scripts', 'requirements',
'requirements', '*.egg-info',
'*.egg-info', )),
)), description='An integrated large-scale model training system with efficient parallelization techniques',
description='An integrated large-scale model training system with efficient parallelization techniques', long_description=fetch_readme(),
long_description=fetch_readme(), long_description_content_type='text/markdown',
long_description_content_type='text/markdown', license='Apache Software License 2.0',
license='Apache Software License 2.0', url='https://www.colossalai.org',
url='https://www.colossalai.org', project_urls={
project_urls={ 'Forum': 'https://github.com/hpcaitech/ColossalAI/discussions',
'Forum': 'https://github.com/hpcaitech/ColossalAI/discussions', 'Bug Tracker': 'https://github.com/hpcaitech/ColossalAI/issues',
'Bug Tracker': 'https://github.com/hpcaitech/ColossalAI/issues', 'Examples': 'https://github.com/hpcaitech/ColossalAI-Examples',
'Examples': 'https://github.com/hpcaitech/ColossalAI-Examples', 'Documentation': 'http://colossalai.readthedocs.io',
'Documentation': 'http://colossalai.readthedocs.io', 'Github': 'https://github.com/hpcaitech/ColossalAI',
'Github': 'https://github.com/hpcaitech/ColossalAI', },
}, ext_modules=ext_modules,
ext_modules=ext_modules, cmdclass={'build_ext': BuildExtension} if ext_modules else {},
cmdclass={'build_ext': BuildExtension} if ext_modules else {}, install_requires=fetch_requirements('requirements/requirements.txt'),
install_requires=fetch_requirements('requirements/requirements.txt'), entry_points='''
entry_points='''
[console_scripts] [console_scripts]
colossalai=colossalai.cli:cli colossalai=colossalai.cli:cli
''', ''',
python_requires='>=3.6', python_requires='>=3.6',
classifiers=[ classifiers=[
'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3',
'License :: OSI Approved :: Apache Software License', 'License :: OSI Approved :: Apache Software License',
'Environment :: GPU :: NVIDIA CUDA', 'Environment :: GPU :: NVIDIA CUDA',
'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: System :: Distributed Computing', 'Topic :: System :: Distributed Computing',
], ],
) package_data={'colossalai': ['_C/*.pyi']})
import math import math
import torch import torch
from colossalai.testing import parameterize from colossalai.testing import parameterize
...@@ -66,8 +67,8 @@ def test_cpu_adam(adamw, step, p_dtype, g_dtype): ...@@ -66,8 +67,8 @@ def test_cpu_adam(adamw, step, p_dtype, g_dtype):
exp_avg_sq_copy = exp_avg_sq.clone() exp_avg_sq_copy = exp_avg_sq.clone()
try: try:
import cpu_adam import colossalai._C.cpu_optim
cpu_adam_op = cpu_adam.CPUAdamOptimizer(lr, beta1, beta2, eps, weight_decay, adamw) cpu_adam_op = colossalai._C.cpu_optim.CPUAdamOptimizer(lr, beta1, beta2, eps, weight_decay, adamw)
except: except:
raise ImportError("Import cpu adam error, please install colossal from source code") raise ImportError("Import cpu adam error, please install colossal from source code")
......
from numpy import dtype import math
import torch import torch
import torch.nn as nn import torch.nn as nn
from numpy import dtype
import math
from colossalai.testing import parameterize from colossalai.testing import parameterize
from colossalai.utils import multi_tensor_applier from colossalai.utils import multi_tensor_applier
...@@ -47,11 +47,11 @@ def torch_adam_update( ...@@ -47,11 +47,11 @@ def torch_adam_update(
@parameterize('g_dtype', [torch.float, torch.half]) @parameterize('g_dtype', [torch.float, torch.half])
def test_adam(adamw, step, p_dtype, g_dtype): def test_adam(adamw, step, p_dtype, g_dtype):
try: try:
import colossal_C import colossalai._C.fused_optim
fused_adam = colossal_C.multi_tensor_adam fused_adam = colossalai._C.fused_optim.multi_tensor_adam
dummy_overflow_buf = torch.cuda.IntTensor([0]) dummy_overflow_buf = torch.cuda.IntTensor([0])
except: except:
raise ImportError("No colossal_C kernel installed.") raise ImportError("No colossalai._C.fused_optim kernel installed.")
count = 0 count = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment