Unverified Commit 37bbfc76 authored by Tim Moon's avatar Tim Moon Committed by GitHub
Browse files

Refactor build system (#235)



* Refactor Setuptools build system

Successfully launches CMake install, but installs CMake extensions in temp dir.
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Debug JAX build

Fix pybind11 import. Distinguish between build-time and run-time dependencies.
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Add helper function to determine dependencies
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Add missing license
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Debug case where system CMake is too old
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Add missing license
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Simplify sanity import tests

Just importing modules provides richer error messages.
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Properly install submodules
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Install helper library for TensorFlow
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Update documentation
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Do not install Ninja by default
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Include Git commit hash in version string
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Override build_ext.build_extensions instead of build_ext.run
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Fix incorrect include path

Restore Ninja dependency. Restore overriding build_ext.run func.
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Review suggestions from @nouiz
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Disable parallel Ninja jobs in GitHub actions
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Properly install userbuffers lib
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Tweak install docs

Review suggestion from @ksivaman
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Add examples for specifying framework in docs
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

---------
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
parent 215dfe7e
......@@ -22,7 +22,7 @@ jobs:
- name: 'Build'
run: |
mkdir -p wheelhouse && \
NVTE_FRAMEWORK=pytorch pip wheel -w wheelhouse . -v
NVTE_FRAMEWORK=pytorch MAX_JOBS=1 pip wheel -w wheelhouse . -v
- name: 'Upload wheel'
uses: actions/upload-artifact@v3
with:
......@@ -47,7 +47,6 @@ jobs:
submodules: recursive
- name: 'Build'
run: |
pip install ninja pybind11 && \
pip install --upgrade "jax[cuda12_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html && \
mkdir -p wheelhouse && \
NVTE_FRAMEWORK=jax pip wheel -w wheelhouse . -v
......@@ -74,7 +73,6 @@ jobs:
submodules: recursive
- name: 'Build'
run: |
pip install ninja pybind11 && \
mkdir -p wheelhouse && \
NVTE_FRAMEWORK=tensorflow pip wheel -w wheelhouse . -v
- name: 'Upload wheel'
......
......@@ -34,12 +34,9 @@ pip - from GitHub
Additional Prerequisites
^^^^^^^^^^^^^^^^^^^^^^^^
1. `CMake <https://cmake.org/>`__ version 3.18 or later: `pip install cmake`.
2. [For pyTorch support] `pyTorch <https://pytorch.org/>`__ with GPU support.
3. [For JAX support] `JAX <https://github.com/google/jax/>`__ with GPU support, version >= 0.4.7.
4. [For TensorFlow support] `TensorFlow <https://www.tensorflow.org/>`__ with GPU support.
5. `pybind11`: `pip install pybind11`.
6. [Optional] `Ninja <https://ninja-build.org/>`__: `pip install ninja`.
1. [For PyTorch support] `PyTorch <https://pytorch.org/>`__ with GPU support.
2. [For JAX support] `JAX <https://github.com/google/jax/>`__ with GPU support, version >= 0.4.7.
3. [For TensorFlow support] `TensorFlow <https://www.tensorflow.org/>`__ with GPU support.
Installation (stable release)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
......@@ -48,11 +45,9 @@ Execute the following command to install the latest stable version of Transforme
.. code-block:: bash
# Execute one of the following commands
NVTE_FRAMEWORK=pytorch pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable # Build TE for PyTorch only. The default.
NVTE_FRAMEWORK=jax pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable # Build TE for JAX only.
NVTE_FRAMEWORK=tensorflow pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable # Build TE for TensorFlow only.
NVTE_FRAMEWORK=all pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable # Build TE for all supported frameworks.
pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable `NVTE_FRAMEWORK` to a comma-separated list (e.g. `NVTE_FRAMEWORK=jax,tensorflow`).
Installation (development build)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
......@@ -67,11 +62,9 @@ Execute the following command to install the latest development build of Transfo
.. code-block:: bash
# Execute one of the following commands
NVTE_FRAMEWORK=pytorch pip install git+https://github.com/NVIDIA/TransformerEngine.git@main # Build TE for PyTorch only. The default.
NVTE_FRAMEWORK=jax pip install git+https://github.com/NVIDIA/TransformerEngine.git@main # Build TE for JAX only.
NVTE_FRAMEWORK=tensorflow pip install git+https://github.com/NVIDIA/TransformerEngine.git@main # Build TE for TensorFlow only.
NVTE_FRAMEWORK=all pip install git+https://github.com/NVIDIA/TransformerEngine.git@main # Build TE for all supported frameworks.
pip install git+https://github.com/NVIDIA/TransformerEngine.git@main
This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable `NVTE_FRAMEWORK` to a comma-separated list (e.g. `NVTE_FRAMEWORK=jax,tensorflow`).
Installation (from source)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
......@@ -80,14 +73,27 @@ Execute the following commands to install Transformer Engine from source:
.. code-block:: bash
git clone --recursive https://github.com/NVIDIA/TransformerEngine.git # Clone the repository/fork and checkout all submodules recursively.
cd TransformerEngine # Enter TE directory.
git checkout stable # Checkout the correct branch.
export NVTE_FRAMEWORK=pytorch # Optionally set the framework.
# Clone repository, checkout stable branch, clone submodules
git clone --branch stable --recursive https://github.com/NVIDIA/TransformerEngine.git
cd TransformerEngine
export NVTE_FRAMEWORK=pytorch # Optionally set framework
pip install . # Build and install
For already cloned repos, run the following command in TE directory:
If the Git repository has already been cloned, make sure to also clone the submodules:
.. code-block:: bash
git submodule update --init --recursive
Extra dependencies for testing can be installed by setting the "test" option:
.. code-block:: bash
pip install .[test]
To build the C++ extensions with debug symbols, e.g. with the `-g` flag:
.. code-block:: bash
git submodule update --init --recursive # Checkout all submodules recursively.
pip install . --global-option=--debug
......@@ -2,433 +2,517 @@
#
# See LICENSE for license information.
import atexit
from functools import lru_cache
import os
import sys
import subprocess
import io
from pathlib import Path
import re
import copy
import shutil
import subprocess
from subprocess import CalledProcessError
import sys
import tempfile
from pkg_resources import packaging
from setuptools import setup, find_packages, Extension
from typing import List, Optional, Tuple, Union
import setuptools
from setuptools.command.build_ext import build_ext
from shutil import copyfile
# Project directory root
root_path: Path = Path(__file__).resolve().parent
path = os.path.dirname(os.path.realpath(__file__))
with open(path + "/VERSION", "r") as f:
te_version = f.readline()
@lru_cache(maxsize=1)
def te_version() -> str:
"""Transformer Engine version string
CUDA_HOME = os.environ.get("CUDA_HOME", "/usr/local/cuda")
NVTE_WITH_USERBUFFERS = int(os.environ.get("NVTE_WITH_USERBUFFERS", "0"))
if NVTE_WITH_USERBUFFERS:
MPI_HOME = os.environ.get("MPI_HOME", "")
assert MPI_HOME, "MPI_HOME must be set if NVTE_WITH_USERBUFFERS=1"
Includes Git commit as local version, unless suppressed with
NVTE_NO_LOCAL_VERSION environment variable.
def get_cuda_bare_metal_version(cuda_dir):
raw_output = subprocess.check_output(
[cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
"""
with open(root_path / "VERSION", "r") as f:
version = f.readline().strip()
if not int(os.getenv("NVTE_NO_LOCAL_VERSION", "0")):
try:
output = subprocess.run(
["git", "rev-parse" , "--short", "HEAD"],
capture_output=True,
cwd=root_path,
check=True,
universal_newlines=True,
)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
return (int(bare_metal_major), int(bare_metal_minor))
def append_nvcc_threads(nvcc_extra_args):
cuda_major, cuda_minor = get_cuda_bare_metal_version(CUDA_HOME)
if cuda_major >= 11 and cuda_minor >= 2:
return nvcc_extra_args + ["--threads", "4"]
return nvcc_extra_args
def extra_gencodes(cc_flag):
cuda_bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
if cuda_bare_metal_version >= (11, 0):
cc_flag.append("-gencode")
cc_flag.append("arch=compute_80,code=sm_80")
if cuda_bare_metal_version >= (11, 8):
cc_flag.append("-gencode")
cc_flag.append("arch=compute_90,code=sm_90")
def extra_compiler_flags():
extra_flags = [
"-O3",
"-gencode",
"arch=compute_70,code=sm_70",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"-I./transformer_engine/common/layer_norm/",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
]
if NVTE_WITH_USERBUFFERS:
extra_flags.append("-DNVTE_WITH_USERBUFFERS")
return extra_flags
cc_flag = []
extra_gencodes(cc_flag)
def make_abs_path(l):
return [os.path.join(path, p) for p in l]
pytorch_sources = [
"transformer_engine/pytorch/csrc/extensions.cu",
"transformer_engine/pytorch/csrc/common.cu",
"transformer_engine/pytorch/csrc/ts_fp8_op.cpp",
]
pytorch_sources = make_abs_path(pytorch_sources)
all_sources = pytorch_sources
supported_frameworks = {
"all": all_sources,
"pytorch": pytorch_sources,
"jax": None, # JAX use transformer_engine/CMakeLists.txt
"tensorflow": None, # tensorflow use transformer_engine/CMakeLists.txt
}
framework = os.environ.get("NVTE_FRAMEWORK", "pytorch")
include_dirs = [
"transformer_engine/common/include",
"transformer_engine/pytorch/csrc",
"3rdparty/cudnn-frontend/include",
]
if NVTE_WITH_USERBUFFERS:
if MPI_HOME:
include_dirs.append(os.path.join(MPI_HOME, "include"))
include_dirs = make_abs_path(include_dirs)
args = sys.argv.copy()
for s in args:
if s.startswith("--framework="):
framework = s.replace("--framework=", "")
sys.argv.remove(s)
if framework not in supported_frameworks.keys():
raise ValueError("Unsupported framework " + framework)
class CMakeExtension(Extension):
def __init__(self, name, cmake_path, sources, **kwargs):
super(CMakeExtension, self).__init__(name, sources=sources, **kwargs)
self.cmake_path = cmake_path
class FrameworkBuilderBase:
def __init__(self, *args, **kwargs) -> None:
pass
def cmake_flags(self):
return []
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self, extensions):
except (CalledProcessError, OSError):
pass
else:
commit = output.stdout.strip()
version += f"+{commit}"
return version
@staticmethod
def install_requires():
return []
class PyTorchBuilder(FrameworkBuilderBase):
def __init__(self, *args, **kwargs) -> None:
pytorch_args = copy.deepcopy(args)
pytorch_kwargs = copy.deepcopy(kwargs)
from torch.utils.cpp_extension import BuildExtension
self.pytorch_build_extensions = BuildExtension(*pytorch_args, **pytorch_kwargs)
def initialize_options(self):
self.pytorch_build_extensions.initialize_options()
def finalize_options(self):
self.pytorch_build_extensions.finalize_options()
def run(self, extensions):
other_ext = [
ext for ext in extensions if not isinstance(ext, CMakeExtension)
]
self.pytorch_build_extensions.extensions = other_ext
print("Building pyTorch extensions!")
self.pytorch_build_extensions.run()
def cmake_flags(self):
return []
@staticmethod
def install_requires():
return ["flash-attn>=1.0.2"]
class TensorFlowBuilder(FrameworkBuilderBase):
def cmake_flags(self):
p = [d for d in sys.path if 'dist-packages' in d][0]
return ["-DENABLE_TENSORFLOW=ON", "-DCMAKE_PREFIX_PATH="+p]
def run(self, extensions):
print("Building TensorFlow extensions!")
class JaxBuilder(FrameworkBuilderBase):
def cmake_flags(self):
p = [d for d in sys.path if 'dist-packages' in d][0]
return ["-DENABLE_JAX=ON", "-DCMAKE_PREFIX_PATH="+p]
@lru_cache(maxsize=1)
def with_debug_build() -> bool:
"""Whether to build with a debug configuration"""
for arg in sys.argv:
if arg == "--debug":
sys.argv.remove(arg)
return True
if int(os.getenv("NVTE_BUILD_DEBUG", "0")):
return True
return False
def run(self, extensions):
print("Building jax extensions!")
# Call once in global scope since this function manipulates the
# command-line arguments. Future calls will use a cached value.
with_debug_build()
def install_requires():
# TODO: find a way to install pybind11 and ninja directly.
return ['cmake', 'flax']
def found_cmake() -> bool:
""""Check if valid CMake is available
ext_modules = []
dlfw_builder_funcs = []
CMake 3.18 or newer is required.
ext_modules.append(
CMakeExtension(
name="transformer_engine",
cmake_path=os.path.join(path, "transformer_engine"),
sources=[],
include_dirs=include_dirs,
)
)
"""
if framework in ("all", "pytorch"):
from torch.utils.cpp_extension import CUDAExtension
ext_modules.append(
CUDAExtension(
name="transformer_engine_extensions",
sources=supported_frameworks[framework],
extra_compile_args={
"cxx": ["-O3"],
"nvcc": append_nvcc_threads(extra_compiler_flags() + cc_flag),
},
include_dirs=include_dirs,
)
# Check if CMake is available
try:
_cmake_bin = cmake_bin()
except FileNotFoundError:
return False
# Query CMake for version info
output = subprocess.run(
[_cmake_bin, "--version"],
capture_output=True,
check=True,
universal_newlines=True,
)
dlfw_builder_funcs.append(PyTorchBuilder)
if framework in ("all", "jax"):
dlfw_builder_funcs.append(JaxBuilder)
# Trigger a better error when pybind11 isn't present.
# Sadly, if pybind11 was installed with `apt -y install pybind11-dev`
# This doesn't install a python packages. So the line bellow is too strict.
# When it fail, we need to detect if cmake will find pybind11.
# import pybind11
match = re.search(r"version\s*([\d.]+)", output.stdout)
version = match.group(1).split('.')
version = tuple(int(v) for v in version)
return version >= (3, 18)
if framework in ("all", "tensorflow"):
dlfw_builder_funcs.append(TensorFlowBuilder)
def cmake_bin() -> Path:
"""Get CMake executable
dlfw_install_requires = ['pydantic']
for builder in dlfw_builder_funcs:
dlfw_install_requires = dlfw_install_requires + builder.install_requires()
Throws FileNotFoundError if not found.
"""
def get_cmake_bin():
cmake_bin = "cmake"
# Search in CMake Python package
_cmake_bin: Optional[Path] = None
try:
out = subprocess.check_output([cmake_bin, "--version"])
except OSError:
cmake_installed_version = packaging.version.Version("0.0")
import cmake
except ImportError:
pass
else:
cmake_installed_version = packaging.version.Version(
re.search(r"version\s*([\d.]+)", out.decode()).group(1)
)
cmake_dir = Path(cmake.__file__).resolve().parent
_cmake_bin = cmake_dir / "data" / "bin" / "cmake"
if not _cmake_bin.is_file():
_cmake_bin = None
# Search in path
if _cmake_bin is None:
_cmake_bin = shutil.which("cmake")
if _cmake_bin is not None:
_cmake_bin = Path(_cmake_bin).resolve()
# Return executable if found
if _cmake_bin is None:
raise FileNotFoundError("Could not find CMake executable")
return _cmake_bin
def found_ninja() -> bool:
""""Check if Ninja is available"""
return shutil.which("ninja") is not None
def found_pybind11() -> bool:
""""Check if pybind11 is available"""
# Check if Python package is installed
try:
import pybind11
except ImportError:
pass
else:
return True
if cmake_installed_version < packaging.version.Version("3.18.0"):
print(
"Could not find a recent CMake to build Transformer Engine. "
"Attempting to install CMake 3.18 to a temporary location via pip.",
flush=True,
)
cmake_temp_dir = tempfile.TemporaryDirectory(prefix="nvte-cmake-tmp")
atexit.register(cmake_temp_dir.cleanup)
# Check if CMake can find pybind11
if not found_cmake():
return False
try:
_ = subprocess.check_output(
["pip", "install", "--target", cmake_temp_dir.name, "cmake~=3.18.0"]
)
except Exception:
raise RuntimeError(
"Failed to install temporary CMake. "
"Please update your CMake to 3.18+."
subprocess.run(
[
"cmake",
"--find-package",
"-DMODE=EXIST",
"-DNAME=pybind11",
"-DCOMPILER_ID=CXX",
"-DLANGUAGE=CXX",
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True,
)
cmake_bin = os.path.join(cmake_temp_dir.name, "bin", "run_cmake")
with io.open(cmake_bin, "w") as f_run_cmake:
f_run_cmake.write(
f"#!/bin/sh\nPYTHONPATH={cmake_temp_dir.name} {os.path.join(cmake_temp_dir.name, 'bin', 'cmake')} \"$@\""
except (CalledProcessError, OSError):
pass
else:
return True
return False
def cuda_version() -> Tuple[int, ...]:
"""CUDA Toolkit version as a (major, minor) tuple
Throws FileNotFoundError if NVCC is not found.
"""
# Try finding NVCC
nvcc_bin: Optional[Path] = None
if nvcc_bin is None and os.getenv("CUDA_HOME"):
# Check in CUDA_HOME
cuda_home = Path(os.getenv("CUDA_HOME"))
nvcc_bin = cuda_home / "bin" / "nvcc"
if nvcc_bin is None:
# Check if nvcc is in path
nvcc_bin = shutil.which("nvcc")
if nvcc_bin is not None:
nvcc_bin = Path(nvcc_bin)
if nvcc_bin is None:
# Last-ditch guess in /usr/local/cuda
cuda_home = Path("/usr/local/cuda")
nvcc_bin = cuda_home / "bin" / "nvcc"
if not nvcc_bin.is_file():
raise FileNotFoundError(f"Could not find NVCC at {nvcc_bin}")
# Query NVCC for version info
output = subprocess.run(
[nvcc_bin, "-V"],
capture_output=True,
check=True,
universal_newlines=True,
)
os.chmod(cmake_bin, 0o755)
return cmake_bin
class CMakeBuildExtension(build_ext, object):
def __init__(self, *args, **kwargs) -> None:
self.dlfw_flags = kwargs["dlfw_flags"]
super(CMakeBuildExtension, self).__init__(*args, **kwargs)
def build_extensions(self) -> None:
print("Building CMake extensions!")
cmake_bin = get_cmake_bin()
config = "Debug" if self.debug else "Release"
ext_name = self.extensions[0].name
build_dir = self.get_ext_fullpath(ext_name).replace(
self.get_ext_filename(ext_name), ""
match = re.search(r"release\s*([\d.]+)", output.stdout)
version = match.group(1).split('.')
return tuple(int(v) for v in version)
@lru_cache(maxsize=1)
def with_userbuffers() -> bool:
"""Check if userbuffers support is enabled"""
if int(os.getenv("NVTE_WITH_USERBUFFERS", "0")):
assert os.getenv("MPI_HOME"), \
"MPI_HOME must be set if NVTE_WITH_USERBUFFERS=1"
return True
return False
@lru_cache(maxsize=1)
def frameworks() -> List[str]:
"""DL frameworks to build support for"""
_frameworks: List[str] = []
supported_frameworks = ["pytorch", "jax", "tensorflow"]
# Check environment variable
if os.getenv("NVTE_FRAMEWORK"):
_frameworks.extend(os.getenv("NVTE_FRAMEWORK").split(","))
# Check command-line arguments
for arg in sys.argv.copy():
if arg.startswith("--framework="):
_frameworks.extend(arg.replace("--framework=", "").split(","))
sys.argv.remove(arg)
# Detect installed frameworks if not explicitly specified
if not _frameworks:
try:
import torch
except ImportError:
pass
else:
_frameworks.append("pytorch")
try:
import jax
except ImportError:
pass
else:
_frameworks.append("jax")
try:
import tensorflow
except ImportError:
pass
else:
_frameworks.append("tensorflow")
# Special framework names
if "all" in _frameworks:
_frameworks = supported_frameworks.copy()
if "none" in _frameworks:
_frameworks = []
# Check that frameworks are valid
_frameworks = [framework.lower() for framework in _frameworks]
for framework in _frameworks:
if framework not in supported_frameworks:
raise ValueError(
f"Transformer Engine does not support framework={framework}"
)
build_dir = os.path.abspath(build_dir)
cmake_args = [
"-DCMAKE_BUILD_TYPE=" + config,
"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(config.upper(), build_dir),
return _frameworks
# Call once in global scope since this function manipulates the
# command-line arguments. Future calls will use a cached value.
frameworks()
def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
"""Setup Python dependencies
Returns dependencies for build, runtime, and testing.
"""
# Common requirements
setup_reqs: List[str] = []
install_reqs: List[str] = ["pydantic"]
test_reqs: List[str] = ["pytest"]
def add_unique(l: List[str], vals: Union[str, List[str]]) -> None:
"""Add entry to list if not already included"""
if isinstance(vals, str):
vals = [vals]
for val in vals:
if val not in l:
l.append(val)
# Requirements that may be installed outside of Python
if not found_cmake():
add_unique(setup_reqs, "cmake>=3.18")
if not found_ninja():
add_unique(setup_reqs, "ninja")
# Framework-specific requirements
if "pytorch" in frameworks():
add_unique(install_reqs, ["torch", "flash-attn>=1.0.2"])
add_unique(test_reqs, ["numpy", "onnxruntime", "torchvision"])
if "jax" in frameworks():
if not found_pybind11():
add_unique(setup_reqs, "pybind11")
add_unique(install_reqs, ["jax", "flax"])
add_unique(test_reqs, ["numpy", "praxis"])
if "tensorflow" in frameworks():
if not found_pybind11():
add_unique(setup_reqs, "pybind11")
add_unique(install_reqs, "tensorflow")
add_unique(test_reqs, ["keras", "tensorflow_datasets"])
return setup_reqs, install_reqs, test_reqs
class CMakeExtension(setuptools.Extension):
"""CMake extension module"""
def __init__(
self,
name: str,
cmake_path: Path,
cmake_flags: Optional[List[str]] = None,
) -> None:
super().__init__(name, sources=[]) # No work for base class
self.cmake_path: Path = cmake_path
self.cmake_flags: List[str] = [] if cmake_flags is None else cmake_flags
def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
# Make sure paths are str
_cmake_bin = str(cmake_bin())
cmake_path = str(self.cmake_path)
build_dir = str(build_dir)
install_dir = str(install_dir)
# CMake configure command
build_type = "Debug" if with_debug_build() else "Release"
configure_command = [
_cmake_bin,
"-S",
cmake_path,
"-B",
build_dir,
f"-DCMAKE_BUILD_TYPE={build_type}",
f"-DCMAKE_INSTALL_PREFIX={install_dir}",
]
configure_command += self.cmake_flags
if found_ninja():
configure_command.append("-GNinja")
try:
import ninja
import pybind11
except ImportError:
pass
else:
cmake_args.append("-GNinja")
pybind11_dir = Path(pybind11.__file__).resolve().parent
pybind11_dir = pybind11_dir / "share" / "cmake" / "pybind11"
configure_command.append(f"-Dpybind11_DIR={pybind11_dir}")
cmake_args = cmake_args + self.dlfw_flags
# CMake build and install commands
build_command = [_cmake_bin, "--build", build_dir]
install_command = [_cmake_bin, "--install", build_dir]
cmake_build_args = ["--config", config]
# Run CMake commands
for command in [configure_command, build_command, install_command]:
print(f"Running command {' '.join(command)}")
try:
subprocess.run(command, cwd=build_dir, check=True)
except (CalledProcessError, OSError) as e:
raise RuntimeError(f"Error when running CMake: {e}")
cmake_build_dir = os.path.join(self.build_temp, config)
if not os.path.exists(cmake_build_dir):
os.makedirs(cmake_build_dir)
config_and_build_commands = [
[cmake_bin, self.extensions[0].cmake_path] + cmake_args,
[cmake_bin, "--build", "."] + cmake_build_args,
]
# PyTorch extension modules require special handling
if "pytorch" in frameworks():
from torch.utils.cpp_extension import BuildExtension
else:
from setuptools.command.build_ext import build_ext as BuildExtension
if True:
print(f"Running CMake in {cmake_build_dir}:")
for command in config_and_build_commands:
print(" ".join(command))
sys.stdout.flush()
# Config and build the extension
try:
for command in config_and_build_commands:
subprocess.check_call(command, cwd=cmake_build_dir)
except OSError as e:
raise RuntimeError("CMake failed: {}".format(str(e)))
class CMakeBuildExtension(BuildExtension):
"""Setuptools command with support for CMake extension modules"""
class TEBuildExtension(build_ext, object):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
def run(self) -> None:
self.dlfw_builder = []
for functor in dlfw_builder_funcs:
self.dlfw_builder.append(functor(*args, **kwargs))
# Build CMake extensions
for ext in self.extensions:
if isinstance(ext, CMakeExtension):
print(f"Building CMake extension {ext.name}")
with tempfile.TemporaryDirectory() as build_dir:
build_dir = Path(build_dir)
package_path = Path(self.get_ext_fullpath(ext.name))
install_dir = package_path.resolve().parent
ext._build_cmake(
build_dir=build_dir,
install_dir=install_dir,
)
# Build non-CMake extensions as usual
all_extensions = self.extensions
self.extensions = [
ext for ext in self.extensions
if not isinstance(ext, CMakeExtension)
]
super().run()
self.extensions = all_extensions
def setup_common_extension() -> CMakeExtension:
"""Setup CMake extension for common library
Also builds JAX, TensorFlow, and userbuffers support if needed.
"""
cmake_flags = []
if "jax" in frameworks():
cmake_flags.append("-DENABLE_JAX=ON")
if "tensorflow" in frameworks():
cmake_flags.append("-DENABLE_TENSORFLOW=ON")
if with_userbuffers():
cmake_flags.append("-DNVTE_WITH_USERBUFFERS=ON")
return CMakeExtension(
name="transformer_engine",
cmake_path=root_path / "transformer_engine",
cmake_flags=cmake_flags,
)
flags = []
if NVTE_WITH_USERBUFFERS:
flags.append('-DNVTE_WITH_USERBUFFERS=ON')
for builder in self.dlfw_builder:
flags = flags + builder.cmake_flags()
def setup_pytorch_extension() -> setuptools.Extension:
"""Setup CUDA extension for PyTorch support"""
# Source files
src_dir = root_path / "transformer_engine" / "pytorch" / "csrc"
sources = [
src_dir / "extensions.cu",
src_dir / "common.cu",
src_dir / "ts_fp8_op.cpp",
]
# Header files
include_dirs = [
root_path / "transformer_engine" / "common" / "include",
root_path / "transformer_engine" / "pytorch" / "csrc",
root_path / "3rdparty" / "cudnn-frontend" / "include",
]
cmake_args = copy.deepcopy(args)
cmake_kwargs = copy.deepcopy(kwargs)
cmake_kwargs["dlfw_flags"] = flags
self.cmake_build_extensions = CMakeBuildExtension(*cmake_args, **cmake_kwargs)
# Compiler flags
cxx_flags = ["-O3"]
nvcc_flags = [
"-O3",
"-gencode",
"arch=compute_70,code=sm_70",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
]
self.all_outputs = None
super(TEBuildExtension, self).__init__(*args, **kwargs)
# Version-dependent CUDA options
try:
version = cuda_version()
except FileNotFoundError:
print("Could not determine CUDA Toolkit version")
else:
if version >= (11, 2):
nvcc_flags.extend(["--threads", "4"])
if version >= (11, 0):
nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
if version >= (11, 8):
nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
# userbuffers support
if with_userbuffers():
if os.getenv("MPI_HOME"):
mpi_home = Path(os.getenv("MPI_HOME"))
include_dirs.append(mpi_home / "include")
cxx_flags.append("-DNVTE_WITH_USERBUFFERS")
nvcc_flags.append("-DNVTE_WITH_USERBUFFERS")
# Construct PyTorch CUDA extension
sources = [str(path) for path in sources]
include_dirs = [str(path) for path in include_dirs]
from torch.utils.cpp_extension import CUDAExtension
return CUDAExtension(
name="transformer_engine_extensions",
sources=sources,
include_dirs=include_dirs,
# libraries=["transformer_engine"], ### TODO (tmoon) Debug linker errors
extra_compile_args={
"cxx": cxx_flags,
"nvcc": nvcc_flags,
},
)
def initialize_options(self):
self.cmake_build_extensions.initialize_options()
for builder in self.dlfw_builder:
builder.initialize_options()
super(TEBuildExtension, self).initialize_options()
def finalize_options(self):
self.cmake_build_extensions.finalize_options()
for builder in self.dlfw_builder:
builder.finalize_options()
super(TEBuildExtension, self).finalize_options()
def main():
def run(self) -> None:
old_inplace, self.inplace = self.inplace, 0
cmake_ext = [ext for ext in self.extensions if isinstance(ext, CMakeExtension)]
self.cmake_build_extensions.extensions = cmake_ext
self.cmake_build_extensions.run()
for builder in self.dlfw_builder:
builder.run(self.extensions)
self.all_outputs = []
for f in os.scandir(self.build_lib):
if f.is_file():
self.all_outputs.append(f.path)
self.inplace = old_inplace
if old_inplace:
self.copy_extensions_to_source()
def copy_extensions_to_source(self):
ext = self.extensions[0]
build_py = self.get_finalized_command("build_py")
fullname = self.get_ext_fullname(ext.name)
modpath = fullname.split(".")
package = ".".join(modpath[:-1])
package_dir = build_py.get_package_dir(package)
for f in os.scandir(self.build_lib):
if f.is_file():
src_filename = f.path
dest_filename = os.path.join(
package_dir, os.path.basename(src_filename)
# Submodules to install
packages = setuptools.find_packages(
include=["transformer_engine", "transformer_engine.*"],
)
# Always copy, even if source is older than destination, to ensure
# that the right extensions for the current Python/platform are
# used.
copyfile(src_filename, dest_filename)
def get_outputs(self):
return self.all_outputs
# Dependencies
setup_requires, install_requires, test_requires = setup_requirements()
# Extensions
ext_modules = [setup_common_extension()]
if "pytorch" in frameworks():
ext_modules.append(setup_pytorch_extension())
setup(
# Configure package
setuptools.setup(
name="transformer_engine",
version=te_version,
packages=find_packages(
exclude=(
"build",
"csrc",
"include",
"tests",
"dist",
"docs",
"tests",
"examples",
"transformer_engine.egg-info",
)
),
version=te_version(),
packages=packages,
description="Transformer acceleration library",
ext_modules=ext_modules,
cmdclass={"build_ext": TEBuildExtension},
install_requires=dlfw_install_requires,
extras_require={
'test': ['pytest',
'tensorflow_datasets'],
'test_pytest': ['onnxruntime',],
},
cmdclass={"build_ext": CMakeBuildExtension},
setup_requires=setup_requires,
install_requires=install_requires,
extras_require={"test": test_requires},
license_files=("LICENSE",),
)
)
if __name__ == "__main__":
main()
......@@ -2,11 +2,5 @@
#
# See LICENSE for license information.
try:
import transformer_engine.jax
te_imported = True
except:
te_imported = False
assert te_imported, 'transformer_engine import failed'
import transformer_engine.jax
print("OK")
......@@ -2,11 +2,5 @@
#
# See LICENSE for license information.
try:
import transformer_engine.pytorch
te_imported = True
except:
te_imported = False
assert te_imported, 'transformer_engine import failed'
import transformer_engine.pytorch
print("OK")
......@@ -2,11 +2,5 @@
#
# See LICENSE for license information.
try:
import transformer_engine.tensorflow
te_imported = True
except:
te_imported = False
assert te_imported, 'transformer_engine import failed'
import transformer_engine.tensorflow
print("OK")
......@@ -28,16 +28,20 @@ include_directories(${PROJECT_SOURCE_DIR})
add_subdirectory(common)
if(NVTE_WITH_USERBUFFERS)
message(STATUS "userbuffers support enabled")
add_subdirectory(pytorch/csrc/userbuffers)
endif()
option(ENABLE_JAX "Enable JAX in the building workflow." OFF)
message(STATUS "JAX support: ${ENABLE_JAX}")
if(ENABLE_JAX)
find_package(pybind11 CONFIG REQUIRED)
add_subdirectory(jax)
endif()
option(ENABLE_TENSORFLOW "Enable TensorFlow in the building workflow." OFF)
message(STATUS "TensorFlow support: ${ENABLE_TENSORFLOW}")
if(ENABLE_TENSORFLOW)
find_package(pybind11 CONFIG REQUIRED)
add_subdirectory(tensorflow)
......
# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
find_path(
......@@ -75,4 +79,3 @@ target_link_libraries(
CUDNN::cudnn_ops_infer
CUDNN::cudnn
)
......@@ -77,3 +77,6 @@ set_source_files_properties(fused_softmax/scaled_masked_softmax.cu
COMPILE_OPTIONS "--use_fast_math")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
# Install library
install(TARGETS transformer_engine DESTINATION .)
......@@ -10,3 +10,4 @@ pybind11_add_module(
)
target_link_libraries(transformer_engine_jax PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt transformer_engine)
install(TARGETS transformer_engine_jax DESTINATION .)
......@@ -31,3 +31,6 @@ set_source_files_properties(userbuffers.cu
COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-maxrregcount=64>")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
# Install library
install(TARGETS transformer_engine_userbuffers DESTINATION .)
......@@ -40,3 +40,7 @@ target_link_libraries(_get_stream PRIVATE ${TF_LINKER_LIBS})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
# Install library
install(TARGETS transformer_engine_tensorflow DESTINATION .)
install(TARGETS _get_stream DESTINATION .)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment