Unverified Commit 37bbfc76 authored by Tim Moon's avatar Tim Moon Committed by GitHub
Browse files

Refactor build system (#235)



* Refactor Setuptools build system

Successfully launches CMake install, but installs CMake extensions in temp dir.
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Debug JAX build

Fix pybind11 import. Distinguish between build-time and run-time dependencies.
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Add helper function to determine dependencies
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Add missing license
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Debug case where system CMake is too old
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Add missing license
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Simplify sanity import tests

Just importing modules provides richer error messages.
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Properly install submodules
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Install helper library for TensorFlow
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Update documentation
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Do not install Ninja by default
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Include Git commit hash in version string
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Override build_ext.build_extensions instead of build_ext.run
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Fix incorrect include path

Restore Ninja dependency. Restore overriding build_ext.run func.
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Review suggestions from @nouiz
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Disable parallel Ninja jobs in GitHub actions
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Properly install userbuffers lib
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Tweak install docs

Review suggestion from @ksivaman
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Add examples for specifying framework in docs
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

---------
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
parent 215dfe7e
...@@ -22,7 +22,7 @@ jobs: ...@@ -22,7 +22,7 @@ jobs:
- name: 'Build' - name: 'Build'
run: | run: |
mkdir -p wheelhouse && \ mkdir -p wheelhouse && \
NVTE_FRAMEWORK=pytorch pip wheel -w wheelhouse . -v NVTE_FRAMEWORK=pytorch MAX_JOBS=1 pip wheel -w wheelhouse . -v
- name: 'Upload wheel' - name: 'Upload wheel'
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v3
with: with:
...@@ -47,7 +47,6 @@ jobs: ...@@ -47,7 +47,6 @@ jobs:
submodules: recursive submodules: recursive
- name: 'Build' - name: 'Build'
run: | run: |
pip install ninja pybind11 && \
pip install --upgrade "jax[cuda12_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html && \ pip install --upgrade "jax[cuda12_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html && \
mkdir -p wheelhouse && \ mkdir -p wheelhouse && \
NVTE_FRAMEWORK=jax pip wheel -w wheelhouse . -v NVTE_FRAMEWORK=jax pip wheel -w wheelhouse . -v
...@@ -74,7 +73,6 @@ jobs: ...@@ -74,7 +73,6 @@ jobs:
submodules: recursive submodules: recursive
- name: 'Build' - name: 'Build'
run: | run: |
pip install ninja pybind11 && \
mkdir -p wheelhouse && \ mkdir -p wheelhouse && \
NVTE_FRAMEWORK=tensorflow pip wheel -w wheelhouse . -v NVTE_FRAMEWORK=tensorflow pip wheel -w wheelhouse . -v
- name: 'Upload wheel' - name: 'Upload wheel'
......
...@@ -34,12 +34,9 @@ pip - from GitHub ...@@ -34,12 +34,9 @@ pip - from GitHub
Additional Prerequisites Additional Prerequisites
^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^
1. `CMake <https://cmake.org/>`__ version 3.18 or later: `pip install cmake`. 1. [For PyTorch support] `PyTorch <https://pytorch.org/>`__ with GPU support.
2. [For pyTorch support] `pyTorch <https://pytorch.org/>`__ with GPU support. 2. [For JAX support] `JAX <https://github.com/google/jax/>`__ with GPU support, version >= 0.4.7.
3. [For JAX support] `JAX <https://github.com/google/jax/>`__ with GPU support, version >= 0.4.7. 3. [For TensorFlow support] `TensorFlow <https://www.tensorflow.org/>`__ with GPU support.
4. [For TensorFlow support] `TensorFlow <https://www.tensorflow.org/>`__ with GPU support.
5. `pybind11`: `pip install pybind11`.
6. [Optional] `Ninja <https://ninja-build.org/>`__: `pip install ninja`.
Installation (stable release) Installation (stable release)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...@@ -48,11 +45,9 @@ Execute the following command to install the latest stable version of Transforme ...@@ -48,11 +45,9 @@ Execute the following command to install the latest stable version of Transforme
.. code-block:: bash .. code-block:: bash
# Execute one of the following commands pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
NVTE_FRAMEWORK=pytorch pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable # Build TE for PyTorch only. The default.
NVTE_FRAMEWORK=jax pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable # Build TE for JAX only. This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable `NVTE_FRAMEWORK` to a comma-separated list (e.g. `NVTE_FRAMEWORK=jax,tensorflow`).
NVTE_FRAMEWORK=tensorflow pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable # Build TE for TensorFlow only.
NVTE_FRAMEWORK=all pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable # Build TE for all supported frameworks.
Installation (development build) Installation (development build)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...@@ -67,11 +62,9 @@ Execute the following command to install the latest development build of Transfo ...@@ -67,11 +62,9 @@ Execute the following command to install the latest development build of Transfo
.. code-block:: bash .. code-block:: bash
# Execute one of the following commands pip install git+https://github.com/NVIDIA/TransformerEngine.git@main
NVTE_FRAMEWORK=pytorch pip install git+https://github.com/NVIDIA/TransformerEngine.git@main # Build TE for PyTorch only. The default.
NVTE_FRAMEWORK=jax pip install git+https://github.com/NVIDIA/TransformerEngine.git@main # Build TE for JAX only. This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable `NVTE_FRAMEWORK` to a comma-separated list (e.g. `NVTE_FRAMEWORK=jax,tensorflow`).
NVTE_FRAMEWORK=tensorflow pip install git+https://github.com/NVIDIA/TransformerEngine.git@main # Build TE for TensorFlow only.
NVTE_FRAMEWORK=all pip install git+https://github.com/NVIDIA/TransformerEngine.git@main # Build TE for all supported frameworks.
Installation (from source) Installation (from source)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...@@ -80,14 +73,27 @@ Execute the following commands to install Transformer Engine from source: ...@@ -80,14 +73,27 @@ Execute the following commands to install Transformer Engine from source:
.. code-block:: bash .. code-block:: bash
git clone --recursive https://github.com/NVIDIA/TransformerEngine.git # Clone the repository/fork and checkout all submodules recursively. # Clone repository, checkout stable branch, clone submodules
cd TransformerEngine # Enter TE directory. git clone --branch stable --recursive https://github.com/NVIDIA/TransformerEngine.git
git checkout stable # Checkout the correct branch.
export NVTE_FRAMEWORK=pytorch # Optionally set the framework. cd TransformerEngine
export NVTE_FRAMEWORK=pytorch # Optionally set framework
pip install . # Build and install pip install . # Build and install
For already cloned repos, run the following command in TE directory: If the Git repository has already been cloned, make sure to also clone the submodules:
.. code-block:: bash
git submodule update --init --recursive
Extra dependencies for testing can be installed by setting the "test" option:
.. code-block:: bash
pip install .[test]
To build the C++ extensions with debug symbols, e.g. with the `-g` flag:
.. code-block:: bash .. code-block:: bash
git submodule update --init --recursive # Checkout all submodules recursively. pip install . --global-option=--debug
...@@ -2,433 +2,517 @@ ...@@ -2,433 +2,517 @@
# #
# See LICENSE for license information. # See LICENSE for license information.
import atexit from functools import lru_cache
import os import os
import sys from pathlib import Path
import subprocess
import io
import re import re
import copy import shutil
import subprocess
from subprocess import CalledProcessError
import sys
import tempfile import tempfile
from pkg_resources import packaging from typing import List, Optional, Tuple, Union
from setuptools import setup, find_packages, Extension
import setuptools
from setuptools.command.build_ext import build_ext from setuptools.command.build_ext import build_ext
from shutil import copyfile
# Project directory root
root_path: Path = Path(__file__).resolve().parent
path = os.path.dirname(os.path.realpath(__file__)) @lru_cache(maxsize=1)
with open(path + "/VERSION", "r") as f: def te_version() -> str:
te_version = f.readline() """Transformer Engine version string
CUDA_HOME = os.environ.get("CUDA_HOME", "/usr/local/cuda") Includes Git commit as local version, unless suppressed with
NVTE_WITH_USERBUFFERS = int(os.environ.get("NVTE_WITH_USERBUFFERS", "0")) NVTE_NO_LOCAL_VERSION environment variable.
if NVTE_WITH_USERBUFFERS:
MPI_HOME = os.environ.get("MPI_HOME", "")
assert MPI_HOME, "MPI_HOME must be set if NVTE_WITH_USERBUFFERS=1"
def get_cuda_bare_metal_version(cuda_dir): """
raw_output = subprocess.check_output( with open(root_path / "VERSION", "r") as f:
[cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True version = f.readline().strip()
if not int(os.getenv("NVTE_NO_LOCAL_VERSION", "0")):
try:
output = subprocess.run(
["git", "rev-parse" , "--short", "HEAD"],
capture_output=True,
cwd=root_path,
check=True,
universal_newlines=True,
) )
output = raw_output.split() except (CalledProcessError, OSError):
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
return (int(bare_metal_major), int(bare_metal_minor))
def append_nvcc_threads(nvcc_extra_args):
cuda_major, cuda_minor = get_cuda_bare_metal_version(CUDA_HOME)
if cuda_major >= 11 and cuda_minor >= 2:
return nvcc_extra_args + ["--threads", "4"]
return nvcc_extra_args
def extra_gencodes(cc_flag):
cuda_bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
if cuda_bare_metal_version >= (11, 0):
cc_flag.append("-gencode")
cc_flag.append("arch=compute_80,code=sm_80")
if cuda_bare_metal_version >= (11, 8):
cc_flag.append("-gencode")
cc_flag.append("arch=compute_90,code=sm_90")
def extra_compiler_flags():
extra_flags = [
"-O3",
"-gencode",
"arch=compute_70,code=sm_70",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"-I./transformer_engine/common/layer_norm/",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
]
if NVTE_WITH_USERBUFFERS:
extra_flags.append("-DNVTE_WITH_USERBUFFERS")
return extra_flags
cc_flag = []
extra_gencodes(cc_flag)
def make_abs_path(l):
return [os.path.join(path, p) for p in l]
pytorch_sources = [
"transformer_engine/pytorch/csrc/extensions.cu",
"transformer_engine/pytorch/csrc/common.cu",
"transformer_engine/pytorch/csrc/ts_fp8_op.cpp",
]
pytorch_sources = make_abs_path(pytorch_sources)
all_sources = pytorch_sources
supported_frameworks = {
"all": all_sources,
"pytorch": pytorch_sources,
"jax": None, # JAX use transformer_engine/CMakeLists.txt
"tensorflow": None, # tensorflow use transformer_engine/CMakeLists.txt
}
framework = os.environ.get("NVTE_FRAMEWORK", "pytorch")
include_dirs = [
"transformer_engine/common/include",
"transformer_engine/pytorch/csrc",
"3rdparty/cudnn-frontend/include",
]
if NVTE_WITH_USERBUFFERS:
if MPI_HOME:
include_dirs.append(os.path.join(MPI_HOME, "include"))
include_dirs = make_abs_path(include_dirs)
args = sys.argv.copy()
for s in args:
if s.startswith("--framework="):
framework = s.replace("--framework=", "")
sys.argv.remove(s)
if framework not in supported_frameworks.keys():
raise ValueError("Unsupported framework " + framework)
class CMakeExtension(Extension):
def __init__(self, name, cmake_path, sources, **kwargs):
super(CMakeExtension, self).__init__(name, sources=sources, **kwargs)
self.cmake_path = cmake_path
class FrameworkBuilderBase:
def __init__(self, *args, **kwargs) -> None:
pass
def cmake_flags(self):
return []
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self, extensions):
pass pass
else:
commit = output.stdout.strip()
version += f"+{commit}"
return version
@staticmethod @lru_cache(maxsize=1)
def install_requires(): def with_debug_build() -> bool:
return [] """Whether to build with a debug configuration"""
for arg in sys.argv:
class PyTorchBuilder(FrameworkBuilderBase): if arg == "--debug":
def __init__(self, *args, **kwargs) -> None: sys.argv.remove(arg)
pytorch_args = copy.deepcopy(args) return True
pytorch_kwargs = copy.deepcopy(kwargs) if int(os.getenv("NVTE_BUILD_DEBUG", "0")):
from torch.utils.cpp_extension import BuildExtension return True
self.pytorch_build_extensions = BuildExtension(*pytorch_args, **pytorch_kwargs) return False
def initialize_options(self):
self.pytorch_build_extensions.initialize_options()
def finalize_options(self):
self.pytorch_build_extensions.finalize_options()
def run(self, extensions):
other_ext = [
ext for ext in extensions if not isinstance(ext, CMakeExtension)
]
self.pytorch_build_extensions.extensions = other_ext
print("Building pyTorch extensions!")
self.pytorch_build_extensions.run()
def cmake_flags(self):
return []
@staticmethod
def install_requires():
return ["flash-attn>=1.0.2"]
class TensorFlowBuilder(FrameworkBuilderBase):
def cmake_flags(self):
p = [d for d in sys.path if 'dist-packages' in d][0]
return ["-DENABLE_TENSORFLOW=ON", "-DCMAKE_PREFIX_PATH="+p]
def run(self, extensions):
print("Building TensorFlow extensions!")
class JaxBuilder(FrameworkBuilderBase):
def cmake_flags(self):
p = [d for d in sys.path if 'dist-packages' in d][0]
return ["-DENABLE_JAX=ON", "-DCMAKE_PREFIX_PATH="+p]
def run(self, extensions): # Call once in global scope since this function manipulates the
print("Building jax extensions!") # command-line arguments. Future calls will use a cached value.
with_debug_build()
def install_requires(): def found_cmake() -> bool:
# TODO: find a way to install pybind11 and ninja directly. """"Check if valid CMake is available
return ['cmake', 'flax']
ext_modules = [] CMake 3.18 or newer is required.
dlfw_builder_funcs = []
ext_modules.append( """
CMakeExtension(
name="transformer_engine",
cmake_path=os.path.join(path, "transformer_engine"),
sources=[],
include_dirs=include_dirs,
)
)
if framework in ("all", "pytorch"): # Check if CMake is available
from torch.utils.cpp_extension import CUDAExtension try:
ext_modules.append( _cmake_bin = cmake_bin()
CUDAExtension( except FileNotFoundError:
name="transformer_engine_extensions", return False
sources=supported_frameworks[framework],
extra_compile_args={ # Query CMake for version info
"cxx": ["-O3"], output = subprocess.run(
"nvcc": append_nvcc_threads(extra_compiler_flags() + cc_flag), [_cmake_bin, "--version"],
}, capture_output=True,
include_dirs=include_dirs, check=True,
) universal_newlines=True,
) )
dlfw_builder_funcs.append(PyTorchBuilder) match = re.search(r"version\s*([\d.]+)", output.stdout)
version = match.group(1).split('.')
if framework in ("all", "jax"): version = tuple(int(v) for v in version)
dlfw_builder_funcs.append(JaxBuilder) return version >= (3, 18)
# Trigger a better error when pybind11 isn't present.
# Sadly, if pybind11 was installed with `apt -y install pybind11-dev`
# This doesn't install a python packages. So the line bellow is too strict.
# When it fail, we need to detect if cmake will find pybind11.
# import pybind11
if framework in ("all", "tensorflow"): def cmake_bin() -> Path:
dlfw_builder_funcs.append(TensorFlowBuilder) """Get CMake executable
dlfw_install_requires = ['pydantic'] Throws FileNotFoundError if not found.
for builder in dlfw_builder_funcs:
dlfw_install_requires = dlfw_install_requires + builder.install_requires()
"""
def get_cmake_bin(): # Search in CMake Python package
cmake_bin = "cmake" _cmake_bin: Optional[Path] = None
try: try:
out = subprocess.check_output([cmake_bin, "--version"]) import cmake
except OSError: except ImportError:
cmake_installed_version = packaging.version.Version("0.0") pass
else: else:
cmake_installed_version = packaging.version.Version( cmake_dir = Path(cmake.__file__).resolve().parent
re.search(r"version\s*([\d.]+)", out.decode()).group(1) _cmake_bin = cmake_dir / "data" / "bin" / "cmake"
) if not _cmake_bin.is_file():
_cmake_bin = None
# Search in path
if _cmake_bin is None:
_cmake_bin = shutil.which("cmake")
if _cmake_bin is not None:
_cmake_bin = Path(_cmake_bin).resolve()
# Return executable if found
if _cmake_bin is None:
raise FileNotFoundError("Could not find CMake executable")
return _cmake_bin
def found_ninja() -> bool:
""""Check if Ninja is available"""
return shutil.which("ninja") is not None
def found_pybind11() -> bool:
""""Check if pybind11 is available"""
# Check if Python package is installed
try:
import pybind11
except ImportError:
pass
else:
return True
if cmake_installed_version < packaging.version.Version("3.18.0"): # Check if CMake can find pybind11
print( if not found_cmake():
"Could not find a recent CMake to build Transformer Engine. " return False
"Attempting to install CMake 3.18 to a temporary location via pip.",
flush=True,
)
cmake_temp_dir = tempfile.TemporaryDirectory(prefix="nvte-cmake-tmp")
atexit.register(cmake_temp_dir.cleanup)
try: try:
_ = subprocess.check_output( subprocess.run(
["pip", "install", "--target", cmake_temp_dir.name, "cmake~=3.18.0"] [
) "cmake",
except Exception: "--find-package",
raise RuntimeError( "-DMODE=EXIST",
"Failed to install temporary CMake. " "-DNAME=pybind11",
"Please update your CMake to 3.18+." "-DCOMPILER_ID=CXX",
"-DLANGUAGE=CXX",
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True,
) )
cmake_bin = os.path.join(cmake_temp_dir.name, "bin", "run_cmake") except (CalledProcessError, OSError):
with io.open(cmake_bin, "w") as f_run_cmake: pass
f_run_cmake.write( else:
f"#!/bin/sh\nPYTHONPATH={cmake_temp_dir.name} {os.path.join(cmake_temp_dir.name, 'bin', 'cmake')} \"$@\"" return True
return False
def cuda_version() -> Tuple[int, ...]:
"""CUDA Toolkit version as a (major, minor) tuple
Throws FileNotFoundError if NVCC is not found.
"""
# Try finding NVCC
nvcc_bin: Optional[Path] = None
if nvcc_bin is None and os.getenv("CUDA_HOME"):
# Check in CUDA_HOME
cuda_home = Path(os.getenv("CUDA_HOME"))
nvcc_bin = cuda_home / "bin" / "nvcc"
if nvcc_bin is None:
# Check if nvcc is in path
nvcc_bin = shutil.which("nvcc")
if nvcc_bin is not None:
nvcc_bin = Path(nvcc_bin)
if nvcc_bin is None:
# Last-ditch guess in /usr/local/cuda
cuda_home = Path("/usr/local/cuda")
nvcc_bin = cuda_home / "bin" / "nvcc"
if not nvcc_bin.is_file():
raise FileNotFoundError(f"Could not find NVCC at {nvcc_bin}")
# Query NVCC for version info
output = subprocess.run(
[nvcc_bin, "-V"],
capture_output=True,
check=True,
universal_newlines=True,
) )
os.chmod(cmake_bin, 0o755) match = re.search(r"release\s*([\d.]+)", output.stdout)
version = match.group(1).split('.')
return cmake_bin return tuple(int(v) for v in version)
@lru_cache(maxsize=1)
class CMakeBuildExtension(build_ext, object): def with_userbuffers() -> bool:
def __init__(self, *args, **kwargs) -> None: """Check if userbuffers support is enabled"""
self.dlfw_flags = kwargs["dlfw_flags"] if int(os.getenv("NVTE_WITH_USERBUFFERS", "0")):
super(CMakeBuildExtension, self).__init__(*args, **kwargs) assert os.getenv("MPI_HOME"), \
"MPI_HOME must be set if NVTE_WITH_USERBUFFERS=1"
def build_extensions(self) -> None: return True
print("Building CMake extensions!") return False
cmake_bin = get_cmake_bin() @lru_cache(maxsize=1)
config = "Debug" if self.debug else "Release" def frameworks() -> List[str]:
"""DL frameworks to build support for"""
ext_name = self.extensions[0].name _frameworks: List[str] = []
build_dir = self.get_ext_fullpath(ext_name).replace( supported_frameworks = ["pytorch", "jax", "tensorflow"]
self.get_ext_filename(ext_name), ""
# Check environment variable
if os.getenv("NVTE_FRAMEWORK"):
_frameworks.extend(os.getenv("NVTE_FRAMEWORK").split(","))
# Check command-line arguments
for arg in sys.argv.copy():
if arg.startswith("--framework="):
_frameworks.extend(arg.replace("--framework=", "").split(","))
sys.argv.remove(arg)
# Detect installed frameworks if not explicitly specified
if not _frameworks:
try:
import torch
except ImportError:
pass
else:
_frameworks.append("pytorch")
try:
import jax
except ImportError:
pass
else:
_frameworks.append("jax")
try:
import tensorflow
except ImportError:
pass
else:
_frameworks.append("tensorflow")
# Special framework names
if "all" in _frameworks:
_frameworks = supported_frameworks.copy()
if "none" in _frameworks:
_frameworks = []
# Check that frameworks are valid
_frameworks = [framework.lower() for framework in _frameworks]
for framework in _frameworks:
if framework not in supported_frameworks:
raise ValueError(
f"Transformer Engine does not support framework={framework}"
) )
build_dir = os.path.abspath(build_dir)
cmake_args = [ return _frameworks
"-DCMAKE_BUILD_TYPE=" + config,
"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(config.upper(), build_dir), # Call once in global scope since this function manipulates the
# command-line arguments. Future calls will use a cached value.
frameworks()
def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
"""Setup Python dependencies
Returns dependencies for build, runtime, and testing.
"""
# Common requirements
setup_reqs: List[str] = []
install_reqs: List[str] = ["pydantic"]
test_reqs: List[str] = ["pytest"]
def add_unique(l: List[str], vals: Union[str, List[str]]) -> None:
"""Add entry to list if not already included"""
if isinstance(vals, str):
vals = [vals]
for val in vals:
if val not in l:
l.append(val)
# Requirements that may be installed outside of Python
if not found_cmake():
add_unique(setup_reqs, "cmake>=3.18")
if not found_ninja():
add_unique(setup_reqs, "ninja")
# Framework-specific requirements
if "pytorch" in frameworks():
add_unique(install_reqs, ["torch", "flash-attn>=1.0.2"])
add_unique(test_reqs, ["numpy", "onnxruntime", "torchvision"])
if "jax" in frameworks():
if not found_pybind11():
add_unique(setup_reqs, "pybind11")
add_unique(install_reqs, ["jax", "flax"])
add_unique(test_reqs, ["numpy", "praxis"])
if "tensorflow" in frameworks():
if not found_pybind11():
add_unique(setup_reqs, "pybind11")
add_unique(install_reqs, "tensorflow")
add_unique(test_reqs, ["keras", "tensorflow_datasets"])
return setup_reqs, install_reqs, test_reqs
class CMakeExtension(setuptools.Extension):
"""CMake extension module"""
def __init__(
self,
name: str,
cmake_path: Path,
cmake_flags: Optional[List[str]] = None,
) -> None:
super().__init__(name, sources=[]) # No work for base class
self.cmake_path: Path = cmake_path
self.cmake_flags: List[str] = [] if cmake_flags is None else cmake_flags
def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
# Make sure paths are str
_cmake_bin = str(cmake_bin())
cmake_path = str(self.cmake_path)
build_dir = str(build_dir)
install_dir = str(install_dir)
# CMake configure command
build_type = "Debug" if with_debug_build() else "Release"
configure_command = [
_cmake_bin,
"-S",
cmake_path,
"-B",
build_dir,
f"-DCMAKE_BUILD_TYPE={build_type}",
f"-DCMAKE_INSTALL_PREFIX={install_dir}",
] ]
configure_command += self.cmake_flags
if found_ninja():
configure_command.append("-GNinja")
try: try:
import ninja import pybind11
except ImportError: except ImportError:
pass pass
else: else:
cmake_args.append("-GNinja") pybind11_dir = Path(pybind11.__file__).resolve().parent
pybind11_dir = pybind11_dir / "share" / "cmake" / "pybind11"
configure_command.append(f"-Dpybind11_DIR={pybind11_dir}")
cmake_args = cmake_args + self.dlfw_flags # CMake build and install commands
build_command = [_cmake_bin, "--build", build_dir]
install_command = [_cmake_bin, "--install", build_dir]
cmake_build_args = ["--config", config] # Run CMake commands
for command in [configure_command, build_command, install_command]:
print(f"Running command {' '.join(command)}")
try:
subprocess.run(command, cwd=build_dir, check=True)
except (CalledProcessError, OSError) as e:
raise RuntimeError(f"Error when running CMake: {e}")
cmake_build_dir = os.path.join(self.build_temp, config)
if not os.path.exists(cmake_build_dir):
os.makedirs(cmake_build_dir)
config_and_build_commands = [ # PyTorch extension modules require special handling
[cmake_bin, self.extensions[0].cmake_path] + cmake_args, if "pytorch" in frameworks():
[cmake_bin, "--build", "."] + cmake_build_args, from torch.utils.cpp_extension import BuildExtension
] else:
from setuptools.command.build_ext import build_ext as BuildExtension
if True:
print(f"Running CMake in {cmake_build_dir}:")
for command in config_and_build_commands:
print(" ".join(command))
sys.stdout.flush()
# Config and build the extension class CMakeBuildExtension(BuildExtension):
try: """Setuptools command with support for CMake extension modules"""
for command in config_and_build_commands:
subprocess.check_call(command, cwd=cmake_build_dir)
except OSError as e:
raise RuntimeError("CMake failed: {}".format(str(e)))
class TEBuildExtension(build_ext, object):
def __init__(self, *args, **kwargs) -> None: def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
def run(self) -> None:
self.dlfw_builder = [] # Build CMake extensions
for functor in dlfw_builder_funcs: for ext in self.extensions:
self.dlfw_builder.append(functor(*args, **kwargs)) if isinstance(ext, CMakeExtension):
print(f"Building CMake extension {ext.name}")
with tempfile.TemporaryDirectory() as build_dir:
build_dir = Path(build_dir)
package_path = Path(self.get_ext_fullpath(ext.name))
install_dir = package_path.resolve().parent
ext._build_cmake(
build_dir=build_dir,
install_dir=install_dir,
)
# Build non-CMake extensions as usual
all_extensions = self.extensions
self.extensions = [
ext for ext in self.extensions
if not isinstance(ext, CMakeExtension)
]
super().run()
self.extensions = all_extensions
def setup_common_extension() -> CMakeExtension:
"""Setup CMake extension for common library
Also builds JAX, TensorFlow, and userbuffers support if needed.
"""
cmake_flags = []
if "jax" in frameworks():
cmake_flags.append("-DENABLE_JAX=ON")
if "tensorflow" in frameworks():
cmake_flags.append("-DENABLE_TENSORFLOW=ON")
if with_userbuffers():
cmake_flags.append("-DNVTE_WITH_USERBUFFERS=ON")
return CMakeExtension(
name="transformer_engine",
cmake_path=root_path / "transformer_engine",
cmake_flags=cmake_flags,
)
flags = [] def setup_pytorch_extension() -> setuptools.Extension:
if NVTE_WITH_USERBUFFERS: """Setup CUDA extension for PyTorch support"""
flags.append('-DNVTE_WITH_USERBUFFERS=ON')
for builder in self.dlfw_builder: # Source files
flags = flags + builder.cmake_flags() src_dir = root_path / "transformer_engine" / "pytorch" / "csrc"
sources = [
src_dir / "extensions.cu",
src_dir / "common.cu",
src_dir / "ts_fp8_op.cpp",
]
# Header files
include_dirs = [
root_path / "transformer_engine" / "common" / "include",
root_path / "transformer_engine" / "pytorch" / "csrc",
root_path / "3rdparty" / "cudnn-frontend" / "include",
]
cmake_args = copy.deepcopy(args) # Compiler flags
cmake_kwargs = copy.deepcopy(kwargs) cxx_flags = ["-O3"]
cmake_kwargs["dlfw_flags"] = flags nvcc_flags = [
self.cmake_build_extensions = CMakeBuildExtension(*cmake_args, **cmake_kwargs) "-O3",
"-gencode",
"arch=compute_70,code=sm_70",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
]
self.all_outputs = None # Version-dependent CUDA options
super(TEBuildExtension, self).__init__(*args, **kwargs) try:
version = cuda_version()
except FileNotFoundError:
print("Could not determine CUDA Toolkit version")
else:
if version >= (11, 2):
nvcc_flags.extend(["--threads", "4"])
if version >= (11, 0):
nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
if version >= (11, 8):
nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
# userbuffers support
if with_userbuffers():
if os.getenv("MPI_HOME"):
mpi_home = Path(os.getenv("MPI_HOME"))
include_dirs.append(mpi_home / "include")
cxx_flags.append("-DNVTE_WITH_USERBUFFERS")
nvcc_flags.append("-DNVTE_WITH_USERBUFFERS")
# Construct PyTorch CUDA extension
sources = [str(path) for path in sources]
include_dirs = [str(path) for path in include_dirs]
from torch.utils.cpp_extension import CUDAExtension
return CUDAExtension(
name="transformer_engine_extensions",
sources=sources,
include_dirs=include_dirs,
# libraries=["transformer_engine"], ### TODO (tmoon) Debug linker errors
extra_compile_args={
"cxx": cxx_flags,
"nvcc": nvcc_flags,
},
)
def initialize_options(self):
self.cmake_build_extensions.initialize_options()
for builder in self.dlfw_builder:
builder.initialize_options()
super(TEBuildExtension, self).initialize_options()
def finalize_options(self): def main():
self.cmake_build_extensions.finalize_options()
for builder in self.dlfw_builder:
builder.finalize_options()
super(TEBuildExtension, self).finalize_options()
def run(self) -> None: # Submodules to install
old_inplace, self.inplace = self.inplace, 0 packages = setuptools.find_packages(
cmake_ext = [ext for ext in self.extensions if isinstance(ext, CMakeExtension)] include=["transformer_engine", "transformer_engine.*"],
self.cmake_build_extensions.extensions = cmake_ext
self.cmake_build_extensions.run()
for builder in self.dlfw_builder:
builder.run(self.extensions)
self.all_outputs = []
for f in os.scandir(self.build_lib):
if f.is_file():
self.all_outputs.append(f.path)
self.inplace = old_inplace
if old_inplace:
self.copy_extensions_to_source()
def copy_extensions_to_source(self):
ext = self.extensions[0]
build_py = self.get_finalized_command("build_py")
fullname = self.get_ext_fullname(ext.name)
modpath = fullname.split(".")
package = ".".join(modpath[:-1])
package_dir = build_py.get_package_dir(package)
for f in os.scandir(self.build_lib):
if f.is_file():
src_filename = f.path
dest_filename = os.path.join(
package_dir, os.path.basename(src_filename)
) )
# Always copy, even if source is older than destination, to ensure
# that the right extensions for the current Python/platform are
# used.
copyfile(src_filename, dest_filename)
def get_outputs(self): # Dependencies
return self.all_outputs setup_requires, install_requires, test_requires = setup_requirements()
# Extensions
ext_modules = [setup_common_extension()]
if "pytorch" in frameworks():
ext_modules.append(setup_pytorch_extension())
setup( # Configure package
setuptools.setup(
name="transformer_engine", name="transformer_engine",
version=te_version, version=te_version(),
packages=find_packages( packages=packages,
exclude=(
"build",
"csrc",
"include",
"tests",
"dist",
"docs",
"tests",
"examples",
"transformer_engine.egg-info",
)
),
description="Transformer acceleration library", description="Transformer acceleration library",
ext_modules=ext_modules, ext_modules=ext_modules,
cmdclass={"build_ext": TEBuildExtension}, cmdclass={"build_ext": CMakeBuildExtension},
install_requires=dlfw_install_requires, setup_requires=setup_requires,
extras_require={ install_requires=install_requires,
'test': ['pytest', extras_require={"test": test_requires},
'tensorflow_datasets'],
'test_pytest': ['onnxruntime',],
},
license_files=("LICENSE",), license_files=("LICENSE",),
) )
if __name__ == "__main__":
main()
...@@ -2,11 +2,5 @@ ...@@ -2,11 +2,5 @@
# #
# See LICENSE for license information. # See LICENSE for license information.
try: import transformer_engine.jax
import transformer_engine.jax
te_imported = True
except:
te_imported = False
assert te_imported, 'transformer_engine import failed'
print("OK") print("OK")
...@@ -2,11 +2,5 @@ ...@@ -2,11 +2,5 @@
# #
# See LICENSE for license information. # See LICENSE for license information.
try: import transformer_engine.pytorch
import transformer_engine.pytorch
te_imported = True
except:
te_imported = False
assert te_imported, 'transformer_engine import failed'
print("OK") print("OK")
...@@ -2,11 +2,5 @@ ...@@ -2,11 +2,5 @@
# #
# See LICENSE for license information. # See LICENSE for license information.
try: import transformer_engine.tensorflow
import transformer_engine.tensorflow
te_imported = True
except:
te_imported = False
assert te_imported, 'transformer_engine import failed'
print("OK") print("OK")
...@@ -28,16 +28,20 @@ include_directories(${PROJECT_SOURCE_DIR}) ...@@ -28,16 +28,20 @@ include_directories(${PROJECT_SOURCE_DIR})
add_subdirectory(common) add_subdirectory(common)
if(NVTE_WITH_USERBUFFERS) if(NVTE_WITH_USERBUFFERS)
message(STATUS "userbuffers support enabled")
add_subdirectory(pytorch/csrc/userbuffers) add_subdirectory(pytorch/csrc/userbuffers)
endif() endif()
option(ENABLE_JAX "Enable JAX in the building workflow." OFF) option(ENABLE_JAX "Enable JAX in the building workflow." OFF)
message(STATUS "JAX support: ${ENABLE_JAX}")
if(ENABLE_JAX) if(ENABLE_JAX)
find_package(pybind11 CONFIG REQUIRED) find_package(pybind11 CONFIG REQUIRED)
add_subdirectory(jax) add_subdirectory(jax)
endif() endif()
option(ENABLE_TENSORFLOW "Enable TensorFlow in the building workflow." OFF) option(ENABLE_TENSORFLOW "Enable TensorFlow in the building workflow." OFF)
message(STATUS "TensorFlow support: ${ENABLE_TENSORFLOW}")
if(ENABLE_TENSORFLOW) if(ENABLE_TENSORFLOW)
find_package(pybind11 CONFIG REQUIRED) find_package(pybind11 CONFIG REQUIRED)
add_subdirectory(tensorflow) add_subdirectory(tensorflow)
......
# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
add_library(CUDNN::cudnn_all INTERFACE IMPORTED) add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
find_path( find_path(
...@@ -75,4 +79,3 @@ target_link_libraries( ...@@ -75,4 +79,3 @@ target_link_libraries(
CUDNN::cudnn_ops_infer CUDNN::cudnn_ops_infer
CUDNN::cudnn CUDNN::cudnn
) )
...@@ -77,3 +77,6 @@ set_source_files_properties(fused_softmax/scaled_masked_softmax.cu ...@@ -77,3 +77,6 @@ set_source_files_properties(fused_softmax/scaled_masked_softmax.cu
COMPILE_OPTIONS "--use_fast_math") COMPILE_OPTIONS "--use_fast_math")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
# Install library
install(TARGETS transformer_engine DESTINATION .)
...@@ -10,3 +10,4 @@ pybind11_add_module( ...@@ -10,3 +10,4 @@ pybind11_add_module(
) )
target_link_libraries(transformer_engine_jax PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt transformer_engine) target_link_libraries(transformer_engine_jax PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt transformer_engine)
install(TARGETS transformer_engine_jax DESTINATION .)
...@@ -31,3 +31,6 @@ set_source_files_properties(userbuffers.cu ...@@ -31,3 +31,6 @@ set_source_files_properties(userbuffers.cu
COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-maxrregcount=64>") COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-maxrregcount=64>")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
# Install library
install(TARGETS transformer_engine_userbuffers DESTINATION .)
...@@ -40,3 +40,7 @@ target_link_libraries(_get_stream PRIVATE ${TF_LINKER_LIBS}) ...@@ -40,3 +40,7 @@ target_link_libraries(_get_stream PRIVATE ${TF_LINKER_LIBS})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
# Install library
install(TARGETS transformer_engine_tensorflow DESTINATION .)
install(TARGETS _get_stream DESTINATION .)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment