Unverified Commit 00328ac7 authored by Kirthi Shankar Sivamani's avatar Kirthi Shankar Sivamani Committed by GitHub
Browse files

Build support for cuda 13 (#1809)



* Build support for cuda 13
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Fix build for cudnn 8.9*; cuda 12.1
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* readd include
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

---------
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
parent 6262280e
...@@ -242,9 +242,12 @@ def get_cuda_include_dirs() -> Tuple[str, str]: ...@@ -242,9 +242,12 @@ def get_cuda_include_dirs() -> Tuple[str, str]:
def cuda_archs() -> str: def cuda_archs() -> str:
version = cuda_version() version = cuda_version()
if os.getenv("NVTE_CUDA_ARCHS") is None: if os.getenv("NVTE_CUDA_ARCHS") is None:
os.environ["NVTE_CUDA_ARCHS"] = ( if version >= (13, 0):
"70;80;89;90;100;120" if version >= (12, 8) else "70;80;89;90" os.environ["NVTE_CUDA_ARCHS"] = "75;80;89;90;100;120"
) elif version >= (12, 8):
os.environ["NVTE_CUDA_ARCHS"] = "70;80;89;90;100;120"
else:
os.environ["NVTE_CUDA_ARCHS"] = "70;80;89;90"
return os.getenv("NVTE_CUDA_ARCHS") return os.getenv("NVTE_CUDA_ARCHS")
......
...@@ -22,6 +22,7 @@ from build_tools.utils import ( ...@@ -22,6 +22,7 @@ from build_tools.utils import (
get_frameworks, get_frameworks,
install_and_import, install_and_import,
remove_dups, remove_dups,
cuda_toolkit_include_path,
) )
frameworks = get_frameworks() frameworks = get_frameworks()
...@@ -88,7 +89,10 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]: ...@@ -88,7 +89,10 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
""" """
# Common requirements # Common requirements
setup_reqs: List[str] = [ setup_reqs: List[str] = []
if cuda_toolkit_include_path() is None:
setup_reqs.extend(
[
"nvidia-cuda-runtime-cu12", "nvidia-cuda-runtime-cu12",
"nvidia-cublas-cu12", "nvidia-cublas-cu12",
"nvidia-cudnn-cu12", "nvidia-cudnn-cu12",
...@@ -97,6 +101,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]: ...@@ -97,6 +101,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
"nvidia-nvtx-cu12", "nvidia-nvtx-cu12",
"nvidia-cuda-nvrtc-cu12", "nvidia-cuda-nvrtc-cu12",
] ]
)
install_reqs: List[str] = [ install_reqs: List[str] = [
"pydantic", "pydantic",
"importlib-metadata>=1.0", "importlib-metadata>=1.0",
......
...@@ -6,7 +6,9 @@ cmake_minimum_required(VERSION 3.21) ...@@ -6,7 +6,9 @@ cmake_minimum_required(VERSION 3.21)
# Language options # Language options
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8) if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13.0)
set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90 100 120)
elseif (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8)
set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90 100 120) set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90 100 120)
else () else ()
set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90) set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90)
......
...@@ -132,9 +132,10 @@ void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor, ...@@ -132,9 +132,10 @@ void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor,
const uint32_t shmemX, const uint32_t stride_elems, const uint32_t shmemX, const uint32_t stride_elems,
const uint32_t offset_elems, const size_t type_size) { const uint32_t offset_elems, const size_t type_size) {
// Get a function pointer to the cuTensorMapEncodeTiled driver API // Get a function pointer to the cuTensorMapEncodeTiled driver API
static PFN_cuTensorMapEncodeTiled cuDriverTensorMapEncodeTiled = []() { // Note: PFN_cuTensorMapEncodeTiled is not defined in cuda13
static PFN_cuTensorMapEncodeTiled_v12000 cuDriverTensorMapEncodeTiled = []() {
void *driver_ptr = cuda_driver::get_symbol("cuTensorMapEncodeTiled"); void *driver_ptr = cuda_driver::get_symbol("cuTensorMapEncodeTiled");
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(driver_ptr); return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(driver_ptr);
}(); }();
// rank is the number of dimensions of the array // rank is the number of dimensions of the array
constexpr uint32_t rank = 2; constexpr uint32_t rank = 2;
......
...@@ -493,7 +493,8 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, ...@@ -493,7 +493,8 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE,
&epilogue, sizeof(epilogue))); &epilogue, sizeof(epilogue)));
#if CUDA_VERSION >= 12020 && CUBLAS_VERSION >= 120205 #if CUDA_VERSION >= 12020 && CUBLAS_VERSION >= 120205 && CUDA_VERSION < 13000 && \
CUBLAS_VERSION < 130000
if (counter != nullptr) { if (counter != nullptr) {
if (m_split == 0) m_split = 1; if (m_split == 0) m_split = 1;
if (n_split == 0) n_split = 1; if (n_split == 0) n_split = 1;
...@@ -609,8 +610,10 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor ...@@ -609,8 +610,10 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor
int cudart_version; int cudart_version;
NVTE_CHECK_CUDA(cudaRuntimeGetVersion(&cudart_version)); NVTE_CHECK_CUDA(cudaRuntimeGetVersion(&cudart_version));
NVTE_CHECK(cudart_version >= 12020, "Cuda version 12.2 is required for atomic gemm."); NVTE_CHECK(cudart_version >= 12020 && cudart_version < 13000,
NVTE_CHECK(cublasLtGetVersion() >= 120205, "Cublas version 12.2.5 is required for atomic gemm."); "Cuda version >=12.2 and <13.0 is required for atomic gemm.");
NVTE_CHECK(cublasLtGetVersion() >= 120205 && cublasLtGetVersion() < 130000,
"Cublas version >=12.2.5 and <13.0 is required for atomic gemm.");
using namespace transformer_engine; using namespace transformer_engine;
const Tensor *inputA = reinterpret_cast<const Tensor *>(A); const Tensor *inputA = reinterpret_cast<const Tensor *>(A);
......
...@@ -44,7 +44,7 @@ if bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))) or os.path.isdir(build_tools_ ...@@ -44,7 +44,7 @@ if bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))) or os.path.isdir(build_tools_
from build_tools.build_ext import get_build_ext from build_tools.build_ext import get_build_ext
from build_tools.utils import copy_common_headers, install_and_import from build_tools.utils import copy_common_headers, install_and_import, cuda_toolkit_include_path
from build_tools.te_version import te_version from build_tools.te_version import te_version
from build_tools.jax import setup_jax_extension from build_tools.jax import setup_jax_extension
...@@ -94,16 +94,10 @@ if __name__ == "__main__": ...@@ -94,16 +94,10 @@ if __name__ == "__main__":
) )
] ]
# Configure package setup_requires = ["jax[cuda12]", "flax>=0.7.1"]
setuptools.setup( if cuda_toolkit_include_path() is None:
name="transformer_engine_jax", setup_requires.extend(
version=te_version(), [
description="Transformer acceleration library - Jax Lib",
ext_modules=ext_modules,
cmdclass={"build_ext": CMakeBuildExtension},
setup_requires=[
"jax[cuda12]",
"flax>=0.7.1",
"nvidia-cuda-runtime-cu12", "nvidia-cuda-runtime-cu12",
"nvidia-cublas-cu12", "nvidia-cublas-cu12",
"nvidia-cudnn-cu12", "nvidia-cudnn-cu12",
...@@ -111,7 +105,17 @@ if __name__ == "__main__": ...@@ -111,7 +105,17 @@ if __name__ == "__main__":
"nvidia-cuda-nvcc-cu12", "nvidia-cuda-nvcc-cu12",
"nvidia-nvtx-cu12", "nvidia-nvtx-cu12",
"nvidia-cuda-nvrtc-cu12", "nvidia-cuda-nvrtc-cu12",
], ]
)
# Configure package
setuptools.setup(
name="transformer_engine_jax",
version=te_version(),
description="Transformer acceleration library - Jax Lib",
ext_modules=ext_modules,
cmdclass={"build_ext": CMakeBuildExtension},
setup_requires=setup_requires,
install_requires=["jax", "flax>=0.7.1"], install_requires=["jax", "flax>=0.7.1"],
tests_require=["numpy"], tests_require=["numpy"],
) )
......
...@@ -29,7 +29,7 @@ if bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))) or os.path.isdir(build_tools_ ...@@ -29,7 +29,7 @@ if bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))) or os.path.isdir(build_tools_
from build_tools.build_ext import get_build_ext from build_tools.build_ext import get_build_ext
from build_tools.utils import copy_common_headers from build_tools.utils import copy_common_headers, cuda_toolkit_include_path
from build_tools.te_version import te_version from build_tools.te_version import te_version
from build_tools.pytorch import setup_pytorch_extension from build_tools.pytorch import setup_pytorch_extension
...@@ -48,15 +48,10 @@ if __name__ == "__main__": ...@@ -48,15 +48,10 @@ if __name__ == "__main__":
) )
] ]
# Configure package setup_requires = ["torch>=2.1"]
setuptools.setup( if cuda_toolkit_include_path() is None:
name="transformer_engine_torch", setup_requires.extend(
version=te_version(), [
description="Transformer acceleration library - Torch Lib",
ext_modules=ext_modules,
cmdclass={"build_ext": CMakeBuildExtension},
setup_requires=[
"torch>=2.1",
"nvidia-cuda-runtime-cu12", "nvidia-cuda-runtime-cu12",
"nvidia-cublas-cu12", "nvidia-cublas-cu12",
"nvidia-cudnn-cu12", "nvidia-cudnn-cu12",
...@@ -64,7 +59,17 @@ if __name__ == "__main__": ...@@ -64,7 +59,17 @@ if __name__ == "__main__":
"nvidia-cuda-nvcc-cu12", "nvidia-cuda-nvcc-cu12",
"nvidia-nvtx-cu12", "nvidia-nvtx-cu12",
"nvidia-cuda-nvrtc-cu12", "nvidia-cuda-nvrtc-cu12",
], ]
)
# Configure package
setuptools.setup(
name="transformer_engine_torch",
version=te_version(),
description="Transformer acceleration library - Torch Lib",
ext_modules=ext_modules,
cmdclass={"build_ext": CMakeBuildExtension},
setup_requires=setup_requires,
install_requires=["torch>=2.1"], install_requires=["torch>=2.1"],
tests_require=["numpy", "torchvision"], tests_require=["numpy", "torchvision"],
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment