"...git@developer.sourcefind.cn:OpenDAS/TransformerEngine.git" did not exist on "77a83c1047ca09d535e2e1fe7581c45d1064fd2c"
Commit 33062330 authored by wenjh's avatar wenjh
Browse files

Merge branch 'develop_v2.9' into release_v2.9

parents adb1e9c5 b3dcfc28
...@@ -101,9 +101,9 @@ struct BitsNumber { ...@@ -101,9 +101,9 @@ struct BitsNumber {
template <typename T> template <typename T>
struct TypeInfo { struct TypeInfo {
#if FP4_TYPE_SUPPORTED #if FP4_TYPE_SUPPORTED
using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, fp8e8m0, fp4e2m1, int8>; using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8, fp8e8m0, fp4e2m1>;
#else #else
using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, fp8e8m0, int8>; using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8, fp8e8m0>;
#endif #endif
template <typename U, DType current> template <typename U, DType current>
......
...@@ -155,26 +155,28 @@ set(CUTLASS_TOOLS_INCLUDE_DIR ...@@ -155,26 +155,28 @@ set(CUTLASS_TOOLS_INCLUDE_DIR
# Python # Python
find_package(Python COMPONENTS Interpreter Development.Module REQUIRED) find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
# NVIDIA MathDX include directory (from Python package install location) if(USE_CUDA)
if(NOT DEFINED MATHDX_INCLUDE_DIR) # NVIDIA MathDX include directory (from Python package install location)
execute_process( if(NOT DEFINED MATHDX_INCLUDE_DIR)
COMMAND ${Python_EXECUTABLE} -m pip show nvidia-mathdx execute_process(
OUTPUT_VARIABLE _PIP_SHOW_MATHDX COMMAND ${Python_EXECUTABLE} -m pip show nvidia-mathdx
ERROR_VARIABLE _PIP_SHOW_MATHDX_ERR OUTPUT_VARIABLE _PIP_SHOW_MATHDX
RESULT_VARIABLE _PIP_SHOW_MATHDX_RES ERROR_VARIABLE _PIP_SHOW_MATHDX_ERR
OUTPUT_STRIP_TRAILING_WHITESPACE) RESULT_VARIABLE _PIP_SHOW_MATHDX_RES
if(NOT _PIP_SHOW_MATHDX_RES EQUAL 0) OUTPUT_STRIP_TRAILING_WHITESPACE)
message(FATAL_ERROR "Failed to query 'nvidia-mathdx' with pip (using ${Python_EXECUTABLE}): ${_PIP_SHOW_MATHDX_ERR}") if(NOT _PIP_SHOW_MATHDX_RES EQUAL 0)
message(FATAL_ERROR "Failed to query 'nvidia-mathdx' with pip (using ${Python_EXECUTABLE}): ${_PIP_SHOW_MATHDX_ERR}")
endif()
string(REGEX MATCH "Location: ([^\n\r]+)" _MATHDX_LOC_MATCH "${_PIP_SHOW_MATHDX}")
if(NOT _MATHDX_LOC_MATCH)
message(FATAL_ERROR "Could not parse installation location for 'nvidia-mathdx'. Output was:\n${_PIP_SHOW_MATHDX}")
endif()
set(MATHDX_LOCATION "${CMAKE_MATCH_1}")
set(MATHDX_INCLUDE_DIR "${MATHDX_LOCATION}/nvidia/mathdx/include")
endif() endif()
string(REGEX MATCH "Location: ([^\n\r]+)" _MATHDX_LOC_MATCH "${_PIP_SHOW_MATHDX}") if(NOT EXISTS "${MATHDX_INCLUDE_DIR}")
if(NOT _MATHDX_LOC_MATCH) message(FATAL_ERROR "MATHDX include directory not found at ${MATHDX_INCLUDE_DIR}. Set MATHDX_INCLUDE_DIR or ensure 'nvidia-mathdx' is installed for ${Python_EXECUTABLE}.")
message(FATAL_ERROR "Could not parse installation location for 'nvidia-mathdx'. Output was:\n${_PIP_SHOW_MATHDX}")
endif() endif()
set(MATHDX_LOCATION "${CMAKE_MATCH_1}")
set(MATHDX_INCLUDE_DIR "${MATHDX_LOCATION}/nvidia/mathdx/include")
endif()
if(NOT EXISTS "${MATHDX_INCLUDE_DIR}")
message(FATAL_ERROR "MATHDX include directory not found at ${MATHDX_INCLUDE_DIR}. Set MATHDX_INCLUDE_DIR or ensure 'nvidia-mathdx' is installed for ${Python_EXECUTABLE}.")
endif() endif()
# Configure Transformer Engine library # Configure Transformer Engine library
......
...@@ -417,11 +417,13 @@ struct BitsNumber { ...@@ -417,11 +417,13 @@ struct BitsNumber {
template <typename T> template <typename T>
struct TypeInfo { struct TypeInfo {
#if FP4_TYPE_SUPPORTED #if FP4_TYPE_SUPPORTED
using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8, fp4e2m1 using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8
#if CUDA_VERSION >= 12080 #if CUDA_VERSION >= 12080
, ,
fp8e8m0 fp8e8m0
#endif #endif
,
fp4e2m1
>; >;
#else #else
using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8 using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8
......
...@@ -1175,9 +1175,6 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor ...@@ -1175,9 +1175,6 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor
const Tensor *inputCounter = convertNVTETensor(counter); const Tensor *inputCounter = convertNVTETensor(counter);
Tensor *wspace = convertNVTETensor(workspace); Tensor *wspace = convertNVTETensor(workspace);
const void *alpha_ptr = GetScalarOne();
const void *beta_ptr = accumulate ? GetScalarOne() : GetScalarZero();
NVTE_CHECK(is_delayed_tensor_scaling(inputA->scaling_mode) && NVTE_CHECK(is_delayed_tensor_scaling(inputA->scaling_mode) &&
is_delayed_tensor_scaling(inputB->scaling_mode), is_delayed_tensor_scaling(inputB->scaling_mode),
"Atomic GEMM only supports delayed scaling."); "Atomic GEMM only supports delayed scaling.");
...@@ -1230,6 +1227,8 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor ...@@ -1230,6 +1227,8 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor
stream); stream);
} }
#else #else
const void *alpha_ptr = GetScalarOne();
const void *beta_ptr = accumulate ? GetScalarOne() : GetScalarZero();
cublas_gemm(inputA, inputB, outputD, biasTensor, outputGelu, (transa) ? CUBLAS_OP_T : CUBLAS_OP_N, cublas_gemm(inputA, inputB, outputD, biasTensor, outputGelu, (transa) ? CUBLAS_OP_T : CUBLAS_OP_N,
(transb) ? CUBLAS_OP_T : CUBLAS_OP_N, grad, wspace->data.dptr, wspace->data.shape[0], (transb) ? CUBLAS_OP_T : CUBLAS_OP_N, grad, wspace->data.dptr, wspace->data.shape[0],
alpha_ptr, beta_ptr, use_split_accumulator, math_sm_count, m_split, n_split, alpha_ptr, beta_ptr, use_split_accumulator, math_sm_count, m_split, n_split,
......
...@@ -14,8 +14,6 @@ ...@@ -14,8 +14,6 @@
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <stddef.h> #include <stddef.h>
#define TE_FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
...@@ -33,13 +31,9 @@ enum NVTEDType { ...@@ -33,13 +31,9 @@ enum NVTEDType {
kNVTEBFloat16 = 6, /*!< 16-bit bfloat (E8M7) */ kNVTEBFloat16 = 6, /*!< 16-bit bfloat (E8M7) */
kNVTEFloat8E4M3 = 7, /*!< 8-bit float (E4M3) */ kNVTEFloat8E4M3 = 7, /*!< 8-bit float (E4M3) */
kNVTEFloat8E5M2 = 8, /*!< 8-bit float (E5M2) */ kNVTEFloat8E5M2 = 8, /*!< 8-bit float (E5M2) */
kNVTEFloat8E8M0 = 9, /*!< 8-bit float (E8M0) */ kNVTEInt8 = 9, /*!< 8-bit integer */
#if TE_FP4_TYPE_SUPPORTED kNVTEFloat8E8M0 = 10, /*!< 8-bit float (E8M0) */
kNVTEFloat4E2M1 = 10, /*!< 4-bit float (E2M1) */ kNVTEFloat4E2M1 = 11, /*!< 4-bit float (E2M1) */
kNVTEInt8 = 11, /*!< 8-bit integer */
#else
kNVTEInt8 = 10, /*!< 8-bit integer */
#endif
kNVTENumTypes /*!< Number of supported types */ kNVTENumTypes /*!< Number of supported types */
}; };
...@@ -423,13 +417,9 @@ enum class DType { ...@@ -423,13 +417,9 @@ enum class DType {
kBFloat16 = 6, kBFloat16 = 6,
kFloat8E4M3 = 7, kFloat8E4M3 = 7,
kFloat8E5M2 = 8, kFloat8E5M2 = 8,
kFloat8E8M0 = 9, kInt8 = 9,
#if TE_FP4_TYPE_SUPPORTED kFloat8E8M0 = 10,
kFloat4E2M1 = 10, kFloat4E2M1 = 11,
kInt8 = 11,
#else
kInt8 = 10,
#endif
kNumTypes kNumTypes
}; };
...@@ -457,11 +447,7 @@ inline bool is_fp8_dtype(const DType t) { ...@@ -457,11 +447,7 @@ inline bool is_fp8_dtype(const DType t) {
* \param[in] DType TE Datatype of interest * \param[in] DType TE Datatype of interest
*/ */
inline bool is_fp4_dtype(const DType t) { inline bool is_fp4_dtype(const DType t) {
#if TE_FP4_TYPE_SUPPORTED
return t == DType::kFloat4E2M1; return t == DType::kFloat4E2M1;
#else
return false;
#endif
} }
/*! \brief Check if TE datatype is high precision (FP32, FP16, BF16) /*! \brief Check if TE datatype is high precision (FP32, FP16, BF16)
......
...@@ -5,13 +5,21 @@ ...@@ -5,13 +5,21 @@
************************************************************************/ ************************************************************************/
#include <cuda.h> #include <cuda.h>
#ifndef __HIP_PLATFORM_AMD__
#include <cudaTypedefs.h> #include <cudaTypedefs.h>
#else
#define CUDA_VERSION 0
#endif
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <algorithm> #include <algorithm>
#include <cfloat> #include <cfloat>
#ifndef __HIP_PLATFORM_AMD__
#include <cuda/barrier> #include <cuda/barrier>
#endif
#include <utility> #include <utility>
#include "common/common.h" #include "common/common.h"
...@@ -19,7 +27,10 @@ ...@@ -19,7 +27,10 @@
#include "common/transpose/cast_transpose.h" #include "common/transpose/cast_transpose.h"
#include "common/util/ptx.cuh" #include "common/util/ptx.cuh"
#include "common/utils.cuh" #include "common/utils.cuh"
#ifndef __HIP_PLATFORM_AMD__
#include "curanddx.hpp" #include "curanddx.hpp"
#endif
namespace transformer_engine { namespace transformer_engine {
......
...@@ -576,6 +576,7 @@ __device__ __forceinline__ fp8e4m3 compute_decoding_scaling_factor(const float b ...@@ -576,6 +576,7 @@ __device__ __forceinline__ fp8e4m3 compute_decoding_scaling_factor(const float b
#define DIRECT_SCALING_FACTORS_STORE 1 #define DIRECT_SCALING_FACTORS_STORE 1
#ifndef __HIP_PLATFORM_AMD__
template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &), template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &),
typename IType, typename OType, bool COLWISE_SCALING, size_t CHUNK_DIM_Y, typename IType, typename OType, bool COLWISE_SCALING, size_t CHUNK_DIM_Y,
size_t CHUNK_DIM_X, size_t THREADS_PER_CHUNK> size_t CHUNK_DIM_X, size_t THREADS_PER_CHUNK>
...@@ -1065,6 +1066,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK) ...@@ -1065,6 +1066,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
destroy_barriers<STAGES>(mbar, is_master_thread); destroy_barriers<STAGES>(mbar, is_master_thread);
#endif // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) #endif // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
} }
#endif
} // namespace nvfp4_kernel } // namespace nvfp4_kernel
constexpr size_t FP8_CHUNK_DIM_Y = 128; constexpr size_t FP8_CHUNK_DIM_Y = 128;
...@@ -1725,6 +1727,11 @@ void mxfp8_quantize(const Tensor &input, const Tensor *act_input, ...@@ -1725,6 +1727,11 @@ void mxfp8_quantize(const Tensor &input, const Tensor *act_input,
// 2. r16c32 - Rowwise NVFP4 AND Colwise MXFP8 // 2. r16c32 - Rowwise NVFP4 AND Colwise MXFP8
template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &)> template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &)>
void nvfp4_quantize(const Tensor &input, const Tensor *noop, Tensor *output, cudaStream_t stream) { void nvfp4_quantize(const Tensor &input, const Tensor *noop, Tensor *output, cudaStream_t stream) {
#ifdef __HIP_PLATFORM_AMD__
assert(false);
#else
using namespace nvfp4_kernel; using namespace nvfp4_kernel;
using namespace ptx; using namespace ptx;
checkCuDriverContext(stream); checkCuDriverContext(stream);
...@@ -1853,6 +1860,7 @@ void nvfp4_quantize(const Tensor &input, const Tensor *noop, Tensor *output, cud ...@@ -1853,6 +1860,7 @@ void nvfp4_quantize(const Tensor &input, const Tensor *noop, Tensor *output, cud
break; break;
}); // NOLINT(*) }); // NOLINT(*)
); // NOLINT(*) ); // NOLINT(*)
#endif
} }
namespace detail { namespace detail {
......
...@@ -23,7 +23,9 @@ ...@@ -23,7 +23,9 @@
#endif // __HIP_PLATFORM_AMD__ #endif // __HIP_PLATFORM_AMD__
#include <nvrtc.h> #include <nvrtc.h>
#ifndef __HIP_PLATFORM_AMD__
#include "nccl.h" #include "nccl.h"
#endif
#ifdef NVTE_WITH_CUBLASMP #ifdef NVTE_WITH_CUBLASMP
#include <cublasmp.h> #include <cublasmp.h>
......
...@@ -12,7 +12,13 @@ ...@@ -12,7 +12,13 @@
#define TRANSFORMER_ENGINE_NVFP4_TRANSPOSE_CUH_ #define TRANSFORMER_ENGINE_NVFP4_TRANSPOSE_CUH_
#include <cuda.h> #include <cuda.h>
#ifndef __HIP_PLATFORM_AMD__
#include <cudaTypedefs.h> #include <cudaTypedefs.h>
#else
#define CUDA_VERSION 0
#endif
#include <cuda_runtime.h> #include <cuda_runtime.h>
#if FP4_TYPE_SUPPORTED #if FP4_TYPE_SUPPORTED
...@@ -22,7 +28,11 @@ ...@@ -22,7 +28,11 @@
#include "../common.h" #include "../common.h"
#include "../utils.cuh" #include "../utils.cuh"
#ifndef __HIP_PLATFORM_AMD__
#include "curanddx.hpp" #include "curanddx.hpp"
#endif
#include "math.h" #include "math.h"
#include "ptx.cuh" #include "ptx.cuh"
#include "transformer_engine/transformer_engine.h" #include "transformer_engine/transformer_engine.h"
......
...@@ -1486,10 +1486,14 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou ...@@ -1486,10 +1486,14 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
// We need: // We need:
// 1. Rowwise amax = amax for input // 1. Rowwise amax = amax for input
// 2. Columnwise amax = amax for RHT(input.t) // 2. Columnwise amax = amax for RHT(input.t)
#ifdef __HIP_PLATFORM_AMD__
NVTE_CHECK(false, "Not only supported for nvte_hadamard_transform_amax");
#else
NVTE_SCOPED_GIL_RELEASE({ NVTE_SCOPED_GIL_RELEASE({
nvte_hadamard_transform_amax(input.data(), out.data(), 0, nvte_hadamard_transform_amax(input.data(), out.data(), 0,
this->rht_matrix_random_sign_mask_t, stream); this->rht_matrix_random_sign_mask_t, stream);
}); });
#endif
} else { } else {
// raise error since it's not supported yet // raise error since it's not supported yet
NVTE_CHECK(false, "Pre-RHT amax is not supported yet"); NVTE_CHECK(false, "Pre-RHT amax is not supported yet");
...@@ -1612,11 +1616,15 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou ...@@ -1612,11 +1616,15 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
rht_output_t_cpp.set_rowwise_data(rht_output_t.data_ptr(), input.dtype(), rht_output_t_cpp.set_rowwise_data(rht_output_t.data_ptr(), input.dtype(),
std::vector<size_t>{cols, rows}); std::vector<size_t>{cols, rows});
#ifdef __HIP_PLATFORM_AMD__
NVTE_CHECK(false, "Not only supported for nvte_hadamard_transform");
#else
NVTE_SCOPED_GIL_RELEASE({ NVTE_SCOPED_GIL_RELEASE({
// Perform the RHT(input.t), and write to rht_output_cpp.columnwise. // Perform the RHT(input.t), and write to rht_output_cpp.columnwise.
nvte_hadamard_transform(input.data(), rht_output_t_cpp.data(), 0, nvte_hadamard_transform(input.data(), rht_output_t_cpp.data(), 0,
this->rht_matrix_random_sign_mask_t, stream); this->rht_matrix_random_sign_mask_t, stream);
}); });
#endif
// Quantize kernel will treat everything as rowwise input/output, which is // Quantize kernel will treat everything as rowwise input/output, which is
// intended. // intended.
...@@ -1628,10 +1636,14 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou ...@@ -1628,10 +1636,14 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
NVTE_CHECK(this->rht_matrix.defined() && this->rht_matrix.numel() > 0, NVTE_CHECK(this->rht_matrix.defined() && this->rht_matrix.numel() > 0,
"RHT matrix is not set"); "RHT matrix is not set");
auto rht_matrix_nvte = makeTransformerEngineTensor(this->rht_matrix); auto rht_matrix_nvte = makeTransformerEngineTensor(this->rht_matrix);
#ifdef __HIP_PLATFORM_AMD__
NVTE_CHECK(false, "Not only supported for nvte_hadamard_transform_cast_fusion_columnwise");
#else
NVTE_SCOPED_GIL_RELEASE({ NVTE_SCOPED_GIL_RELEASE({
nvte_hadamard_transform_cast_fusion_columnwise( nvte_hadamard_transform_cast_fusion_columnwise(
input.data(), out_transpose.data(), rht_matrix_nvte.data(), quant_config, stream); input.data(), out_transpose.data(), rht_matrix_nvte.data(), quant_config, stream);
}); });
#endif
} }
} }
} else { } else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment