Unverified Commit 5d85857a authored by Oleg Goncharov's avatar Oleg Goncharov Committed by GitHub
Browse files

Added memory alignment check to cast_fp8_1D (#1507)



* Added TMA alignment check to cast_fp8_1D
Signed-off-by: default avatarOleg Goncharov <ogoncharov@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* Use tensor const-ref instead of tensor const-ptr
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

---------
Signed-off-by: default avatarOleg Goncharov <ogoncharov@nvidia.com>
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
Co-authored-by: default avatarpre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: default avatarTim Moon <tmoon@nvidia.com>
Co-authored-by: default avatarTim Moon <4406448+timmoon10@users.noreply.github.com>
parent 8ca2caf8
...@@ -67,11 +67,6 @@ CUtensorMapDataType get_CUtensorMapDataType(DType dtype) { ...@@ -67,11 +67,6 @@ CUtensorMapDataType get_CUtensorMapDataType(DType dtype) {
return dtypeMapping.at(dtype); return dtypeMapping.at(dtype);
} }
inline bool isPointerAligned(const void *const ptr, const int alignment) {
const uint64_t ptr_as_uint = reinterpret_cast<uint64_t>(ptr);
return ptr_as_uint % alignment == 0;
}
// Set up parameters to create TMA descriptor. // Set up parameters to create TMA descriptor.
void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor, void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor,
const uint64_t globalY, const uint64_t globalX, const uint32_t shmemY, const uint64_t globalY, const uint64_t globalX, const uint32_t shmemY,
...@@ -100,8 +95,7 @@ void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor, ...@@ -100,8 +95,7 @@ void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor,
void *dataPtr = void *dataPtr =
reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(tensor.dptr) + offset_elems * type_size); reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(tensor.dptr) + offset_elems * type_size);
constexpr int TMA_gmem_alignment = 16; // Alignment of the global memory address NVTE_CHECK(is_aligned_ptr(dataPtr, TMA_gmem_alignment),
NVTE_CHECK(isPointerAligned(dataPtr, TMA_gmem_alignment),
"Tensor data pointer must be 16B aligned"); "Tensor data pointer must be 16B aligned");
const int TMA_needed_size = TMA_gmem_alignment / type_size; const int TMA_needed_size = TMA_gmem_alignment / type_size;
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <transformer_engine/transformer_engine.h> #include <transformer_engine/transformer_engine.h>
#include <cstdint>
#include <functional> #include <functional>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
...@@ -426,6 +427,17 @@ constexpr size_t scale_tensor_alignment_Y_rowwise = 128; ...@@ -426,6 +427,17 @@ constexpr size_t scale_tensor_alignment_Y_rowwise = 128;
constexpr size_t scale_tensor_alignment_X_colwise = 128; constexpr size_t scale_tensor_alignment_X_colwise = 128;
constexpr size_t scale_tensor_alignment_Y_colwise = 4; constexpr size_t scale_tensor_alignment_Y_colwise = 4;
// Alignment requirements for the Tensor Memory Accelerator (TMA)
constexpr int TMA_gmem_alignment = 16; // global memory address alignment
inline bool is_aligned_ptr(const void *ptr, size_t alignment) {
return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
}
inline bool is_aligned_tensor_data(const Tensor &t, size_t alignment) {
return is_aligned_ptr(static_cast<const void *>(t.data.dptr), alignment);
}
size_t typeToSize(const DType type); size_t typeToSize(const DType type);
void CheckNoopTensor(const Tensor &t, const std::string &name); void CheckNoopTensor(const Tensor &t, const std::string &name);
...@@ -465,8 +477,6 @@ void checkCuDriverContext(CUstream stream); ...@@ -465,8 +477,6 @@ void checkCuDriverContext(CUstream stream);
CUtensorMapDataType get_CUtensorMapDataType(DType dtype); CUtensorMapDataType get_CUtensorMapDataType(DType dtype);
inline bool isPointerAligned(const void *const ptr, const int alignment);
// Set up parameters to create TMA descriptor. // Set up parameters to create TMA descriptor.
void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor, void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor,
const uint64_t globalY, const uint64_t globalX, const uint32_t shmemY, const uint64_t globalY, const uint64_t globalX, const uint32_t shmemY,
......
...@@ -1110,7 +1110,9 @@ void fp8_quantize_arch_ge_100(const Tensor &input, const Tensor *act_input, cons ...@@ -1110,7 +1110,9 @@ void fp8_quantize_arch_ge_100(const Tensor &input, const Tensor *act_input, cons
switch (output->scaling_mode) { switch (output->scaling_mode) {
case NVTE_DELAYED_TENSOR_SCALING: { case NVTE_DELAYED_TENSOR_SCALING: {
if (!IS_DBIAS && !IS_DACT) { if (!IS_DBIAS && !IS_DACT) {
if (is_full_tile_1D_tensor(output) && is_fp8_dtype(output->dtype())) { if (is_full_tile_1D_tensor(output) && is_fp8_dtype(output->dtype()) &&
is_aligned_tensor_data(input, TMA_gmem_alignment) &&
is_aligned_tensor_data(*output, TMA_gmem_alignment)) {
// Aligned AND FP8 // Aligned AND FP8
cast_fp8_1D<IS_ACT, ParamOP, OP>(input, output, stream); cast_fp8_1D<IS_ACT, ParamOP, OP>(input, output, stream);
} else { } else {
...@@ -1118,7 +1120,10 @@ void fp8_quantize_arch_ge_100(const Tensor &input, const Tensor *act_input, cons ...@@ -1118,7 +1120,10 @@ void fp8_quantize_arch_ge_100(const Tensor &input, const Tensor *act_input, cons
CastVectorizedUnaryKernelLauncher<ParamOP, OP>(input, noop, output, stream); CastVectorizedUnaryKernelLauncher<ParamOP, OP>(input, noop, output, stream);
} }
} else if (!IS_DBIAS && IS_DACT) { } else if (!IS_DBIAS && IS_DACT) {
if (dimensions_supported_by_TMA(output) && is_fp8_dtype(output->dtype())) { if (dimensions_supported_by_TMA(output) && is_fp8_dtype(output->dtype()) &&
is_aligned_tensor_data(input, TMA_gmem_alignment) &&
is_aligned_tensor_data(*output, TMA_gmem_alignment) &&
is_aligned_tensor_data(*act_input, TMA_gmem_alignment)) {
// Aligned AND FP8 (+dAct) // Aligned AND FP8 (+dAct)
cast_fp8_2D<IS_DBIAS, IS_DACT, ParamOP, OP>(input, act_input, output, dbias, workspace, cast_fp8_2D<IS_DBIAS, IS_DACT, ParamOP, OP>(input, act_input, output, dbias, workspace,
stream); stream);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment