Unverified Commit 5d85857a authored by Oleg Goncharov's avatar Oleg Goncharov Committed by GitHub
Browse files

Added memory alignment check to cast_fp8_1D (#1507)



* Added TMA alignment check to cast_fp8_1D
Signed-off-by: default avatarOleg Goncharov <ogoncharov@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* Use tensor const-ref instead of tensor const-ptr
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

---------
Signed-off-by: default avatarOleg Goncharov <ogoncharov@nvidia.com>
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
Co-authored-by: default avatarpre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: default avatarTim Moon <tmoon@nvidia.com>
Co-authored-by: default avatarTim Moon <4406448+timmoon10@users.noreply.github.com>
parent 8ca2caf8
......@@ -67,11 +67,6 @@ CUtensorMapDataType get_CUtensorMapDataType(DType dtype) {
return dtypeMapping.at(dtype);
}
inline bool isPointerAligned(const void *const ptr, const int alignment) {
const uint64_t ptr_as_uint = reinterpret_cast<uint64_t>(ptr);
return ptr_as_uint % alignment == 0;
}
// Set up parameters to create TMA descriptor.
void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor,
const uint64_t globalY, const uint64_t globalX, const uint32_t shmemY,
......@@ -100,8 +95,7 @@ void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor,
void *dataPtr =
reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(tensor.dptr) + offset_elems * type_size);
constexpr int TMA_gmem_alignment = 16; // Alignment of the global memory address
NVTE_CHECK(isPointerAligned(dataPtr, TMA_gmem_alignment),
NVTE_CHECK(is_aligned_ptr(dataPtr, TMA_gmem_alignment),
"Tensor data pointer must be 16B aligned");
const int TMA_needed_size = TMA_gmem_alignment / type_size;
......
......@@ -14,6 +14,7 @@
#include <cuda_runtime_api.h>
#include <transformer_engine/transformer_engine.h>
#include <cstdint>
#include <functional>
#include <stdexcept>
#include <string>
......@@ -426,6 +427,17 @@ constexpr size_t scale_tensor_alignment_Y_rowwise = 128;
constexpr size_t scale_tensor_alignment_X_colwise = 128;
constexpr size_t scale_tensor_alignment_Y_colwise = 4;
// Alignment requirements for the Tensor Memory Accelerator (TMA)
constexpr int TMA_gmem_alignment = 16; // global memory address alignment
inline bool is_aligned_ptr(const void *ptr, size_t alignment) {
return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
}
inline bool is_aligned_tensor_data(const Tensor &t, size_t alignment) {
return is_aligned_ptr(static_cast<const void *>(t.data.dptr), alignment);
}
size_t typeToSize(const DType type);
void CheckNoopTensor(const Tensor &t, const std::string &name);
......@@ -465,8 +477,6 @@ void checkCuDriverContext(CUstream stream);
CUtensorMapDataType get_CUtensorMapDataType(DType dtype);
inline bool isPointerAligned(const void *const ptr, const int alignment);
// Set up parameters to create TMA descriptor.
void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor,
const uint64_t globalY, const uint64_t globalX, const uint32_t shmemY,
......
......@@ -1110,7 +1110,9 @@ void fp8_quantize_arch_ge_100(const Tensor &input, const Tensor *act_input, cons
switch (output->scaling_mode) {
case NVTE_DELAYED_TENSOR_SCALING: {
if (!IS_DBIAS && !IS_DACT) {
if (is_full_tile_1D_tensor(output) && is_fp8_dtype(output->dtype())) {
if (is_full_tile_1D_tensor(output) && is_fp8_dtype(output->dtype()) &&
is_aligned_tensor_data(input, TMA_gmem_alignment) &&
is_aligned_tensor_data(*output, TMA_gmem_alignment)) {
// Aligned AND FP8
cast_fp8_1D<IS_ACT, ParamOP, OP>(input, output, stream);
} else {
......@@ -1118,7 +1120,10 @@ void fp8_quantize_arch_ge_100(const Tensor &input, const Tensor *act_input, cons
CastVectorizedUnaryKernelLauncher<ParamOP, OP>(input, noop, output, stream);
}
} else if (!IS_DBIAS && IS_DACT) {
if (dimensions_supported_by_TMA(output) && is_fp8_dtype(output->dtype())) {
if (dimensions_supported_by_TMA(output) && is_fp8_dtype(output->dtype()) &&
is_aligned_tensor_data(input, TMA_gmem_alignment) &&
is_aligned_tensor_data(*output, TMA_gmem_alignment) &&
is_aligned_tensor_data(*act_input, TMA_gmem_alignment)) {
// Aligned AND FP8 (+dAct)
cast_fp8_2D<IS_DBIAS, IS_DACT, ParamOP, OP>(input, act_input, output, dbias, workspace,
stream);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment