Commit 544dd14b authored by Przemek Tredak's avatar Przemek Tredak
Browse files

Update main branch with TE 2.0 code, update version to 2.1.0.dev0


Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>
parent e5369541
......@@ -15,7 +15,7 @@
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <transformer_engine/transpose.h>
#include <transformer_engine/cast.h>
#include "../test_common.h"
using namespace transformer_engine;
......@@ -64,26 +64,23 @@ void performTest(const size_t N, const size_t H) {
DType itype = TypeInfo<IType>::dtype;
DType otype = TypeInfo<OType>::dtype;
DType ctype = TypeInfo<CType>::dtype;
Tensor input({N, H}, itype);
Tensor input("input", {N, H}, itype);
Tensor output_c({N, H}, otype);
Tensor output_t({ H, N}, otype);
Tensor output("output", {N, H}, otype, true, true);
// dbias has the same data type with "output grad"
Tensor dbias({H}, itype);
Tensor dbias("dbias", {H}, itype);
fillUniform(&input);
setRandomScale(&output_c);
output_t.shareFP8Meta(output_c);
setRandomScale(&output);
std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N*H);
std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
CType ref_amax;
compute_ref_cast_transpose_dbias(input.cpu_dptr<IType>(),
output_c.scale(),
compute_ref_cast_transpose_dbias(input.rowwise_cpu_dptr<IType>(),
output.scale(),
ref_output_c.get(),
ref_output_t.get(),
&ref_amax,
......@@ -92,22 +89,20 @@ void performTest(const size_t N, const size_t H) {
Tensor workspace;
nvte_cast_transpose_dbias(input.data(),
output_c.data(),
output_t.data(),
dbias.data(),
workspace.data(),
0);
nvte_quantize_dbias(input.data(),
output.data(),
dbias.data(),
workspace.data(),
0);
workspace = Tensor(workspace.shape(), workspace.dtype());
workspace = Tensor("workspace", workspace.rowwise_shape(), workspace.dtype());
nvte_cast_transpose_dbias(input.data(),
output_c.data(),
output_t.data(),
dbias.data(),
workspace.data(),
0);
nvte_quantize_dbias(input.data(),
output.data(),
dbias.data(),
workspace.data(),
0);
cudaDeviceSynchronize();
auto err = cudaGetLastError();
......@@ -115,17 +110,17 @@ void performTest(const size_t N, const size_t H) {
if (isFp8Type(otype)) {
auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
float ref_scale_inv = 1.f / output_c.scale();
compareResults("scale_inv", output_c.scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
float ref_scale_inv = 1.f / output.scale();
compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
}
auto [atol, rtol] = getTolerances(otype);
compareResults("output_c", output_c, ref_output_c.get(), atol, rtol);
compareResults("output_t", output_t, ref_output_t.get(), atol, rtol);
compareResults("output_c", output, ref_output_c.get(), true, atol, rtol);
compareResults("output_t", output, ref_output_t.get(), false, atol, rtol);
auto [atol_dbias, rtol_dbias] = getTolerances(itype);
rtol_dbias *= 4;
compareResults("output_dbias", dbias, ref_output_dbias.get(), atol_dbias, rtol_dbias);
compareResults("output_dbias", dbias, ref_output_dbias.get(), true, atol_dbias, rtol_dbias);
}
std::vector<std::pair<size_t, size_t>> test_cases = {{64, 400},
......
......@@ -75,29 +75,26 @@ void performTest(const size_t N, const size_t H) {
DType itype = TypeInfo<IType>::dtype;
DType otype = TypeInfo<OType>::dtype;
DType ctype = TypeInfo<CType>::dtype;
Tensor input({N, H}, itype);
Tensor gelu_input({N, H}, itype);
Tensor input("input", {N, H}, itype);
Tensor gelu_input("gelu_input", {N, H}, itype);
Tensor output_c({N, H}, otype);
Tensor output_t({ H, N}, otype);
Tensor output("output", {N, H}, otype, true, true);
// dbias has the same data type with "output grad"
Tensor dbias({H}, itype);
Tensor dbias("dbias", {H}, itype);
fillUniform(&input);
fillUniform(&gelu_input);
setRandomScale(&output_c);
output_t.shareFP8Meta(output_c);
setRandomScale(&output);
std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N*H);
std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
CType ref_amax;
compute_ref_cast_transpose_dbias_dgelu(input.cpu_dptr<IType>(),
gelu_input.cpu_dptr<IType>(),
output_c.scale(),
compute_ref_cast_transpose_dbias_dgelu(input.rowwise_cpu_dptr<IType>(),
gelu_input.rowwise_cpu_dptr<IType>(),
output.scale(),
ref_output_c.get(),
ref_output_t.get(),
&ref_amax,
......@@ -108,19 +105,17 @@ void performTest(const size_t N, const size_t H) {
nvte_cast_transpose_dbias_dgelu(input.data(),
gelu_input.data(),
output_c.data(),
output_t.data(),
output.data(),
dbias.data(),
workspace.data(),
0);
workspace = Tensor(workspace.shape(), workspace.dtype());
workspace = Tensor("workspace", workspace.rowwise_shape(), workspace.dtype());
nvte_cast_transpose_dbias_dgelu(input.data(),
gelu_input.data(),
output_c.data(),
output_t.data(),
output.data(),
dbias.data(),
workspace.data(),
0);
......@@ -131,18 +126,18 @@ void performTest(const size_t N, const size_t H) {
if (isFp8Type(otype)) {
auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
float ref_scale_inv = 1.f / output_c.scale();
compareResults("scale_inv", output_c.scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
float ref_scale_inv = 1.f / output.scale();
compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
}
auto [atol, rtol] = getTolerances(otype);
compareResults("output_c", output_c, ref_output_c.get(), atol, rtol);
compareResults("output_t", output_t, ref_output_t.get(), atol, rtol);
compareResults("output_c", output, ref_output_c.get(), true, atol, rtol);
compareResults("output_t", output, ref_output_t.get(), false, atol, rtol);
auto [atol_dbias, rtol_dbias] = getTolerances(itype);
rtol_dbias *= 4;
compareResults("output_dbias", dbias, ref_output_dbias.get(), atol_dbias, rtol_dbias);
compareResults("output_dbias", dbias, ref_output_dbias.get(), true, atol_dbias, rtol_dbias);
}
std::vector<std::pair<size_t, size_t>> test_cases = {{64, 400},
......
......@@ -74,24 +74,22 @@ void performTest(const size_t N, const size_t H) {
DType itype = TypeInfo<IType>::dtype;
DType otype = TypeInfo<OType>::dtype;
Tensor grad({N, H}, itype);
Tensor input({N, H * 2}, itype);
Tensor output_c({N, H * 2}, otype);
Tensor output_t({H * 2, N}, otype);
Tensor grad("grad", {N, H}, itype);
Tensor input("input", {N, H * 2}, itype);
Tensor output("output", {N, H * 2}, otype, true, true);
fillUniform(&grad);
fillUniform(&input);
setRandomScale(&output_c);
output_t.shareFP8Meta(output_c);
setRandomScale(&output);
std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N * H * 2);
std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N * H * 2);
nvte_dgeglu_cast_transpose(grad.data(), input.data(), output_c.data(), output_t.data(), 0);
nvte_dgeglu_cast_transpose(grad.data(), input.data(), output.data(), 0);
CType ref_amax;
compute_ref_cast_transpose_dgated_gelu(grad.cpu_dptr<IType>(), input.cpu_dptr<IType>(),
output_c.scale(), ref_output_c.get(), ref_output_t.get(),
compute_ref_cast_transpose_dgated_gelu(grad.rowwise_cpu_dptr<IType>(), input.rowwise_cpu_dptr<IType>(),
output.scale(), ref_output_c.get(), ref_output_t.get(),
&ref_amax, N, H);
cudaDeviceSynchronize();
......@@ -100,14 +98,14 @@ void performTest(const size_t N, const size_t H) {
if (isFp8Type(otype)) {
auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
float ref_scale_inv = 1.f / output_c.scale();
compareResults("scale_inv", output_c.scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
float ref_scale_inv = 1.f / output.scale();
compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
}
auto [atol, rtol] = getTolerances(otype);
compareResults("output_c", output_c, ref_output_c.get(), atol, rtol);
compareResults("output_t", output_t, ref_output_t.get(), atol, rtol);
compareResults("output_c", output, ref_output_c.get(), true, atol, rtol);
compareResults("output_t", output, ref_output_t.get(), false, atol, rtol);
}
std::vector<std::pair<size_t, size_t>> test_cases = {{64, 400}, {4096, 2048}, {768, 2816},
......
......@@ -153,11 +153,11 @@ void performTest(
DType itype = TypeInfo<Type>::dtype;
Tensor data_in({ batches, heads, rows, cols }, itype);
Tensor softmax_out({ batches, heads, rows, cols }, itype);
Tensor softmax_in({ batches, heads, rows, cols }, itype);
Tensor grads_in({ batches, heads, rows, cols }, itype);
Tensor grads_out({ batches, heads, rows, cols }, itype);
Tensor data_in("data_in", { batches, heads, rows, cols }, itype);
Tensor softmax_out("softmax_out", { batches, heads, rows, cols }, itype);
Tensor softmax_in("softmax_in", { batches, heads, rows, cols }, itype);
Tensor grads_in("grads_in", { batches, heads, rows, cols }, itype);
Tensor grads_out("grads_out", { batches, heads, rows, cols }, itype);
const size_t elements_total = batches * heads * rows * cols;
std::unique_ptr<Type[]> softmax_out_ref = std::make_unique<Type[]>(elements_total);
......@@ -175,9 +175,9 @@ void performTest(
// Reference implementations
compute_fwd_ref(softmax_out_ref.get(), data_in.cpu_dptr<Type>(),
compute_fwd_ref(softmax_out_ref.get(), data_in.rowwise_cpu_dptr<Type>(),
compute_buffer.get(), scaling_factor, batches, heads, rows, cols);
compute_bwd_ref(grads_out_ref.get(), grads_in.cpu_dptr<Type>(), softmax_in.cpu_dptr<Type>(),
compute_bwd_ref(grads_out_ref.get(), grads_in.rowwise_cpu_dptr<Type>(), softmax_in.rowwise_cpu_dptr<Type>(),
compute_buffer.get(), scaling_factor, batches, heads, rows, cols);
cudaDeviceSynchronize();
......@@ -187,8 +187,8 @@ void performTest(
if(itype == DType::kBFloat16) {
atol = 1e-3;
}
compareResults("softmax_fwd", softmax_out, softmax_out_ref.get(), atol, rtol);
compareResults("softmax_bwd", grads_out, grads_out_ref.get(), atol, rtol);
compareResults("softmax_fwd", softmax_out, softmax_out_ref.get(), true, atol, rtol);
compareResults("softmax_bwd", grads_out, grads_out_ref.get(), true, atol, rtol);
}
// [Batches, Attention Heads, Query Sequence Length, Key Sequence Length, Scaling Factor]
......
/*************************************************************************
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
************************************************************************/
#include <cstring>
#include <iomanip>
#include <iostream>
#include <memory>
#include <random>
#include <limits>
#include <cuda_bf16.h>
#include <cuda_fp8.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <transformer_engine/cast.h>
#include <transformer_engine/activation.h>
#include "../test_common.h"
#include "transformer_engine/transformer_engine.h"
using namespace transformer_engine;
using namespace test;
namespace {
template <typename InputType, typename OutputType>
void dequantize_block(const InputType* input,
OutputType* output,
fp8e8m0* scales,
const size_t scale_idx,
const size_t i_min,
const size_t i_max,
const size_t j_min,
const size_t j_max,
const size_t cols)
{
const fp8e8m0 biased_exponent = scales[scale_idx];
const float block_scale = exp2f(static_cast<float>(biased_exponent) - FP32_EXPONENT_BIAS);
const float elem_scale = block_scale;
// Dequantize elements in the block
for (size_t i = i_min; i < i_max; ++i) {
for (size_t j = j_min; j < j_max; ++j) {
const size_t idx = i * cols + j;
const float elt = static_cast<float>(input[idx]);
output[idx] = static_cast<OutputType>(elt * elem_scale);
}
}
}
template <typename InputType, typename OutputType>
void compute_ref_x1(const InputType* input,
OutputType* output,
fp8e8m0* scales,
const size_t rows,
const size_t cols,
const size_t block_size_Y,
const size_t block_size_X,
const size_t scales_stride)
{
const size_t blocks_Y = (rows + block_size_Y - 1) / block_size_Y;
const size_t blocks_X = (cols + block_size_X - 1) / block_size_X;
for (size_t ii = 0; ii < blocks_Y; ++ii) {
const size_t i_min = ii * block_size_Y;
const size_t i_max = std::min((ii + 1) * block_size_Y, rows);
for (size_t jj = 0; jj < blocks_X; ++jj) {
const size_t j_min = jj * block_size_X;
const size_t j_max = std::min((jj + 1) * block_size_X, cols);
const size_t scale_idx = ii * scales_stride + jj;
dequantize_block<InputType, OutputType>(
input, output, scales, scale_idx, i_min, i_max, j_min, j_max, cols);
}
}
}
template <typename InputType, typename OutputType>
void compute_ref_x2(const InputType* input,
OutputType* output_rowwise,
OutputType* output_colwise,
fp8e8m0* scales_rowwise,
fp8e8m0* scales_colwise,
const size_t rows,
const size_t cols,
const size_t block_size_Y,
const size_t block_size_X,
const size_t scales_stride_rowwise,
const size_t scales_stride_colwise)
{
compute_ref_x1<InputType, OutputType>(input, output_rowwise, scales_rowwise, rows, cols, 1, block_size_X, scales_stride_rowwise);
compute_ref_x1<InputType, OutputType>(input, output_colwise, scales_colwise, rows, cols, block_size_Y, 1, scales_stride_colwise);
}
void generate_scales(fp8e8m0 * const scales_ref,
fp8e8m0 * const scales,
const size_t blocks_num,
std::mt19937& gen,
std::uniform_int_distribution<fp8e8m0> dis)
{
for (size_t i = 0; i < blocks_num; ++i) {
const fp8e8m0 val = dis(gen);
scales_ref[i] = val;
scales[i] = val;
}
}
template<typename InputType>
void generate_data(InputType * const data,
const size_t rows,
const size_t cols,
std::mt19937& gen,
std::uniform_real_distribution<>& dis,
std::uniform_real_distribution<>& dis_sign)
{
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
const size_t idx = i * cols + j;
const bool is_negative = (dis_sign(gen) < 0.0);
double val = dis(gen);
if (is_negative) {
val = -val;
}
data[idx] = static_cast<InputType>(val);
}
}
}
template<typename InputType>
void fill_tensor_data(Tensor& input,
fp8e8m0 * const scales_rowwise,
fp8e8m0 * const scales_colwise,
const bool is_rowwise_scaling,
const bool is_colwise_scaling,
const size_t rows,
const size_t cols,
const size_t blocks_num_rowwise,
const size_t blocks_num_colwise)
{
const double minAbs = Numeric_Traits<InputType>::minNorm;
const double maxAbs = Numeric_Traits<InputType>::maxNorm;
static std::mt19937 gen(12345);
std::uniform_real_distribution<> dis(minAbs, maxAbs);
std::uniform_real_distribution<> dis_sign(-1.0, 1.0);
std::uniform_int_distribution<fp8e8m0> int_dis(0, 255);
if (is_rowwise_scaling) {
generate_scales(scales_rowwise, input.rowwise_cpu_scale_inv_ptr<fp8e8m0>(), blocks_num_rowwise, gen, int_dis);
generate_data(input.rowwise_cpu_dptr<InputType>(), rows, cols, gen, dis, dis_sign);
}
if (is_colwise_scaling) {
generate_scales(scales_colwise, input.columnwise_cpu_scale_inv_ptr<fp8e8m0>(), blocks_num_colwise, gen, int_dis);
generate_data(input.columnwise_cpu_dptr<InputType>(), rows, cols, gen, dis, dis_sign);
}
input.from_cpu();
}
// Dequantize along single dimension (either row- or columnwise)
template <typename InputType, typename OutputType>
void performTest_x1(const size_t rows,
const size_t cols,
const bool rowwise,
const bool colwise)
{
using namespace test;
using EncodingType = fp32;
DType itype = TypeInfo<InputType>::dtype;
DType otype = TypeInfo<OutputType>::dtype;
const size_t block_size_rows = rowwise ? 1 : 32;
const size_t block_size_cols = colwise ? 1 : 32;
const size_t unpadded_blocks_Y_rowwise = rows;
const size_t unpadded_blocks_X_rowwise = divide_round_up(cols, block_size_cols);
const size_t unpadded_blocks_Y_colwise = divide_round_up(rows, block_size_rows);
const size_t unpadded_blocks_X_colwise = cols;
const size_t blocks_Y_rowwise = round_up_to_nearest_multiple(unpadded_blocks_Y_rowwise,
scale_tensor_alignment_Y_rowwise);
const size_t blocks_X_rowwise = round_up_to_nearest_multiple(unpadded_blocks_X_rowwise,
scale_tensor_alignment_X_rowwise);
const size_t blocks_Y_colwise = round_up_to_nearest_multiple(unpadded_blocks_Y_colwise,
scale_tensor_alignment_Y_colwise);
const size_t blocks_X_colwise = round_up_to_nearest_multiple(unpadded_blocks_X_colwise,
scale_tensor_alignment_X_colwise);
const size_t blocks_num_rowwise = blocks_Y_rowwise * blocks_X_rowwise;
const size_t blocks_num_colwise = blocks_Y_colwise * blocks_X_colwise;
const size_t blocks_num = rowwise ? blocks_num_rowwise : blocks_num_colwise;
const size_t scales_stride = rowwise ? blocks_X_rowwise : blocks_X_colwise;
Tensor input("input", { rows, cols }, itype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
// Output data are written to the rowwise ptr regardless of the scaling direction
Tensor output("output", { rows, cols }, otype, true, false);
std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
std::unique_ptr<fp8e8m0[]> scales = std::make_unique<fp8e8m0[]>(blocks_num);
fill_tensor_data<InputType>(input, scales.get(), scales.get(), rowwise, colwise, rows, cols,
blocks_num_rowwise, blocks_num_colwise);
nvte_dequantize(input.data(), output.data(), 0);
cudaDeviceSynchronize();
auto err = cudaGetLastError();
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
InputType * data_ptr = rowwise
? input.rowwise_cpu_dptr<InputType>()
: input.columnwise_cpu_dptr<InputType>();
compute_ref_x1<InputType, OutputType>(data_ptr,
ref_output.get(),
scales.get(),
rows,
cols,
block_size_rows,
block_size_cols,
scales_stride);
auto [atol, rtol] = getTolerances(otype);
compareResults("output", output, ref_output.get(), true, atol, rtol);
}
// Dequantize along single dimension (either row- or columnwise)
template <typename InputType, typename IntermediateType>
void performTest_quantize_then_dequantize(const size_t rows,
const size_t cols,
const bool rowwise,
const bool colwise)
{
using namespace test;
using EncodingType = fp32;
DType in_type = TypeInfo<InputType>::dtype;
DType intermed_type = TypeInfo<IntermediateType>::dtype;
DType out_type = TypeInfo<InputType>::dtype;
std::unique_ptr<InputType[]> input_cpu = std::make_unique<InputType[]>(rows * cols);
std::unique_ptr<IntermediateType[]> quantized_cpu = std::make_unique<IntermediateType[]>(rows * cols);
std::unique_ptr<InputType[]> output_cpu = std::make_unique<InputType[]>(rows * cols);
// input --> quantized --> output (dequantized)
// input == output
Tensor input("input", { rows, cols }, in_type);
Tensor quantized("quantized", { rows, cols }, intermed_type, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
// Output data are written to the rowwise ptr regardless of the scaling direction
Tensor output("output", { rows, cols }, out_type, true, false);
// fillCase<EncodingType>(&input, InputsFillCase::minNorm_to_maxNorm);
fillCase<EncodingType>(&input, InputsFillCase::uniform);
const size_t copy_size = sizeof(InputType) * rows * cols;
cudaMemcpy(input_cpu.get(), input.rowwise_dptr(), copy_size, cudaMemcpyDeviceToHost);
nvte_quantize(input.data(), quantized.data(), 0);
cudaDeviceSynchronize();
const size_t copy_size_quantized = sizeof(IntermediateType) * rows * cols;
if (rowwise) {
cudaMemcpy(quantized_cpu.get(), quantized.rowwise_dptr(), copy_size_quantized, cudaMemcpyDeviceToHost);
}
if (colwise) {
cudaMemcpy(quantized_cpu.get(), quantized.columnwise_dptr(), copy_size_quantized, cudaMemcpyDeviceToHost);
}
nvte_dequantize(quantized.data(), output.data(), 0);
cudaDeviceSynchronize();
cudaMemcpy(output_cpu.get(), output.rowwise_dptr(), copy_size, cudaMemcpyDeviceToHost);
auto err = cudaGetLastError();
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
auto [atol, rtol] = getTolerances(intermed_type);
compareResults("Quantize-Dequantize", input, output_cpu.get(), true, atol, rtol);
}
// Dequantize along both dimensions (row- and columnwise)
template <typename InputType, typename OutputType>
void performTest_x2(const size_t rows,
const size_t cols,
const size_t block_size_rows,
const size_t block_size_cols)
{
using namespace test;
using EncodingType = fp32;
DType itype = TypeInfo<InputType>::dtype;
DType otype = TypeInfo<OutputType>::dtype;
const size_t unpadded_blocks_Y_rowwise = rows;
const size_t unpadded_blocks_X_rowwise = divide_round_up(cols, block_size_cols);
const size_t unpadded_blocks_Y_colwise = divide_round_up(rows, block_size_rows);
const size_t unpadded_blocks_X_colwise = cols;
const size_t blocks_Y_rowwise = round_up_to_nearest_multiple(unpadded_blocks_Y_rowwise,
scale_tensor_alignment_Y_rowwise);
const size_t blocks_X_rowwise = round_up_to_nearest_multiple(unpadded_blocks_X_rowwise,
scale_tensor_alignment_X_rowwise);
const size_t blocks_Y_colwise = round_up_to_nearest_multiple(unpadded_blocks_Y_colwise,
scale_tensor_alignment_Y_colwise);
const size_t blocks_X_colwise = round_up_to_nearest_multiple(unpadded_blocks_X_colwise,
scale_tensor_alignment_X_colwise);
const size_t scales_stride_rowwise = blocks_X_rowwise;
const size_t scales_stride_colwise = blocks_X_colwise;
const size_t blocks_num_rowwise = blocks_Y_rowwise * blocks_X_rowwise;
const size_t blocks_num_colwise = blocks_Y_colwise * blocks_X_colwise;
Tensor input("input", { rows, cols }, itype, true, true, NVTE_MXFP8_1D_SCALING);
Tensor output("output", { rows, cols }, otype);
std::unique_ptr<OutputType[]> ref_output_rowwise = std::make_unique<OutputType[]>(rows * cols);
std::unique_ptr<OutputType[]> ref_output_colwise = std::make_unique<OutputType[]>(rows * cols);
std::unique_ptr<fp8e8m0[]> ref_scales_rowwise = std::make_unique<fp8e8m0[]>(blocks_num_rowwise);
std::unique_ptr<fp8e8m0[]> ref_scales_colwise = std::make_unique<fp8e8m0[]>(blocks_num_colwise);
constexpr bool rowwise = true;
constexpr bool colwise = true;
fill_tensor_data<InputType>(input, ref_scales_rowwise.get(), ref_scales_colwise.get(),
rowwise, colwise, rows, cols, blocks_num_rowwise, blocks_num_colwise);
nvte_dequantize(input.data(), output.data(), 0);
cudaDeviceSynchronize();
auto err = cudaGetLastError();
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
compute_ref_x2<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(),
ref_output_rowwise.get(),
ref_output_colwise.get(),
ref_scales_rowwise.get(),
ref_scales_colwise.get(),
rows,
cols,
block_size_rows,
block_size_cols,
scales_stride_rowwise,
scales_stride_colwise);
auto [atol, rtol] = getTolerances(otype);
compareResults("output_rowwise", output, ref_output_rowwise.get(), true, atol, rtol);
compareResults("output_colwise", output, ref_output_colwise.get(), false, atol, rtol);
}
std::vector<std::pair<size_t, size_t>> tensor_dims = {
{1, 16},
{16, 48},
{65, 96},
{128, 128},
{256, 256},
{993, 512},
{768, 1024},
// {2048, 12288},
// {65536, 128},
// {16384, 1632},
// {16384, 6144},
};
std::vector<std::pair<size_t, size_t>> block_sizes = {
{1, 32},
{32, 1},
// {32, 32},
};
} // namespace
class DequantizeMXFP8TestSuite : public ::testing::TestWithParam
<std::tuple<std::pair<size_t, size_t>,
std::pair<size_t, size_t>,
transformer_engine::DType,
transformer_engine::DType,
bool>> {};
TEST_P(DequantizeMXFP8TestSuite, TestDequantizeMXFP8)
{
// Skip tests for pre-Blackwell architectures
if (getDeviceComputeCapability() < blackwellComputeCapability) {
GTEST_SKIP();
}
using namespace transformer_engine;
using namespace test;
const auto tensor_size = std::get<0>(GetParam());
const auto block_size = std::get<1>(GetParam());
const DType input_type = std::get<2>(GetParam());
const DType output_type = std::get<3>(GetParam());
const bool quantize_then_dequantize = std::get<4>(GetParam());
const bool rowwise = block_size.second != 1;
const bool colwise = block_size.first != 1;
// Skip tests for dequantization along both dimensions
if (rowwise && colwise) {
GTEST_SKIP();
}
// Skip cases with invalid alignment
if (rowwise && tensor_size.second % 32 != 0) {
GTEST_SKIP();
}
if (colwise && tensor_size.first % 32 != 0) {
GTEST_SKIP();
}
TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(input_type, InputType,
TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(output_type, OutputType,
if (quantize_then_dequantize) {
// Mind the order of the Output/Input template parameters
performTest_quantize_then_dequantize<OutputType, InputType>(
tensor_size.first, tensor_size.second, rowwise, colwise);
} else {
if (block_size.first == 1 || block_size.second == 1) {
performTest_x1<InputType, OutputType>(tensor_size.first, tensor_size.second,
rowwise, colwise);
} else {
performTest_x2<InputType, OutputType>(tensor_size.first, tensor_size.second,
block_size.first, block_size.second);
}
}
);
);
}
INSTANTIATE_TEST_SUITE_P(
OperatorTest,
DequantizeMXFP8TestSuite,
::testing::Combine(
::testing::ValuesIn(tensor_dims),
::testing::ValuesIn(block_sizes),
::testing::Values(DType::kFloat8E4M3, DType::kFloat8E5M2),
::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
::testing::Values(false)),
[](const testing::TestParamInfo<DequantizeMXFP8TestSuite::ParamType>& info)
{
std::string name = std::to_string(std::get<0>(info.param).first) + "X" +
std::to_string(std::get<0>(info.param).second) + "X" +
std::to_string(std::get<1>(info.param).first) + "X" +
std::to_string(std::get<1>(info.param).second) + "X" +
test::typeName(std::get<2>(info.param)) + "X" +
test::typeName(std::get<3>(info.param)) + "X" +
(std::get<4>(info.param) ? "QD" : "D");
return name;
}
);
......@@ -69,7 +69,7 @@ void performTest() {
const size_t num_tensors = tensor_dims.size();
// Buffers for Transformer Engine implementation
std::vector<Tensor> input_list, output_c_list, output_t_list;
std::vector<Tensor> input_list, output_list;
// Buffers for reference implementation
std::vector<std::vector<InputType>> ref_input_list;
......@@ -81,25 +81,23 @@ void performTest() {
for (size_t tensor_id = 0; tensor_id < num_tensors; ++tensor_id) {
const size_t height = tensor_dims[tensor_id].first;
const size_t width = tensor_dims[tensor_id].second;
input_list.emplace_back(Tensor({ height, width }, itype));
output_c_list.emplace_back(Tensor({ height, width }, otype));
output_t_list.emplace_back(Tensor({ width, height }, otype));
input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), { height, width }, itype));
output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id),
{ height, width }, otype, true, true));
auto& input = input_list.back();
auto& output_c = output_c_list.back();
auto& output_t = output_t_list.back();
auto& output = output_list.back();
fillUniform(&input);
setRandomScale(&output_c);
output_t.shareFP8Meta(output_c);
setRandomScale(&output);
ref_input_list.emplace_back(height*width);
ref_output_c_list.emplace_back(height*width);
ref_output_t_list.emplace_back(width*height);
std::copy(input.cpu_dptr<InputType>(),
input.cpu_dptr<InputType>() + height * width,
std::copy(input.rowwise_cpu_dptr<InputType>(),
input.rowwise_cpu_dptr<InputType>() + height * width,
ref_input_list.back().begin());
ref_scale_list[tensor_id] = output_c.scale();
ref_scale_list[tensor_id] = output.scale();
ref_height_list[tensor_id] = height;
ref_width_list[tensor_id] = width;
}
......@@ -115,8 +113,7 @@ void performTest() {
};
nvte_multi_cast_transpose(num_tensors,
make_nvte_vector(input_list).data(),
make_nvte_vector(output_c_list).data(),
make_nvte_vector(output_t_list).data(),
make_nvte_vector(output_list).data(),
0);
// Reference implementation
......@@ -136,23 +133,23 @@ void performTest() {
if (isFp8Type(otype)) {
auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
compareResults("amax",
output_c_list[tensor_id].amax(),
output_list[tensor_id].amax(),
ref_amax_list[tensor_id],
atol_amax, rtol_amax);
compareResults("scale_inv",
output_c_list[tensor_id].scale_inv(),
1.f / output_c_list[tensor_id].scale(),
output_list[tensor_id].rowwise_scale_inv(),
1.f / output_list[tensor_id].scale(),
atol_amax, rtol_amax);
}
auto [atol, rtol] = getTolerances(otype);
compareResults("output_c",
output_c_list[tensor_id],
output_list[tensor_id],
ref_output_c_list[tensor_id].data(),
atol, rtol);
true, atol, rtol);
compareResults("output_t",
output_t_list[tensor_id],
output_list[tensor_id],
ref_output_t_list[tensor_id].data(),
atol, rtol);
false, atol, rtol);
}
}
......
......@@ -9,6 +9,7 @@
#include <iostream>
#include <memory>
#include <random>
#include <string>
#include <vector>
#include <cstdio>
......@@ -84,8 +85,8 @@ void performTest() {
const size_t height = tensor_dims[tensor_id].first;
const size_t width = tensor_dims[tensor_id].second;
const size_t padded_height = (height + align - 1) / align * align;
input_list.emplace_back(Tensor({ height, width }, itype));
output_list.emplace_back(Tensor({ padded_height, width }, otype));
input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), { height, width }, itype));
output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id), { padded_height, width }, otype));
auto& input = input_list.back();
auto& output = output_list.back();
......@@ -95,8 +96,8 @@ void performTest() {
ref_input_list.emplace_back(height*width);
ref_output_list.emplace_back(padded_height*width);
std::copy(input.cpu_dptr<InputType>(),
input.cpu_dptr<InputType>() + height * width,
std::copy(input.rowwise_cpu_dptr<InputType>(),
input.rowwise_cpu_dptr<InputType>() + height * width,
ref_input_list.back().begin());
ref_height_list[tensor_id] = height;
ref_width_list[tensor_id] = width;
......@@ -134,6 +135,7 @@ void performTest() {
compareResults("output",
output_list[tensor_id],
ref_output_list[tensor_id].data(),
true,
atol, rtol);
}
}
......
......@@ -10,7 +10,6 @@
#include <iomanip>
#include <iostream>
#include <random>
#include <stdlib.h>
#include <cuda_bf16.h>
#include <cuda_runtime.h>
......@@ -176,6 +175,11 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
GTEST_SKIP() << "LN kernel does not support OutputType > InputType";
return;
}
if (getDeviceComputeCapability() < blackwellComputeCapability && use_cudnn) {
GTEST_SKIP() << "cuDNN normalizations not supported on pre-Blackwell GPUs yet!";
}
using WeightType = InputType;
DType itype = TypeInfo<InputType>::dtype;
DType wtype = TypeInfo<WeightType>::dtype;
......@@ -187,16 +191,16 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
return;
}
Tensor input({ N, H }, itype);
Tensor z({ N, H }, otype);
Tensor gamma({ H }, wtype);
Tensor beta({ H }, wtype);
Tensor mu({ N }, DType::kFloat32);
Tensor rsigma({ N }, DType::kFloat32);
Tensor dz({ N, H }, wtype);
Tensor dx({ N, H }, itype);
Tensor dgamma({ H }, wtype);
Tensor dbeta({ H }, wtype);
Tensor input("input", { N, H }, itype);
Tensor z("z", { N, H }, otype);
Tensor gamma("gamma", { H }, wtype);
Tensor beta("beta", { H }, wtype);
Tensor mu("mu", { N }, DType::kFloat32);
Tensor rsigma("rsigma", { N }, DType::kFloat32);
Tensor dz("dz", { N, H }, wtype);
Tensor dx("dx", { N, H }, itype);
Tensor dgamma("dgamma", { H }, wtype);
Tensor dbeta("dbeta", { H }, wtype);
Tensor workspace_fwd, workspace_bwd;
fillUniform(&input);
......@@ -226,7 +230,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
nvte_layernorm_fwd(input.data(), gamma.data(), beta.data(), epsilon,
z.data(), mu.data(), rsigma.data(), workspace_fwd.data(),
prop.multiProcessorCount, zero_centered_gamma, 0);
workspace_fwd = Tensor(workspace_fwd.shape(), workspace_fwd.dtype());
workspace_fwd = Tensor("workspace", workspace_fwd.rowwise_shape(), workspace_fwd.dtype());
nvte_layernorm_fwd(input.data(), gamma.data(), beta.data(), epsilon,
z.data(), mu.data(), rsigma.data(), workspace_fwd.data(),
prop.multiProcessorCount, zero_centered_gamma, 0);
......@@ -236,7 +240,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
dx.data(), dgamma.data(), dbeta.data(),
workspace_bwd.data(),
prop.multiProcessorCount, zero_centered_gamma, 0);
workspace_bwd = Tensor(workspace_bwd.shape(), workspace_bwd.dtype());
workspace_bwd = Tensor("workspace", workspace_bwd.rowwise_shape(), workspace_bwd.dtype());
nvte_layernorm_bwd(dz.data(), input.data(),
mu.data(), rsigma.data(), gamma.data(),
dx.data(), dgamma.data(), dbeta.data(),
......@@ -246,7 +250,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
nvte_rmsnorm_fwd(input.data(), gamma.data(), epsilon,
z.data(), rsigma.data(), workspace_fwd.data(),
prop.multiProcessorCount, zero_centered_gamma, 0);
workspace_fwd = Tensor(workspace_fwd.shape(), workspace_fwd.dtype());
workspace_fwd = Tensor("workspace", workspace_fwd.rowwise_shape(), workspace_fwd.dtype());
nvte_rmsnorm_fwd(input.data(), gamma.data(), epsilon,
z.data(), rsigma.data(), workspace_fwd.data(),
prop.multiProcessorCount, zero_centered_gamma, 0);
......@@ -255,7 +259,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
dx.data(), dgamma.data(),
workspace_bwd.data(),
prop.multiProcessorCount, zero_centered_gamma, 0);
workspace_bwd = Tensor(workspace_bwd.shape(), workspace_bwd.dtype());
workspace_bwd = Tensor("workspace", workspace_bwd.rowwise_shape(), workspace_bwd.dtype());
nvte_rmsnorm_bwd(dz.data(), input.data(), rsigma.data(), gamma.data(),
dx.data(), dgamma.data(),
workspace_bwd.data(),
......@@ -272,23 +276,24 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
mu.to_cpu();
rsigma.to_cpu();
float ref_amax;
compute_ref_stats(norm_type, input.cpu_dptr<InputType>(), ref_mu.get(),
compute_ref_stats(norm_type, input.rowwise_cpu_dptr<InputType>(), ref_mu.get(),
ref_rsigma.get(), N, H, epsilon);
float ref_scale = isFp8Type(otype) ? z.scale() : 1.f;
compute_ref_output(norm_type, input.cpu_dptr<InputType>(),
gamma.cpu_dptr<WeightType>(),
beta.cpu_dptr<WeightType>(),
compute_ref_output(norm_type, input.rowwise_cpu_dptr<InputType>(),
gamma.rowwise_cpu_dptr<WeightType>(),
beta.rowwise_cpu_dptr<WeightType>(),
ref_output.get(),
mu.cpu_dptr<float>(),
rsigma.cpu_dptr<float>(),
mu.rowwise_cpu_dptr<float>(),
rsigma.rowwise_cpu_dptr<float>(),
N, H,
&ref_amax,
ref_scale,
zero_centered_gamma,
use_cudnn);
compute_ref_backward(norm_type, dz.cpu_dptr<WeightType>(), input.cpu_dptr<InputType>(),
mu.cpu_dptr<float>(), rsigma.cpu_dptr<float>(),
gamma.cpu_dptr<WeightType>(),
compute_ref_backward(norm_type, dz.rowwise_cpu_dptr<WeightType>(),
input.rowwise_cpu_dptr<InputType>(),
mu.rowwise_cpu_dptr<float>(), rsigma.rowwise_cpu_dptr<float>(),
gamma.rowwise_cpu_dptr<WeightType>(),
ref_dx.get(), ref_dgamma.get(), ref_dbeta.get(),
N, H, zero_centered_gamma,
use_cudnn);
......@@ -301,25 +306,25 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
if (isFp8Type(otype)) {
compareResults("amax", z.amax(), ref_amax, atol_amax, rtol_amax);
float ref_scale_inv = 1.f / z.scale();
compareResults("scale_inv", z.scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
compareResults("scale_inv", z.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
}
auto [atol_stats, rtol_stats] = getTolerances(DType::kFloat32);
rtol_stats = 5e-5;
compareResults("mu", mu, ref_mu.get(), atol_stats, rtol_stats);
compareResults("rsigma", rsigma, ref_rsigma.get(), atol_stats, rtol_stats);
compareResults("mu", mu, ref_mu.get(), true, atol_stats, rtol_stats);
compareResults("rsigma", rsigma, ref_rsigma.get(), true, atol_stats, rtol_stats);
auto [atol, rtol] = getTolerances(otype);
if (otype == DType::kFloat32) {
atol = 5e-7;
}
compareResults("output", z, ref_output.get(), atol, rtol);
compareResults("output", z, ref_output.get(), true, atol, rtol);
double atol_bwd = 5e-4;
double rtol_bwd = 5e-4;
compareResults("dx", dx, ref_dx.get(), atol_bwd, rtol_bwd);
compareResults("dgamma", dgamma, ref_dgamma.get(), atol_bwd, rtol_bwd);
compareResults("dbeta", dbeta, ref_dbeta.get(), atol_bwd, rtol_bwd);
compareResults("dx", dx, ref_dx.get(), true, atol_bwd, rtol_bwd);
compareResults("dgamma", dgamma, ref_dgamma.get(), true, atol_bwd, rtol_bwd);
compareResults("dbeta", dbeta, ref_dbeta.get(), true, atol_bwd, rtol_bwd);
}
std::vector<std::pair<size_t, size_t>> test_cases = {
......@@ -357,24 +362,24 @@ TEST_P(NormTestSuite, TestNorm) {
}
INSTANTIATE_TEST_SUITE_P(
OperatorTest,
NormTestSuite,
::testing::Combine(
::testing::Values(false), //TODO: enabling tests for cudnn backend
::testing::Values(NormType::LayerNorm, NormType::RMSNorm),
::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16, DType::kFloat8E4M3),
::testing::ValuesIn(test_cases),
::testing::Values(false, true)),
[](const testing::TestParamInfo<NormTestSuite::ParamType>& info) {
OperatorTest,
NormTestSuite,
::testing::Combine(
::testing::Values(true, false),
::testing::Values(NormType::LayerNorm, NormType::RMSNorm),
::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16, DType::kFloat8E4M3),
::testing::ValuesIn(test_cases),
::testing::Values(false, true)),
[](const testing::TestParamInfo<NormTestSuite::ParamType>& info) {
auto backend = std::get<0>(info.param) == false ? "Te" : "Cudnn";
std::string name =
backend +
normToString.at(std::get<1>(info.param)) + "_" +
test::typeName(std::get<2>(info.param)) + "X" +
test::typeName(std::get<3>(info.param)) + "X" +
std::to_string(std::get<4>(info.param).first) + "X" +
std::to_string(std::get<4>(info.param).second) + "X" +
std::to_string(std::get<5>(info.param));
return name;
});
std::string name =
backend +
normToString.at(std::get<1>(info.param)) + "_" +
test::typeName(std::get<2>(info.param)) + "X" +
test::typeName(std::get<3>(info.param)) + "X" +
std::to_string(std::get<4>(info.param).first) + "X" +
std::to_string(std::get<4>(info.param).second) + "X" +
std::to_string(std::get<5>(info.param));
return name;
});
/*************************************************************************
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
************************************************************************/
#include <cmath>
#include <cstring>
#include <memory>
#include <map>
#include <iomanip>
#include <iostream>
#include <random>
#include <cuda_bf16.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <transformer_engine/normalization.h>
#include <transformer_engine/transformer_engine.h>
#include "../test_common.h"
using namespace transformer_engine;
using namespace test;
namespace {
using fp8e8m0 = byte;
enum NormType {
LayerNorm,
RMSNorm
};
std::map<NormType, std::string> normToString = {
{NormType::LayerNorm, "LayerNorm"},
{NormType::RMSNorm, "RMSNorm"}
};
template <typename InputType, typename ScaleType, typename OutputType>
void dequantize_1x_kernel(InputType* input_ptr, ScaleType* scale_ptr, OutputType* output_ptr,
size_t rows, size_t cols, size_t scaling_mode_x, size_t scaling_mode_y){
const size_t block_size_Y = scaling_mode_x; // mind the mapping Y <-- x
const size_t block_size_X = scaling_mode_y; // and X <-- y
const size_t tile_size_Y = std::max(32lu, block_size_Y);
const size_t tile_size_X = std::max(64lu, block_size_X);
const size_t tiles_num_Y = (rows + tile_size_Y - 1) / tile_size_Y;
const size_t tiles_num_X = (cols + tile_size_X - 1) / tile_size_X;
const size_t blocks_per_tile_Y = tile_size_Y / block_size_Y;
const size_t blocks_per_tile_X = tile_size_X / block_size_X;
const size_t blocks_per_row = (cols + block_size_X - 1) / block_size_X;
#pragma omp parallel for proc_bind(spread) schedule(static)
for (size_t t = 0; t < tiles_num_Y * tiles_num_X; ++t) {
const size_t tile_Y = t / tiles_num_X;
const size_t tile_X = t % tiles_num_X;
const size_t tile_offset_Y = tile_Y * tile_size_Y;
const size_t tile_offset_X = tile_X * tile_size_X;
for (size_t ii = 0; ii < blocks_per_tile_Y; ++ii) {
const size_t block_idx_Y = tile_Y * blocks_per_tile_Y + ii;
const size_t block_offset_Y = ii * block_size_Y;
const size_t i_min = tile_offset_Y + block_offset_Y;
const size_t i_max = std::min(i_min + block_size_Y, rows);
for (size_t jj = 0; jj < blocks_per_tile_X; ++jj) {
const size_t block_idx_X = tile_X * blocks_per_tile_X + jj;
const size_t block_offset_X = jj * block_size_X;
const size_t j_min = tile_offset_X + block_offset_X;
const size_t j_max = std::min(j_min + block_size_X, cols);
const size_t mx_scale_idx = block_idx_Y * blocks_per_row + block_idx_X;
// TODO: padded SFs i.e. (4,128)
const float scale_inv = exp2f(static_cast<float>(scale_ptr[mx_scale_idx]) - FP32_EXPONENT_BIAS);
for (size_t i = i_min; i < i_max; ++i) {
for (size_t j = j_min; j < j_max; ++j) {
const size_t idx = i * cols + j;
const float elem = static_cast<float>(input_ptr[idx]);
output_ptr[idx] = static_cast<float>(elem * scale_inv);
}
}
}
}
}
}
template <typename InputType, typename ScaleType>
void dequantize_2x(Tensor& input, Tensor& output, bool is_training)
{
input.to_cpu();
auto scaling_mode = input.scaling_mode();
assert(input.rowwise_shape().ndim == 2);
assert(input.columnwise_shape().ndim == 2);
dequantize_1x_kernel(input.rowwise_cpu_dptr<InputType>(),
input.rowwise_cpu_scale_inv_ptr<ScaleType>(),
output.rowwise_cpu_dptr<float>(),
input.rowwise_shape().data[0], input.rowwise_shape().data[1],
1, 32);
if (is_training)
dequantize_1x_kernel(input.columnwise_cpu_dptr<InputType>(),
input.columnwise_cpu_scale_inv_ptr<ScaleType>(),
output.columnwise_cpu_dptr<float>(),
input.columnwise_shape().data[0], input.columnwise_shape().data[1],
32, 1);
}
template <typename InputType>
void compute_ref_stats(NormType norm_type,
const InputType *data, float *mu, float *rsigma,
const size_t N, const size_t H, const double epsilon){
using compute_t = float;
#pragma omp parallel for proc_bind(spread)
for (size_t i = 0; i < N; ++i) {
compute_t sum = 0;
for (size_t j = 0; j < H; ++j) {
sum += static_cast<compute_t>(data[i * H + j]);
}
compute_t m;
if (norm_type == LayerNorm){
mu[i] = sum / H;
m = mu[i];
} else { m = 0;}
compute_t sum_sq = 0;
for (size_t j = 0; j < H; ++j) {
compute_t current = static_cast<compute_t>(data[i * H + j]);
sum_sq += (current - m) * (current - m);
}
rsigma[i] = rsqrtf((sum_sq / H) + epsilon);
}
}
template <typename InputType, typename OutputType>
void compute_ref_output(NormType norm_type,
const InputType *data, const InputType *gamma, const InputType *beta,
const float *mu, const float *rsigma,
const size_t N, const size_t H,
OutputType* output,
const bool zero_centered_gamma){
using compute_t = float;
#pragma omp parallel for proc_bind(spread)
for (size_t i = 0; i < N; ++i) {
for (size_t j = 0; j < H; ++j) {
compute_t current = static_cast<compute_t>(data[i * H + j]);
compute_t g = static_cast<compute_t>(gamma[j]);
if (zero_centered_gamma) {
g += 1.0;
}
compute_t tmp;
if (norm_type == LayerNorm) {
tmp = (current - mu[i]) * rsigma[i] * g + static_cast<compute_t>(beta[j]);
} else { // RMSNorm
tmp = current * rsigma[i] * g;
}
output[i * H + j] = tmp;
}
}
}
template <typename InputType, typename OutputType>
void performTest(const size_t N, const size_t H, const bool zero_centered_gamma, NormType norm_type, bool is_training) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
if (getDeviceComputeCapability() < blackwellComputeCapability) {
GTEST_SKIP();
}
using WeightType = InputType;
DType itype = TypeInfo<InputType>::dtype;
DType wtype = TypeInfo<WeightType>::dtype;
DType otype = TypeInfo<OutputType>::dtype;
Tensor input("input", { N, H }, itype);
Tensor z("z", { N, H }, otype, true, is_training, NVTE_MXFP8_1D_SCALING);
Tensor gamma("gamma", { H }, wtype);
Tensor beta("beta", { H }, wtype);
Tensor mu("mu", { N }, DType::kFloat32);
Tensor rsigma("rsigma", { N }, DType::kFloat32);
Tensor workspace;
fillUniform(&input);
fillUniform(&gamma);
fillUniform(&beta);
// Forward kernel
float epsilon = 1e-5;
if (norm_type == NormType::LayerNorm){
nvte_layernorm_fwd(input.data(), gamma.data(), beta.data(), epsilon,
z.data(), mu.data(), rsigma.data(), workspace.data(),
prop.multiProcessorCount, zero_centered_gamma,
0);
workspace = Tensor("workspace", workspace.rowwise_shape(), workspace.dtype());
nvte_layernorm_fwd(input.data(), gamma.data(), beta.data(), epsilon,
z.data(), mu.data(), rsigma.data(), workspace.data(),
prop.multiProcessorCount, zero_centered_gamma,
0);
} else {
nvte_rmsnorm_fwd(input.data(), gamma.data(), epsilon,
z.data(), rsigma.data(), workspace.data(),
prop.multiProcessorCount, zero_centered_gamma,
0);
workspace = Tensor("workspace", workspace.rowwise_shape(), workspace.dtype());
nvte_rmsnorm_fwd(input.data(), gamma.data(), epsilon,
z.data(), rsigma.data(), workspace.data(),
prop.multiProcessorCount, zero_centered_gamma,
0);
}
Tensor dequantized_output("dequantized_output", { N, H }, DType::kFloat32, true, true);
dequantize_2x<OutputType, fp8e8m0>(z, dequantized_output, is_training);
// Reference implementations
std::unique_ptr<float[]> ref_mu = std::make_unique<float[]>(N);
std::unique_ptr<float[]> ref_rsigma = std::make_unique<float[]>(N);
std::unique_ptr<float[]> ref_output = std::make_unique<float[]>(N * H);
compute_ref_stats(norm_type, input.rowwise_cpu_dptr<InputType>(), ref_mu.get(),
ref_rsigma.get(), N, H, epsilon);
// use the GPU stats to tighten the tolerances
float *ref_mu_ptr, *ref_rsigma_ptr;
if (is_training){
mu.to_cpu();
rsigma.to_cpu();
ref_mu_ptr = mu.rowwise_cpu_dptr<float>();
ref_rsigma_ptr = rsigma.rowwise_cpu_dptr<float>();
} else {
ref_mu_ptr = ref_mu.get();
ref_rsigma_ptr = ref_rsigma.get();
}
compute_ref_output(norm_type, input.rowwise_cpu_dptr<InputType>(),
gamma.rowwise_cpu_dptr<WeightType>(),
beta.rowwise_cpu_dptr<WeightType>(),
ref_mu_ptr,
ref_rsigma_ptr,
N, H,
ref_output.get(),
zero_centered_gamma);
cudaDeviceSynchronize();
auto err = cudaGetLastError();
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
auto [atol_stats, rtol_stats] = getTolerances(DType::kFloat32);
rtol_stats = 5e-5;
if (is_training){
compareResults("mu", mu, ref_mu.get(), true, atol_stats, rtol_stats);
compareResults("rsigma", rsigma, ref_rsigma.get(), true, atol_stats, rtol_stats);
}
float atol, rtol;
if (otype == DType::kFloat8E5M2){
atol = 1.25e-1;
rtol = 1.25e-1;
} else if (otype == DType::kFloat8E4M3){
if (itype == DType::kBFloat16){
atol = 7e-2;
rtol = 7e-2;
} else {
atol = 6.25e-2;
rtol = 6.25e-2;
}
}
compareResults("output_rowwise", dequantized_output, ref_output.get(), true, atol, rtol, false);
if (is_training)
compareResults("output_colwise", dequantized_output, ref_output.get(), false, atol, rtol, false);
}
std::vector<std::pair<size_t, size_t>> test_cases = {
{32, 32},
{768, 2304},
{2048, 12288},
};
std::vector<NormType> norms = {
NormType::LayerNorm,
NormType::RMSNorm
};
} // namespace
class MxNormTestSuite : public ::testing::TestWithParam< std::tuple<NormType,
transformer_engine::DType,
transformer_engine::DType,
std::pair<size_t, size_t>,
bool, bool>> {};
TEST_P(MxNormTestSuite, TestMxNorm) {
using namespace transformer_engine;
using namespace test;
const NormType norm_type = std::get<0>(GetParam());
const DType input_type = std::get<1>(GetParam());
const DType output_type = std::get<2>(GetParam());
const auto size = std::get<3>(GetParam());
const bool zero_centered_gamma = std::get<4>(GetParam());
const bool is_training = std::get<5>(GetParam());
TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(input_type, InputType,
TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(output_type, OutputType,
performTest<InputType, OutputType>(size.first, size.second, zero_centered_gamma, norm_type, is_training);
);
);
}
INSTANTIATE_TEST_SUITE_P(
OperatorTest,
MxNormTestSuite,
::testing::Combine(
::testing::Values(NormType::LayerNorm, NormType::RMSNorm),
::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
::testing::Values(DType::kFloat8E5M2, DType::kFloat8E4M3),
::testing::ValuesIn(test_cases),
::testing::Values(true, false),
::testing::Values(true, false)),
[](const testing::TestParamInfo<MxNormTestSuite::ParamType>& info) {
std::string name = normToString.at(std::get<0>(info.param)) + "_" +
test::typeName(std::get<1>(info.param)) + "X" +
test::typeName(std::get<2>(info.param)) + "X" +
std::to_string(std::get<3>(info.param).first) + "X" +
std::to_string(std::get<3>(info.param).second) + "X" +
std::to_string(std::get<4>(info.param)) + "out" +
std::to_string(int(std::get<5>(info.param)) + 1) + "x";
return name;
});
......@@ -58,18 +58,18 @@ void performTestQ(const size_t N) {
DType itype = TypeInfo<InputType>::dtype;
DType otype = TypeInfo<OutputType>::dtype;
Tensor input({ N }, itype);
Tensor output({ N }, otype);
Tensor input("input", { N }, itype);
Tensor output("output", { N }, otype);
std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(N);
fillUniform(&input);
setRandomScale(&output);
nvte_fp8_quantize(input.data(), output.data(), 0);
nvte_quantize(input.data(), output.data(), 0);
float ref_amax;
compute_ref_q<InputType, OutputType>(input.cpu_dptr<InputType>(), ref_output.get(),
compute_ref_q<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output.get(),
N, &ref_amax, output.scale());
cudaDeviceSynchronize();
......@@ -79,7 +79,7 @@ void performTestQ(const size_t N) {
auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
auto [atol, rtol] = getTolerances(otype);
compareResults("output_q", output, ref_output.get(), atol, rtol);
compareResults("output_q", output, ref_output.get(), true, atol, rtol);
}
template <typename InputType, typename OutputType>
......@@ -89,24 +89,24 @@ void performTestDQ(const size_t N) {
DType itype = TypeInfo<InputType>::dtype;
DType otype = TypeInfo<OutputType>::dtype;
Tensor input({ N }, itype);
Tensor output({ N }, otype);
Tensor input("input", { N }, itype);
Tensor output("output", { N }, otype);
std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(N);
fillUniform(&input);
nvte_fp8_dequantize(input.data(), output.data(), 0);
nvte_dequantize(input.data(), output.data(), 0);
compute_ref_dq<InputType, OutputType>(input.cpu_dptr<InputType>(), ref_output.get(),
N, input.scale_inv());
compute_ref_dq<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output.get(),
N, input.rowwise_scale_inv());
cudaDeviceSynchronize();
auto err = cudaGetLastError();
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
auto [atol, rtol] = getTolerances(otype);
compareResults("output_dq", output, ref_output.get(), atol, rtol);
compareResults("output_dq", output, ref_output.get(), true, atol, rtol);
}
std::vector<size_t> qdq_test_cases = {2048* 12288,
......
/*************************************************************************
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
************************************************************************/
#include <cmath>
#include <cstdint>
#include <cstring>
#include <memory>
#include <iomanip>
#include <iostream>
#include <random>
#include <type_traits>
#include <cuda_bf16.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <transformer_engine/swizzle.h>
#include "../test_common.h"
#include "transformer_engine/transformer_engine.h"
using namespace transformer_engine;
constexpr int MAT_TILE_DIM_M = 128;
constexpr int MAT_TILE_DIM_K = 128;
template <int SF_TILE_DIM_M, int SF_TILE_DIM_K, bool row_scaling>
void compute_ref_swizzle(const uint8_t *h_input, uint8_t *h_output,
const size_t M, const size_t K) {
constexpr int NEW_SF_TILE_DIM_M = SF_TILE_DIM_M / 4;
constexpr int NEW_SF_TILE_DIM_K = SF_TILE_DIM_K * 4;
constexpr int SF_TILE_SIZE = SF_TILE_DIM_M * SF_TILE_DIM_K;
for (int m = 0; m < M; m++) {
for (int k = 0; k < K; k++) {
int tile_id_m = m / SF_TILE_DIM_M;
int tile_id_k = k / SF_TILE_DIM_K;
int m_in_tile = m % SF_TILE_DIM_M;
int k_in_tile = k % SF_TILE_DIM_K;
int row_in_new_tile = m_in_tile % NEW_SF_TILE_DIM_M;
int col_in_new_tile = m_in_tile / NEW_SF_TILE_DIM_M * SF_TILE_DIM_K + k_in_tile;
int tile_output_ptr = tile_id_m * SF_TILE_DIM_M * K + tile_id_k * SF_TILE_SIZE;
int out_index = tile_output_ptr + row_in_new_tile * NEW_SF_TILE_DIM_K + col_in_new_tile;
if constexpr(row_scaling)
h_output[out_index] = h_input[k + m * K];
else
h_output[out_index] = h_input[k * M + m];
}
}
}
void performTestSwizzle1D(const int num_tiles_M, const int num_tiles_K, bool rowwise, bool columnwise, const bool transa) {
using namespace test;
int SF_MODE_X, SF_MODE_Y;
if (rowwise) {
SF_MODE_X = 1;
SF_MODE_Y = 32;
}
if (columnwise) {
SF_MODE_X = 32;
SF_MODE_Y = 1;
}
if ((rowwise && columnwise) || !(rowwise || columnwise)){
GTEST_SKIP() << "TEST SKIPPED, The scaling mode " + std::to_string(SF_MODE_X) + "x" +
std::to_string(SF_MODE_Y) + "is not implemented.";
}
DType dtype = DType::kFloat8E4M3;
const size_t M = num_tiles_M * MAT_TILE_DIM_M;
const size_t K = num_tiles_K * MAT_TILE_DIM_K;
const auto data_shape = transa ? std::vector<size_t>{M, K} : std::vector<size_t>{K, M};
const auto scale_shape = std::vector<size_t>{data_shape[0] / SF_MODE_X, data_shape[1] /SF_MODE_Y};
std::vector<int> scaling_mode = {SF_MODE_X, SF_MODE_Y, 0};
Tensor input("input", data_shape, dtype, rowwise, columnwise, NVTE_MXFP8_1D_SCALING);
Tensor output("output", data_shape, dtype, rowwise, columnwise, NVTE_MXFP8_1D_SCALING);
fillUniform(&input);
std::unique_ptr<uint8_t[]> ref_output = std::make_unique<uint8_t[]>(scale_shape[0] * scale_shape[1]);
nvte_swizzle_scaling_factors(input.data(), output.data(), 0);
if (rowwise)
compute_ref_swizzle<128, 4, true>(input.rowwise_cpu_scale_inv_ptr<uint8_t>(), ref_output.get(), scale_shape[0], scale_shape[1]);
else
compute_ref_swizzle<128, 4, false>(input.columnwise_cpu_scale_inv_ptr<uint8_t>(), ref_output.get(), scale_shape[1], scale_shape[0]);
cudaDeviceSynchronize();
auto err = cudaGetLastError();
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
output.to_cpu();
if (rowwise) {
compareResults("output_swizzle", output.rowwise_cpu_scale_inv_ptr<uint8_t>(), ref_output.get(), scale_shape[0] * scale_shape[1]);
} else {
compareResults("output_swizzle", output.columnwise_cpu_scale_inv_ptr<uint8_t>(), ref_output.get(), scale_shape[0] * scale_shape[1]);
}
}
class SwizzleTestSuite : public ::testing::TestWithParam<std::tuple<std::pair<int, int>, std::pair<bool, bool>, bool>> {};
TEST_P(SwizzleTestSuite, TestSwizzle) {
using namespace transformer_engine;
using namespace test;
const auto num_tiles = std::get<0>(GetParam());
const auto scaling_mode = std::get<1>(GetParam());
const auto transa = std::get<2>(GetParam());
performTestSwizzle1D(num_tiles.first, num_tiles.second,
scaling_mode.first, scaling_mode.second,
transa);
}
namespace {
std::vector<std::pair<int, int>> num_tiles = {
{1, 1},
{1, 132},
{132, 1},
{65, 256},
{65, 257},
{65, 258},
{65, 259},
};
std::vector<std::pair<bool, bool>> scaling_mode = {
{true, false},
{false, true}
};
std::vector<bool> transa = {true, false};
} // namespace
INSTANTIATE_TEST_SUITE_P(
OperatorTest,
SwizzleTestSuite,
::testing::Combine(
::testing::ValuesIn(num_tiles),
::testing::ValuesIn(scaling_mode),
::testing::ValuesIn(transa)
),
[](const testing::TestParamInfo<SwizzleTestSuite::ParamType>& info) {
std::string name = "ntiles" +
std::to_string(std::get<0>(info.param).first) + "X" +
std::to_string(std::get<0>(info.param).second) + "smode" +
std::to_string(std::get<1>(info.param).first) + "X"+
std::to_string(std::get<1>(info.param).second) + "trans" +
std::to_string(std::get<2>(info.param));
return name;
});
......@@ -37,8 +37,8 @@ void performTest(const size_t N, const size_t H) {
DType dtype = TypeInfo<Type>::dtype;
Tensor input({ N, H }, dtype);
Tensor output({ H, N }, dtype);
Tensor input("input", { N, H }, dtype);
Tensor output("output", { H, N }, dtype);
std::unique_ptr<Type[]> ref_output = std::make_unique<Type[]>(N * H);
......@@ -46,13 +46,13 @@ void performTest(const size_t N, const size_t H) {
nvte_transpose(input.data(), output.data(), 0);
compute_ref<Type>(input.cpu_dptr<Type>(), ref_output.get(), N, H);
compute_ref<Type>(input.rowwise_cpu_dptr<Type>(), ref_output.get(), N, H);
cudaDeviceSynchronize();
auto err = cudaGetLastError();
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
auto [atol, rtol] = getTolerances(dtype);
compareResults("output", output, ref_output.get(), atol, rtol);
compareResults("output", output, ref_output.get(), true, atol, rtol);
}
std::vector<std::pair<size_t, size_t>> test_cases = {{2048, 12288},
......
......@@ -10,14 +10,24 @@
#include <algorithm>
#include <memory>
#include <random>
#include <cassert>
#include <cmath>
#include <string>
#include <gtest/gtest.h>
#include <omp.h>
#include <transformer_engine/transformer_engine.h>
#include "util/logging.h"
namespace test {
size_t create_seed_from_tensor_name(const std::string& tensor_name) {
auto full_name = std::string(testing::UnitTest::GetInstance()->current_test_info()->name()) +
"/" + tensor_name;
return std::hash<std::string>{}(full_name);
}
std::vector<DType> all_fp_types = {DType::kFloat32,
DType::kFloat16,
DType::kBFloat16,
......@@ -50,102 +60,379 @@ const std::string &typeName(DType type) {
{DType::kFloat16, "float16"},
{DType::kBFloat16, "bfloat16"},
{DType::kFloat8E4M3, "float8e4m3"},
{DType::kFloat8E5M2, "float8e5m2"}};
{DType::kFloat8E5M2, "float8e5m2"},
{DType::kFloat8E8M0, "float8e8m0"}};
return name_map.at(type);
}
size_t product(const NVTEShape &shape) {
const std::string& caseName(InputsFillCase type) {
static const std::unordered_map<InputsFillCase, std::string> name_map = {
{InputsFillCase::uniform, "uniform"},
{InputsFillCase::zeros, "zeros"},
{InputsFillCase::zero_to_minNorm, "zero_to_minNorm"},
{InputsFillCase::minNorm_to_maxNorm, "minNorm_to_maxNorm"},
{InputsFillCase::maxNorm_to_inf, "maxNorm_to_inf"}};
return name_map.at(type);
}
size_t product(const NVTEShape &shape, size_t begin, size_t end) {
size_t ret = 1;
for (size_t i = 0; i < shape.ndim; ++i) {
NVTE_CHECK(end <= shape.ndim);
for (size_t i = begin; i < end; ++i) {
ret *= shape.data[i];
}
return ret;
}
size_t product(const NVTEShape &shape) {
return product(shape, 0, shape.ndim);
}
size_t product(const std::vector<size_t> shape, size_t begin, size_t end) {
size_t ret = 1;
NVTE_CHECK(end <= shape.size());
for (size_t i = begin; i < end; ++i) {
ret *= shape[i];
}
return ret;
}
Tensor::Tensor(const NVTEShape &shape, const DType type) {
size_t s = typeToSize(type);
size_t total_size = product(shape) * s;
void *dptr = nullptr;
cpu_data_ = nullptr;
amax_cpu_data_ = nullptr;
scale_cpu_data_ = nullptr;
scale_inv_cpu_data_ = nullptr;
float *amax = nullptr, *scale = nullptr, *scale_inv = nullptr;
if (total_size != 0) {
cudaMalloc((void**)&dptr, total_size); // NOLINT(*)
cudaMemset(dptr, 0, total_size);
cpu_data_ = std::make_unique<unsigned char[]>(total_size);
for (size_t i = 0; i < total_size; ++i) {
cpu_data_[i] = 0;
}
size_t product(const std::vector<size_t>& shape) {
return product(shape, 0, shape.size());
}
size_t DIVUP(const size_t &x, const size_t &y){
return (((x) + ((y)-1)) / (y));
}
inline bool is_tensor_scaling(const NVTEScalingMode &mode) {
return mode == NVTE_DELAYED_TENSOR_SCALING;
}
struct scale_inv_meta {
std::vector<size_t> shape;
DType type;
size_t type_size;
};
NVTEShape convertShape(const std::vector<size_t>& shape) {
return {shape.data(), shape.size()};
}
std::pair<scale_inv_meta, scale_inv_meta> get_scales(const NVTEShape& shape,
const NVTEScalingMode scaling_mode) {
if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) {
scale_inv_meta ret;
ret.shape = {1};
ret.type = DType::kFloat32;
ret.type_size = sizeof(float);
return {ret, ret};
}
if (scaling_mode == NVTE_MXFP8_1D_SCALING) {
std::vector<size_t> shape_vec;
for (size_t i = 0; i < shape.ndim; ++i) {
shape_vec.push_back(shape.data[i]);
}
size_t first_dim = first_dimension(shape_vec);
size_t last_dim = last_dimension(shape_vec);
scale_inv_meta ret_rowwise, ret_colwise;
auto block_alignment = std::vector<size_t>{128ul,4ul};
{
auto alignment = block_alignment[0];
auto scale_dim_0 = DIVUP(DIVUP(first_dim,
static_cast<size_t>(1)),
alignment) * alignment;
alignment = block_alignment[1];
auto scale_dim_1 = DIVUP(DIVUP(last_dim,
static_cast<size_t>(32)),
alignment) * alignment;
ret_rowwise.shape = {scale_dim_0, scale_dim_1};
}
{
auto alignment = block_alignment[1];
auto scale_dim_0 = DIVUP(DIVUP(first_dim,
static_cast<size_t>(32)),
alignment) * alignment;
alignment = block_alignment[0];
auto scale_dim_1 = DIVUP(DIVUP(last_dim,
static_cast<size_t>(1)),
alignment) * alignment;
ret_colwise.shape = {scale_dim_0, scale_dim_1};
}
if (isFp8Type(type)) {
ret_rowwise.type = DType::kFloat8E8M0;
ret_colwise.type = DType::kFloat8E8M0;
ret_rowwise.type_size = sizeof(uint8_t);
ret_colwise.type_size = sizeof(uint8_t);
return {ret_rowwise, ret_colwise};
}
NVTE_ERROR("Invalid scaling mode!");
}
Tensor::Tensor(const std::string& name,
const NVTEShape &shape, const DType type,
const bool rowwise, const bool columnwise,
const NVTEScalingMode &scaling_mode) {
name_ = name;
const size_t seed = create_seed_from_tensor_name(name);
gen_.seed(seed);
rowwise_ = rowwise;
columnwise_ = columnwise;
size_t s = typeToSize(type);
size_t total_size = product(shape) * s;
void *dptr_rowwise = nullptr;
void *dptr_columnwise = nullptr;
cpu_data_rowwise_ = nullptr;
cpu_data_columnwise_ = nullptr;
amax_cpu_data_ = nullptr;
scale_cpu_data_ = nullptr;
rowwise_scale_inv_cpu_data_ = nullptr;
columnwise_scale_inv_cpu_data_ = nullptr;
float *amax = nullptr, *scale = nullptr;
float *rowwise_scale_inv = nullptr, *columnwise_scale_inv = nullptr;
if (columnwise) {
NVTE_CHECK(shape.ndim >= 2);
}
std::vector<size_t> normalized_shape_v = {product(shape, 0, shape.ndim - 1),
shape.data[shape.ndim - 1]};
NVTEShape normalized_shape = convertShape(normalized_shape_v);
std::vector<size_t> columnwise_shape_vec;
if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) {
// Transpose when tensor scaling
columnwise_shape_vec.emplace_back(shape.data[shape.ndim - 1]);
for (size_t i = 0; i < shape.ndim - 1; ++i) {
columnwise_shape_vec.emplace_back(shape.data[i]);
}
} else {
// Same shape for MX
for (size_t i = 0; i < shape.ndim; ++i) {
columnwise_shape_vec.emplace_back(shape.data[i]);
}
}
const NVTEShape columnwise_shape{columnwise_shape_vec.data(), columnwise_shape_vec.size()};
tensor_ = TensorWrapper(scaling_mode);
if (total_size != 0) {
if (rowwise) {
cudaMalloc((void**)&dptr_rowwise, total_size); // NOLINT(*)
cudaMemset(dptr_rowwise, 0, total_size);
cpu_data_rowwise_ = std::make_unique<unsigned char[]>(total_size);
std::fill_n(cpu_data_rowwise_.get(), total_size, 0);
}
if (columnwise) {
cudaMalloc((void**)&dptr_columnwise, total_size); // NOLINT(*)
cudaMemset(dptr_columnwise, 0, total_size);
cpu_data_columnwise_ = std::make_unique<unsigned char[]>(total_size);
std::fill_n(cpu_data_columnwise_.get(), total_size, 0);
}
}
tensor_.set_rowwise_data(dptr_rowwise, type, shape);
tensor_.set_columnwise_data(dptr_columnwise, type, columnwise_shape);
if (isFp8Type(type)) {
if (is_tensor_scaling(scaling_mode)) {
cudaMalloc((void**)&amax, sizeof(float)); // NOLINT(*)
cudaMemset(amax, 0, sizeof(float));
cudaMalloc((void**)&scale, sizeof(float)); // NOLINT(*)
cudaMemset(scale, 0, sizeof(float));
cudaMalloc((void**)&scale_inv, sizeof(float)); // NOLINT(*)
cudaMemset(scale_inv, 0, sizeof(float));
amax_cpu_data_ = std::make_shared<float>();
*amax_cpu_data_ = 0;
scale_cpu_data_ = std::make_shared<float>();
*scale_cpu_data_ = 0;
scale_inv_cpu_data_ = std::make_shared<float>();
*scale_inv_cpu_data_ = 0;
amax_cpu_data_ = std::make_shared<float>(0);
scale_cpu_data_ = std::make_shared<float>(0);
tensor_.set_amax(amax, DType::kFloat32, std::vector<size_t>{1});
tensor_.set_scale(scale, DType::kFloat32, std::vector<size_t>{1});
cudaMalloc((void**)&rowwise_scale_inv, sizeof(float)); // NOLINT(*)
if (rowwise) {
tensor_.set_rowwise_scale_inv(rowwise_scale_inv, DType::kFloat32,
std::vector<size_t>{1});
rowwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(sizeof(float));
std::fill_n(rowwise_scale_inv_cpu_data_.get(), sizeof(float), 0);
}
if (columnwise) {
tensor_.set_columnwise_scale_inv(rowwise_scale_inv, DType::kFloat32,
std::vector<size_t>{1});
columnwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(sizeof(float));
std::fill_n(columnwise_scale_inv_cpu_data_.get(), sizeof(float), 0);
}
} else {
auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(normalized_shape,
tensor_.scaling_mode());
auto rowwise_scale_size = product(rowwise_scale_meta.shape) * rowwise_scale_meta.type_size;
auto columnwise_scale_size = product(colwise_scale_meta.shape) * colwise_scale_meta.type_size;
auto scale_shape = rowwise_scale_meta.shape;
auto columnwise_scale_shape = colwise_scale_meta.shape;
if (rowwise) {
cudaMalloc((void**)&rowwise_scale_inv, rowwise_scale_size); // NOLINT(*)
cudaMemset(rowwise_scale_inv, 0, rowwise_scale_size);
rowwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(rowwise_scale_size);
std::fill_n(rowwise_scale_inv_cpu_data_.get(), rowwise_scale_size, 0);
tensor_.set_rowwise_scale_inv(rowwise_scale_inv, DType::kFloat8E8M0, scale_shape);
}
if (columnwise) {
cudaMalloc((void**)&columnwise_scale_inv, columnwise_scale_size); // NOLINT(*)
cudaMemset(columnwise_scale_inv, 0, columnwise_scale_size);
columnwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(columnwise_scale_size);
std::fill_n(columnwise_scale_inv_cpu_data_.get(), columnwise_scale_size, 0);
tensor_.set_columnwise_scale_inv(columnwise_scale_inv, DType::kFloat8E8M0, columnwise_scale_shape);
}
}
tensor_ = TensorWrapper(dptr, shape, type, amax, scale, scale_inv);
}
}
void Tensor::to_cpu() const {
const NVTEShape s = tensor_.shape();
const size_t size = product(s) * typeToSize(tensor_.dtype());
cudaMemcpy(cpu_data_.get(), tensor_.dptr(), size, cudaMemcpyDeviceToHost);
if (rowwise_) {
cudaMemcpy(cpu_data_rowwise_.get(),
tensor_.get_rowwise_data().data_ptr,
size,
cudaMemcpyDeviceToHost);
}
if (columnwise_) {
cudaMemcpy(cpu_data_columnwise_.get(),
tensor_.get_columnwise_data().data_ptr,
size,
cudaMemcpyDeviceToHost);
}
if (isFp8Type(dtype())) {
cudaMemcpy(amax_cpu_data_.get(), tensor_.amax(), sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(scale_cpu_data_.get(), tensor_.scale(), sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(scale_inv_cpu_data_.get(), tensor_.scale_inv(), sizeof(float),
cudaMemcpyDeviceToHost);
if (is_tensor_scaling(tensor_.scaling_mode())) {
cudaMemcpy(amax_cpu_data_.get(),
tensor_.amax(),
sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(scale_cpu_data_.get(),
tensor_.scale(),
sizeof(float),
cudaMemcpyDeviceToHost);
}
auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode());
if (rowwise_) {
auto scale_size = product(rowwise_scale_meta.shape) * rowwise_scale_meta.type_size;
cudaMemcpy(rowwise_scale_inv_cpu_data_.get(),
tensor_.get_rowwise_scale_inv().data_ptr,
scale_size,
cudaMemcpyDeviceToHost);
}
if (columnwise_) {
auto scale_size = product(colwise_scale_meta.shape) * colwise_scale_meta.type_size;
cudaMemcpy(columnwise_scale_inv_cpu_data_.get(),
tensor_.get_columnwise_scale_inv().data_ptr,
scale_size,
cudaMemcpyDeviceToHost);
}
}
}
void Tensor::from_cpu() const {
const NVTEShape s = tensor_.shape();
const size_t size = product(s) * typeToSize(tensor_.dtype());
cudaMemcpy(tensor_.dptr(), cpu_data_.get(), size, cudaMemcpyHostToDevice);
if (rowwise_) {
cudaMemcpy(tensor_.get_rowwise_data().data_ptr,
cpu_data_rowwise_.get(), size, cudaMemcpyHostToDevice);
}
if (columnwise_) {
cudaMemcpy(tensor_.get_columnwise_data().data_ptr,
cpu_data_columnwise_.get(), size, cudaMemcpyHostToDevice);
}
if (isFp8Type(dtype())) {
cudaMemcpy(tensor_.amax(), amax_cpu_data_.get(), sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(tensor_.scale(), scale_cpu_data_.get(), sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(tensor_.scale_inv(), scale_inv_cpu_data_.get(), sizeof(float),
cudaMemcpyHostToDevice);
if (is_tensor_scaling(tensor_.scaling_mode())) {
cudaMemcpy(tensor_.amax(), amax_cpu_data_.get(), sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(tensor_.scale(), scale_cpu_data_.get(), sizeof(float),
cudaMemcpyHostToDevice);
}
auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode());
if (rowwise_) {
auto scale_size = product(rowwise_scale_meta.shape) * rowwise_scale_meta.type_size;
cudaMemcpy(tensor_.get_rowwise_scale_inv().data_ptr,
rowwise_scale_inv_cpu_data_.get(), scale_size,
cudaMemcpyHostToDevice);
}
if (columnwise_) {
auto scale_size = product(colwise_scale_meta.shape) * colwise_scale_meta.type_size;
cudaMemcpy(tensor_.get_columnwise_scale_inv().data_ptr,
columnwise_scale_inv_cpu_data_.get(), scale_size,
cudaMemcpyHostToDevice);
}
}
}
void Tensor::set_scale(float scale) {
if (isFp8Type(dtype())) {
NVTE_CHECK(scale_cpu_data_);
*scale_cpu_data_ = scale;
from_cpu();
if (is_tensor_scaling(tensor_.scaling_mode())) {
*scale_cpu_data_ = scale;
from_cpu();
}
}
}
void Tensor::set_scale_inv(float scale_inv) {
if (isFp8Type(dtype())) {
NVTE_CHECK(scale_inv_cpu_data_);
*scale_inv_cpu_data_ = scale_inv;
if (rowwise_) {
NVTE_CHECK(rowwise_scale_inv_cpu_data_);
}
if (columnwise_) {
NVTE_CHECK(columnwise_scale_inv_cpu_data_);
}
auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(tensor_.shape(), tensor_.scaling_mode());
if (rowwise_) {
auto num_scales = product(rowwise_scale_meta.shape);
if (num_scales == 1){
rowwise_cpu_scale_inv_ptr<float>()[0] = scale_inv;
} else{
std::uniform_int_distribution<uint8_t> dis(0, 127);
auto* scale_inv_ptr = rowwise_cpu_scale_inv_ptr<uint8_t>();
for (size_t i = 0; i < num_scales; i++){
scale_inv_ptr[i] = dis(gen_);
}
}
}
if (columnwise_) {
auto num_scales = product(colwise_scale_meta.shape);
if (num_scales == 1){
columnwise_cpu_scale_inv_ptr<float>()[0] = scale_inv;
} else{
std::uniform_int_distribution<uint8_t> dis(0, 127);
auto* scale_inv_ptr = columnwise_cpu_scale_inv_ptr<uint8_t>();
for (size_t i = 0; i < num_scales; i++){
scale_inv_ptr[i] = dis(gen_);
}
}
}
from_cpu();
}
}
void Tensor::shareFP8Meta(const Tensor &other) {
if(isFp8Type(dtype()) && isFp8Type(other.dtype())) {
tensor_ = TensorWrapper(dptr(), shape(), dtype(),
other.tensor_.amax(),
other.tensor_.scale(),
other.tensor_.scale_inv());
auto new_tensor = TensorWrapper(other.tensor_.scaling_mode());
auto my_rowwise_data = tensor_.get_rowwise_data();
new_tensor.set_rowwise_data(my_rowwise_data.data_ptr,
static_cast<DType>(my_rowwise_data.dtype),
my_rowwise_data.shape);
auto my_columnwise_data = tensor_.get_columnwise_data();
new_tensor.set_columnwise_data(my_columnwise_data.data_ptr,
static_cast<DType>(my_columnwise_data.dtype),
my_columnwise_data.shape);
auto other_amax = other.tensor_.get_amax();
new_tensor.set_amax(other_amax.data_ptr,
static_cast<DType>(other_amax.dtype),
other_amax.shape);
auto other_scale = other.tensor_.get_scale();
new_tensor.set_scale(other_scale.data_ptr,
static_cast<DType>(other_scale.dtype),
other_scale.shape);
auto other_row_scale_inv = other.tensor_.get_rowwise_scale_inv();
new_tensor.set_rowwise_scale_inv(other_row_scale_inv.data_ptr,
static_cast<DType>(other_row_scale_inv.dtype),
other_row_scale_inv.shape);
auto other_col_scale_inv = other.tensor_.get_columnwise_scale_inv();
new_tensor.set_columnwise_scale_inv(other_col_scale_inv.data_ptr,
static_cast<DType>(other_col_scale_inv.dtype),
other_col_scale_inv.shape);
tensor_ = std::move(new_tensor);
to_cpu();
}
}
......@@ -177,12 +464,14 @@ std::vector<size_t> unravel(const size_t i, const NVTEShape &shape) {
return ret;
}
void compareResults(const std::string &name, const Tensor &test, const void *ref,
double atol, double rtol) {
test.to_cpu();
const size_t N = product(test.shape());
void compareResults_sequential(const std::string &name, const Tensor &test,
const void *ref, const bool rowwise,
double atol, double rtol, bool if_on_gpus) {
if (if_on_gpus) test.to_cpu();
const auto& shape = rowwise ? test.rowwise_shape() : test.columnwise_shape();
const size_t N = product(shape);
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(test.dtype(), T,
const T *test_data = test.cpu_dptr<T>();
const T *test_data = rowwise ? test.rowwise_cpu_dptr<T>() : test.columnwise_cpu_dptr<T>();
const T *ref_data = reinterpret_cast<const T*>(ref);
for (size_t i = 0; i < N; ++i) {
double t = static_cast<double>(test_data[i]);
......@@ -200,14 +489,84 @@ void compareResults(const std::string &name, const Tensor &test, const void *ref
const double cast_mean_m = static_cast<double>(static_cast<T>(mean_m));
assertion = !(cast_mean_m == std::min(t,r) && cast_mean_p == std::max(t,r));
}
ASSERT_FALSE(assertion) << "Error in tensor " << name << std::endl
<< "Mismatch at place " << to_string(unravel(i, test.shape()))
std::string direction = rowwise ? "rowwise" : "columnwise";
ASSERT_FALSE(assertion) << "Error in tensor " << name << " in "
<< direction << " direction." << std::endl
<< "Mismatch at place " << to_string(unravel(i, shape))
<< " (" << std::to_string(i) << "): " << t << " vs " << r;
}
);
}
template <typename T>
static size_t getFirstMismatchIdx(const DType data_type, const T* test_data, const T* ref_data,
const size_t N, const double atol, const double rtol) {
int first_mismatch_idx = N;
bool is_mismatch_found = false;
#pragma omp parallel for schedule(static) firstprivate(is_mismatch_found) \
reduction(min: first_mismatch_idx) proc_bind(spread)
for (size_t i = 0; i < N; ++i) {
if (is_mismatch_found) { // early escape of the omp thread
continue;
}
double t = static_cast<double>(test_data[i]);
double r = static_cast<double>(ref_data[i]);
bool mismatch = fabs(t - r) > atol && (r == 0 || fabs((t - r) / r) > rtol);
/* For Float32 the floating point comparison is enough to error out */
bool assertion = mismatch && (data_type == DType::kFloat32);
if (mismatch && !assertion) {
/* Check if it is just a failure of round to nearest choosing different
side of the real value */
const double mean = (t + r) / 2;
const double mean_p = mean >= 0 ? mean * (1 + 1e-6) : mean * (1 - 1e-6);
const double mean_m = mean >= 0 ? mean * (1 - 1e-6) : mean * (1 + 1e-6);
const double cast_mean_p = static_cast<double>(static_cast<T>(mean_p));
const double cast_mean_m = static_cast<double>(static_cast<T>(mean_m));
assertion = !(cast_mean_m == std::min(t,r) && cast_mean_p == std::max(t,r));
}
if (assertion && i < first_mismatch_idx) {
first_mismatch_idx = i;
is_mismatch_found = true;
}
}
return first_mismatch_idx;
}
void compareResults_parallel(const std::string &name, const Tensor &test, const void *ref,
const bool rowwise, double atol, double rtol, bool if_on_gpus) {
if (if_on_gpus) test.to_cpu();
const auto& shape = rowwise ? test.rowwise_shape() : test.columnwise_shape();
const size_t N = product(shape);
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(test.dtype(), T,
const T *test_data = rowwise ? test.rowwise_cpu_dptr<T>() : test.columnwise_cpu_dptr<T>();
const T *ref_data = reinterpret_cast<const T*>(ref);
const size_t i = getFirstMismatchIdx<T>(test.dtype(), test_data, ref_data, N, atol, rtol);
if (i != N) {
const double t = static_cast<double>(test_data[i]);
const double r = static_cast<double>(ref_data[i]);
std::string direction = rowwise ? "rowwise" : "columnwise";
ASSERT_FALSE(true) << "Error in tensor " << name << " in "
<< direction << " direction." << std::endl
<< "Mismatch at place " << to_string(unravel(i, shape))
<< " (" << std::to_string(i) << "): " << t << " vs " << r;
}
);
}
void compareResults(const std::string &name, const Tensor &test, const void *ref,
const bool rowwise, double atol, double rtol, bool if_on_gpus) {
constexpr bool sequential = false;
if constexpr (sequential) {
compareResults_sequential(name, test, ref, rowwise, atol, rtol, if_on_gpus);
} else {
compareResults_parallel(name, test, ref, rowwise, atol, rtol, if_on_gpus);
}
}
void compareResults(const std::string &name, const float test, const float ref,
double atol, double rtol) {
double t = static_cast<double>(test);
......@@ -218,6 +577,51 @@ void compareResults(const std::string &name, const float test, const float ref,
}
void compareResults(const std::string &name, const uint8_t *test, const uint8_t *ref,
size_t N, float mismatch_rate_tol) {
size_t max_mismatches = std::ceil(N * mismatch_rate_tol);
size_t n_mismatches = 0;
std::vector<size_t> mismatch_indices;
for (int i = 0; i < N; i++){
bool mismatch = test[i] != ref[i];
if (mismatch){
n_mismatches++;
mismatch_indices.push_back(i);
}
if (n_mismatches > max_mismatches){
std::cout << "Error in " << name << std::endl;
for (auto &index : mismatch_indices)
std::cout << "Mismatch at (" << index << "):" << static_cast<int>(test[i]) << " vs "
<< static_cast<int>(ref[i]) << std::endl;
GTEST_FAIL() << n_mismatches << " mismatche(s) which is more than mismatch tol.";
}
}
}
void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test, const uint8_t *ref,
const size_t row_blocks, const size_t col_blocks, const size_t stride)
{
for (int i = 0; i < row_blocks; ++i) {
for (int j = 0; j < col_blocks; ++j) {
const int idx = i * stride + j;
ASSERT_FALSE(test[idx] != ref[idx]) << "Error in " << name << std::endl
<< "Mismatch: " << static_cast<int>(test[idx]) << " vs "
<< static_cast<int>(ref[idx]) << " at index " << idx;
}
}
}
void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test, const uint8_t *ref,
const size_t N)
{
for (int i = 0; i < N; i++) {
ASSERT_FALSE(test[i] != ref[i]) << "Error in " << name << std::endl
<< "Mismatch: " << static_cast<int>(test[i]) << " vs "
<< static_cast<int>(ref[i]) << " at index " << i;
}
}
std::pair<double, double> getTolerances(const DType type) {
switch(type) {
case DType::kFloat32:
......@@ -228,6 +632,7 @@ std::pair<double, double> getTolerances(const DType type) {
return {1e-5, 1e-2};
case DType::kFloat8E4M3:
case DType::kFloat8E5M2:
case DType::kFloat8E8M0:
return {1e-2, 1e-2};
default:
NVTE_CHECK("Invalid type!");
......@@ -235,29 +640,158 @@ std::pair<double, double> getTolerances(const DType type) {
return {0, 0};
}
template <typename T>
void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
#pragma omp parallel proc_bind(spread)
{
std::mt19937 gen_local = *gen;
gen_local.discard(omp_get_thread_num() * 599);
std::uniform_real_distribution<> dis(-2.0, 1.0);
#pragma omp for schedule(static)
for (size_t i = 0; i < size; ++i) {
data[i] = static_cast<T>(dis(gen_local));
}
}
gen->discard(size);
}
void fillUniform(Tensor *t) {
const size_t size = product(t->shape());
static std::mt19937 gen(12345);
if (t->rowwise()) {
const size_t size = product(t->rowwise_shape());
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T,
{
T *data = t->rowwise_cpu_dptr<T>();
generate_data_uniformly(data, size, &(t->gen()));
}
);
} else {
const size_t size = product(t->columnwise_shape());
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T,
{
T *data = t->columnwise_cpu_dptr<T>();
generate_data_uniformly(data, size, &(t->gen()));
}
);
}
std::uniform_real_distribution<> dis(-2.0, 1.0);
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T, {
T *data = t->cpu_dptr<T>();
t->set_scale_inv(dis(t->gen()));
t->from_cpu();
}
template<typename InputEncoding, InputsFillCase Case>
void fillCase_special(Tensor *t) {
const size_t size = product(t->rowwise_shape());
const size_t rows = t->rowwise_shape().data[0];
const size_t cols = t->rowwise_shape().data[1];
if constexpr (Case == InputsFillCase::zeros) {
TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(t->dtype(), InputType, {
InputType *data = t->rowwise_cpu_dptr<InputType>();
for (size_t i = 0; i < size; ++i) {
data[i] = T(dis(gen));
data[i] = static_cast<InputType>(0);
}
});
t->set_scale_inv(dis(gen));
});
} else {
double minAbs = -2.0;
double maxAbs = 1.0;
if constexpr (Case != InputsFillCase::uniform) {
minAbs = Quantized_Limits<InputEncoding>::ranges[Case];
maxAbs = Quantized_Limits<InputEncoding>::ranges[Case + 1];
}
std::uniform_real_distribution<> dis(minAbs, maxAbs);
std::uniform_real_distribution<> dis_sign(-1.0, 1.0);
TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(t->dtype(), InputType, {
InputType *data = t->rowwise_cpu_dptr<InputType>();
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
const size_t idx = i * cols + j;
const bool is_negative = (dis_sign(t->gen()) < 0.0);
double val = dis(t->gen());
if (is_negative) {
val = -val;
}
data[idx] = static_cast<InputType>(val);
}
}
});
}
t->set_scale_inv(1.0);
t->from_cpu();
}
template <typename InputEncoding>
void fillCase(Tensor *t, const InputsFillCase fill_case) {
switch (fill_case) {
case InputsFillCase::uniform:
fillCase_special<InputEncoding, InputsFillCase::uniform>(t); break;
case InputsFillCase::zeros:
fillCase_special<InputEncoding, InputsFillCase::zeros>(t); break;
case InputsFillCase::zero_to_minNorm:
fillCase_special<InputEncoding, InputsFillCase::zero_to_minNorm>(t); break;
case InputsFillCase::minNorm_to_maxNorm:
fillCase_special<InputEncoding, InputsFillCase::minNorm_to_maxNorm>(t); break;
case InputsFillCase::maxNorm_to_inf:
fillCase_special<InputEncoding, InputsFillCase::maxNorm_to_inf>(t); break;
}
}
template void fillCase<fp8e4m3>(Tensor *t, const InputsFillCase fill_case);
template void fillCase<fp8e5m2>(Tensor *t, const InputsFillCase fill_case);
template void fillCase<fp32>(Tensor *t, const InputsFillCase fill_case);
void setRandomScale(Tensor *t) {
static std::mt19937 gen(12345);
std::uniform_real_distribution<> dis(-2.0, 1.0);
const float scale = dis(gen);
const float scale = dis(t->gen());
t->set_scale(scale);
}
void setRandomScaleInv(Tensor *t) {
std::uniform_real_distribution<> dis(-2.0, 1.0);
const float scale_inv = dis(t->gen());
t->set_scale_inv(scale_inv);
}
bool isFp8Type(DType type) {
return type == DType::kFloat8E4M3 || type == DType::kFloat8E5M2;
return type == DType::kFloat8E4M3 || type == DType::kFloat8E5M2 || type == DType::kFloat8E8M0;
}
int32_t getDeviceComputeCapability()
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
return 10 * deviceProp.major + deviceProp.minor;
}
size_t first_dimension(const std::vector<size_t> &shape) {
if (shape.size() == 0) return 1;
if (shape.size() == 1) return 1;
return product(shape, 0, shape.size() - 1);
}
size_t last_dimension(const std::vector<size_t> &shape) {
if (shape.size() == 0) return 1;
return shape[shape.size() - 1];
}
std::array<size_t, 4> get_scale_tensor_dims(const size_t rows,
const size_t cols,
const size_t block_size_rows,
const size_t block_size_cols) {
const bool is_rowwise = (block_size_rows == 1) && (block_size_cols == 32);
const size_t alignment_Y = is_rowwise
? scale_tensor_alignment_Y_rowwise
: scale_tensor_alignment_Y_colwise;
const size_t alignment_X = is_rowwise
? scale_tensor_alignment_X_rowwise
: scale_tensor_alignment_X_colwise;
const size_t unpadded_blocks_Y = divide_round_up(rows, block_size_rows);
const size_t unpadded_blocks_X = divide_round_up(cols, block_size_cols);
const size_t blocks_Y = round_up_to_nearest_multiple(unpadded_blocks_Y, alignment_Y);
const size_t blocks_X = round_up_to_nearest_multiple(unpadded_blocks_X, alignment_X);
return {unpadded_blocks_Y, unpadded_blocks_X, blocks_Y, blocks_X};
}
} // namespace test
......@@ -6,9 +6,10 @@
#pragma once
#include <iostream>
#include <memory>
#include <vector>
#include <array>
#include <random>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
......@@ -52,6 +53,7 @@ using fp16 = half;
using bf16 = nv_bfloat16;
using fp8e4m3 = __nv_fp8_e4m3;
using fp8e5m2 = __nv_fp8_e5m2;
using fp8e8m0 = uint8_t;
template <typename T>
struct TypeInfo{
......@@ -62,7 +64,8 @@ struct TypeInfo{
fp16,
bf16,
fp8e4m3,
fp8e5m2>;
fp8e5m2,
fp8e8m0>;
template <typename U, DType current>
struct Helper {
......@@ -94,10 +97,19 @@ struct TypeInfo{
class Tensor {
public:
Tensor(const NVTEShape &shape, const DType type);
Tensor(const std::vector<size_t> &shape, const DType type) :
Tensor(NVTEShape{shape.data(), shape.size()}, type) {}
Tensor(const std::string& name,
const NVTEShape &shape, const DType type,
const bool rowwise = true,
const bool columnwise = false,
const NVTEScalingMode &mode = NVTE_DELAYED_TENSOR_SCALING);
Tensor(const std::string& name,
const std::vector<size_t> &shape,
const DType type,
const bool rowwise = true,
const bool columnwise = false,
const NVTEScalingMode &mode = NVTE_DELAYED_TENSOR_SCALING) :
Tensor(name, NVTEShape{shape.data(), shape.size()}, type, rowwise, columnwise, mode) {}
Tensor() {}
......@@ -108,30 +120,82 @@ class Tensor {
Tensor& operator=(Tensor &&other) = default;
~Tensor() {
if (tensor_.dptr() != nullptr) {
cudaFree(tensor_.dptr());
void *data_ptr = tensor_.dptr();
void *scale_inv = tensor_.scale_inv();
void *columnwise_data_ptr = tensor_.get_columnwise_data().data_ptr;
void *columnwise_scale_inv = tensor_.get_columnwise_scale_inv().data_ptr;
if (columnwise_data_ptr == data_ptr) {
columnwise_data_ptr = nullptr;
}
if (columnwise_scale_inv == scale_inv) {
columnwise_scale_inv = nullptr;
}
if (data_ptr != nullptr) {
cudaFree(data_ptr);
}
if (scale_inv != nullptr) {
cudaFree(scale_inv);
}
if (columnwise_data_ptr != nullptr){
cudaFree(columnwise_data_ptr);
}
if (columnwise_scale_inv != nullptr){
cudaFree(columnwise_scale_inv);
}
}
NVTETensor data() const noexcept {
return tensor_.data();
}
const NVTEShape shape() const noexcept {
return tensor_.shape();
NVTEShape rowwise_shape() const noexcept {
return tensor_.get_rowwise_data().shape;
}
NVTEShape columnwise_shape() const noexcept {
return tensor_.get_columnwise_data().shape;
}
NVTEShape rowwise_scale_inv_shape() const {
NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!");
return tensor_.get_rowwise_scale_inv().shape;
}
NVTEShape columnwise_scale_inv_shape() const {
NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!");
return tensor_.get_columnwise_scale_inv().shape;
}
NVTEScalingMode scaling_mode() const noexcept {
return tensor_.scaling_mode();
}
DType dtype() const noexcept {
return tensor_.dtype();
}
void *dptr() const noexcept {
return tensor_.dptr();
void *rowwise_dptr() const {
NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!");
return tensor_.get_rowwise_data().data_ptr;
}
void *columnwise_dptr() const {
NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!");
return tensor_.get_columnwise_data().data_ptr;
}
template <typename T>
T *rowwise_cpu_dptr() const {
NVTE_CHECK(TypeInfo<T>::dtype == tensor_.dtype(), "Invalid type!");
NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!");
return reinterpret_cast<T *>(cpu_data_rowwise_.get());
}
template <typename T>
T *cpu_dptr() const {
T *columnwise_cpu_dptr() const {
NVTE_CHECK(TypeInfo<T>::dtype == tensor_.dtype(), "Invalid type!");
return reinterpret_cast<T *>(cpu_data_.get());
NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!");
return reinterpret_cast<T *>(cpu_data_columnwise_.get());
}
float amax() const {
......@@ -145,6 +209,7 @@ class Tensor {
float scale() const {
if(scale_cpu_data_) {
NVTE_CHECK(tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING, "Invalid scaling_mode!");
to_cpu();
return *scale_cpu_data_;
} else {
......@@ -152,52 +217,246 @@ class Tensor {
}
}
float scale_inv() const {
if(scale_inv_cpu_data_) {
to_cpu();
return *scale_inv_cpu_data_;
template <typename T>
T *rowwise_cpu_scale_inv_ptr(){
if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){
NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
} else {
NVTE_CHECK(TypeInfo<T>::dtype == DType::kByte, "Invalid type!");
}
to_cpu();
return reinterpret_cast<T*>(rowwise_scale_inv_cpu_data_.get());
}
template <typename T>
T *columnwise_cpu_scale_inv_ptr(){
if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){
NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
} else {
NVTE_CHECK(TypeInfo<T>::dtype == DType::kByte, "Invalid type!");
}
to_cpu();
return reinterpret_cast<T*>(columnwise_scale_inv_cpu_data_.get());
}
float rowwise_scale_inv(){
if(rowwise_scale_inv_cpu_data_) {
float scale_inv = rowwise_cpu_scale_inv_ptr<float>()[0];
return scale_inv;
} else {
return 1;
}
}
bool rowwise() const {
return rowwise_;
}
bool columnwise() const {
return columnwise_;
}
void to_cpu() const;
void from_cpu() const;
void set_scale(float scale);
void set_scale_inv(float scale_inv);
void shareFP8Meta(const Tensor &other);
std::mt19937& gen() { return gen_; }
private:
TensorWrapper tensor_;
std::unique_ptr<unsigned char[]> cpu_data_;
std::unique_ptr<unsigned char[]> cpu_data_rowwise_;
std::unique_ptr<unsigned char[]> cpu_data_columnwise_;
std::shared_ptr<float> amax_cpu_data_;
std::shared_ptr<float> scale_cpu_data_;
std::shared_ptr<float> scale_inv_cpu_data_;
std::unique_ptr<unsigned char[]> rowwise_scale_inv_cpu_data_;
std::unique_ptr<unsigned char[]> columnwise_scale_inv_cpu_data_;
bool rowwise_;
bool columnwise_;
std::string name_;
std::mt19937 gen_;
};
constexpr uint32_t FP32_EXPONENT_BIAS = 127;
constexpr uint32_t FP32_MANTISSA_BITS = 23;
// [128,4] rowwise and [4,128] colwise alignment requirement
constexpr size_t scale_tensor_alignment_X_rowwise = 4;
constexpr size_t scale_tensor_alignment_Y_rowwise = 128;
constexpr size_t scale_tensor_alignment_X_colwise = 128;
constexpr size_t scale_tensor_alignment_Y_colwise = 4;
inline size_t divide_round_up(const size_t N, const size_t M) {
return (N - 1 + M) / M;
}
inline size_t round_up_to_nearest_multiple(const size_t N, const size_t M) {
return divide_round_up(N, M) * M;
}
template <typename T>
struct Numeric_Traits {
static constexpr double minSubnorm = 1.0;
static constexpr double maxSubnorm = 1.0;
static constexpr double minNorm = 1.0;
static constexpr double maxNorm = 1.0;
static constexpr double artifInf = 1.0;
static constexpr int maxBiasedExponent = 1;
};
template <>
struct Numeric_Traits<fp8e4m3> {
static constexpr double minSubnorm = 1.0 / static_cast<double>(1 << 9); // std::pow(2.0, -9.0);
static constexpr double maxSubnorm = 0.875 / static_cast<double>(1 << 6); // std::pow(2.0, -6.0);
static constexpr double minNorm = 1.0 / static_cast<double>(1 << 6); // std::pow(2.0, -6.0);
static constexpr double maxNorm = 448.0;
static constexpr double artifInf = 10.0 * maxNorm; // artificial Infinity
static constexpr int maxBiasedExponentAsFP32 = 8 + FP32_EXPONENT_BIAS;
static constexpr int maxUnbiasedExponentAsFP32 = 8;
static constexpr int maxExpNorm = 1 << maxUnbiasedExponentAsFP32;
};
template <>
struct Numeric_Traits<fp8e5m2> {
static constexpr double minSubnorm = 1.0 / static_cast<double>(1 << 16); // std::pow(2.0, -16.0);
static constexpr double maxSubnorm = 0.75 / static_cast<double>(1 << 14); // std::pow(2.0, -14.0);
static constexpr double minNorm = 1.0 / static_cast<double>(1 << 14); // std::pow(2.0, -14.0);
static constexpr double maxNorm = 57344.0;
static constexpr double artifInf = 10.0 * maxNorm; // artificial Infinity
static constexpr int maxBiasedExponentAsFP32 = 15 + FP32_EXPONENT_BIAS;
static constexpr int maxUnbiasedExponentAsFP32 = 15;
static constexpr int maxExpNorm = 1 << maxUnbiasedExponentAsFP32;
};
template <>
struct Numeric_Traits<fp32> {
static constexpr double minSubnorm = std::numeric_limits<fp32>::denorm_min(); // std::pow(2.0, -149.0);
static constexpr double maxSubnorm = std::numeric_limits<fp32>::min()
- std::numeric_limits<fp32>::denorm_min(); // minNormalized - minDenormalized
static constexpr double minNorm = std::numeric_limits<fp32>::min(); // std::pow(2.0, -126.0);
static constexpr double maxNorm = std::numeric_limits<fp32>::max(); // (1 - pow(2, -24)) * pow(2, 128)
static constexpr double artifInf = std::numeric_limits<fp32>::infinity();
static constexpr int maxBiasedExponentAsFP32 = 255;
static constexpr int maxUnbiasedExponentAsFP32 = 128;
};
template <typename T>
struct Quantized_Limits {
static constexpr double ranges[] = {
0.0,
Numeric_Traits<T>::minNorm,
Numeric_Traits<T>::maxNorm,
Numeric_Traits<T>::artifInf
};
static constexpr inline fp32 max() { return static_cast<fp32>(Numeric_Traits<T>::maxNorm); }
static constexpr inline fp32 max_reciprocal() { return static_cast<fp32>(1.0 / max()); }
static constexpr inline fp32 emax() { return static_cast<fp32>(Numeric_Traits<T>::maxExpNorm); }
static constexpr inline fp32 emax_reciprocal() { return static_cast<fp32>(1.0 / emax()); }
static constexpr inline int max_norm_biased_exponent() { return Numeric_Traits<T>::maxBiasedExponentAsFP32; }
static constexpr inline int max_norm_unbiased_exponent() { return Numeric_Traits<T>::maxUnbiasedExponentAsFP32; }
};
// Input data filling cases
// Considering normal and subnormal magnitudes of E4M3 and E5M2 formats
// with nearest to even rounding per OFP8 specification
enum InputsFillCase {
zero_to_minNorm = 0, // [0, min_normal)
minNorm_to_maxNorm = 1, // [min_normal, max_normal)
maxNorm_to_inf = 2, // [max_normal, inf)
zeros = 3, // {0}
uniform = 4, // std::uniform_real_distribution<> dis(-2.0, 1.0)
};
inline fp8e8m0 float_to_e8m0(float val) {
// TODO: nan/inf needs to be set for any value
// of nan/inf in input not just amax.
if (std::isnan(val)) {
return 0xFF;
}
if (std::isinf(val)) {
return 0xFE;
}
if (val == 0.0f) {
return 0x00;
}
uint32_t val_u32 = *reinterpret_cast<uint32_t*>(&val);
fp8e8m0 exponent = (val_u32 >> FP32_MANTISSA_BITS);
uint32_t mantissa = val_u32 & 0x7FFFFF;
// Round up exponent and deal with satfinite.
if ((mantissa > 0 && exponent != 0xFE) && !(exponent == 0 && mantissa <= 0x400000)) {
++exponent;
}
return exponent;
}
inline float exp2f_rcp(fp8e8m0 biased_exp) {
return (biased_exp == 0) ? 1 : exp2f(FP32_EXPONENT_BIAS - static_cast<float>(biased_exp));
}
inline float identity(const float x) { return x; }
inline float gelu(const float x) { return x * (0.5f + 0.5f * tanhf(x * (0.79788456f + 0.03567741f * x * x))); }
inline float dgelu(const float x) {
const float tanh_out = tanhf(0.79788456f * x * (1 + 0.044715f * x * x));
return 0.5f * x * ((1 - tanh_out * tanh_out) * (0.79788456f + 0.1070322243f * x * x))
+ 0.5f * (1 + tanh_out);
}
inline float sigmoid(const float x) { return 1 / (1 + expf(-x)); }
inline float dsigmoid(const float x) { return sigmoid(x) * (1 - sigmoid(x)); }
inline float qgelu(const float x) { return x * sigmoid(1.702f * x); }
inline float dqgelu(const float x) { return 1.702f * x * dsigmoid(1.702f * x) + sigmoid(1.702f * x); }
inline float relu(const float x) { return fmaxf(0, x); }
inline float drelu(const float x) { return x > 0 ? 1 : 0; }
inline float silu(const float x) { return x * sigmoid(x); }
inline float dsilu(const float x) { return x * dsigmoid(x) + sigmoid(x); }
inline float srelu(const float x) { return x > 0 ? x * x : 0; }
inline float dsrelu(const float x) { return fmaxf(0, 2 * x); }
size_t typeToSize(DType type);
size_t product(const NVTEShape &shape);
size_t product(const std::vector<size_t> &shape);
size_t first_dimension(const std::vector<size_t> &shape);
size_t last_dimension(const std::vector<size_t> &shape);
bool areShapesEqual(const NVTEShape &s1, const NVTEShape &s2);
void compareResults(const std::string &name, const Tensor &test, const void *ref,
double atol = 1e-5, double rtol = 1e-8);
bool rowwise, double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true);
void compareResults(const std::string &name, const float test, const float ref,
double atol = 1e-5, double rtol = 1e-8);
void compareResults(const std::string &name, const uint8_t *test, const uint8_t *ref,
size_t N, float mismatch_rate_tol = 0.);
void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test, const uint8_t *ref,
const size_t row_blocks, const size_t col_blocks, const size_t stride);
void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test, const uint8_t *ref,
const size_t N);
std::array<size_t, 4> get_scale_tensor_dims(const size_t rows, const size_t cols,
const size_t block_size_rows, const size_t block_size_cols);
std::pair<double, double> getTolerances(const DType type);
void fillUniform(Tensor *t);
template <typename InputEncoding>
void fillCase(Tensor *t, const InputsFillCase fill_case);
void setRandomScale(Tensor *t);
void setRandomScaleInv(Tensor *t);
constexpr int THREADS_PER_WARP = 32;
const std::string &typeName(DType type);
const std::string& caseName(InputsFillCase type);
extern std::vector<DType> all_fp_types;
bool isFp8Type(DType type);
int32_t getDeviceComputeCapability();
constexpr int32_t blackwellComputeCapability = 100;
} // namespace test
#define TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(dtype, type, ...) \
......@@ -254,3 +513,47 @@ bool isFp8Type(DType type);
default: \
NVTE_ERROR("Invalid type."); \
}
#define TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(dtype, type, ...) \
switch (dtype) { \
using namespace transformer_engine; \
case DType::kFloat8E4M3: \
{ \
using type = fp8e4m3; \
{__VA_ARGS__} \
} \
break; \
case DType::kFloat8E5M2: \
{ \
using type = fp8e5m2; \
{__VA_ARGS__} \
} \
break; \
default: \
NVTE_ERROR("Invalid type."); \
}
#define TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(dtype, type, ...) \
switch (dtype) { \
using namespace transformer_engine; \
case DType::kFloat32: \
{ \
using type = float; \
{__VA_ARGS__} \
} \
break; \
case DType::kFloat16: \
{ \
using type = fp16; \
{__VA_ARGS__} \
} \
break; \
case DType::kBFloat16: \
{ \
using type = bf16; \
{__VA_ARGS__} \
} \
break; \
default: \
NVTE_ERROR("Invalid type."); \
}
......@@ -8,8 +8,9 @@ add_executable(test_util
../test_common.cu)
target_link_libraries(test_util PUBLIC CUDA::cudart GTest::gtest_main ${TE_LIB} CUDA::nvrtc CUDNN::cudnn)
target_compile_options(test_util PRIVATE -O2)
find_package(OpenMP REQUIRED)
target_link_libraries(test_util PUBLIC CUDA::cudart GTest::gtest_main ${TE_LIB} CUDA::nvrtc CUDNN::cudnn OpenMP::OpenMP_CXX)
target_compile_options(test_util PRIVATE -O2 -fopenmp)
include(GoogleTest)
gtest_discover_tests(test_util)
gtest_discover_tests(test_util DISCOVERY_TIMEOUT 600)
......@@ -27,9 +27,6 @@ def enable_fused_attn_after_hopper():
"""
if get_device_compute_capability(0) >= 90:
os.environ["NVTE_FUSED_ATTN"] = "1"
os.environ["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "0"
yield
if "NVTE_FUSED_ATTN" in os.environ:
del os.environ["NVTE_FUSED_ATTN"]
if "NVTE_ALLOW_NONDETERMINISTIC_ALGO" in os.environ:
del os.environ["NVTE_ALLOW_NONDETERMINISTIC_ALGO"]
......@@ -4,14 +4,19 @@
"""Test transformer_engine.jax.flax.TransformerLayer"""
import os
from functools import partial
from typing import Dict, Tuple
from typing import Dict, Tuple, Optional
import flax
import jax
import jax.numpy as jnp
import pytest
from utils import assert_allclose, assert_tree_like_allclose, sync_params_values
from utils import (
assert_allclose,
assert_tree_like_allclose,
dtype_tols,
sync_params_values,
)
from utils import DecoderLayer as RefDecoderLayer
from utils import EncoderLayer as RefEncoderLayer
......@@ -250,7 +255,13 @@ class BaseRunner:
target = sync_params_values(target, ref, self.transformations)
return ref, target
def test_forward(self, data_shape, dtype, rtol=1e-05, atol=1e-08):
def test_forward(
self,
data_shape: Tuple[int],
dtype: jnp.dtype,
rtol: Optional[float] = None,
atol: Optional[float] = None,
) -> None:
"""Test only the forward"""
inputs, (ref_masks, test_masks) = self.generate_inputs(data_shape, dtype)
......@@ -264,9 +275,16 @@ class BaseRunner:
ref_out = self._loss_fn(inputs, ref_masks, ref_params, ref_others, ref_layer)
test_out = self._loss_fn(inputs, test_masks, test_params, test_others, test_layer)
assert_allclose(ref_out, test_out, rtol=rtol, atol=atol)
tols = dtype_tols(dtype, rtol=rtol, atol=atol)
assert_allclose(ref_out, test_out, **tols)
def test_backward(self, data_shape, dtype, rtol=1e-05, atol=1e-08):
def test_backward(
self,
data_shape: Tuple[int],
dtype: jnp.dtype,
rtol: Optional[float] = None,
atol: Optional[float] = None,
) -> None:
"""Test forward and backward through value_and_grad()"""
inputs, (ref_masks, test_masks) = self.generate_inputs(data_shape, dtype)
......@@ -302,11 +320,12 @@ class BaseRunner:
inputs, test_masks, test_params, test_others, test_layer
)
assert_allclose(ref_out, test_out, rtol=rtol, atol=atol)
assert_tree_like_allclose(ref_dgrads, test_dgrads, rtol=rtol, atol=atol)
tols = dtype_tols(dtype, rtol=rtol, atol=atol)
assert_allclose(ref_out, test_out, **tols)
assert_tree_like_allclose(ref_dgrads, test_dgrads, **tols)
_, restructed_ref_wgrads = self._sync_params(ref_wgrads, test_wgrads)
assert_tree_like_allclose(restructed_ref_wgrads, test_wgrads, rtol=rtol, atol=atol)
assert_tree_like_allclose(restructed_ref_wgrads, test_wgrads, **tols)
class EncoderRunner(BaseRunner):
......@@ -418,12 +437,12 @@ class BaseTester:
def test_forward(self, data_shape, dtype, attrs):
"""Test normal datatype forward"""
FP8Helper.finalize() # Ensure FP8 disabled.
self.runner(attrs).test_forward(data_shape, dtype, rtol=1e-5, atol=7e-5)
self.runner(attrs).test_forward(data_shape, dtype)
def test_backward(self, data_shape, dtype, attrs):
"""Test normal datatype backward"""
FP8Helper.finalize() # Ensure FP8 disabled.
self.runner(attrs).test_backward(data_shape, dtype, rtol=1e-5, atol=7e-5)
self.runner(attrs).test_backward(data_shape, dtype)
@pytest.mark.skipif(not is_fp8_supported, reason=reason)
@pytest.mark.parametrize("fp8_format", FP8_FORMATS)
......
......@@ -1387,18 +1387,26 @@ def assert_tree_like_allclose(expected, actual, rtol=1e-05, atol=1e-08):
def dtype_tols(
dtype: Union[DType, TEDType, np.dtype],
reference_value: float = 1.0,
rtol: Optional[float] = None,
atol: Optional[float] = None,
) -> Dict[str, float]:
"""Expected numerical tolerance for a data type.
Args:
dtype: data type.
reference_value: reference value (default: 1).
rtol: override for relative tolerance estimate
atol: override for absolute tolerance estimate
Returns:
Dictionary with "rtol" and "atol" as keys
"""
# Return immediately if tolerances are fully specified
if rtol is not None and atol is not None:
return {"rtol": rtol, "atol": atol}
# Convert to JAX dtype if needed
if isinstance(dtype, TEDType):
dtype = {
......@@ -1416,7 +1424,11 @@ def dtype_tols(
# Expect bit-wise accuracy for integer dtypes
if not jnp.issubdtype(dtype, jnp.floating):
return dict(rtol=0, atol=0)
if rtol is None:
rtol = 0.0
if atol is None:
atol = 0.0
return {"rtol": rtol, "atol": atol}
# Estimate floating-point error
finfo = jnp.finfo(dtype)
......@@ -1429,10 +1441,11 @@ def dtype_tols(
spacing_high = jnp.nextafter(reference_value, finfo.max) - reference_value
spacing_low = reference_value - jnp.nextafter(reference_value, finfo.min)
ulp = max(spacing_high.item(), spacing_low.item())
return dict(
rtol=eps_relaxed,
atol=max(ulp, eps_relaxed),
)
if rtol is None:
rtol = eps_relaxed
if atol is None:
atol = max(ulp, eps_relaxed)
return {"rtol": rtol, "atol": atol}
def sync_params_values(dst, src, transformations, sep="/"):
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
"""Helper functions to launch distributed tests"""
import copy
import os
from pathlib import Path
import subprocess
import time
import unittest
try:
from paddle.base import core
except ImportError:
from paddle.fluid import core
from paddle.distributed.utils.launch_utils import (
TrainerProc,
find_free_ports,
get_cluster,
watch_local_trainers,
)
__all__ = ["TestDistributed"]
def get_cluster_from_args(selected_gpus):
"""Get node information from selected GPUs"""
cluster_node_ips = "127.0.0.1"
node_ip = "127.0.0.1"
node_ips = [x.strip() for x in cluster_node_ips.split(",")]
node_ips.index(node_ip)
free_ports = None
free_ports = find_free_ports(len(selected_gpus))
if free_ports is not None:
free_ports = list(free_ports)
trainer_endpoints = []
for ip in node_ips:
trainer_endpoints.append([f"{ip}:{port}" for port in free_ports])
return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
def get_gpus(selected_gpus):
"""Get selected GPU string"""
selected_gpus = [x.strip() for x in selected_gpus.split(",")]
return selected_gpus
def start_local_trainers(
cluster,
pod,
training_script,
training_script_args,
allocator_strategy="auto_growth",
):
"""Launch trainers"""
current_env = copy.copy(os.environ.copy())
# paddle broadcast ncclUniqueId use socket, and
# proxy maybe make trainers unreachable, so delete them.
# if we set them to "", grpc will log error message "bad uri"
# so just delete them.
current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None)
procs = []
for t in pod.trainers:
proc_env = {
"FLAGS_selected_gpus": ",".join([str(g) for g in t.gpus]),
"PADDLE_TRAINER_ID": f"{t.rank}",
"PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}",
"PADDLE_TRAINERS_NUM": f"{cluster.trainers_nranks()}",
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
"PYTHONPATH": str(Path(__file__).resolve().parent),
}
proc_env["FLAGS_allocator_strategy"] = allocator_strategy
if allocator_strategy == "auto_growth":
proc_env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.1"
current_env.update(proc_env)
print(f"trainer proc env:{current_env}")
if os.getenv("WITH_COVERAGE", "OFF") == "ON":
cmd = "python -m coverage run --branch -p " + training_script
else:
cmd = "python -u " + training_script
print(f"start trainer proc:{cmd} env:{proc_env}")
fn = None
proc = subprocess.Popen(
cmd.split(" ") + training_script_args, env=current_env
) # pylint: disable=consider-using-with
tp = TrainerProc()
tp.proc = proc
tp.rank = t.rank
tp.log_fn = fn
tp.cmd = cmd
procs.append(tp)
return procs
class TestDistributed(unittest.TestCase):
"""Base class for distributed test"""
@staticmethod
def run_2gpu(
target_file_name,
allocator_strategy="auto_growth",
):
"""Run target file in subprocesses"""
if not core.is_compiled_with_cuda() or core.get_cuda_device_count() == 0:
return
selected_gpus = get_gpus("0,1")
cluster = None
pod = None
cluster, pod = get_cluster_from_args(selected_gpus)
procs = start_local_trainers(
cluster,
pod,
allocator_strategy=allocator_strategy,
training_script=target_file_name,
training_script_args=[],
)
while True:
alive = watch_local_trainers(procs, cluster.trainers_endpoints())
if not alive:
print(f"Local procs complete, POD info:{pod}")
break
time.sleep(3)
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
"""Unittest for Linear layer in tensor parallel"""
import unittest
import paddle
from paddle.distributed import fleet
from utils import assert_allclose, set_random_seed
import transformer_engine.paddle as te
def assert_allclose_across_ranks(tensor, group=None):
"""Assert tensor is identical in all ranks"""
gathered_list = []
paddle.distributed.all_gather(gathered_list, tensor, group=group)
assert len(gathered_list) > 1
for gathered_tensor in gathered_list:
assert_allclose(tensor, gathered_tensor)
class TestAmaxReduction(unittest.TestCase):
"""Tests Amax reduction"""
def setUp(self):
self.data_parallel_size = 2
self.init_dist_env()
self.global_dtype = "bfloat16"
paddle.set_default_dtype(self.global_dtype)
def init_dist_env(self):
"""Init Paddle Fleet environment"""
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": self.data_parallel_size,
"mp_degree": 1,
"pp_degree": 1,
}
fleet.init(is_collective=True, strategy=strategy)
def test_amax_reduction(self):
"""Tests column parallel linear"""
set_random_seed(1024)
layer1 = te.Linear(16, 16)
layer2 = te.Linear(16, 16)
model = paddle.nn.Sequential(layer1, layer2)
model = fleet.distributed_model(model)
rank_id = paddle.distributed.get_rank()
set_random_seed(rank_id)
optimizer = paddle.optimizer.SGD(learning_rate=10.0, parameters=model.parameters())
optimizer = fleet.distributed_optimizer(optimizer)
def train_one_step(layer, inp, optimizer):
inp = paddle.to_tensor(inp)
inp.stop_gradient = False
out = layer(inp)
loss = out.mean()
loss.backward()
optimizer.step()
optimizer.clear_grad()
return loss
for _ in range(5):
inp = paddle.uniform([16, 16], self.global_dtype)
with te.fp8_autocast(enabled=True):
train_one_step(model, inp, optimizer)
assert_allclose_across_ranks(layer1.fp8_meta["scaling_fwd"].amax_history[-1])
assert_allclose_across_ranks(layer1.fp8_meta["scaling_fwd"].scale)
assert_allclose_across_ranks(layer1.fp8_meta["scaling_fwd"].scale_inv)
assert_allclose_across_ranks(layer2.fp8_meta["scaling_fwd"].amax_history[-1])
assert_allclose_across_ranks(layer2.fp8_meta["scaling_fwd"].scale)
assert_allclose_across_ranks(layer2.fp8_meta["scaling_fwd"].scale_inv)
assert_allclose_across_ranks(layer1.fp8_meta["scaling_bwd"].amax_history[-1])
assert_allclose_across_ranks(layer1.fp8_meta["scaling_bwd"].scale)
assert_allclose_across_ranks(layer1.fp8_meta["scaling_bwd"].scale_inv)
assert_allclose_across_ranks(layer2.fp8_meta["scaling_bwd"].amax_history[-1])
assert_allclose_across_ranks(layer2.fp8_meta["scaling_bwd"].scale)
assert_allclose_across_ranks(layer2.fp8_meta["scaling_bwd"].scale_inv)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment