Commit 8fc9d8f1 authored by maxiao3's avatar maxiao3 Committed by wenjh
Browse files

Fix issues related to L0cpp tests



1,Resolve out-of-bounds issues for types struct
2,Fix TestFusedCastFloat8Vectorwise test case failure
Signed-off-by: default avatarmaxiao3 <maxiao3@sugon.com>

See merge request dcutoolkit/deeplearing/TransformerEngine!73
parent 261e476b
......@@ -21,6 +21,8 @@
#include <transformer_engine/transformer_engine.h>
#include "util/logging.h"
#define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
namespace test {
size_t create_seed_from_tensor_name(const std::string& tensor_name) {
......@@ -343,10 +345,15 @@ Tensor::Tensor(const std::string& name,
}
}
#if FP4_TYPE_SUPPORTED
const DType rowwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
const DType colwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
tensor_.set_rowwise_data(dptr_rowwise, rowwise_type, shape);
tensor_.set_columnwise_data(dptr_columnwise, colwise_type, columnwise_shape);
#else
tensor_.set_rowwise_data(dptr_rowwise, type, shape);
tensor_.set_columnwise_data(dptr_columnwise, type, columnwise_shape);
#endif
if (isFp8Type(type) || isFp4Type(type)) {
if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) {
......@@ -1041,7 +1048,11 @@ bool isFp8Type(DType type) {
}
bool isFp4Type(DType type) {
#if FP4_TYPE_SUPPORTED
return type == DType::kFloat4E2M1;
#else
return false;
#endif
}
int32_t getDeviceComputeCapability() {
......
......@@ -112,7 +112,9 @@ struct TypeInfo {
struct Helper {
constexpr static DType getType() {
constexpr int i = static_cast<int>(current);
if (std::is_same<U, typename std::tuple_element<i, types>::type>::value) {
if constexpr (i >= std::tuple_size_v<types>) {
return DType::kNumTypes;
} else if (std::is_same<U, typename std::tuple_element<i, types>::type>::value) {
return current;
} else {
return Helper<U, static_cast<DType>(i + 1)>::getType();
......
......@@ -1519,6 +1519,7 @@ void quantize_transpose_vector_blockwise(const SimpleTensor& input, SimpleTensor
size_t smem_bytes = kSMemSize_Colwise * sizeof(InputType);
const size_t num_blocks_x = DIVUP(row_length, (size_t)(block_len / 2));
const size_t num_blocks_y = DIVUP(num_rows, (size_t)(block_len));
dim3 grid(num_blocks_x, num_blocks_y, 1);
if (smem_bytes >= 48 * 1024) {
cudaError_t err = cudaFuncSetAttribute(
(const void*)&block_scaled_1d_cast_transpose_kernel_colwise<
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment