Commit bd05b0dc authored by wenjh's avatar wenjh
Browse files

Merge branch 'develop_v2.10' into release_v2.10

parents 7aeb5a72 8fc9d8f1
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#include <transformer_engine/transformer_engine.h> #include <transformer_engine/transformer_engine.h>
#include "util/logging.h" #include "util/logging.h"
#define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
namespace test { namespace test {
size_t create_seed_from_tensor_name(const std::string& tensor_name) { size_t create_seed_from_tensor_name(const std::string& tensor_name) {
...@@ -343,10 +345,15 @@ Tensor::Tensor(const std::string& name, ...@@ -343,10 +345,15 @@ Tensor::Tensor(const std::string& name,
} }
} }
#if FP4_TYPE_SUPPORTED
const DType rowwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type; const DType rowwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
const DType colwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type; const DType colwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
tensor_.set_rowwise_data(dptr_rowwise, rowwise_type, shape); tensor_.set_rowwise_data(dptr_rowwise, rowwise_type, shape);
tensor_.set_columnwise_data(dptr_columnwise, colwise_type, columnwise_shape); tensor_.set_columnwise_data(dptr_columnwise, colwise_type, columnwise_shape);
#else
tensor_.set_rowwise_data(dptr_rowwise, type, shape);
tensor_.set_columnwise_data(dptr_columnwise, type, columnwise_shape);
#endif
if (isFp8Type(type) || isFp4Type(type)) { if (isFp8Type(type) || isFp4Type(type)) {
if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) { if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) {
...@@ -1041,7 +1048,11 @@ bool isFp8Type(DType type) { ...@@ -1041,7 +1048,11 @@ bool isFp8Type(DType type) {
} }
bool isFp4Type(DType type) { bool isFp4Type(DType type) {
#if FP4_TYPE_SUPPORTED
return type == DType::kFloat4E2M1; return type == DType::kFloat4E2M1;
#else
return false;
#endif
} }
int32_t getDeviceComputeCapability() { int32_t getDeviceComputeCapability() {
......
...@@ -112,7 +112,9 @@ struct TypeInfo { ...@@ -112,7 +112,9 @@ struct TypeInfo {
struct Helper { struct Helper {
constexpr static DType getType() { constexpr static DType getType() {
constexpr int i = static_cast<int>(current); constexpr int i = static_cast<int>(current);
if (std::is_same<U, typename std::tuple_element<i, types>::type>::value) { if constexpr (i >= std::tuple_size_v<types>) {
return DType::kNumTypes;
} else if (std::is_same<U, typename std::tuple_element<i, types>::type>::value) {
return current; return current;
} else { } else {
return Helper<U, static_cast<DType>(i + 1)>::getType(); return Helper<U, static_cast<DType>(i + 1)>::getType();
......
...@@ -1519,6 +1519,7 @@ void quantize_transpose_vector_blockwise(const SimpleTensor& input, SimpleTensor ...@@ -1519,6 +1519,7 @@ void quantize_transpose_vector_blockwise(const SimpleTensor& input, SimpleTensor
size_t smem_bytes = kSMemSize_Colwise * sizeof(InputType); size_t smem_bytes = kSMemSize_Colwise * sizeof(InputType);
const size_t num_blocks_x = DIVUP(row_length, (size_t)(block_len / 2)); const size_t num_blocks_x = DIVUP(row_length, (size_t)(block_len / 2));
const size_t num_blocks_y = DIVUP(num_rows, (size_t)(block_len)); const size_t num_blocks_y = DIVUP(num_rows, (size_t)(block_len));
dim3 grid(num_blocks_x, num_blocks_y, 1);
if (smem_bytes >= 48 * 1024) { if (smem_bytes >= 48 * 1024) {
cudaError_t err = cudaFuncSetAttribute( cudaError_t err = cudaFuncSetAttribute(
(const void*)&block_scaled_1d_cast_transpose_kernel_colwise< (const void*)&block_scaled_1d_cast_transpose_kernel_colwise<
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment