Kwyss/new shape owns data (#1708)

* Reapply "Allow NVTEShape to own data." (#1703) This reverts commit 91405eb4 . Signed-off-by: Keith Wyss <kwyss@nvidia.com> * Update code so that data is replaced by an array. Signed-off-by: Keith Wyss <kwyss@nvidia.com> * Specify unambiguous Tensor constructor in tests. Signed-off-by: Keith Wyss <kwyss@nvidia.com> * Fix assumption in test of 2D shape. Signed-off-by: Keith Wyss <kwyss@nvidia.com> * Remove row and col Signed-off-by: Keith Wyss <kwyss@nvidia.com> --------- Signed-off-by: Keith Wyss <kwyss@nvidia.com>

Kwyss/new shape owns data (#1708)
* Reapply "Allow NVTEShape to own data." (#1703) This reverts commit 91405eb4 . Signed-off-by: Keith Wyss <kwyss@nvidia.com> * Update code so that data is replaced by an array. Signed-off-by: Keith Wyss <kwyss@nvidia.com> * Specify unambiguous Tensor constructor in tests. Signed-off-by: Keith Wyss <kwyss@nvidia.com> * Fix assumption in test of 2D shape. Signed-off-by: Keith Wyss <kwyss@nvidia.com> * Remove row and col Signed-off-by: Keith Wyss <kwyss@nvidia.com> --------- Signed-off-by: Keith Wyss <kwyss@nvidia.com>
afb70224 · kwyss-nvidia · GitHub · 21ec6e04 · afb70224 · afb70224
Unverified Commit afb70224 authored Apr 29, 2025 by kwyss-nvidia Committed by GitHub Apr 29, 2025
20 changed files
--- a/tests/cpp/operator/test_act.cu
+++ b/tests/cpp/operator/test_act.cu
@@ -116,10 +116,10 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;
-  Tensor input("input", { N, H }, itype);
+  Tensor input("input", std::vector<size_t>{ N, H }, itype);
-  Tensor output("output", { N, H }, otype);
+  Tensor output("output", std::vector<size_t>{ N, H }, otype);
-  Tensor igrad("igrad", { N, H }, itype);
+  Tensor igrad("igrad", std::vector<size_t>{ N, H }, itype);
-  Tensor ograd("ograd", { N, H }, itype);
+  Tensor ograd("ograd", std::vector<size_t>{ N, H }, itype);
  fillUniform(&input);
  fillUniform(&ograd);
@@ -171,10 +171,10 @@ void performTestGLU(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;
-  Tensor input("input", {N, H * 2}, itype);
+  Tensor input("input", std::vector<size_t>{N, H * 2}, itype);
-  Tensor output("output", {N, H}, otype);
+  Tensor output("output", std::vector<size_t>{N, H}, otype);
-  Tensor igrad("igrad", { N, H * 2 }, itype);
+  Tensor igrad("igrad", std::vector<size_t>{ N, H * 2 }, itype);
-  Tensor ograd("ograd", { N, H }, itype);
+  Tensor ograd("ograd", std::vector<size_t>{ N, H }, itype);
  fillUniform(&input);
  fillUniform(&ograd);

--- a/tests/cpp/operator/test_cast_dbias.cu
+++ b/tests/cpp/operator/test_cast_dbias.cu
@@ -70,7 +70,7 @@ void performTest(const std::vector<size_t>& shape) {
  Tensor output_c("output_c", shape, otype);
  // dbias has the same data type with "output grad"
-  Tensor dbias("dbias", {H}, itype);
+  Tensor dbias("dbias", std::vector<size_t>{H}, itype);
  fillUniform(&input);
  setRandomScale(&output_c);

--- a/tests/cpp/operator/test_cast_dbias_dgelu.cu
+++ b/tests/cpp/operator/test_cast_dbias_dgelu.cu
@@ -79,7 +79,7 @@ void performTest(const std::vector<size_t>& shape) {
  Tensor output_c("output_c", shape, otype);
  // dbias has the same data type with "output grad"
-  Tensor dbias("dbias", {H}, itype);
+  Tensor dbias("dbias", std::vector<size_t>{H}, itype);
  fillUniform(&input);
  fillUniform(&grad);

--- a/tests/cpp/operator/test_cast_float8blockwise.cu
+++ b/tests/cpp/operator/test_cast_float8blockwise.cu
@@ -280,7 +280,7 @@ void runTestCase(const ProcessingMethod processing_method, const std::vector<siz
  Tensor grad("grad", shape, itype);
  Tensor output_c("output_c", shape, otype, rowwise, colwise,
                  opts.block_scaling_dim == 2 ? NVTE_BLOCK_SCALING_2D : NVTE_BLOCK_SCALING_1D);
-  Tensor output_dbias("output_dbias", {cols}, itype);
+  Tensor output_dbias("output_dbias", std::vector<size_t>{cols}, itype);
  std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
  std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(rows * cols);
@@ -355,7 +355,7 @@ void runTestCaseOneDimensionalBlocks(const ProcessingMethod processing_method,
  Tensor grad("grad", shape, itype);
  Tensor output_c("output_c", shape, otype, rowwise, colwise,
                  opts.block_scaling_dim == 2 ? NVTE_BLOCK_SCALING_2D : NVTE_BLOCK_SCALING_1D);
-  Tensor output_dbias("output_dbias", {cols}, itype);
+  Tensor output_dbias("output_dbias", std::vector<size_t>{cols}, itype);
  std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
  std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(rows * cols);

--- a/tests/cpp/operator/test_cast_mxfp8.cu
+++ b/tests/cpp/operator/test_cast_mxfp8.cu
@@ -230,7 +230,7 @@ void performTest_x1(const ProcessingMethod processing_method,
    Tensor input("input", shape, itype);
    Tensor grad("grad", shape, itype);
    Tensor output_c("output_c", shape, otype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
-    Tensor output_dbias("output_dbias", { cols }, itype);
+    Tensor output_dbias("output_dbias", std::vector<size_t>{ cols }, itype);
    std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(rows * cols);
    std::unique_ptr<InputType[]> ref_output_dbias = std::make_unique<InputType[]>(cols);
@@ -368,7 +368,7 @@ void performTest_x2(const ProcessingMethod processing_method,
    Tensor input("input", shape, itype);
    Tensor grad("grad", shape, itype);
    Tensor output("output", shape, otype, true, true, NVTE_MXFP8_1D_SCALING);
-    Tensor output_dbias("output_dbias", { cols }, itype);
+    Tensor output_dbias("output_dbias", std::vector<size_t>{ cols }, itype);
    std::unique_ptr<OutputType[]> ref_output_c_rowwise = std::make_unique<OutputType[]>(rows * cols);
    std::unique_ptr<OutputType[]> ref_output_c_colwise = std::make_unique<OutputType[]>(rows * cols);

--- a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
+++ b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
@@ -204,8 +204,8 @@ void performTest_x1(const size_t rows,
    // std::cout << "blocks_X: " << blocks_X << std::endl;
    // std::cout << "scales_stride: " << scales_stride << std::endl;
-    Tensor grad("grad", { rows, cols }, itype);
+    Tensor grad("grad", std::vector<size_t>{ rows, cols }, itype);
-    Tensor input("input", { rows, cols * 2 }, itype);
+    Tensor input("input", std::vector<size_t>{ rows, cols * 2 }, itype);
    const size_t output_cols = (IS_DGATED ? 2 : 1) * cols;
@@ -289,8 +289,8 @@ void performTest_x2(const size_t rows,
    DType itype = TypeInfo<IType>::dtype;
    DType otype = TypeInfo<OType>::dtype;
-    Tensor grad("grad", { rows, cols }, itype);
+    Tensor grad("grad", std::vector<size_t>{ rows, cols }, itype);
-    Tensor input("input", { rows, cols * 2 }, itype);
+    Tensor input("input", std::vector<size_t>{ rows, cols * 2 }, itype);
    const size_t output_cols = (IS_DGATED ? 2 : 1) * cols;

--- a/tests/cpp/operator/test_cast_transpose.cu
+++ b/tests/cpp/operator/test_cast_transpose.cu
@@ -47,8 +47,8 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<InputType>::dtype;
  DType otype = TypeInfo<OutputType>::dtype;
-  Tensor input("input", { N, H }, itype);
+  Tensor input("input", std::vector<size_t>{ N, H }, itype);
-  Tensor output("output", { N, H }, otype, true, true);
+  Tensor output("output", std::vector<size_t>{ N, H }, otype, true, true);
  std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(N * H);
  std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(N * H);

--- a/tests/cpp/operator/test_cast_transpose_current_scaling.cu
+++ b/tests/cpp/operator/test_cast_transpose_current_scaling.cu
@@ -112,8 +112,8 @@ void performTest(const size_t N, const size_t H) {
    }
  }
-  Tensor input("input", { N, H }, itype);
+  Tensor input("input", std::vector<size_t>{ N, H }, itype);
-  Tensor output("output", { N, H }, otype, true, true);
+  Tensor output("output", std::vector<size_t>{ N, H }, otype, true, true);
  std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(N * H);
  std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(N * H);

--- a/tests/cpp/operator/test_cast_transpose_dbias.cu
+++ b/tests/cpp/operator/test_cast_transpose_dbias.cu
@@ -65,11 +65,11 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;
-  Tensor input("input", {N, H}, itype);
+  Tensor input("input", std::vector<size_t>{N, H}, itype);
-  Tensor output("output", {N, H}, otype, true, true);
+  Tensor output("output", std::vector<size_t>{N, H}, otype, true, true);
  // dbias has the same data type with "output grad"
-  Tensor dbias("dbias", {H}, itype);
+  Tensor dbias("dbias", std::vector<size_t>{H}, itype);
  fillUniform(&input);
  setRandomScale(&output);

--- a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
+++ b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
@@ -76,12 +76,12 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;
-  Tensor input("input", {N, H}, itype);
+  Tensor input("input", std::vector<size_t>{N, H}, itype);
-  Tensor gelu_input("gelu_input", {N, H}, itype);
+  Tensor gelu_input("gelu_input", std::vector<size_t>{N, H}, itype);
-  Tensor output("output", {N, H}, otype, true, true);
+  Tensor output("output", std::vector<size_t>{N, H}, otype, true, true);
  // dbias has the same data type with "output grad"
-  Tensor dbias("dbias", {H}, itype);
+  Tensor dbias("dbias", std::vector<size_t>{H}, itype);
  fillUniform(&input);
  fillUniform(&gelu_input);

--- a/tests/cpp/operator/test_cast_transpose_dgeglu.cu
+++ b/tests/cpp/operator/test_cast_transpose_dgeglu.cu
@@ -74,9 +74,9 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;
-  Tensor grad("grad", {N, H}, itype);
+  Tensor grad("grad", std::vector<size_t>{N, H}, itype);
-  Tensor input("input", {N, H * 2}, itype);
+  Tensor input("input", std::vector<size_t>{N, H * 2}, itype);
-  Tensor output("output", {N, H * 2}, otype, true, true);
+  Tensor output("output", std::vector<size_t>{N, H * 2}, otype, true, true);
  fillUniform(&grad);
  fillUniform(&input);

--- a/tests/cpp/operator/test_causal_softmax.cu
+++ b/tests/cpp/operator/test_causal_softmax.cu
@@ -153,11 +153,11 @@ void performTest(
  DType itype = TypeInfo<Type>::dtype;
-  Tensor data_in("data_in", { batches, heads, rows, cols }, itype);
+  Tensor data_in("data_in", std::vector<size_t>{ batches, heads, rows, cols }, itype);
-  Tensor softmax_out("softmax_out", { batches, heads, rows, cols }, itype);
+  Tensor softmax_out("softmax_out", std::vector<size_t>{ batches, heads, rows, cols }, itype);
-  Tensor softmax_in("softmax_in", { batches, heads, rows, cols }, itype);
+  Tensor softmax_in("softmax_in", std::vector<size_t>{ batches, heads, rows, cols }, itype);
-  Tensor grads_in("grads_in", { batches, heads, rows, cols }, itype);
+  Tensor grads_in("grads_in", std::vector<size_t>{ batches, heads, rows, cols }, itype);
-  Tensor grads_out("grads_out", { batches, heads, rows, cols }, itype);
+  Tensor grads_out("grads_out", std::vector<size_t>{ batches, heads, rows, cols }, itype);
  const size_t elements_total = batches * heads * rows * cols;
  std::unique_ptr<Type[]> softmax_out_ref = std::make_unique<Type[]>(elements_total);

--- a/tests/cpp/operator/test_dequantize_mxfp8.cu
+++ b/tests/cpp/operator/test_dequantize_mxfp8.cu
@@ -214,10 +214,10 @@ void performTest_x1(const size_t rows,
    const size_t blocks_num = rowwise ? blocks_num_rowwise : blocks_num_colwise;
    const size_t scales_stride = rowwise ? blocks_X_rowwise : blocks_X_colwise;
-    Tensor input("input", { rows, cols }, itype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
+    Tensor input("input", std::vector<size_t>{ rows, cols }, itype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
    // Output data are written to the rowwise ptr regardless of the scaling direction
-    Tensor output("output", { rows, cols }, otype, true, false);
+    Tensor output("output", std::vector<size_t>{ rows, cols }, otype, true, false);
    std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
    std::unique_ptr<fp8e8m0[]> scales = std::make_unique<fp8e8m0[]>(blocks_num);
@@ -267,11 +267,11 @@ void performTest_quantize_then_dequantize(const size_t rows,
    // input --> quantized --> output (dequantized)
    // input == output
-    Tensor input("input", { rows, cols }, in_type);
+    Tensor input("input", std::vector<size_t>{ rows, cols }, in_type);
-    Tensor quantized("quantized", { rows, cols }, intermed_type, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
+    Tensor quantized("quantized", std::vector<size_t>{ rows, cols }, intermed_type, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
    // Output data are written to the rowwise ptr regardless of the scaling direction
-    Tensor output("output", { rows, cols }, out_type, true, false);
+    Tensor output("output", std::vector<size_t>{ rows, cols }, out_type, true, false);
    // fillCase<EncodingType>(&input, InputsFillCase::minNorm_to_maxNorm);
    fillCase<EncodingType>(&input, InputsFillCase::uniform);
@@ -333,8 +333,8 @@ void performTest_x2(const size_t rows,
    const size_t blocks_num_rowwise = blocks_Y_rowwise * blocks_X_rowwise;
    const size_t blocks_num_colwise = blocks_Y_colwise * blocks_X_colwise;
-    Tensor input("input", { rows, cols }, itype, true, true, NVTE_MXFP8_1D_SCALING);
+    Tensor input("input", std::vector<size_t>{ rows, cols }, itype, true, true, NVTE_MXFP8_1D_SCALING);
-    Tensor output("output", { rows, cols }, otype);
+    Tensor output("output", std::vector<size_t>{ rows, cols }, otype);
    std::unique_ptr<OutputType[]> ref_output_rowwise = std::make_unique<OutputType[]>(rows * cols);
    std::unique_ptr<OutputType[]> ref_output_colwise = std::make_unique<OutputType[]>(rows * cols);

--- a/tests/cpp/operator/test_multi_cast_transpose.cu
+++ b/tests/cpp/operator/test_multi_cast_transpose.cu
@@ -81,9 +81,9 @@ void performTest() {
  for (size_t tensor_id = 0; tensor_id < num_tensors; ++tensor_id) {
    const size_t height = tensor_dims[tensor_id].first;
    const size_t width = tensor_dims[tensor_id].second;
-    input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), { height, width }, itype));
+    input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), std::vector<size_t>{ height, width }, itype));
    output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id),
-                                    { height, width }, otype, true, true));
+                                    std::vector<size_t>{ height, width }, otype, true, true));
    auto& input = input_list.back();
    auto& output = output_list.back();

--- a/tests/cpp/operator/test_multi_padding.cu
+++ b/tests/cpp/operator/test_multi_padding.cu
@@ -85,8 +85,8 @@ void performTest() {
    const size_t height = tensor_dims[tensor_id].first;
    const size_t width = tensor_dims[tensor_id].second;
    const size_t padded_height = (height + align - 1) / align * align;
-    input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), { height, width }, itype));
+    input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), std::vector<size_t>{ height, width }, itype));
-    output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id), { padded_height, width }, otype));
+    output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id), std::vector<size_t>{ padded_height, width }, otype));
    auto& input = input_list.back();
    auto& output = output_list.back();

--- a/tests/cpp/operator/test_normalization.cu
+++ b/tests/cpp/operator/test_normalization.cu
@@ -48,16 +48,16 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
    return;
  }
-  Tensor input("input", { N, H }, itype);
+  Tensor input("input", std::vector<size_t>{ N, H }, itype);
-  Tensor z("z", { N, H }, otype);
+  Tensor z("z", std::vector<size_t>{ N, H }, otype);
-  Tensor gamma("gamma", { H }, wtype);
+  Tensor gamma("gamma", std::vector<size_t>{ H }, wtype);
-  Tensor beta("beta", { H }, wtype);
+  Tensor beta("beta", std::vector<size_t>{ H }, wtype);
-  Tensor mu("mu", { N }, DType::kFloat32);
+  Tensor mu("mu", std::vector<size_t>{ N }, DType::kFloat32);
-  Tensor rsigma("rsigma", { N }, DType::kFloat32);
+  Tensor rsigma("rsigma", std::vector<size_t>{ N }, DType::kFloat32);
-  Tensor dz("dz", { N, H }, wtype);
+  Tensor dz("dz", std::vector<size_t>{ N, H }, wtype);
-  Tensor dx("dx", { N, H }, itype);
+  Tensor dx("dx", std::vector<size_t>{ N, H }, itype);
-  Tensor dgamma("dgamma", { H }, wtype);
+  Tensor dgamma("dgamma", std::vector<size_t>{ H }, wtype);
-  Tensor dbeta("dbeta", { H }, wtype);
+  Tensor dbeta("dbeta", std::vector<size_t>{ H }, wtype);
  Tensor workspace_fwd, workspace_bwd;
  fillUniform(&input);

--- a/tests/cpp/operator/test_normalization_mxfp8.cu
+++ b/tests/cpp/operator/test_normalization_mxfp8.cu
@@ -116,12 +116,12 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
  DType wtype = TypeInfo<WeightType>::dtype;
  DType otype = TypeInfo<OutputType>::dtype;
-  Tensor input("input", { N, H }, itype);
+  Tensor input("input", std::vector<size_t>{ N, H }, itype);
-  Tensor z("z", { N, H }, otype, true, is_training, NVTE_MXFP8_1D_SCALING);
+  Tensor z("z", std::vector<size_t>{ N, H }, otype, true, is_training, NVTE_MXFP8_1D_SCALING);
-  Tensor gamma("gamma", { H }, wtype);
+  Tensor gamma("gamma", std::vector<size_t>{ H }, wtype);
-  Tensor beta("beta", { H }, wtype);
+  Tensor beta("beta", std::vector<size_t>{ H }, wtype);
-  Tensor mu("mu", { N }, DType::kFloat32);
+  Tensor mu("mu", std::vector<size_t>{ N }, DType::kFloat32);
-  Tensor rsigma("rsigma", { N }, DType::kFloat32);
+  Tensor rsigma("rsigma", std::vector<size_t>{ N }, DType::kFloat32);
  Tensor workspace;
@@ -164,7 +164,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
    nvte_enable_zero_centered_gamma_in_weight_dtype(false);
  }
-  Tensor dequantized_output("dequantized_output", { N, H }, DType::kFloat32, true, true);
+  Tensor dequantized_output("dequantized_output", std::vector<size_t>{ N, H }, DType::kFloat32, true, true);
  dequantize_2x<OutputType, fp8e8m0>(z, dequantized_output, is_training);

--- a/tests/cpp/operator/test_qdq.cu
+++ b/tests/cpp/operator/test_qdq.cu
@@ -58,8 +58,8 @@ void performTestQ(const size_t N) {
  DType itype = TypeInfo<InputType>::dtype;
  DType otype = TypeInfo<OutputType>::dtype;
-  Tensor input("input", { N }, itype);
+  Tensor input("input", std::vector<size_t>{ N }, itype);
-  Tensor output("output", { N }, otype);
+  Tensor output("output", std::vector<size_t>{ N }, otype);
  std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(N);
@@ -89,8 +89,8 @@ void performTestDQ(const size_t N) {
  DType itype = TypeInfo<InputType>::dtype;
  DType otype = TypeInfo<OutputType>::dtype;
-  Tensor input("input", { N }, itype);
+  Tensor input("input", std::vector<size_t>{ N }, itype);
-  Tensor output("output", { N }, otype);
+  Tensor output("output", std::vector<size_t>{ N }, otype);
  std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(N);

--- a/tests/cpp/operator/test_transpose.cu
+++ b/tests/cpp/operator/test_transpose.cu
@@ -37,8 +37,8 @@ void performTest(const size_t N, const size_t H) {
  DType dtype = TypeInfo<Type>::dtype;
-  Tensor input("input", { N, H }, dtype);
+  Tensor input("input", std::vector<size_t>{ N, H }, dtype);
-  Tensor output("output", { H, N }, dtype);
+  Tensor output("output", std::vector<size_t>{ H, N }, dtype);
  std::unique_ptr<Type[]> ref_output = std::make_unique<Type[]>(N * H);

--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -112,8 +112,8 @@ struct scale_inv_meta {
  size_t type_size;
 };
-NVTEShape convertShape(const std::vector<size_t>& shape) {
+NVTEShape convertShape(const std::vector<size_t>& s) {
-  return {shape.data(), shape.size()};
+  return nvte_make_shape(s.data(), s.size());
 }
 std::pair<scale_inv_meta, scale_inv_meta> get_scales(const NVTEShape& shape,
@@ -240,7 +240,7 @@ Tensor::Tensor(const std::string& name,
  std::vector<size_t> normalized_shape_v = {product(shape, 0, shape.ndim - 1),
                                            shape.data[shape.ndim - 1]};
  NVTEShape normalized_shape = convertShape(normalized_shape_v);
-  NVTEShape columnwise_shape{nullptr, 0};
+  NVTEShape columnwise_shape = {};
  std::vector<size_t> columnwise_shape_vec;
  if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING || scaling_mode == NVTE_BLOCK_SCALING_1D || scaling_mode == NVTE_BLOCK_SCALING_2D) {
@@ -257,8 +257,7 @@ Tensor::Tensor(const std::string& name,
  }
  if (columnwise) {
-    columnwise_shape.data = columnwise_shape_vec.data();
+    columnwise_shape = nvte_make_shape(columnwise_shape_vec.data(), columnwise_shape_vec.size());
-    columnwise_shape.ndim = columnwise_shape_vec.size();
  }
  tensor_ = TensorWrapper(scaling_mode);
@@ -739,8 +738,6 @@ void fillUniform(Tensor *t) {
 template<typename InputEncoding, InputsFillCase Case>
 void fillCase_special(Tensor *t) {
  const size_t size = product(t->rowwise_shape());
-  const size_t rows = t->rowwise_shape().data[0];
-  const size_t cols = t->rowwise_shape().data[1];
  if constexpr (Case == InputsFillCase::zeros) {
    TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(t->dtype(), InputType, {
@@ -760,9 +757,7 @@ void fillCase_special(Tensor *t) {
    std::uniform_real_distribution<> dis_sign(-1.0, 1.0);
    TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(t->dtype(), InputType, {
      InputType *data = t->rowwise_cpu_dptr<InputType>();
-      for (size_t i = 0; i < rows; ++i) {
+      for (size_t idx = 0; idx < size; ++idx) {
-        for (size_t j = 0; j < cols; ++j) {
-          const size_t idx = i * cols + j;
        const bool is_negative = (dis_sign(t->gen()) < 0.0);
        double val = dis(t->gen());
        if (is_negative) {
@@ -770,7 +765,6 @@ void fillCase_special(Tensor *t) {
        }
        data[idx] = static_cast<InputType>(val);
      }
-      }
    });
  }
  t->set_scale_inv(1.0);