Unverified Commit a3ba4dff authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

Fix cpp warnings (#1639)



* fix cpp warning
Signed-off-by: default avatarXin Yao <xiny@nvidia.com>

* more fix
Signed-off-by: default avatarXin Yao <xiny@nvidia.com>

---------
Signed-off-by: default avatarXin Yao <xiny@nvidia.com>
parent ba605f18
......@@ -96,8 +96,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
constexpr size_t in_mem = in_act_mem + in_gate_mem;
constexpr size_t out_act_mem = buff_size_aligned_out;
constexpr size_t out_gate_mem = buff_size_aligned_out;
constexpr size_t out_mem = out_act_mem + out_gate_mem;
// const size_t in_transaction_size = grad_mem + in_mem;
constexpr size_t in_transaction_size = buff_elems * sizeof(IType);
......@@ -108,7 +106,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
IType *in_gate_sh = reinterpret_cast<IType *>(dshmem + grad_mem + in_act_mem);
OType *out_act_sh = reinterpret_cast<OType *>(dshmem + grad_mem + in_mem);
OType *out_gate_sh = reinterpret_cast<OType *>(dshmem + grad_mem + in_mem + out_act_mem);
// uint64_t *mbar = reinterpret_cast<uint64_t *>(dshmem + grad_mem + in_mem + out_mem);
const uint64_t *TMAP_grad_in = reinterpret_cast<const uint64_t *>(&tensor_map_grad);
const uint64_t *TMAP_in_act = reinterpret_cast<const uint64_t *>(&tensor_map_input_act);
......@@ -289,7 +286,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
constexpr bool USE_ROWWISE_SCALING = SCALE_DIM_X > 1;
constexpr bool USE_COLWISE_SCALING = SCALE_DIM_Y > 1;
constexpr bool COMPUTE_IN_ROWWISE_SECTION = !USE_COLWISE_SCALING;
constexpr size_t SCALES_ROWWISE_PER_CHUNK_Y = CHUNK_DIM_Y; // 128
constexpr size_t SCALES_ROWWISE_PER_CHUNK_X = CHUNK_DIM_X / SCALE_DIM_X; // 4 = 128 / 32
......@@ -826,8 +822,6 @@ void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *out
size_t scale_stride_rowwise = USE_ROWWISE_SCALING ? output->scale_inv.shape[1] : 1;
size_t scale_stride_colwise = USE_COLWISE_SCALING ? output->columnwise_scale_inv.shape[1] : 1;
float *const amax_ptr = reinterpret_cast<float *>(output->amax.dptr);
e8m0_t *const scales_rowwise_ptr =
USE_ROWWISE_SCALING ? reinterpret_cast<e8m0_t *>(output->scale_inv.dptr) : nullptr;
e8m0_t *const scales_colwise_ptr =
......
......@@ -142,7 +142,6 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
OType out_colwise_sh[MXFP8_BUFFERS_NUM][MXFP8_SHMEM_DIM_Y][MXFP8_SHMEM_DIM_X];
constexpr int shmem_buff_size = sizeof(in_sh) / MXFP8_BUFFERS_NUM;
constexpr int transaction_size = shmem_buff_size * (IS_DACT ? 2 : 1);
const bool is_master_thread = (threadIdx.x == 0);
......@@ -513,7 +512,6 @@ __global__ void __launch_bounds__(FP8_THREADS_PER_CHUNK)
__shared__ alignas(128) OType out_sh[FP8_BUFFERS_NUM][FP8_SHMEM_DIM_Y][FP8_SHMEM_DIM_X];
constexpr int shmem_buff_size = sizeof(in_sh) / FP8_BUFFERS_NUM;
constexpr int transaction_size = shmem_buff_size * (IS_DACT ? 2 : 1);
const bool is_master_thread = (threadIdx.x == 0);
......@@ -927,7 +925,6 @@ void mxfp8_quantize(const Tensor &input, const Tensor *act_input,
bool use_colwise_scaling = output->has_columnwise_data();
checkCuDriverContext(stream);
NVTE_CHECK(input.has_data(), "Cannot quantize tensor without rowwise data.");
const auto &input_shape = input.data.shape;
NVTE_CHECK(is_fp8_dtype(output->dtype()), "Output must have FP8 type.");
if (use_rowwise_scaling) {
......
......@@ -56,7 +56,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
const size_t scales_stride) {
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
constexpr bool USE_ROWWISE_SCALING = SCALE_DIM_X > 1;
constexpr bool USE_COLWISE_SCALING = SCALE_DIM_Y > 1;
constexpr size_t SCALES_ROWWISE_PER_CHUNK_Y = CHUNK_DIM_Y; // 128
constexpr size_t SCALES_ROWWISE_PER_CHUNK_X = CHUNK_DIM_X / SCALE_DIM_X; // 4 = 128 / 32
......@@ -66,7 +65,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
constexpr size_t THREADS_PER_SCALE_X_ROWWISE =
DIVUP(SCALE_DIM_X, ELEMS_PER_THREAD); // 2 = 32 / 16
constexpr size_t SUBWARP_WIDTH = THREADS_PER_SCALE_X_ROWWISE; // 2
const int chunk_offset_Y = blockIdx.y * CHUNK_DIM_Y;
const int chunk_offset_X = blockIdx.x * CHUNK_DIM_X;
......
......@@ -157,15 +157,15 @@ void CommOverlap::copy_into_buffer(py::handle input, py::handle quantizer, bool
char *ubuf_ptr = reinterpret_cast<char *>(_ubuf.dptr());
if (local_chunk) {
if (input_tensor.numel() * _tp_size > (int64_t)_ubuf.numel())
if (input_tensor.numel() * _tp_size > _ubuf.numel())
NVTE_ERROR("input is larger than the local communication buffer!");
if (input_tensor.element_size() != (int64_t)_ubuf.element_size())
if (input_tensor.element_size() != _ubuf.element_size())
NVTE_ERROR("input data type does not match communication buffer!");
ubuf_ptr += (_ubuf.numel() / _tp_size) * _tp_id * _ubuf.element_size();
} else {
if (input_tensor.numel() > (int64_t)_ubuf.numel())
if (input_tensor.numel() > _ubuf.numel())
NVTE_ERROR("input is larger than the global communication buffer!");
if (input_tensor.element_size() != (int64_t)_ubuf.element_size())
if (input_tensor.element_size() != _ubuf.element_size())
NVTE_ERROR("input data type does not match communication buffer!");
}
......@@ -189,7 +189,7 @@ py::object CommOverlap::get_buffer(py::handle quantizer, bool local_chunk,
std::vector<int64_t> torch_shape;
if (shape.has_value()) {
torch_shape = shape.value();
auto requested = product(torch_shape);
size_t requested = product(torch_shape);
auto expected = local_chunk ? _ubuf.numel() / _tp_size : _ubuf.numel();
NVTE_CHECK(requested == expected, "Number of elements in the requested shape (", requested,
") does not match allocated buffer size (", expected, ")!");
......@@ -253,18 +253,18 @@ void CommOverlapP2P::copy_into_buffer(py::handle input, py::handle quantizer, bo
at::cuda::CUDAStream stream_main = at::cuda::getCurrentCUDAStream();
if (local_chunk) {
// Copy input to the target ubuf chunk by rank offset
if (input_tensor.numel() * _tp_size > (int64_t)_ubuf.numel())
if (input_tensor.numel() * _tp_size > _ubuf.numel())
NVTE_ERROR("input is larger than the local communication buffer!");
if (input_tensor.element_size() != (int64_t)_ubuf.element_size())
if (input_tensor.element_size() != _ubuf.element_size())
NVTE_ERROR("input data type does not match communication buffer!");
NVTE_CHECK_CUDA(cudaMemcpyAsync(_ubufs[_tp_id].dptr(), input_ptr,
input_tensor.numel() * input_tensor.element_size(),
cudaMemcpyDeviceToDevice, (cudaStream_t)stream_main));
} else {
if (input_tensor.numel() > (int64_t)_ubuf.numel())
if (input_tensor.numel() > _ubuf.numel())
NVTE_ERROR("input is larger than the global communication buffer!");
if (input_tensor.element_size() != (int64_t)_ubuf.element_size())
if (input_tensor.element_size() != _ubuf.element_size())
NVTE_ERROR("input data type does not match communication buffer!");
NVTE_CHECK_CUDA(cudaMemcpyAsync(_ubuf.dptr(), input_ptr,
input_tensor.numel() * input_tensor.element_size(),
......@@ -280,7 +280,7 @@ py::object CommOverlapP2P::get_buffer(py::handle quantizer, bool local_chunk,
std::vector<int64_t> torch_shape;
if (shape.has_value()) {
torch_shape = shape.value();
auto requested = product(torch_shape);
size_t requested = product(torch_shape);
auto expected = local_chunk ? _ubufs[_tp_id].numel() : _ubuf.numel();
NVTE_CHECK(requested == expected, "Number of elements in the requested shape (", requested,
") does not match allocated buffer size (", expected, ")!");
......
......@@ -318,12 +318,11 @@ std::optional<std::vector<at::Tensor>> te_general_grouped_gemm(
std::vector<size_t> single_output_begins;
std::vector<size_t> single_output_ends;
int slicing_dim;
if (single_output && D == std::nullopt) {
NVTE_ERROR("not implemented, D should be allocated for single output case.");
}
void* output_data_ptr;
void* output_data_ptr = nullptr;
if (single_output) {
output_data_ptr = (*D)[0].data_ptr();
}
......
......@@ -17,7 +17,7 @@ void fused_multi_row_padding(at::Tensor input, at::Tensor output,
NVTE_CHECK(input.dim() == 2, "Dimension of input must equal 2.");
NVTE_CHECK(output.dim() == 2, "Dimension of output must equal 2.");
const int num_tensors = input_row_list.size();
const auto num_tensors = input_row_list.size();
// Extract properties from PyTorch tensors
std::vector<void*> input_dptr_list, output_dptr_list;
std::vector<std::vector<size_t>> input_shape_list, output_shape_list;
......
......@@ -323,7 +323,8 @@ std::pair<TensorWrapper, py::object> Float8BlockQuantizer::create_tensor(
"Expected 1 or 2. Got ",
block_scaling_dim);
}
scale_inv_rowwise = at::empty({sinv0, sinv1}, scale_opts);
scale_inv_rowwise =
at::empty({static_cast<int64_t>(sinv0), static_cast<int64_t>(sinv1)}, scale_opts);
tensor.set_rowwise_data(data_rowwise.data_ptr(), this->dtype, shape);
tensor.set_rowwise_scale_inv(scale_inv_rowwise.data_ptr(), DType::kFloat32,
std::vector<size_t>{sinv0, sinv1});
......@@ -359,7 +360,8 @@ std::pair<TensorWrapper, py::object> Float8BlockQuantizer::create_tensor(
block_scaling_dim);
}
data_colwise = at::empty(torch_columnwise_shape, opts);
scale_inv_colwise = at::empty({sinv0, sinv1}, scale_opts);
scale_inv_colwise =
at::empty({static_cast<int64_t>(sinv0), static_cast<int64_t>(sinv1)}, scale_opts);
tensor.set_columnwise_data(data_colwise.data_ptr(), this->dtype, columnwise_shape);
tensor.set_columnwise_scale_inv(scale_inv_colwise.data_ptr(), DType::kFloat32,
......
......@@ -21,7 +21,7 @@ std::vector<py::object> fused_multi_quantize(std::vector<py::handle> input_list,
auto none = py::none();
// create TE tensors from input
for (int i = 0; i < input_list.size(); i++) {
for (size_t i = 0; i < input_list.size(); i++) {
auto input_tensor = makeTransformerEngineTensor(input_list[i], none);
const NVTEShape input_shape = input_tensor.shape();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment