// SPDX-License-Identifier: MIT
 
#pragma once

#include <torch/torch.h>

namespace aiter {

void static_per_tensor_quant(torch::Tensor& out,          // [..., d]
                             torch::Tensor const& input,  // [..., d]
                             torch::Tensor const& scale); // [1]

void dynamic_per_tensor_quant(torch::Tensor& out,         // [..., d]
                              torch::Tensor const& input, // [..., d]
                              torch::Tensor& scale);      // [1]

void dynamic_per_token_scaled_quant(torch::Tensor& out,         // [..., d]
                                    torch::Tensor const& input, // [..., d]
                                    torch::Tensor& scales,
                                    std::optional<at::Tensor> const& scale_ub,
                                    bool shuffle_scale                        = false,
                                    std::optional<at::Tensor> const& num_rows = std::nullopt,
                                    int num_rows_factor                       = 1);

void dynamic_per_group_scaled_quant_fp4(torch::Tensor& out,         // [..., d]
                                        torch::Tensor const& input, // [..., d]
                                        torch::Tensor& scales,
                                        int group_size                            = 32,
                                        bool shuffle_scale                        = true,
                                        std::optional<at::Tensor> const& num_rows = std::nullopt,
                                        int num_rows_factor                       = 1);

void smooth_per_token_scaled_quant(
    torch::Tensor& out,         // [..., d]
    torch::Tensor const& input, // [..., d]
    torch::Tensor& scales,
    torch::Tensor const& smooth_scale,
    std::optional<torch::Tensor> const& smooth_scale_map = std::nullopt,
    bool shuffle_scale                                   = false,
    std::optional<torch::Tensor> const& num_rows         = std::nullopt,
    int num_rows_factor                                  = 1);

void partial_transpose(torch::Tensor& out,         // [rows, d]
                       torch::Tensor const& input, // [rows, d]
                       torch::Tensor const& num_rows);

void moe_swiglu_dynamic_quant(torch::Tensor& scatter_tokens,
                              torch::Tensor& smooth,
                              torch::Tensor& experts_tokens_count,
                              torch::Tensor& experts_tokens_start,
                              torch::Tensor& output,
                              torch::Tensor& scales,
                              float beta);                       

} // namespace aiter