quant.h 2.68 KB
Newer Older
Xiaowei.zhang's avatar
Xiaowei.zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// SPDX-License-Identifier: MIT
 
#pragma once

#include <torch/torch.h>

namespace aiter {

void static_per_tensor_quant(torch::Tensor& out,          // [..., d]
                             torch::Tensor const& input,  // [..., d]
                             torch::Tensor const& scale); // [1]

void dynamic_per_tensor_quant(torch::Tensor& out,         // [..., d]
                              torch::Tensor const& input, // [..., d]
                              torch::Tensor& scale);      // [1]

void dynamic_per_token_scaled_quant(torch::Tensor& out,         // [..., d]
                                    torch::Tensor const& input, // [..., d]
                                    torch::Tensor& scales,
                                    std::optional<at::Tensor> const& scale_ub,
                                    bool shuffle_scale                        = false,
                                    std::optional<at::Tensor> const& num_rows = std::nullopt,
                                    int num_rows_factor                       = 1);

void dynamic_per_group_scaled_quant_fp4(torch::Tensor& out,         // [..., d]
                                        torch::Tensor const& input, // [..., d]
                                        torch::Tensor& scales,
                                        int group_size                            = 32,
                                        bool shuffle_scale                        = true,
                                        std::optional<at::Tensor> const& num_rows = std::nullopt,
                                        int num_rows_factor                       = 1);

void smooth_per_token_scaled_quant(
    torch::Tensor& out,         // [..., d]
    torch::Tensor const& input, // [..., d]
    torch::Tensor& scales,
    torch::Tensor const& smooth_scale,
    std::optional<torch::Tensor> const& smooth_scale_map = std::nullopt,
    bool shuffle_scale                                   = false,
    std::optional<torch::Tensor> const& num_rows         = std::nullopt,
    int num_rows_factor                                  = 1);

void partial_transpose(torch::Tensor& out,         // [rows, d]
                       torch::Tensor const& input, // [rows, d]
                       torch::Tensor const& num_rows);

47
48
49
50
51
52
53
54
void moe_swiglu_dynamic_quant(torch::Tensor& scatter_tokens,
                              torch::Tensor& smooth,
                              torch::Tensor& experts_tokens_count,
                              torch::Tensor& experts_tokens_start,
                              torch::Tensor& output,
                              torch::Tensor& scales,
                              float beta);                       

Xiaowei.zhang's avatar
Xiaowei.zhang committed
55
} // namespace aiter