"platforms/cuda/src/kernels/gbsaObc1.cu" did not exist on "bcf953865fe100a237c3ff68300f2f03a8deda16"
asm_flatmm_a8w8_blockscale.h 351 Bytes
Newer Older
Xiaowei.zhang's avatar
Xiaowei.zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
#pragma once
// SPDX-License-Identifier: MIT
 
#include <torch/extension.h>

torch::Tensor flatmm_a8w8_blockscale_asm(
    torch::Tensor &XQ,      // [M, K]
    torch::Tensor &WQ,      // [N, K] -> [N/128, K*128]
    torch::Tensor &x_scale, // [K/128, M]
    torch::Tensor &w_scale, // [K/128, N/128]
    torch::Tensor &out      // Out:[M, N] fp16
);