gemm_w8a8.cu

#include "common.h"
#include "Tensor.h"

#include <cutlass/core_io.h>
#include <cutlass/cutlass.h>
#include <cutlass/half.h>

#include <cutlass/gemm/device/gemm.h>
#include <cutlass/numeric_types.h>

using spdlog::fmt_lib::format;

Tensor gemm_w8a8_fp16(Tensor input,  // INT8
                        Tensor weight, // INT8
                        Tensor out, // FP16
                        half alpha,
                        half beta   // FP16
) {
    auto N = weight.size(0);
    auto K = input.size(-1);
    auto M = input.numel() / K;
    assert(weight.size(1) == K);

    spdlog::debug("gemm_w8a8: M={} K={} N={}", M, K, N);

    using ElementOutput = cutlass::half_t;
    using ElementAccumulator = int32_t;
    using ElementComputeEpilogue = cutlass::half_t;
    using ElementInputA = int8_t; // <- data type of elements in input matrix A
    using ElementInputB = int8_t; // <- data type of elements in input matrix B

    using LayoutInputA = cutlass::layout::RowMajor;
    using LayoutInputB = cutlass::layout::ColumnMajor;
    using LayoutOutput = cutlass::layout::RowMajor;

// #if CUDA_ARCH >= 800
    using Gemm = cutlass::gemm::device::Gemm<
        int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
        ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
        cutlass::gemm::GemmShape<128, 128, 64>,
        cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
        cutlass::epilogue::thread::LinearCombination<
            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
            ElementAccumulator, ElementComputeEpilogue>,
        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
// #elif CUDA_ARCH >= 750
//     using DefaultGemmCfg = cutlass::gemm::device::DefaultGemmConfiguration<
//         cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
//         ElementInputA, ElementInputB, ElementOutput, ElementAccumulator>;
//     using Gemm = cutlass::gemm::device::Gemm<
//         int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
//         ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
//         cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
//         DefaultGemmCfg::ThreadblockShape, DefaultGemmCfg::WarpShape,
//         DefaultGemmCfg::InstructionShape,
//         cutlass::epilogue::thread::LinearCombination<
//             ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
//             ElementAccumulator, ElementComputeEpilogue>>;
// #elif CUDA_ARCH >= 700
//     using DefaultGemmCfg = cutlass::gemm::device::DefaultGemmConfiguration<
//         cutlass::arch::OpClassSimt, cutlass::arch::Sm70,
//         ElementInputA, ElementInputB, ElementOutput, ElementAccumulator>;
//     using Gemm = cutlass::gemm::device::Gemm<
//         int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
//         ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
//         cutlass::arch::OpClassSimt, cutlass::arch::Sm70,
//         DefaultGemmCfg::ThreadblockShape, DefaultGemmCfg::WarpShape,
//         DefaultGemmCfg::InstructionShape,
//         cutlass::epilogue::thread::LinearCombination<
//             ElementOutput, 1, ElementAccumulator, ElementComputeEpilogue>>;
// #else
// #error "Unsupported cuda arch"
// #endif

    auto input_size = cutlass::MatrixCoord(M, K);
    auto weight_size = cutlass::MatrixCoord(K, N);
    auto output_size = cutlass::MatrixCoord(M, N);

    auto device = input.device();
    // use the broadcasted bias as the output
    // auto out = bias.to(device).view({1, -1}).repeat({M, 1});

    if (!out.valid()) {
        auto out_shape = TensorShape(input.shape.dataExtent);
        out_shape[-1] = N;
        out = Tensor::empty(out_shape, Tensor::FP16, input.device());
    }

    // FIXME: check contiguous of input if dims >= 3
    assert(input.stride(-1) == 1);
    // assert(input.is_contiguous());
    assert(weight.is_contiguous());

    assert(out.dtype() == Tensor::FP16);
    assert(out.shape[-1] == N);
    assert(out.numel() / out.shape[-1] == M);
    assert(out.stride(-1) == 1);
    // FIXME: check contiguous of output if dims >= 3

    // constexpr int kSparse = Gemm::kSparse;
    // How many elements of A are covered per ElementE
    // constexpr int kElementsPerElementE = Gemm::kElementsPerElementE;
    // The size of individual meta data
    // constexpr int kMetaSizeInBits = Gemm::kMetaSizeInBits;
    cutlass::gemm::GemmCoord problem_size(M, N, K);

    cutlass::TensorRef<ElementInputA, LayoutInputA> input_ref(
        input.data_ptr<ElementInputA>(), LayoutInputA(input.stride(-2)));
    cutlass::TensorRef<ElementInputB, LayoutInputB> weight_ref(
        weight.data_ptr<ElementInputB>(), LayoutInputB::packed(weight_size));
    cutlass::TensorRef<ElementOutput, LayoutOutput> out_ref(
        out.data_ptr<ElementOutput>(), LayoutOutput(out.stride(-2)));

    typename Gemm::Arguments arguments{
        problem_size, // <- problem size of matrix multiplication
        input_ref,    // <- reference to matrix A on device
        weight_ref,   // <- reference to matrix B on device
        out_ref,      // <- reference to matrix C on device
        out_ref,      // <- reference to matrix D on device
        {ElementOutput(alpha), ElementOutput(beta)},
        1};
    Gemm gemm_op;

    // Using the arguments, query for extra workspace required for matrix
    // multiplication computation
    size_t workspace_size = Gemm::get_workspace_size(arguments);

    // Allocate workspace memory
    // cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);

    BufferCUDA workspace(workspace_size);

    // Check the problem size is supported or not
    cutlass::Status status = gemm_op.can_implement(arguments);
    if (status != cutlass::Status::kSuccess) {
        throw std::runtime_error(format("cutlass cannot implement M={} N={} K={}", M, N, K));
    }

    // Initialize CUTLASS kernel with arguments and workspace pointer
    status = gemm_op.initialize(arguments, workspace.getPtr());
    if (status != cutlass::Status::kSuccess) {
        throw std::runtime_error("cutlass cannot initialize");
    }

    status = gemm_op();
    if (status != cutlass::Status::kSuccess) {
        throw std::runtime_error("cutlass cannot run");
    }

    return out;
}