Linear.h

#pragma once

#include "common.h"
#include "Tensor.h"
#include "Module.h"

class GEMM_F16 : public Module {
public:
    GEMM_F16(int in_features, int out_features, bool use_bias, Tensor::ScalarType dtype, Device device);

    Tensor forward(Tensor x);

public:
    const int in_features;
    const int out_features;

public:
    Tensor weight;
    Tensor bias;
};

class GEMV_AWQ : public Module {
public:
    GEMV_AWQ(int in_features, int out_features, bool use_bias, Tensor::ScalarType dtype, Device device);

    Tensor forward(Tensor x);

protected:
    virtual void loadParam(std::string key, Tensor &dst, Tensor src) override;

public:
    const int in_features;
    const int out_features;
    const int group_size;

    int lora_rank;
    float lora_scale;

    const Device device;
public:
    Tensor qweight;
    Tensor wscales;
    Tensor wzeros;
    Tensor bias;

    Tensor lora_down;
    Tensor lora_up;

    // std::shared_ptr<CUBLASWrapper> cublas;
};

class GEMM_W4A4 : public Module {
public:
    enum class FuseOptions {
        EMPTY = 0,
        GELU_QUANT,
        SILU,
    };
    struct QuantizedActivation {
        Tensor act;
        Tensor ascales;
        Tensor lora_act;
        bool is_unsigned = false;
        TensorShape actShape;
    };

public:
    GEMM_W4A4(int in_features, int out_features, bool bias, bool use_fp4, Tensor::ScalarType dtype, Device device);
    Tensor forward(Tensor x);
    Tensor forward_silu(Tensor x);
    std::variant<Tensor, QuantizedActivation> forward(Tensor x, FuseOptions fuse, GEMM_W4A4 *nextGEMM = nullptr);
    void forward(
        Tensor x, Tensor out, 
        Tensor pool = {}, Tensor norm_q = {}, Tensor norm_k = {}, Tensor rotary_emb = {}, 
        Tensor out_q = {}, Tensor out_k = {}, Tensor out_v = {}, int numTokens = 0
    );
    std::variant<Tensor, QuantizedActivation> forward_quant(QuantizedActivation qact, FuseOptions fuse, GEMM_W4A4 *nextGEMM = nullptr);
    Tensor forward_quant(QuantizedActivation qact);

public:
    QuantizedActivation quantize(Tensor x, bool fuse_glu);

public:
    const int in_features;
    const int out_features;
    const int in_features_pad;
    const int out_features_pad;
    const bool use_fp4;
    
    int lora_rank;
    std::vector<float> lora_scales; // every 16 ranks share a scale

    const Tensor::ScalarType dtype;
    const Device device;

protected:
    virtual void loadParam(std::string key, Tensor &dst, Tensor src) override;

public:
    Tensor qweight;
    Tensor wscales;
    Tensor bias;

    Tensor lora_down;
    Tensor lora_up;

    Tensor smooth;

    Tensor wtscale;
    Tensor wcscales;

    cublasHandle_t handle;
};

class GEMM_W8A8 : public Module {
public:
    struct QuantizedActivation {
        Tensor act;
        Tensor ascales;
    };
public:
    GEMM_W8A8(int in_features, int out_features, bool bias, Tensor::ScalarType dtype, Device device);

public:
    QuantizedActivation quantize(Tensor x, bool fuse_glu); 
    Tensor forward_quant(QuantizedActivation qact);
    Tensor forward(Tensor x) { return forward_quant(quantize(x, false)); }

public:
    const int in_features;
    const int out_features;
    const Tensor::ScalarType dtype;

public:
    Tensor qweight;
    Tensor wscales;
    Tensor bias;
};

class DWCONV : public Module {
public:
    DWCONV(int in_features, bool bias, Tensor::ScalarType dtype, Device device);

    Tensor forward(Tensor x);

public:
    const int in_features;

public:
    Tensor weight;
    Tensor bias;
};