Unverified Commit eb89439d authored by qinyiqun's avatar qinyiqun Committed by GitHub
Browse files

Support Quantization (#996)



demo131 - multiple issues regarding quantization, qy, and so forth

* issue/843: success per_channel_quant_int8

* issue/843: success qy quant

* issue/843: modified quant

* Add w8a8int8 performance tests

* add infinicore op linear_w8a8i8

* w8a8 linear module functional nn

* issue/843: QY-GPU Support Int8 scale_mm (#68)

* issue/843: success qy scaled_mm

* issue/843: modified kernel.cuh as per_channel_dequant_int8.cuh

* fix parallel slic in w8

* w8: support multiple batch size

* temp: 修改quantconfig处理

* fix format and delete redundancy code

* fix format

* fix format

* fix format

* Refactor: add new API alongside legacy interfaces with deprecation warnings

* 添加w4 inifnicore相关内容,以及将Quantization config划入InfiniCore

* 量化算子支持图

* solve cub version problem and fix code structure

* fix format

* demo131 - remove commented lines

---------
Co-authored-by: default avatarxgqdut2016 <kenan_gewei@163.com>
Co-authored-by: default avatarxgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
Co-authored-by: default avatarwooway777 <wooway777@gmail.com>
parent abab5652
[submodule "third_party/spdlog"]
path = third_party/spdlog
url = https://github.com/gabime/spdlog.git
[submodule "third_party/nlohmann_json"]
path = third_party/nlohmann_json
url = https://github.com/nlohmann/json.git
branch = master
......@@ -3,4 +3,5 @@
#include "infinicore/device_event.hpp"
#include "infinicore/nn.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/quantization.hpp"
#include "infinicore/tensor.hpp"
#pragma once
#include "../ops.hpp"
#include "../quantization.hpp"
#include "module.hpp"
#include <infiniccl.h>
#include <optional>
namespace infinicore::nn {
......@@ -11,6 +13,9 @@ public:
BaseLinear(size_t in_features, size_t out_features, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());
BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());
// Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const;
......@@ -27,12 +32,17 @@ public:
// Accessors for parameters
Tensor weight() const { return weight_; }
Tensor bias() const { return bias_; }
Tensor weight_scale() const { return weight_scale_; }
Tensor weight_zeros() const { return weight_zeros_; }
protected:
// Parameters
INFINICORE_NN_PARAMETER(weight);
INFINICORE_NN_PARAMETER(bias);
INFINICORE_NN_PARAMETER(weight_scale);
INFINICORE_NN_PARAMETER(weight_zeros);
protected:
// Helper method for common forward computation
Tensor compute_linear(Tensor &input) const;
......@@ -41,6 +51,7 @@ protected:
size_t out_features_;
bool has_bias_;
DataType dtype_;
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_ = std::make_shared<infinicore::quantization::NoneQuantization>(nullptr);
};
} // namespace infinicore::nn
......@@ -52,6 +63,9 @@ public:
Linear(size_t in_features, size_t out_features, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());
Linear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());
// Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const;
......@@ -65,6 +79,10 @@ public:
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1);
ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1);
// Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const;
......@@ -82,6 +100,10 @@ public:
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
// Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const;
......
#pragma once
#include "../device.hpp"
#include "common/op.hpp"
#include <optional>
namespace infinicore::op {
INFINICORE_GRAPH_OP_CLASS(DequantizeAWQ, Tensor, const Tensor &, const Tensor &, const Tensor &);
void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros);
} // namespace infinicore::op
#pragma once
#include "common/op.hpp"
#include <optional>
namespace infinicore::op {
Tensor linear_w4a16_awq(Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional<Tensor> bias);
void linear_w4a16_awq_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional<Tensor> bias);
} // namespace infinicore::op
#pragma once
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <optional>
namespace infinicore::op {
Tensor linear_w8a8i8(Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional<Tensor> bias);
void linear_w8a8i8_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional<Tensor> bias);
} // namespace infinicore::op
#pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <optional>
namespace infinicore::op {
INFINICORE_GRAPH_OP_CLASS(PerChannelQuantI8, const Tensor &, Tensor, Tensor);
void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale);
} // namespace infinicore::op
#pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <optional>
namespace infinicore::op {
INFINICORE_GRAPH_OP_CLASS(I8Gemm, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, std::optional<Tensor>);
void scaled_mm_i8_(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias);
} // namespace infinicore::op
#pragma once
#include "quantization/awq.hpp"
#include "quantization/base_quantization.hpp"
#include "quantization/compressed_tensors.hpp"
#include "quantization/none_quantizaiton.hpp"
#include "quantization/quantization_scheme.hpp"
#pragma once
#include "base_quantization.hpp"
namespace infinicore::quantization {
class AWQ : public BaseQuantization {
// This is a temporary class that currently only returns AWQ_W4A16.
// Future enhancements should parse quant_config to extract detailed quantization
// information and support multiple quantization schemes.
public:
explicit AWQ(const nlohmann::json &quant_config)
: BaseQuantization(quant_config) {};
infinicore::quantization::QuantScheme
get_quant_scheme() const override {
return infinicore::quantization::QuantScheme::AWQ_W4A16;
};
};
} // namespace infinicore::quantization
#pragma once
#include "nlohmann/json.hpp"
#include "quantization_scheme.hpp"
namespace infinicore::quantization {
class BaseQuantization {
// Base class for quantization schemes. Intended to be extended to support various quantization methods.
public:
explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {};
virtual ~BaseQuantization() = default;
virtual infinicore::quantization::QuantScheme get_quant_scheme() const = 0;
protected:
nlohmann::json quant_config_;
};
} // namespace infinicore::quantization
#pragma once
#include "base_quantization.hpp"
namespace infinicore::quantization {
class CompressedTensors : public BaseQuantization {
// This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8.
// Future enhancements should parse quant_config to extract detailed quantization
// information and support multiple quantization schemes.
public:
explicit CompressedTensors(const nlohmann::json &quant_config)
: BaseQuantization(quant_config) {};
infinicore::quantization::QuantScheme
get_quant_scheme() const override {
return infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8;
};
};
} // namespace infinicore::quantization
#pragma once
#include "base_quantization.hpp"
namespace infinicore::quantization {
class NoneQuantization : public BaseQuantization {
// This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8.
// Future enhancements should parse quant_config to extract detailed quantization
// information and support multiple quantization schemes.
public:
explicit NoneQuantization(const nlohmann::json &quant_config)
: BaseQuantization(quant_config) {};
infinicore::quantization::QuantScheme
get_quant_scheme() const override {
return infinicore::quantization::QuantScheme::NONE;
};
};
} // namespace infinicore::quantization
// quant.hpp
#pragma once
namespace infinicore::quantization {
enum class QuantScheme {
NONE,
COMPRESSED_TENSOR_W8A8I8,
AWQ_W4A16,
};
} // namespace infinicore::quantization
......@@ -13,6 +13,7 @@
#include "infiniop/ops/flash_attention.h"
#include "infiniop/ops/gelu.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/int8_gemm.h"
#include "infiniop/ops/kv_caching.h"
#include "infiniop/ops/layer_norm.h"
#include "infiniop/ops/logsoftmax.h"
......@@ -22,6 +23,7 @@
#include "infiniop/ops/paged_attention.h"
#include "infiniop/ops/paged_attention_prefill.h"
#include "infiniop/ops/paged_caching.h"
#include "infiniop/ops/quant/per_channel_quant_int8.h"
#include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h"
#include "infiniop/ops/relu.h"
......
#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#include "../../operator_descriptor.h"
typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;
__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc);
__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x_packed,
void *x_scale,
void *x_zero,
const void *x,
void *stream);
__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);
#endif
......@@ -2,6 +2,7 @@ from .causal_softmax import causal_softmax
from .embedding import embedding
from .flash_attention import flash_attention
from .linear import linear
from .linear_w8a8i8 import linear_w8a8i8
from .random_sample import random_sample
from .rms_norm import rms_norm
from .rope import RopeAlgo, rope
......@@ -19,4 +20,5 @@ __all__ = [
"rope",
"silu",
"swiglu",
"linear_w8a8i8",
]
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def linear_w8a8i8(
input: Tensor,
weight_packed: Tensor,
weight_scale: Tensor,
bias=None,
out=None,
) -> Tensor:
r"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale."""
if out is None:
return Tensor(
_infinicore.linear_w8a8i8(
input._underlying,
weight_packed._underlying,
weight_scale._underlying,
None if bias is None else bias._underlying,
)
)
_infinicore.linear_w8a8i8_(
out._underlying,
input._underlying,
weight_packed._underlying,
weight_scale._underlying,
None if bias is None else bias._underlying,
)
return out
......@@ -3,6 +3,7 @@
#include "infinicore/ops.hpp"
#include "infinicore/ops/distributed/allreduce.hpp"
#include "infinicore/ops/linear.hpp"
#include "infinicore/ops/linear_w8a8i8.hpp"
#include <optional>
#include <spdlog/spdlog.h>
......@@ -18,21 +19,46 @@ BaseLinear::BaseLinear(size_t in_features, size_t out_features, bool bias,
device_ = device;
}
Tensor BaseLinear::compute_linear(Tensor &input) const {
BaseLinear::BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device)
: in_features_(in_features),
out_features_(out_features),
quantization_(quantization),
has_bias_(bias),
dtype_(dtype) {
// Ensure input is contiguous before creating views (required for matmul)
// This prevents hanging when input tensor has non-contiguous memory layout
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
device_ = device;
}
// Use ops::linear_ directly to match Python backend's exact code path
// This ensures identical computation and numerical results
// Parameter inherits from Tensor, so we cast to Tensor explicitly
Tensor weight_tensor = static_cast<const Tensor &>(weight_);
std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
Tensor BaseLinear::compute_linear(Tensor &input) const {
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt);
return output;
}
Tensor weight_packed_tensor = static_cast<const Tensor &>(weight_);
Tensor weight_scale_tensor = static_cast<const Tensor &>(weight_scale_);
// weight_packed should be transposed and non-contiguous.
std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
auto output = infinicore::op::linear_w8a8i8(input_contiguous->contiguous(), weight_packed_tensor, weight_scale_tensor, bias_opt);
return output;
}
default: {
// Ensure input is contiguous before creating views (required for matmul)
// This prevents hanging when input tensor has non-contiguous memory layout
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
// Use ops::linear_ directly to match Python backend's exact code path
// This ensures identical computation and numerical results
// Parameter inherits from Tensor, so we cast to Tensor explicitly
Tensor weight_tensor = static_cast<const Tensor &>(weight_);
std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt);
return output;
}
}
} // namespace infinicore::nn
Tensor BaseLinear::forward(Tensor &input) const {
return compute_linear(input);
......@@ -71,6 +97,43 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias,
// in_features, out_features, bias, static_cast<int>(dtype_));
}
Linear::Linear(size_t in_features, size_t out_features,
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device)
: BaseLinear(in_features, out_features, quantization, bias, dtype, device_) {
device_ = device;
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device));
INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device));
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device));
// Register bias parameter if requested
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
// SPDLOG_DEBUG("Created Linear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
break;
}
}
}
Tensor Linear::forward(Tensor &input) const {
return BaseLinear::forward(input);
}
......@@ -105,6 +168,45 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur
}
}
ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device,
Size tp_rank, Size tp_size)
: BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
tp_rank_(tp_rank),
tp_size_(tp_size) {
device_ = device;
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 0, tp_rank_, tp_size_));
INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, tp_rank_, tp_size_));
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
0, tp_rank_, tp_size_));
// Register bias parameter if requested
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device,
0, tp_rank_, tp_size_));
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
break;
}
}
}
Tensor ColumnParallelLinear::forward(Tensor &input) const {
return BaseLinear::forward(input);
}
......@@ -138,6 +240,46 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bo
}
}
RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device,
Size tp_rank, Size tp_size, infinicclComm_t communicator)
: BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
tp_rank_(tp_rank),
tp_size_(tp_size), communicator_(communicator) {
device_ = device;
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 1, tp_rank_, tp_size_));
INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, 0, 1));
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
1, tp_rank_, tp_size_));
// Register bias parameter if requested
if (bias && (0 == tp_rank_)) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
// SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
break;
}
}
}
Tensor RowParallelLinear::forward(Tensor &input) const {
auto output = BaseLinear::forward(input);
......
#include "infinicore/ops/dequantize_awq.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(DequantizeAWQ);
DequantizeAWQ::DequantizeAWQ(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zeros);
INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zeros);
}
void DequantizeAWQ::execute(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(DequantizeAWQ, x, x_packed, x_scale, x_zeros);
}
void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
DequantizeAWQ::execute(x, x_packed, x_scale, x_zeros);
}
} // namespace infinicore::op
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment