Unverified Commit 8d09630a authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge branch 'demo131' into Issue/862

parents ab52dead 012df56c
[submodule "third_party/spdlog"] [submodule "third_party/spdlog"]
path = third_party/spdlog path = third_party/spdlog
url = https://github.com/gabime/spdlog.git url = https://github.com/gabime/spdlog.git
[submodule "third_party/nlohmann_json"]
path = third_party/nlohmann_json
url = https://github.com/nlohmann/json.git
branch = master
...@@ -20,6 +20,7 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功 ...@@ -20,6 +20,7 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功
- 天数智芯 GPU; - 天数智芯 GPU;
- 沐曦 GPU; - 沐曦 GPU;
- 海光 DCU; - 海光 DCU;
- 阿里 PPU;
- 华为昇腾 NPU; - 华为昇腾 NPU;
- 寒武纪 MLU; - 寒武纪 MLU;
- 昆仑芯 XPU; - 昆仑芯 XPU;
...@@ -103,6 +104,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS] ...@@ -103,6 +104,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
| `--qy-gpu=[y\|n]` | 是否编译QY GPU 接口实现 | n | `--qy-gpu=[y\|n]` | 是否编译QY GPU 接口实现 | n
| `--hygon-dcu=[y\|n]` | 是否编译海光 DCU 接口实现 | n | `--hygon-dcu=[y\|n]` | 是否编译海光 DCU 接口实现 | n
| `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n | `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n
| `--ali-ppu=[y\|n]` | 是否编译阿里 PPU 接口实现 | n
| `--ninetoothed=[y\|n]` | 是否编译九齿实现 | n | `--ninetoothed=[y\|n]` | 是否编译九齿实现 | n
| `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n | `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n
...@@ -187,9 +189,9 @@ pip install -e . ...@@ -187,9 +189,9 @@ pip install -e .
```bash ```bash
# 测试单算子 # 测试单算子
python test/infinicore/ops/[operator].py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon] python test/infinicore/ops/[operator].py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
# 测试全部算子 # 测试全部算子
python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun] python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --ali]
``` ```
使用 -h 查看更多参数。 使用 -h 查看更多参数。
...@@ -198,9 +200,9 @@ python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia ...@@ -198,9 +200,9 @@ python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia
```shell ```shell
# 测试单算子 # 测试单算子
python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon] python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
# 测试全部算子 # 测试全部算子
python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon] python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
``` ```
#### 通信库(InfiniCCL)测试 #### 通信库(InfiniCCL)测试
......
...@@ -47,6 +47,7 @@ typedef enum { ...@@ -47,6 +47,7 @@ typedef enum {
INFINI_DEVICE_KUNLUN = 7, INFINI_DEVICE_KUNLUN = 7,
INFINI_DEVICE_HYGON = 8, INFINI_DEVICE_HYGON = 8,
INFINI_DEVICE_QY = 9, INFINI_DEVICE_QY = 9,
INFINI_DEVICE_ALI = 10,
INFINI_DEVICE_TYPE_COUNT INFINI_DEVICE_TYPE_COUNT
} infiniDevice_t; } infiniDevice_t;
......
...@@ -3,4 +3,5 @@ ...@@ -3,4 +3,5 @@
#include "infinicore/device_event.hpp" #include "infinicore/device_event.hpp"
#include "infinicore/nn.hpp" #include "infinicore/nn.hpp"
#include "infinicore/ops.hpp" #include "infinicore/ops.hpp"
#include "infinicore/quantization.hpp"
#include "infinicore/tensor.hpp" #include "infinicore/tensor.hpp"
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include "../tensor.hpp" #include "../tensor.hpp"
#include <optional>
#include <type_traits> #include <type_traits>
namespace infinicore { namespace infinicore {
...@@ -24,6 +25,15 @@ inline void hash_combine(size_t &seed, Tensor tensor) { ...@@ -24,6 +25,15 @@ inline void hash_combine(size_t &seed, Tensor tensor) {
} }
} }
// Specialization for optional
template <typename T>
inline void hash_combine(size_t &seed, const std::optional<T> &opt) {
hash_combine(seed, opt.has_value());
if (opt) {
hash_combine(seed, *opt);
}
}
// Specialization for std::string // Specialization for std::string
inline void hash_combine(size_t &seed, const std::string &str) { inline void hash_combine(size_t &seed, const std::string &str) {
hash_combine(seed, std::hash<std::string>{}(str)); hash_combine(seed, std::hash<std::string>{}(str));
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
#include "../device.hpp" #include "../device.hpp"
#include "../memory.hpp" #include "../memory.hpp"
#include "../graph/graph.hpp"
#include <infiniop.h> #include <infiniop.h>
#include <infinirt.h> #include <infinirt.h>
...@@ -40,6 +42,12 @@ void destroyEvent(infinirtEvent_t event); ...@@ -40,6 +42,12 @@ void destroyEvent(infinirtEvent_t event);
float elapsedTime(infinirtEvent_t start, infinirtEvent_t end); float elapsedTime(infinirtEvent_t start, infinirtEvent_t end);
void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event); void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event);
// Graph recording APIs
bool isGraphRecording();
void startGraphRecording();
void addGraphOperator(std::shared_ptr<graph::GraphOperator> op);
std::shared_ptr<graph::Graph> stopGraphRecording();
} // namespace context } // namespace context
} // namespace infinicore } // namespace infinicore
...@@ -22,6 +22,7 @@ public: ...@@ -22,6 +22,7 @@ public:
KUNLUN = INFINI_DEVICE_KUNLUN, KUNLUN = INFINI_DEVICE_KUNLUN,
HYGON = INFINI_DEVICE_HYGON, HYGON = INFINI_DEVICE_HYGON,
QY = INFINI_DEVICE_QY, QY = INFINI_DEVICE_QY,
ALI = INFINI_DEVICE_ALI,
COUNT = INFINI_DEVICE_TYPE_COUNT, COUNT = INFINI_DEVICE_TYPE_COUNT,
}; };
......
#pragma once
#include <memory>
#include <vector>
#include "../tensor.hpp"
namespace infinicore::graph {
// Forward declarations
class GraphManager;
class GraphTensor : public Tensor {
public:
GraphTensor(const Tensor &);
};
class GraphOperator {
public:
virtual void run() const = 0;
virtual ~GraphOperator() = default;
};
class DispatchableGraphOperator : public GraphOperator {
public:
void run() const override;
~DispatchableGraphOperator() override;
protected:
using run_schema = void (*)(void *);
using cleanup_schema = void (*)(void **);
void *planned_meta_;
run_schema runner_;
cleanup_schema deleter_;
};
class Graph {
public:
Graph();
~Graph();
void run() const;
protected:
void add_operator(std::shared_ptr<GraphOperator> op);
void instantiate();
std::vector<std::shared_ptr<GraphOperator>> op_list_;
friend class GraphManager;
private:
struct DeviceGraph;
std::unique_ptr<DeviceGraph> device_graph_;
};
} // namespace infinicore::graph
#define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...) \
class __OP_NAME__ : public graph::DispatchableGraphOperator { \
public: \
using schema = void (*)(__VA_ARGS__); \
using plan_schema = void *(*)(__VA_ARGS__); \
static common::OpDispatcher<plan_schema> &plan_dispatcher(); \
static common::OpDispatcher<run_schema> &run_dispatcher(); \
static common::OpDispatcher<cleanup_schema> &cleanup_dispatcher(); \
__OP_NAME__(__VA_ARGS__); \
static void execute(__VA_ARGS__); \
};
#define INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(__OP_NAME__) \
common::OpDispatcher<__OP_NAME__::plan_schema> &__OP_NAME__::plan_dispatcher() { \
static common::OpDispatcher<__OP_NAME__::plan_schema> dispatcher_; \
return dispatcher_; \
} \
common::OpDispatcher<__OP_NAME__::run_schema> &__OP_NAME__::run_dispatcher() { \
static common::OpDispatcher<__OP_NAME__::run_schema> dispatcher_; \
return dispatcher_; \
} \
common::OpDispatcher<__OP_NAME__::cleanup_schema> &__OP_NAME__::cleanup_dispatcher() { \
static common::OpDispatcher<__OP_NAME__::cleanup_schema> dispatcher_; \
return dispatcher_; \
}
#define INFINICORE_GRAPH_OP_DISPATCH(__DEVICE_TYPE__, ...) \
planned_meta_ = plan_dispatcher().lookup(__DEVICE_TYPE__)(__VA_ARGS__); \
runner_ = run_dispatcher().lookup(__DEVICE_TYPE__); \
deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__);
#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \
auto ___op = std::make_shared<__OP_NAME__>(__VA_ARGS__); \
if (context::isGraphRecording()) { \
context::addGraphOperator(___op); \
} else { \
___op->run(); \
}
#define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \
static bool registered = []() { \
__OP_NAME__::plan_dispatcher().registerAll(__PLAN_F__, false); \
__OP_NAME__::run_dispatcher().registerAll(__RUN_F__, false); \
__OP_NAME__::cleanup_dispatcher().registerAll(__CLEANUP_F__, false); \
return true; \
}();
#pragma once #pragma once
#include "../ops.hpp" #include "../ops.hpp"
#include "../quantization.hpp"
#include "module.hpp" #include "module.hpp"
#include <infiniccl.h> #include <infiniccl.h>
#include <optional>
namespace infinicore::nn { namespace infinicore::nn {
...@@ -11,6 +13,9 @@ public: ...@@ -11,6 +13,9 @@ public:
BaseLinear(size_t in_features, size_t out_features, bool bias = true, BaseLinear(size_t in_features, size_t out_features, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device()); const DataType &dtype = DataType::F32, const Device &device = Device());
BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());
// Forward pass: output = input @ weight.T + bias // Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const; Tensor forward(Tensor &input) const;
...@@ -27,12 +32,17 @@ public: ...@@ -27,12 +32,17 @@ public:
// Accessors for parameters // Accessors for parameters
Tensor weight() const { return weight_; } Tensor weight() const { return weight_; }
Tensor bias() const { return bias_; } Tensor bias() const { return bias_; }
Tensor weight_scale() const { return weight_scale_; }
Tensor weight_zeros() const { return weight_zeros_; }
protected: protected:
// Parameters // Parameters
INFINICORE_NN_PARAMETER(weight); INFINICORE_NN_PARAMETER(weight);
INFINICORE_NN_PARAMETER(bias); INFINICORE_NN_PARAMETER(bias);
INFINICORE_NN_PARAMETER(weight_scale);
INFINICORE_NN_PARAMETER(weight_zeros);
protected: protected:
// Helper method for common forward computation // Helper method for common forward computation
Tensor compute_linear(Tensor &input) const; Tensor compute_linear(Tensor &input) const;
...@@ -41,6 +51,7 @@ protected: ...@@ -41,6 +51,7 @@ protected:
size_t out_features_; size_t out_features_;
bool has_bias_; bool has_bias_;
DataType dtype_; DataType dtype_;
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_ = std::make_shared<infinicore::quantization::NoneQuantization>(nullptr);
}; };
} // namespace infinicore::nn } // namespace infinicore::nn
...@@ -52,6 +63,9 @@ public: ...@@ -52,6 +63,9 @@ public:
Linear(size_t in_features, size_t out_features, bool bias = true, Linear(size_t in_features, size_t out_features, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device()); const DataType &dtype = DataType::F32, const Device &device = Device());
Linear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());
// Forward pass: output = input @ weight.T + bias // Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const; Tensor forward(Tensor &input) const;
...@@ -65,6 +79,10 @@ public: ...@@ -65,6 +79,10 @@ public:
const DataType &dtype = DataType::F32, const Device &device = Device(), const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1); Size tp_rank = 0, Size tp_size = 1);
ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1);
// Forward pass: output = input @ weight.T + bias // Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const; Tensor forward(Tensor &input) const;
...@@ -82,6 +100,10 @@ public: ...@@ -82,6 +100,10 @@ public:
const DataType &dtype = DataType::F32, const Device &device = Device(), const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr); Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
// Forward pass: output = input @ weight.T + bias // Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const; Tensor forward(Tensor &input) const;
......
#pragma once #pragma once
#include "module.hpp"
#include "../ops.hpp" #include "../ops.hpp"
#include "module.hpp"
namespace infinicore::nn { namespace infinicore::nn {
...@@ -57,6 +57,21 @@ public: ...@@ -57,6 +57,21 @@ public:
*/ */
Tensor forward(const Tensor &x) const; Tensor forward(const Tensor &x) const;
/**
* @brief Forward pass: apply RMSNorm in-place with residual
*
* @param x Input tensor of shape (*, normalized_shape) where * is any number of dimensions.
* Will be modified in-place to the normalized output.
* @param residual Residual tensor to add to input before normalization.
* Will be modified in-place to the sum of input and residual.
*
* The normalization is applied over the last dimension.
* For example:
* Input: [batch, seq_len, hidden_size] -> normalize over hidden_size
* Input: [batch, hidden_size] -> normalize over hidden_size
*/
void forward_inplace(Tensor &x, Tensor &residual) const;
// Module information // Module information
size_t normalized_shape() const { return normalized_shape_; } size_t normalized_shape() const { return normalized_shape_; }
double eps() const { return eps_; } double eps() const { return eps_; }
...@@ -73,9 +88,9 @@ protected: ...@@ -73,9 +88,9 @@ protected:
INFINICORE_NN_PARAMETER(weight); INFINICORE_NN_PARAMETER(weight);
private: private:
size_t normalized_shape_; // Size of the feature dimension size_t normalized_shape_; // Size of the feature dimension
double eps_; // Epsilon for numerical stability double eps_; // Epsilon for numerical stability
DataType dtype_; // Data type for weight DataType dtype_; // Data type for weight
}; };
} // namespace infinicore::nn } // namespace infinicore::nn
...@@ -17,6 +17,47 @@ public: ...@@ -17,6 +17,47 @@ public:
GPT_NEOX = 1, // GPT-NeoX style RoPE algorithm (First half dimensions for sin, second half for cos) GPT_NEOX = 1, // GPT-NeoX style RoPE algorithm (First half dimensions for sin, second half for cos)
}; };
enum class ScalingType {
DEFAULT = 0, // Default RoPE
LONGROPE = 1 // Long-RoPE
};
class ScalingConfig {
public:
virtual ~ScalingConfig() = default;
ScalingType type() const { return type_; }
protected:
ScalingType type_ = ScalingType::DEFAULT;
ScalingConfig(ScalingType type) : type_(type) {}
};
// longrope scaling
class LongRopeConfig : public ScalingConfig {
protected:
std::vector<float> short_factor_;
std::vector<float> long_factor_;
size_t original_max_position_embeddings_;
float factor_;
public:
LongRopeConfig(
std::vector<float> short_factor,
std::vector<float> long_factor,
size_t original_max_position_embeddings,
float factor = 1.0f)
: ScalingConfig(ScalingType::LONGROPE),
short_factor_(short_factor),
long_factor_(long_factor),
original_max_position_embeddings_(original_max_position_embeddings),
factor_(factor == 1.0f ? 1.0f : std::sqrt(1 + std::log(factor) / std::log(original_max_position_embeddings))) {}
~LongRopeConfig() override = default;
size_t original_max_position_embeddings() const { return original_max_position_embeddings_; }
const std::vector<float> &short_factor() const { return short_factor_; }
const std::vector<float> &long_factor() const { return long_factor_; }
float factor() const { return factor_; }
};
/** /**
* @brief Construct a RoPE layer * @brief Construct a RoPE layer
* *
...@@ -26,13 +67,15 @@ public: ...@@ -26,13 +67,15 @@ public:
* @param algo RoPE algorithm type (default: Algo::GPT_J) * @param algo RoPE algorithm type (default: Algo::GPT_J)
* @param dtype Data type for sin/cos cache (default: DataType::F32) * @param dtype Data type for sin/cos cache (default: DataType::F32)
* @param device Device to create the cache on * @param device Device to create the cache on
* @param scaling RoPE scaling type (default: nullptr)
*/ */
RoPE(size_t head_dim, RoPE(size_t head_dim,
size_t max_seq_len, size_t max_seq_len,
double theta = 10000.0, double theta = 10000.0,
Algo algo = Algo::GPT_J, Algo algo = Algo::GPT_J,
const DataType &dtype = DataType::F32, const DataType &dtype = DataType::F32,
const Device &device = Device()); const Device &device = Device(),
std::shared_ptr<ScalingConfig> scaling = nullptr);
/** /**
* @brief Forward pass: apply RoPE to a tensor * @brief Forward pass: apply RoPE to a tensor
...@@ -88,11 +131,12 @@ protected: ...@@ -88,11 +131,12 @@ protected:
private: private:
void initialize_cache(); void initialize_cache();
size_t head_dim_; // Dimension of each attention head size_t head_dim_; // Dimension of each attention head
size_t max_seq_len_; // Maximum sequence length size_t max_seq_len_; // Maximum sequence length
double theta_; // Base frequency for rotary embeddings double theta_; // Base frequency for rotary embeddings
Algo algo_; // RoPE algorithm type Algo algo_; // RoPE algorithm type
DataType dtype_; // Data type for cache tables DataType dtype_; // Data type for cache tables
std::shared_ptr<ScalingConfig> scaling_; // RoPE scaling type
}; };
} // namespace infinicore::nn } // namespace infinicore::nn
#pragma once #pragma once
#include "ops/add.hpp" #include "ops/add.hpp"
#include "ops/add_rms_norm.hpp"
#include "ops/attention.hpp" #include "ops/attention.hpp"
#include "ops/causal_softmax.hpp" #include "ops/causal_softmax.hpp"
#include "ops/embedding.hpp"
#include "ops/flash_attention.hpp"
#include "ops/kv_caching.hpp"
#include "ops/matmul.hpp" #include "ops/matmul.hpp"
#include "ops/ones.hpp" #include "ops/ones.hpp"
#include "ops/paged_attention.hpp"
#include "ops/paged_attention_prefill.hpp"
#include "ops/paged_caching.hpp"
#include "ops/random_sample.hpp"
#include "ops/rearrange.hpp" #include "ops/rearrange.hpp"
#include "ops/rms_norm.hpp" #include "ops/rms_norm.hpp"
#include "ops/rope.hpp" #include "ops/rope.hpp"
#include "ops/silu.hpp" #include "ops/silu.hpp"
#include "ops/silu_and_mul.hpp"
#include "ops/swiglu.hpp" #include "ops/swiglu.hpp"
#pragma once #pragma once
#include "../device.hpp" #include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp" #include "common/op.hpp"
namespace infinicore::op { namespace infinicore::op {
class Add {
public:
using schema = void (*)(Tensor, Tensor, Tensor);
static void execute(Tensor c, Tensor a, Tensor b);
static common::OpDispatcher<schema> &dispatcher();
};
Tensor add(Tensor a, Tensor b); INFINICORE_GRAPH_OP_CLASS(Add, Tensor, const Tensor &, const Tensor &);
void add_(Tensor c, Tensor a, Tensor b);
Tensor operator+(Tensor a, Tensor b); Tensor add(const Tensor &a, const Tensor &b);
void add_(Tensor c, const Tensor &a, const Tensor &b);
} // namespace infinicore::op } // namespace infinicore::op
#pragma once
#include "../device.hpp"
#include "common/op.hpp"
#include <utility>
namespace infinicore::op {
INFINICORE_GRAPH_OP_CLASS(AddRMSNorm, Tensor, Tensor, const Tensor &, const Tensor &, const Tensor &, float);
// Fused Add and RMS Normalization
// Returns: (normalized_result, add_result)
// The add_result can be used as residual for subsequent layers
std::pair<Tensor, Tensor> add_rms_norm(const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon = 1e-5f);
void add_rms_norm_(Tensor out, Tensor residual, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon = 1e-5f);
// Fused Add and RMS Normalization (inplace)
// normalized_result wil be stored in input, add_result will be stored in residual
void add_rms_norm_inplace(Tensor input, Tensor residual, const Tensor &weight, float epsilon = 1e-5f);
} // namespace infinicore::op
#pragma once #pragma once
#include "../device.hpp" #include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp" #include "common/op.hpp"
namespace infinicore::op { namespace infinicore::op {
class CausalSoftmax {
public:
using schema = void (*)(Tensor, Tensor);
static void execute(Tensor output, Tensor input);
static common::OpDispatcher<schema> &dispatcher();
};
Tensor causal_softmax(Tensor input); INFINICORE_GRAPH_OP_CLASS(CausalSoftmax, Tensor, const Tensor &);
void causal_softmax_(Tensor output, Tensor input);
Tensor causal_softmax(const Tensor &input);
void causal_softmax_(Tensor output, const Tensor &input);
} // namespace infinicore::op } // namespace infinicore::op
#pragma once
#include "../device.hpp"
#include "common/op.hpp"
#include <optional>
namespace infinicore::op {
INFINICORE_GRAPH_OP_CLASS(DequantizeAWQ, Tensor, const Tensor &, const Tensor &, const Tensor &);
void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros);
} // namespace infinicore::op
#pragma once
#include "../../device.hpp"
#include "../../graph/graph.hpp"
#include "../common/op.hpp"
#include <infiniccl.h>
namespace infinicore::op::distributed {
class AllReduce : public graph::GraphOperator {
public:
AllReduce(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator);
~AllReduce();
void run() const override;
static void execute(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator);
private:
void *planned_meta_;
};
Tensor allreduce(const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator);
void allreduce_(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator);
} // namespace infinicore::op::distributed
#pragma once #pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp" #include "common/op.hpp"
namespace infinicore::op { namespace infinicore::op {
Tensor embedding(Tensor input, Tensor weight); INFINICORE_GRAPH_OP_CLASS(Embedding, Tensor, const Tensor &, const Tensor &);
void embedding_(Tensor out, Tensor input, Tensor weight);
Tensor embedding(const Tensor &input, const Tensor &weight);
void embedding_(Tensor out, const Tensor &input, const Tensor &weight);
} // namespace infinicore::op } // namespace infinicore::op
#pragma once
#include "../device.hpp"
#include "common/op.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_CLASS(FlashAttention, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, float, bool);
Tensor flash_attention(const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal);
void flash_attention_(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal);
} // namespace infinicore::op
#pragma once #pragma once
#include "../device.hpp" #include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp" #include "common/op.hpp"
namespace infinicore::op { namespace infinicore::op {
class Gemm { INFINICORE_GRAPH_OP_CLASS(Gemm, Tensor, const Tensor &, const Tensor &, float, float);
public:
using schema = void (*)(Tensor, Tensor, Tensor, float, float);
static void execute(Tensor c, Tensor a, Tensor b, float alpha, float beta);
static common::OpDispatcher<schema> &dispatcher();
};
Tensor gemm(Tensor a, Tensor b, float alpha = 1.0f, float beta = 0.0f); Tensor gemm(const Tensor &a, const Tensor &b, float alpha = 1.0f, float beta = 0.0f);
void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta); void gemm_(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta);
} // namespace infinicore::op } // namespace infinicore::op
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment