Unverified Commit 8d09630a authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge branch 'demo131' into Issue/862

parents ab52dead 012df56c
......@@ -5,10 +5,10 @@
#include "../context_impl.hpp"
namespace infinicore {
class DeviceCachingAllocator : public MemoryAllocator {
class StreamOrderedAllocator : public MemoryAllocator {
public:
explicit DeviceCachingAllocator(Device device);
~DeviceCachingAllocator() = default;
explicit StreamOrderedAllocator(Device device);
~StreamOrderedAllocator() = default;
std::byte *allocate(size_t size) override;
void deallocate(std::byte *ptr) override;
......
#include "context_impl.hpp"
#include "internal.hpp"
#include "../utils.hpp"
......@@ -29,16 +30,18 @@ Runtime *ContextImpl::getCurrentRuntime() {
return current_runtime_;
}
Runtime *ContextImpl::getCpuRuntime() {
return runtime_table_[int(Device::Type::CPU)][0].get();
}
void ContextImpl::setDevice(Device device) {
if (device == getCurrentRuntime()->device()) {
// Do nothing if the device is already set.
return;
}
thread_local bool warn_switch_runtime = false;
if (getCurrentRuntime()->isGraphRecording() && !warn_switch_runtime) {
spdlog::warn("Switching device runtime during graph recording may break the graph!");
warn_switch_runtime = true;
}
if (runtime_table_[int(device.getType())][device.getIndex()] == nullptr) {
// Lazy initialization of runtime if never set before.
runtime_table_[int(device.getType())][device.getIndex()] = std::unique_ptr<Runtime>(new Runtime(device));
......@@ -100,11 +103,8 @@ infinirtStream_t getStream() {
}
infiniopHandle_t getInfiniopHandle(Device device) {
if (device.getType() == Device::Type::CPU) {
return ContextImpl::singleton().getCpuRuntime()->infiniopHandle();
}
if (device != getDevice()) {
throw std::runtime_error("Requested device doesn't match current runtime.");
setDevice(device);
}
return ContextImpl::singleton().getCurrentRuntime()->infiniopHandle();
}
......@@ -122,7 +122,8 @@ std::shared_ptr<Memory> allocateMemory(size_t size) {
}
std::shared_ptr<Memory> allocateHostMemory(size_t size) {
return ContextImpl::singleton().getCpuRuntime()->allocateMemory(size);
setDevice(Device::cpu());
return allocateMemory(size);
}
std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size) {
......@@ -142,7 +143,8 @@ void memcpyD2D(void *dst, const void *src, size_t size, bool async) {
}
void memcpyH2H(void *dst, const void *src, size_t size) {
return ContextImpl::singleton().getCpuRuntime()->memcpyD2D(dst, src, size);
setDevice(Device::cpu());
return ContextImpl::singleton().getCurrentRuntime()->memcpyD2D(dst, src, size);
}
// Timing API implementations
......@@ -178,6 +180,27 @@ void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
ContextImpl::singleton().getCurrentRuntime()->streamWaitEvent(stream, event);
}
bool isGraphRecording() {
return ContextImpl::singleton().getCurrentRuntime()->isGraphRecording();
}
void startGraphRecording() {
ContextImpl::singleton().getCurrentRuntime()->startGraphRecording();
}
void addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
ContextImpl::singleton().getCurrentRuntime()->addGraphOperator(op);
}
std::shared_ptr<graph::Graph> stopGraphRecording() {
return ContextImpl::singleton().getCurrentRuntime()->stopGraphRecording();
}
std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob) {
setDevice(blob->device());
return ContextImpl::singleton().getCurrentRuntime()->reinstantiateBlob(blob);
}
} // namespace context
} // namespace infinicore
......@@ -19,8 +19,6 @@ protected:
public:
Runtime *getCurrentRuntime();
Runtime *getCpuRuntime();
void setDevice(Device);
size_t getDeviceCount(Device::Type type);
......
#pragma once
#include "infinicore/device.hpp"
#include "infinicore/memory.hpp"
#include "infinicore/graph/graph.hpp"
namespace infinicore::context {
std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);
};
......@@ -2,19 +2,20 @@
#include "../../utils.hpp"
#include "../allocators/device_caching_allocator.hpp"
#include "../allocators/device_pinned_allocator.hpp"
#include "../allocators/host_allocator.hpp"
#include "../allocators/pinnable_block_allocator.hpp"
#include "../allocators/stream_ordered_allocator.hpp"
namespace infinicore {
Runtime::Runtime(Device device) : device_(device) {
Runtime::Runtime(Device device) : device_(device), graph_manager_(std::make_unique<graph::GraphManager>()) {
activate();
INFINICORE_CHECK_ERROR(infinirtStreamCreate(&stream_));
INFINICORE_CHECK_ERROR(infiniopCreateHandle(&infiniop_handle_));
if (device_.getType() == Device::Type::CPU) {
device_memory_allocator_ = std::make_unique<HostAllocator>();
device_memory_allocator_ = std::make_unique<PinnableBlockAllocator>(device);
} else {
device_memory_allocator_ = std::make_unique<DeviceCachingAllocator>(device);
device_memory_allocator_ = std::make_unique<PinnableBlockAllocator>(device);
pinned_host_memory_allocator_ = std::make_unique<DevicePinnedHostAllocator>(device);
}
}
......@@ -76,6 +77,15 @@ std::shared_ptr<Memory> Runtime::allocatePinnedHostMemory(size_t size) {
true);
}
std::shared_ptr<Memory> Runtime::reinstantiateBlob(std::shared_ptr<Memory> blob) {
device_memory_allocator_.get()->mark_in_use_(blob->data(), true);
return std::make_shared<Memory>(
blob->data(), blob->size(), device_,
[alloc = device_memory_allocator_.get()](std::byte *p) {
alloc->deallocate(p);
});
}
void Runtime::memcpyH2D(void *dst, const void *src, size_t size, bool async) {
if (async) {
INFINICORE_CHECK_ERROR(infinirtMemcpyAsync(dst, src, size, INFINIRT_MEMCPY_H2D, stream_));
......@@ -144,6 +154,25 @@ void Runtime::streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
INFINICORE_CHECK_ERROR(infinirtStreamWaitEvent(stream, event));
}
bool Runtime::isGraphRecording() const {
return graph_manager_->is_recording();
}
void Runtime::startGraphRecording() {
device_memory_allocator_->set_pin_mode(true);
return graph_manager_->start_recording();
}
void Runtime::addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
return graph_manager_->add_operator(op);
}
std::shared_ptr<graph::Graph> Runtime::stopGraphRecording() {
auto graph = graph_manager_->stop_recording();
device_memory_allocator_->set_pin_mode(false);
return graph;
}
std::string Runtime::toString() const {
return fmt::format("Runtime({})", device_.toString());
}
......
#pragma once
#include "../allocators/memory_allocator.hpp"
#include "../allocators/pinnable_block_allocator.hpp"
#include "infinicore/context/context.hpp"
#include "../../graph/graph_manager.hpp"
#include <infiniop.h>
#include <infinirt.h>
......@@ -13,8 +16,9 @@ private:
Device device_;
infinirtStream_t stream_;
infiniopHandle_t infiniop_handle_;
std::unique_ptr<MemoryAllocator> device_memory_allocator_;
std::unique_ptr<PinnableBlockAllocator> device_memory_allocator_;
std::unique_ptr<MemoryAllocator> pinned_host_memory_allocator_;
std::unique_ptr<graph::GraphManager> graph_manager_;
protected:
Runtime(Device device);
......@@ -33,6 +37,7 @@ public:
std::shared_ptr<Memory> allocateMemory(size_t size);
std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size);
std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);
void memcpyH2D(void *dst, const void *src, size_t size, bool async = true);
void memcpyD2H(void *dst, const void *src, size_t size);
......@@ -48,6 +53,12 @@ public:
float elapsedTime(infinirtEvent_t start, infinirtEvent_t end);
void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event);
// Graph
bool isGraphRecording() const;
void startGraphRecording();
void addGraphOperator(std::shared_ptr<graph::GraphOperator> op);
std::shared_ptr<graph::Graph> stopGraphRecording();
std::string toString() const;
friend class ContextImpl;
......
......@@ -41,6 +41,8 @@ std::string Device::toString(const Type &type) {
return "KUNLUN";
case Type::HYGON:
return "HYGON";
case Type::ALI:
return "ALI";
case Type::COUNT:
return "COUNT";
default:
......
#include "graph_manager.hpp"
#include "../utils.hpp"
#include "infinicore/context/context.hpp"
#include <infinirt.h>
namespace infinicore::graph {
/* =========================
* GraphTensor
* ========================= */
GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob_()) {
}
/* =========================
* GraphOperator
* ========================= */
void DispatchableGraphOperator::run() const {
runner_(planned_meta_);
}
DispatchableGraphOperator::~DispatchableGraphOperator() {
if (deleter_) {
deleter_(&planned_meta_);
}
}
/* =========================
* Graph
* ========================= */
struct Graph::DeviceGraph {
infinirtGraph_t graph;
infinirtGraphExec_t exec;
infinirtGraphNode_t node;
std::vector<char> log_buffer;
DeviceGraph() {
log_buffer.resize(4 * 1024);
}
~DeviceGraph() {
if (exec) {
infinirtGraphExecDestroy(exec);
}
if (graph) {
infinirtGraphDestroy(graph);
}
}
void launch() {
INFINICORE_CHECK_ERROR(infinirtGraphLuanch(exec, context::getStream()));
}
};
Graph::Graph() {
}
void Graph::run() const {
if (device_graph_ != nullptr && device_graph_.get()->exec != nullptr) {
device_graph_.get()->launch();
} else {
for (auto &op : op_list_) {
op->run();
}
}
}
void Graph::add_operator(std::shared_ptr<GraphOperator> op) {
op_list_.push_back(op);
}
void Graph::instantiate() {
// Reset device graph
device_graph_ = std::make_unique<DeviceGraph>();
// warmup
for (size_t iter = 0; iter < 5; ++iter) {
this->run();
}
infinicore::context::syncStream();
if (infinirtStreamBeginCapture(
context::getStream(),
INFINIRT_STREAM_CAPTURE_MODE_RELAXED)
!= INFINI_STATUS_SUCCESS) {
return;
}
// Run and record
this->run();
if (infinirtStreamEndCapture(
context::getStream(),
&device_graph_.get()->graph)
!= INFINI_STATUS_SUCCESS) {
return;
}
if (infinirtGraphInstantiate(
&device_graph_.get()->exec,
device_graph_.get()->graph,
&device_graph_.get()->node,
device_graph_.get()->log_buffer.data(),
device_graph_.get()->log_buffer.size())
!= INFINI_STATUS_SUCCESS) {
static bool warned_once = false;
if (!warned_once) {
warned_once = true;
spdlog::warn("Fail to instantiate device graph: {}", std::string(device_graph_.get()->log_buffer.data()));
}
}
}
Graph::~Graph() = default;
/* =========================
* GraphManager
* ========================= */
bool GraphManager::is_recording() const {
return recording_;
}
void GraphManager::start_recording() {
if (is_recording()) {
spdlog::warn("Graph is already recording. Previous recording will be dropped.");
}
recording_ = true;
graph_ = std::make_shared<Graph>();
}
void GraphManager::add_operator(std::shared_ptr<GraphOperator> op) {
INFINICORE_ASSERT(is_recording());
graph_->add_operator(op);
}
std::shared_ptr<Graph> GraphManager::stop_recording() {
if (!is_recording()) {
spdlog::warn("Graph is not recording. Please start recording first.");
return nullptr;
}
recording_ = false;
#ifdef USE_INFINIRT_GRAPH
graph_->instantiate();
#endif
return std::exchange(graph_, nullptr);
}
} // namespace infinicore::graph
#pragma once
#include "infinicore/graph/graph.hpp"
#include <memory>
#include <vector>
namespace infinicore::graph {
class GraphManager {
public:
GraphManager() = default;
~GraphManager() = default;
bool is_recording() const;
void start_recording();
void add_operator(std::shared_ptr<GraphOperator> op);
std::shared_ptr<Graph> stop_recording();
private:
std::shared_ptr<Graph> graph_;
bool recording_ = false;
};
} // namespace infinicore::graph
......@@ -43,6 +43,13 @@ Embedding::Embedding(size_t num_embeddings,
}
Tensor Embedding::forward(const Tensor &indices) const {
// TODO: Implement on-device embedding for all devices, then remove the condition and the classic approach
auto device_type = device_.getType();
if (device_type == Device::Type::NVIDIA || device_type == Device::Type::ILUVATAR || device_type == Device::Type::METAX || device_type == Device::Type::MOORE || device_type == Device::Type::ALI) {
// Use op::embedding which supports device-side input and batch dimension
return op::embedding(indices->contiguous()->to(device_), weight_);
}
// Get the shape of indices
auto indices_shape = indices->shape();
......
#include "infinicore/nn/linear.hpp"
#include "../utils.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/ops/distributed/allreduce.hpp"
#include "infinicore/ops/linear.hpp"
#include "infinicore/ops/linear_w8a8i8.hpp"
#include <optional>
#include <spdlog/spdlog.h>
......@@ -17,21 +19,46 @@ BaseLinear::BaseLinear(size_t in_features, size_t out_features, bool bias,
device_ = device;
}
Tensor BaseLinear::compute_linear(Tensor &input) const {
BaseLinear::BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device)
: in_features_(in_features),
out_features_(out_features),
quantization_(quantization),
has_bias_(bias),
dtype_(dtype) {
device_ = device;
}
// Ensure input is contiguous before creating views (required for matmul)
// This prevents hanging when input tensor has non-contiguous memory layout
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
Tensor BaseLinear::compute_linear(Tensor &input) const {
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
// Use ops::linear_ directly to match Python backend's exact code path
// This ensures identical computation and numerical results
// Parameter inherits from Tensor, so we cast to Tensor explicitly
Tensor weight_tensor = static_cast<const Tensor &>(weight_);
std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
Tensor weight_packed_tensor = static_cast<const Tensor &>(weight_);
Tensor weight_scale_tensor = static_cast<const Tensor &>(weight_scale_);
// weight_packed should be transposed and non-contiguous.
std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt);
return output;
}
auto output = infinicore::op::linear_w8a8i8(input_contiguous->contiguous(), weight_packed_tensor, weight_scale_tensor, bias_opt);
return output;
}
default: {
// Ensure input is contiguous before creating views (required for matmul)
// This prevents hanging when input tensor has non-contiguous memory layout
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
// Use ops::linear_ directly to match Python backend's exact code path
// This ensures identical computation and numerical results
// Parameter inherits from Tensor, so we cast to Tensor explicitly
Tensor weight_tensor = static_cast<const Tensor &>(weight_);
std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt);
return output;
}
}
} // namespace infinicore::nn
Tensor BaseLinear::forward(Tensor &input) const {
return compute_linear(input);
......@@ -70,6 +97,43 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias,
// in_features, out_features, bias, static_cast<int>(dtype_));
}
Linear::Linear(size_t in_features, size_t out_features,
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device)
: BaseLinear(in_features, out_features, quantization, bias, dtype, device_) {
device_ = device;
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device));
INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device));
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device));
// Register bias parameter if requested
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
// SPDLOG_DEBUG("Created Linear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
break;
}
}
}
Tensor Linear::forward(Tensor &input) const {
return BaseLinear::forward(input);
}
......@@ -102,9 +166,45 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
}
// SPDLOG_DEBUG("Created ColumnParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device,
Size tp_rank, Size tp_size)
: BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
tp_rank_(tp_rank),
tp_size_(tp_size) {
device_ = device;
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 0, tp_rank_, tp_size_));
INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, tp_rank_, tp_size_));
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
0, tp_rank_, tp_size_));
// Register bias parameter if requested
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device,
0, tp_rank_, tp_size_));
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
break;
}
}
}
Tensor ColumnParallelLinear::forward(Tensor &input) const {
......@@ -138,26 +238,53 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bo
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
}
// SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device,
Size tp_rank, Size tp_size, infinicclComm_t communicator)
: BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
tp_rank_(tp_rank),
tp_size_(tp_size), communicator_(communicator) {
device_ = device;
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 1, tp_rank_, tp_size_));
INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, 0, 1));
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
1, tp_rank_, tp_size_));
// Register bias parameter if requested
if (bias && (0 == tp_rank_)) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
// SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
break;
}
}
}
Tensor RowParallelLinear::forward(Tensor &input) const {
auto output = BaseLinear::forward(input);
if ((tp_size_ > 1) && (communicator_ != nullptr)) {
Size count = output->numel();
DataType type = output->dtype();
infinirtStream_t stream = infinicore::context::getStream();
INFINICORE_CHECK_ERROR(infinicclAllReduce(output->data(), output->data(), count, static_cast<infiniDtype_t>(static_cast<int>(type)),
INFINICCL_SUM, communicator_, stream));
INFINICORE_CHECK_ERROR(infinirtStreamSynchronize(stream));
// RUN_INFINI(infinirtStreamSynchronize(stream));
op::distributed::allreduce_(output, output, INFINICCL_SUM, communicator_);
}
return output;
}
......
......@@ -21,6 +21,25 @@ Tensor RMSNorm::forward(const Tensor &x) const {
return op::rms_norm(x, weight_, static_cast<float>(eps_));
}
void RMSNorm::forward_inplace(Tensor &x, Tensor &residual) const {
if (!residual) {
residual = x;
x = op::rms_norm(x, weight_, static_cast<float>(eps_));
} else {
if (device_.getType() == Device::Type::CPU
|| device_.getType() == Device::Type::NVIDIA
|| device_.getType() == Device::Type::ILUVATAR
|| device_.getType() == Device::Type::METAX
|| device_.getType() == Device::Type::MOORE
|| device_.getType() == Device::Type::ALI) {
op::add_rms_norm_inplace(x, residual, weight_, static_cast<float>(eps_));
} else {
op::add_(residual, x, residual);
op::rms_norm_(x, residual, weight_, static_cast<float>(eps_));
}
}
}
std::string RMSNorm::extra_repr() const {
return "RMSNorm(normalized_shape=" + std::to_string(normalized_shape_) + ", eps=" + std::to_string(eps_) + ", dtype=" + std::to_string(static_cast<int>(dtype_)) + ")";
}
......
......@@ -16,12 +16,14 @@ RoPE::RoPE(size_t head_dim,
double theta,
Algo algo,
const DataType &dtype,
const Device &device)
const Device &device,
std::shared_ptr<ScalingConfig> scaling)
: head_dim_(head_dim),
max_seq_len_(max_seq_len),
theta_(theta),
algo_(algo),
dtype_(dtype) {
dtype_(dtype),
scaling_(scaling) {
if (head_dim % 2 != 0) {
throw std::invalid_argument("head_dim must be even for RoPE, got " + std::to_string(head_dim));
}
......@@ -54,14 +56,30 @@ void RoPE::initialize_cache() {
for (size_t j = 0; j < cache_dim; j++) {
// GPT-J style inverse frequency: theta^(-2j/head_dim)
// Compute directly in float to avoid double->float casting
float inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
float inv_freq;
float table_factor = 1.0f;
if (scaling_ == nullptr) {
inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
} else if (scaling_->type() == ScalingType::LONGROPE) {
std::shared_ptr<LongRopeConfig> lr = std::dynamic_pointer_cast<LongRopeConfig>(scaling_);
table_factor = lr->factor();
float _ext;
if (pos < lr->original_max_position_embeddings()) {
_ext = lr->short_factor()[j];
} else {
_ext = lr->long_factor()[j];
}
inv_freq = 1.0f / (_ext * std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_)));
} else {
inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
}
// Compute angle: position * inverse_frequency
float angle = static_cast<float>(pos) * inv_freq;
// Compute sin and cos directly on float
sin_data[pos * cache_dim + j] = std::sin(angle);
cos_data[pos * cache_dim + j] = std::cos(angle);
sin_data[pos * cache_dim + j] = std::sin(angle) * table_factor;
cos_data[pos * cache_dim + j] = std::cos(angle) * table_factor;
}
}
......
......@@ -3,24 +3,24 @@
namespace infinicore::op {
common::OpDispatcher<Add::schema> &Add::dispatcher() {
static common::OpDispatcher<Add::schema> dispatcher_;
return dispatcher_;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Add);
void Add::execute(Tensor c, Tensor a, Tensor b) {
Add::Add(Tensor c, const Tensor &a, const Tensor &b) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
infinicore::context::setDevice(c->device());
dispatcher().lookup(c->device().getType())(c, a, b);
INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b);
}
Tensor add(Tensor a, Tensor b) {
void Add::execute(Tensor c, const Tensor &a, const Tensor &b) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(Add, c, a, b);
}
Tensor add(const Tensor &a, const Tensor &b) {
auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
add_(c, a, b);
return c;
}
void add_(Tensor c, Tensor a, Tensor b) {
void add_(Tensor c, const Tensor &a, const Tensor &b) {
Add::execute(c, a, b);
}
......
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/add.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace infinicore::op::add_impl::infiniop {
thread_local common::OpCache<size_t, infiniopAddDescriptor_t> caches(
100, // capacity
[](infiniopAddDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyAddDescriptor(desc));
desc = nullptr;
}
});
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Add, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, c, a, b;
};
void calculate(Tensor c, Tensor a, Tensor b) {
void *plan(Tensor c, const Tensor &a, const Tensor &b) {
size_t seed = hash_combine(c, b, a);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, Add,
seed,
c->desc(), a->desc(), b->desc());
auto desc_opt = cache.get(seed);
infiniopAddDescriptor_t desc = nullptr;
INFINIOP_WORKSPACE_TENSOR(workspace, Add, descriptor);
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateAddDescriptor(
context::getInfiniopHandle(device), &desc,
c->desc(), a->desc(), b->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(c),
graph::GraphTensor(a),
graph::GraphTensor(b)};
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetAddWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopAdd(
desc, workspace->data(), workspace_size,
c->data(), a->data(), b->data(), context::getStream()));
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->c->data(),
planned->a->data(),
planned->b->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
static bool registered = []() {
Add::dispatcher().registerAll(&calculate, false);
return true;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Add, &plan, &run, &cleanup);
} // namespace infinicore::op::add_impl::infiniop
#include "infinicore/ops/add_rms_norm.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(AddRMSNorm);
AddRMSNorm::AddRMSNorm(Tensor y, Tensor residual_out, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, residual_out, a, b, weight);
INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, residual_out, a, b, weight, epsilon);
}
void AddRMSNorm::execute(Tensor y, Tensor residual_out, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(AddRMSNorm, y, residual_out, a, b, weight, epsilon);
}
std::pair<Tensor, Tensor> add_rms_norm(const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
auto y = Tensor::empty(a->shape(), a->dtype(), a->device());
auto residual_out = Tensor::empty(a->shape(), a->dtype(), a->device());
add_rms_norm_(y, residual_out, a, b, weight, epsilon);
return std::make_pair(y, residual_out);
}
void add_rms_norm_(Tensor out, Tensor residual, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
AddRMSNorm::execute(out, residual, a, b, weight, epsilon);
}
void add_rms_norm_inplace(Tensor input, Tensor residual, const Tensor &weight, float epsilon) {
add_rms_norm_(input, residual, input, residual, weight, epsilon);
}
} // namespace infinicore::op
#include "infinicore/ops/add_rms_norm.hpp"
#include "../infiniop_impl.hpp"
namespace infinicore::op::add_rms_norm_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, AddRMSNorm, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, out, residual, a, b, weight;
float epsilon;
};
void *plan(Tensor y, Tensor residual_out, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
size_t seed = hash_combine(y, residual_out, a, b, weight, epsilon);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, AddRMSNorm,
seed, y->desc(), residual_out->desc(),
a->desc(), b->desc(), weight->desc(), epsilon);
INFINIOP_WORKSPACE_TENSOR(workspace, AddRMSNorm, descriptor);
auto planned = new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(y),
graph::GraphTensor(residual_out),
graph::GraphTensor(a),
graph::GraphTensor(b),
graph::GraphTensor(weight),
epsilon};
return planned;
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopAddRMSNorm(
planned->descriptor->desc, planned->workspace->data(), planned->workspace->numel(),
planned->out->data(), planned->residual->data(), planned->a->data(), planned->b->data(), planned->weight->data(), context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(AddRMSNorm, &plan, &run, &cleanup);
} // namespace infinicore::op::add_rms_norm_impl::infiniop
#include "infinicore/ops/causal_softmax.hpp"
#include "../../utils.hpp"
#include <stdexcept>
namespace infinicore::op {
common::OpDispatcher<CausalSoftmax::schema> &CausalSoftmax::dispatcher() {
static common::OpDispatcher<CausalSoftmax::schema> dispatcher_;
return dispatcher_;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(CausalSoftmax);
void CausalSoftmax::execute(Tensor output, Tensor input) {
CausalSoftmax::CausalSoftmax(Tensor output, const Tensor &input) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
infinicore::context::setDevice(output->device());
auto device_type = output->device().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No CausalSoftmax implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
INFINICORE_GRAPH_OP_DISPATCH(output->device().getType(), output, input);
}
func(output, input);
void CausalSoftmax::execute(Tensor output, const Tensor &input) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(CausalSoftmax, output, input);
}
Tensor causal_softmax(Tensor input) {
Shape shape = input->shape();
auto output = Tensor::empty(shape, input->dtype(), input->device());
Tensor causal_softmax(const Tensor &input) {
auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
causal_softmax_(output, input);
return output;
}
void causal_softmax_(Tensor output, Tensor input) {
void causal_softmax_(Tensor output, const Tensor &input) {
CausalSoftmax::execute(output, input);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/causal_softmax.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace infinicore::op::causal_softmax_impl::infiniop {
thread_local common::OpCache<size_t, infiniopCausalSoftmaxDescriptor_t> caches(
100, // capacity
[](infiniopCausalSoftmaxDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyCausalSoftmaxDescriptor(desc));
desc = nullptr;
}
});
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, CausalSoftmax, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, output, input;
};
void calculate(Tensor output, Tensor input) {
void *plan(Tensor output, const Tensor &input) {
size_t seed = hash_combine(output, input);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, CausalSoftmax,
seed, output->desc(), input->desc());
auto desc_opt = cache.get(seed);
infiniopCausalSoftmaxDescriptor_t desc = nullptr;
INFINIOP_WORKSPACE_TENSOR(workspace, CausalSoftmax, descriptor);
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateCausalSoftmaxDescriptor(
context::getInfiniopHandle(device), &desc,
output->desc(), input->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(output),
graph::GraphTensor(input)};
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetCausalSoftmaxWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopCausalSoftmax(
desc, workspace->data(), workspace_size,
output->data(), input->data(), context::getStream()));
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->output->data(),
planned->input->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
static bool registered = []() {
CausalSoftmax::dispatcher().registerAll(&calculate, false);
return true;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(CausalSoftmax, &plan, &run, &cleanup);
} // namespace infinicore::op::causal_softmax_impl::infiniop
#include "infinicore/ops/dequantize_awq.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(DequantizeAWQ);
DequantizeAWQ::DequantizeAWQ(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zeros);
INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zeros);
}
void DequantizeAWQ::execute(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(DequantizeAWQ, x, x_packed, x_scale, x_zeros);
}
void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
DequantizeAWQ::execute(x, x_packed, x_scale, x_zeros);
}
} // namespace infinicore::op
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment