Unverified Commit 8d09630a authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge branch 'demo131' into Issue/862

parents ab52dead 012df56c
...@@ -5,10 +5,10 @@ ...@@ -5,10 +5,10 @@
#include "../context_impl.hpp" #include "../context_impl.hpp"
namespace infinicore { namespace infinicore {
class DeviceCachingAllocator : public MemoryAllocator { class StreamOrderedAllocator : public MemoryAllocator {
public: public:
explicit DeviceCachingAllocator(Device device); explicit StreamOrderedAllocator(Device device);
~DeviceCachingAllocator() = default; ~StreamOrderedAllocator() = default;
std::byte *allocate(size_t size) override; std::byte *allocate(size_t size) override;
void deallocate(std::byte *ptr) override; void deallocate(std::byte *ptr) override;
......
#include "context_impl.hpp" #include "context_impl.hpp"
#include "internal.hpp"
#include "../utils.hpp" #include "../utils.hpp"
...@@ -29,16 +30,18 @@ Runtime *ContextImpl::getCurrentRuntime() { ...@@ -29,16 +30,18 @@ Runtime *ContextImpl::getCurrentRuntime() {
return current_runtime_; return current_runtime_;
} }
Runtime *ContextImpl::getCpuRuntime() {
return runtime_table_[int(Device::Type::CPU)][0].get();
}
void ContextImpl::setDevice(Device device) { void ContextImpl::setDevice(Device device) {
if (device == getCurrentRuntime()->device()) { if (device == getCurrentRuntime()->device()) {
// Do nothing if the device is already set. // Do nothing if the device is already set.
return; return;
} }
thread_local bool warn_switch_runtime = false;
if (getCurrentRuntime()->isGraphRecording() && !warn_switch_runtime) {
spdlog::warn("Switching device runtime during graph recording may break the graph!");
warn_switch_runtime = true;
}
if (runtime_table_[int(device.getType())][device.getIndex()] == nullptr) { if (runtime_table_[int(device.getType())][device.getIndex()] == nullptr) {
// Lazy initialization of runtime if never set before. // Lazy initialization of runtime if never set before.
runtime_table_[int(device.getType())][device.getIndex()] = std::unique_ptr<Runtime>(new Runtime(device)); runtime_table_[int(device.getType())][device.getIndex()] = std::unique_ptr<Runtime>(new Runtime(device));
...@@ -100,11 +103,8 @@ infinirtStream_t getStream() { ...@@ -100,11 +103,8 @@ infinirtStream_t getStream() {
} }
infiniopHandle_t getInfiniopHandle(Device device) { infiniopHandle_t getInfiniopHandle(Device device) {
if (device.getType() == Device::Type::CPU) {
return ContextImpl::singleton().getCpuRuntime()->infiniopHandle();
}
if (device != getDevice()) { if (device != getDevice()) {
throw std::runtime_error("Requested device doesn't match current runtime."); setDevice(device);
} }
return ContextImpl::singleton().getCurrentRuntime()->infiniopHandle(); return ContextImpl::singleton().getCurrentRuntime()->infiniopHandle();
} }
...@@ -122,7 +122,8 @@ std::shared_ptr<Memory> allocateMemory(size_t size) { ...@@ -122,7 +122,8 @@ std::shared_ptr<Memory> allocateMemory(size_t size) {
} }
std::shared_ptr<Memory> allocateHostMemory(size_t size) { std::shared_ptr<Memory> allocateHostMemory(size_t size) {
return ContextImpl::singleton().getCpuRuntime()->allocateMemory(size); setDevice(Device::cpu());
return allocateMemory(size);
} }
std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size) { std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size) {
...@@ -142,7 +143,8 @@ void memcpyD2D(void *dst, const void *src, size_t size, bool async) { ...@@ -142,7 +143,8 @@ void memcpyD2D(void *dst, const void *src, size_t size, bool async) {
} }
void memcpyH2H(void *dst, const void *src, size_t size) { void memcpyH2H(void *dst, const void *src, size_t size) {
return ContextImpl::singleton().getCpuRuntime()->memcpyD2D(dst, src, size); setDevice(Device::cpu());
return ContextImpl::singleton().getCurrentRuntime()->memcpyD2D(dst, src, size);
} }
// Timing API implementations // Timing API implementations
...@@ -178,6 +180,27 @@ void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) { ...@@ -178,6 +180,27 @@ void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
ContextImpl::singleton().getCurrentRuntime()->streamWaitEvent(stream, event); ContextImpl::singleton().getCurrentRuntime()->streamWaitEvent(stream, event);
} }
bool isGraphRecording() {
return ContextImpl::singleton().getCurrentRuntime()->isGraphRecording();
}
void startGraphRecording() {
ContextImpl::singleton().getCurrentRuntime()->startGraphRecording();
}
void addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
ContextImpl::singleton().getCurrentRuntime()->addGraphOperator(op);
}
std::shared_ptr<graph::Graph> stopGraphRecording() {
return ContextImpl::singleton().getCurrentRuntime()->stopGraphRecording();
}
std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob) {
setDevice(blob->device());
return ContextImpl::singleton().getCurrentRuntime()->reinstantiateBlob(blob);
}
} // namespace context } // namespace context
} // namespace infinicore } // namespace infinicore
...@@ -19,8 +19,6 @@ protected: ...@@ -19,8 +19,6 @@ protected:
public: public:
Runtime *getCurrentRuntime(); Runtime *getCurrentRuntime();
Runtime *getCpuRuntime();
void setDevice(Device); void setDevice(Device);
size_t getDeviceCount(Device::Type type); size_t getDeviceCount(Device::Type type);
......
#pragma once
#include "infinicore/device.hpp"
#include "infinicore/memory.hpp"
#include "infinicore/graph/graph.hpp"
namespace infinicore::context {
std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);
};
...@@ -2,19 +2,20 @@ ...@@ -2,19 +2,20 @@
#include "../../utils.hpp" #include "../../utils.hpp"
#include "../allocators/device_caching_allocator.hpp"
#include "../allocators/device_pinned_allocator.hpp" #include "../allocators/device_pinned_allocator.hpp"
#include "../allocators/host_allocator.hpp" #include "../allocators/host_allocator.hpp"
#include "../allocators/pinnable_block_allocator.hpp"
#include "../allocators/stream_ordered_allocator.hpp"
namespace infinicore { namespace infinicore {
Runtime::Runtime(Device device) : device_(device) { Runtime::Runtime(Device device) : device_(device), graph_manager_(std::make_unique<graph::GraphManager>()) {
activate(); activate();
INFINICORE_CHECK_ERROR(infinirtStreamCreate(&stream_)); INFINICORE_CHECK_ERROR(infinirtStreamCreate(&stream_));
INFINICORE_CHECK_ERROR(infiniopCreateHandle(&infiniop_handle_)); INFINICORE_CHECK_ERROR(infiniopCreateHandle(&infiniop_handle_));
if (device_.getType() == Device::Type::CPU) { if (device_.getType() == Device::Type::CPU) {
device_memory_allocator_ = std::make_unique<HostAllocator>(); device_memory_allocator_ = std::make_unique<PinnableBlockAllocator>(device);
} else { } else {
device_memory_allocator_ = std::make_unique<DeviceCachingAllocator>(device); device_memory_allocator_ = std::make_unique<PinnableBlockAllocator>(device);
pinned_host_memory_allocator_ = std::make_unique<DevicePinnedHostAllocator>(device); pinned_host_memory_allocator_ = std::make_unique<DevicePinnedHostAllocator>(device);
} }
} }
...@@ -76,6 +77,15 @@ std::shared_ptr<Memory> Runtime::allocatePinnedHostMemory(size_t size) { ...@@ -76,6 +77,15 @@ std::shared_ptr<Memory> Runtime::allocatePinnedHostMemory(size_t size) {
true); true);
} }
std::shared_ptr<Memory> Runtime::reinstantiateBlob(std::shared_ptr<Memory> blob) {
device_memory_allocator_.get()->mark_in_use_(blob->data(), true);
return std::make_shared<Memory>(
blob->data(), blob->size(), device_,
[alloc = device_memory_allocator_.get()](std::byte *p) {
alloc->deallocate(p);
});
}
void Runtime::memcpyH2D(void *dst, const void *src, size_t size, bool async) { void Runtime::memcpyH2D(void *dst, const void *src, size_t size, bool async) {
if (async) { if (async) {
INFINICORE_CHECK_ERROR(infinirtMemcpyAsync(dst, src, size, INFINIRT_MEMCPY_H2D, stream_)); INFINICORE_CHECK_ERROR(infinirtMemcpyAsync(dst, src, size, INFINIRT_MEMCPY_H2D, stream_));
...@@ -144,6 +154,25 @@ void Runtime::streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) { ...@@ -144,6 +154,25 @@ void Runtime::streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
INFINICORE_CHECK_ERROR(infinirtStreamWaitEvent(stream, event)); INFINICORE_CHECK_ERROR(infinirtStreamWaitEvent(stream, event));
} }
bool Runtime::isGraphRecording() const {
return graph_manager_->is_recording();
}
void Runtime::startGraphRecording() {
device_memory_allocator_->set_pin_mode(true);
return graph_manager_->start_recording();
}
void Runtime::addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
return graph_manager_->add_operator(op);
}
std::shared_ptr<graph::Graph> Runtime::stopGraphRecording() {
auto graph = graph_manager_->stop_recording();
device_memory_allocator_->set_pin_mode(false);
return graph;
}
std::string Runtime::toString() const { std::string Runtime::toString() const {
return fmt::format("Runtime({})", device_.toString()); return fmt::format("Runtime({})", device_.toString());
} }
......
#pragma once #pragma once
#include "../allocators/memory_allocator.hpp" #include "../allocators/pinnable_block_allocator.hpp"
#include "infinicore/context/context.hpp" #include "infinicore/context/context.hpp"
#include "../../graph/graph_manager.hpp"
#include <infiniop.h> #include <infiniop.h>
#include <infinirt.h> #include <infinirt.h>
...@@ -13,8 +16,9 @@ private: ...@@ -13,8 +16,9 @@ private:
Device device_; Device device_;
infinirtStream_t stream_; infinirtStream_t stream_;
infiniopHandle_t infiniop_handle_; infiniopHandle_t infiniop_handle_;
std::unique_ptr<MemoryAllocator> device_memory_allocator_; std::unique_ptr<PinnableBlockAllocator> device_memory_allocator_;
std::unique_ptr<MemoryAllocator> pinned_host_memory_allocator_; std::unique_ptr<MemoryAllocator> pinned_host_memory_allocator_;
std::unique_ptr<graph::GraphManager> graph_manager_;
protected: protected:
Runtime(Device device); Runtime(Device device);
...@@ -33,6 +37,7 @@ public: ...@@ -33,6 +37,7 @@ public:
std::shared_ptr<Memory> allocateMemory(size_t size); std::shared_ptr<Memory> allocateMemory(size_t size);
std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size); std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size);
std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);
void memcpyH2D(void *dst, const void *src, size_t size, bool async = true); void memcpyH2D(void *dst, const void *src, size_t size, bool async = true);
void memcpyD2H(void *dst, const void *src, size_t size); void memcpyD2H(void *dst, const void *src, size_t size);
...@@ -48,6 +53,12 @@ public: ...@@ -48,6 +53,12 @@ public:
float elapsedTime(infinirtEvent_t start, infinirtEvent_t end); float elapsedTime(infinirtEvent_t start, infinirtEvent_t end);
void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event); void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event);
// Graph
bool isGraphRecording() const;
void startGraphRecording();
void addGraphOperator(std::shared_ptr<graph::GraphOperator> op);
std::shared_ptr<graph::Graph> stopGraphRecording();
std::string toString() const; std::string toString() const;
friend class ContextImpl; friend class ContextImpl;
......
...@@ -41,6 +41,8 @@ std::string Device::toString(const Type &type) { ...@@ -41,6 +41,8 @@ std::string Device::toString(const Type &type) {
return "KUNLUN"; return "KUNLUN";
case Type::HYGON: case Type::HYGON:
return "HYGON"; return "HYGON";
case Type::ALI:
return "ALI";
case Type::COUNT: case Type::COUNT:
return "COUNT"; return "COUNT";
default: default:
......
#include "graph_manager.hpp"
#include "../utils.hpp"
#include "infinicore/context/context.hpp"
#include <infinirt.h>
namespace infinicore::graph {
/* =========================
* GraphTensor
* ========================= */
GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob_()) {
}
/* =========================
* GraphOperator
* ========================= */
void DispatchableGraphOperator::run() const {
runner_(planned_meta_);
}
DispatchableGraphOperator::~DispatchableGraphOperator() {
if (deleter_) {
deleter_(&planned_meta_);
}
}
/* =========================
* Graph
* ========================= */
struct Graph::DeviceGraph {
infinirtGraph_t graph;
infinirtGraphExec_t exec;
infinirtGraphNode_t node;
std::vector<char> log_buffer;
DeviceGraph() {
log_buffer.resize(4 * 1024);
}
~DeviceGraph() {
if (exec) {
infinirtGraphExecDestroy(exec);
}
if (graph) {
infinirtGraphDestroy(graph);
}
}
void launch() {
INFINICORE_CHECK_ERROR(infinirtGraphLuanch(exec, context::getStream()));
}
};
Graph::Graph() {
}
void Graph::run() const {
if (device_graph_ != nullptr && device_graph_.get()->exec != nullptr) {
device_graph_.get()->launch();
} else {
for (auto &op : op_list_) {
op->run();
}
}
}
void Graph::add_operator(std::shared_ptr<GraphOperator> op) {
op_list_.push_back(op);
}
void Graph::instantiate() {
// Reset device graph
device_graph_ = std::make_unique<DeviceGraph>();
// warmup
for (size_t iter = 0; iter < 5; ++iter) {
this->run();
}
infinicore::context::syncStream();
if (infinirtStreamBeginCapture(
context::getStream(),
INFINIRT_STREAM_CAPTURE_MODE_RELAXED)
!= INFINI_STATUS_SUCCESS) {
return;
}
// Run and record
this->run();
if (infinirtStreamEndCapture(
context::getStream(),
&device_graph_.get()->graph)
!= INFINI_STATUS_SUCCESS) {
return;
}
if (infinirtGraphInstantiate(
&device_graph_.get()->exec,
device_graph_.get()->graph,
&device_graph_.get()->node,
device_graph_.get()->log_buffer.data(),
device_graph_.get()->log_buffer.size())
!= INFINI_STATUS_SUCCESS) {
static bool warned_once = false;
if (!warned_once) {
warned_once = true;
spdlog::warn("Fail to instantiate device graph: {}", std::string(device_graph_.get()->log_buffer.data()));
}
}
}
Graph::~Graph() = default;
/* =========================
* GraphManager
* ========================= */
bool GraphManager::is_recording() const {
return recording_;
}
void GraphManager::start_recording() {
if (is_recording()) {
spdlog::warn("Graph is already recording. Previous recording will be dropped.");
}
recording_ = true;
graph_ = std::make_shared<Graph>();
}
void GraphManager::add_operator(std::shared_ptr<GraphOperator> op) {
INFINICORE_ASSERT(is_recording());
graph_->add_operator(op);
}
std::shared_ptr<Graph> GraphManager::stop_recording() {
if (!is_recording()) {
spdlog::warn("Graph is not recording. Please start recording first.");
return nullptr;
}
recording_ = false;
#ifdef USE_INFINIRT_GRAPH
graph_->instantiate();
#endif
return std::exchange(graph_, nullptr);
}
} // namespace infinicore::graph
#pragma once
#include "infinicore/graph/graph.hpp"
#include <memory>
#include <vector>
namespace infinicore::graph {
class GraphManager {
public:
GraphManager() = default;
~GraphManager() = default;
bool is_recording() const;
void start_recording();
void add_operator(std::shared_ptr<GraphOperator> op);
std::shared_ptr<Graph> stop_recording();
private:
std::shared_ptr<Graph> graph_;
bool recording_ = false;
};
} // namespace infinicore::graph
...@@ -43,6 +43,13 @@ Embedding::Embedding(size_t num_embeddings, ...@@ -43,6 +43,13 @@ Embedding::Embedding(size_t num_embeddings,
} }
Tensor Embedding::forward(const Tensor &indices) const { Tensor Embedding::forward(const Tensor &indices) const {
// TODO: Implement on-device embedding for all devices, then remove the condition and the classic approach
auto device_type = device_.getType();
if (device_type == Device::Type::NVIDIA || device_type == Device::Type::ILUVATAR || device_type == Device::Type::METAX || device_type == Device::Type::MOORE || device_type == Device::Type::ALI) {
// Use op::embedding which supports device-side input and batch dimension
return op::embedding(indices->contiguous()->to(device_), weight_);
}
// Get the shape of indices // Get the shape of indices
auto indices_shape = indices->shape(); auto indices_shape = indices->shape();
......
#include "infinicore/nn/linear.hpp" #include "infinicore/nn/linear.hpp"
#include "../utils.hpp" #include "../utils.hpp"
#include "infinicore/ops.hpp" #include "infinicore/ops.hpp"
#include "infinicore/ops/distributed/allreduce.hpp"
#include "infinicore/ops/linear.hpp" #include "infinicore/ops/linear.hpp"
#include "infinicore/ops/linear_w8a8i8.hpp"
#include <optional> #include <optional>
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
...@@ -17,21 +19,46 @@ BaseLinear::BaseLinear(size_t in_features, size_t out_features, bool bias, ...@@ -17,21 +19,46 @@ BaseLinear::BaseLinear(size_t in_features, size_t out_features, bool bias,
device_ = device; device_ = device;
} }
Tensor BaseLinear::compute_linear(Tensor &input) const { BaseLinear::BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device)
: in_features_(in_features),
out_features_(out_features),
quantization_(quantization),
has_bias_(bias),
dtype_(dtype) {
device_ = device;
}
// Ensure input is contiguous before creating views (required for matmul) Tensor BaseLinear::compute_linear(Tensor &input) const {
// This prevents hanging when input tensor has non-contiguous memory layout switch (this->quantization_->get_quant_scheme()) {
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous(); case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
// Use ops::linear_ directly to match Python backend's exact code path Tensor weight_packed_tensor = static_cast<const Tensor &>(weight_);
// This ensures identical computation and numerical results Tensor weight_scale_tensor = static_cast<const Tensor &>(weight_scale_);
// Parameter inherits from Tensor, so we cast to Tensor explicitly // weight_packed should be transposed and non-contiguous.
Tensor weight_tensor = static_cast<const Tensor &>(weight_); std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt); auto output = infinicore::op::linear_w8a8i8(input_contiguous->contiguous(), weight_packed_tensor, weight_scale_tensor, bias_opt);
return output; return output;
} }
default: {
// Ensure input is contiguous before creating views (required for matmul)
// This prevents hanging when input tensor has non-contiguous memory layout
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
// Use ops::linear_ directly to match Python backend's exact code path
// This ensures identical computation and numerical results
// Parameter inherits from Tensor, so we cast to Tensor explicitly
Tensor weight_tensor = static_cast<const Tensor &>(weight_);
std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt);
return output;
}
}
} // namespace infinicore::nn
Tensor BaseLinear::forward(Tensor &input) const { Tensor BaseLinear::forward(Tensor &input) const {
return compute_linear(input); return compute_linear(input);
...@@ -70,6 +97,43 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, ...@@ -70,6 +97,43 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias,
// in_features, out_features, bias, static_cast<int>(dtype_)); // in_features, out_features, bias, static_cast<int>(dtype_));
} }
Linear::Linear(size_t in_features, size_t out_features,
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const DataType &dtype, const Device &device)
: BaseLinear(in_features, out_features, quantization, bias, dtype, device_) {
device_ = device;
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device));
INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device));
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device));
// Register bias parameter if requested
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
// SPDLOG_DEBUG("Created Linear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
break;
}
}
}
Tensor Linear::forward(Tensor &input) const { Tensor Linear::forward(Tensor &input) const {
return BaseLinear::forward(input); return BaseLinear::forward(input);
} }
...@@ -102,9 +166,45 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur ...@@ -102,9 +166,45 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur
} else { } else {
bias_ = Parameter(); // Default constructed empty parameter bias_ = Parameter(); // Default constructed empty parameter
} }
}
// SPDLOG_DEBUG("Created ColumnParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}", ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
// in_features, out_features, bias, static_cast<int>(dtype_)); const DataType &dtype, const Device &device,
Size tp_rank, Size tp_size)
: BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
tp_rank_(tp_rank),
tp_size_(tp_size) {
device_ = device;
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 0, tp_rank_, tp_size_));
INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, tp_rank_, tp_size_));
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
0, tp_rank_, tp_size_));
// Register bias parameter if requested
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device,
0, tp_rank_, tp_size_));
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
break;
}
}
} }
Tensor ColumnParallelLinear::forward(Tensor &input) const { Tensor ColumnParallelLinear::forward(Tensor &input) const {
...@@ -138,26 +238,53 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bo ...@@ -138,26 +238,53 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bo
} else { } else {
bias_ = Parameter(); // Default constructed empty parameter bias_ = Parameter(); // Default constructed empty parameter
} }
}
// SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}", RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
// in_features, out_features, bias, static_cast<int>(dtype_)); const DataType &dtype, const Device &device,
Size tp_rank, Size tp_size, infinicclComm_t communicator)
: BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
tp_rank_(tp_rank),
tp_size_(tp_size), communicator_(communicator) {
device_ = device;
switch (this->quantization_->get_quant_scheme()) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 1, tp_rank_, tp_size_));
INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, 0, 1));
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
1, tp_rank_, tp_size_));
// Register bias parameter if requested
if (bias && (0 == tp_rank_)) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
} else {
bias_ = Parameter(); // Default constructed empty parameter
}
// SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
break;
}
}
} }
Tensor RowParallelLinear::forward(Tensor &input) const { Tensor RowParallelLinear::forward(Tensor &input) const {
auto output = BaseLinear::forward(input); auto output = BaseLinear::forward(input);
if ((tp_size_ > 1) && (communicator_ != nullptr)) { if ((tp_size_ > 1) && (communicator_ != nullptr)) {
op::distributed::allreduce_(output, output, INFINICCL_SUM, communicator_);
Size count = output->numel();
DataType type = output->dtype();
infinirtStream_t stream = infinicore::context::getStream();
INFINICORE_CHECK_ERROR(infinicclAllReduce(output->data(), output->data(), count, static_cast<infiniDtype_t>(static_cast<int>(type)),
INFINICCL_SUM, communicator_, stream));
INFINICORE_CHECK_ERROR(infinirtStreamSynchronize(stream));
// RUN_INFINI(infinirtStreamSynchronize(stream));
} }
return output; return output;
} }
......
...@@ -21,6 +21,25 @@ Tensor RMSNorm::forward(const Tensor &x) const { ...@@ -21,6 +21,25 @@ Tensor RMSNorm::forward(const Tensor &x) const {
return op::rms_norm(x, weight_, static_cast<float>(eps_)); return op::rms_norm(x, weight_, static_cast<float>(eps_));
} }
void RMSNorm::forward_inplace(Tensor &x, Tensor &residual) const {
if (!residual) {
residual = x;
x = op::rms_norm(x, weight_, static_cast<float>(eps_));
} else {
if (device_.getType() == Device::Type::CPU
|| device_.getType() == Device::Type::NVIDIA
|| device_.getType() == Device::Type::ILUVATAR
|| device_.getType() == Device::Type::METAX
|| device_.getType() == Device::Type::MOORE
|| device_.getType() == Device::Type::ALI) {
op::add_rms_norm_inplace(x, residual, weight_, static_cast<float>(eps_));
} else {
op::add_(residual, x, residual);
op::rms_norm_(x, residual, weight_, static_cast<float>(eps_));
}
}
}
std::string RMSNorm::extra_repr() const { std::string RMSNorm::extra_repr() const {
return "RMSNorm(normalized_shape=" + std::to_string(normalized_shape_) + ", eps=" + std::to_string(eps_) + ", dtype=" + std::to_string(static_cast<int>(dtype_)) + ")"; return "RMSNorm(normalized_shape=" + std::to_string(normalized_shape_) + ", eps=" + std::to_string(eps_) + ", dtype=" + std::to_string(static_cast<int>(dtype_)) + ")";
} }
......
...@@ -16,12 +16,14 @@ RoPE::RoPE(size_t head_dim, ...@@ -16,12 +16,14 @@ RoPE::RoPE(size_t head_dim,
double theta, double theta,
Algo algo, Algo algo,
const DataType &dtype, const DataType &dtype,
const Device &device) const Device &device,
std::shared_ptr<ScalingConfig> scaling)
: head_dim_(head_dim), : head_dim_(head_dim),
max_seq_len_(max_seq_len), max_seq_len_(max_seq_len),
theta_(theta), theta_(theta),
algo_(algo), algo_(algo),
dtype_(dtype) { dtype_(dtype),
scaling_(scaling) {
if (head_dim % 2 != 0) { if (head_dim % 2 != 0) {
throw std::invalid_argument("head_dim must be even for RoPE, got " + std::to_string(head_dim)); throw std::invalid_argument("head_dim must be even for RoPE, got " + std::to_string(head_dim));
} }
...@@ -54,14 +56,30 @@ void RoPE::initialize_cache() { ...@@ -54,14 +56,30 @@ void RoPE::initialize_cache() {
for (size_t j = 0; j < cache_dim; j++) { for (size_t j = 0; j < cache_dim; j++) {
// GPT-J style inverse frequency: theta^(-2j/head_dim) // GPT-J style inverse frequency: theta^(-2j/head_dim)
// Compute directly in float to avoid double->float casting // Compute directly in float to avoid double->float casting
float inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_)); float inv_freq;
float table_factor = 1.0f;
if (scaling_ == nullptr) {
inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
} else if (scaling_->type() == ScalingType::LONGROPE) {
std::shared_ptr<LongRopeConfig> lr = std::dynamic_pointer_cast<LongRopeConfig>(scaling_);
table_factor = lr->factor();
float _ext;
if (pos < lr->original_max_position_embeddings()) {
_ext = lr->short_factor()[j];
} else {
_ext = lr->long_factor()[j];
}
inv_freq = 1.0f / (_ext * std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_)));
} else {
inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
}
// Compute angle: position * inverse_frequency // Compute angle: position * inverse_frequency
float angle = static_cast<float>(pos) * inv_freq; float angle = static_cast<float>(pos) * inv_freq;
// Compute sin and cos directly on float // Compute sin and cos directly on float
sin_data[pos * cache_dim + j] = std::sin(angle); sin_data[pos * cache_dim + j] = std::sin(angle) * table_factor;
cos_data[pos * cache_dim + j] = std::cos(angle); cos_data[pos * cache_dim + j] = std::cos(angle) * table_factor;
} }
} }
......
...@@ -3,24 +3,24 @@ ...@@ -3,24 +3,24 @@
namespace infinicore::op { namespace infinicore::op {
common::OpDispatcher<Add::schema> &Add::dispatcher() { INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Add);
static common::OpDispatcher<Add::schema> dispatcher_;
return dispatcher_;
};
void Add::execute(Tensor c, Tensor a, Tensor b) { Add::Add(Tensor c, const Tensor &a, const Tensor &b) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
infinicore::context::setDevice(c->device()); INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b);
dispatcher().lookup(c->device().getType())(c, a, b);
} }
Tensor add(Tensor a, Tensor b) { void Add::execute(Tensor c, const Tensor &a, const Tensor &b) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(Add, c, a, b);
}
Tensor add(const Tensor &a, const Tensor &b) {
auto c = Tensor::empty(a->shape(), a->dtype(), a->device()); auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
add_(c, a, b); add_(c, a, b);
return c; return c;
} }
void add_(Tensor c, Tensor a, Tensor b) { void add_(Tensor c, const Tensor &a, const Tensor &b) {
Add::execute(c, a, b); Add::execute(c, a, b);
} }
......
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/add.hpp" #include "infinicore/ops/add.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h> #include "../infiniop_impl.hpp"
namespace infinicore::op::add_impl::infiniop { namespace infinicore::op::add_impl::infiniop {
thread_local common::OpCache<size_t, infiniopAddDescriptor_t> caches( INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Add, 100);
100, // capacity
[](infiniopAddDescriptor_t &desc) { struct PlannedMeta {
if (desc != nullptr) { std::shared_ptr<Descriptor> descriptor;
INFINICORE_CHECK_ERROR(infiniopDestroyAddDescriptor(desc)); graph::GraphTensor workspace, c, a, b;
desc = nullptr; };
}
});
void calculate(Tensor c, Tensor a, Tensor b) { void *plan(Tensor c, const Tensor &a, const Tensor &b) {
size_t seed = hash_combine(c, b, a); size_t seed = hash_combine(c, b, a);
auto device = context::getDevice(); INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
auto &cache = caches.getCache(device); Descriptor, descriptor, Add,
seed,
c->desc(), a->desc(), b->desc());
auto desc_opt = cache.get(seed); INFINIOP_WORKSPACE_TENSOR(workspace, Add, descriptor);
infiniopAddDescriptor_t desc = nullptr;
if (!desc_opt) { return new PlannedMeta{
INFINICORE_CHECK_ERROR(infiniopCreateAddDescriptor( descriptor,
context::getInfiniopHandle(device), &desc, graph::GraphTensor(workspace),
c->desc(), a->desc(), b->desc())); graph::GraphTensor(c),
cache.put(seed, desc); graph::GraphTensor(a),
} else { graph::GraphTensor(b)};
desc = *desc_opt; }
}
size_t workspace_size = 0; void run(void *planned_meta) {
INFINICORE_CHECK_ERROR(infiniopGetAddWorkspaceSize(desc, &workspace_size)); auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopAdd( INFINICORE_CHECK_ERROR(infiniopAdd(
desc, workspace->data(), workspace_size, planned->descriptor->desc,
c->data(), a->data(), b->data(), context::getStream())); planned->workspace->data(),
planned->workspace->numel(),
planned->c->data(),
planned->a->data(),
planned->b->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
} }
static bool registered = []() { INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Add, &plan, &run, &cleanup);
Add::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::add_impl::infiniop } // namespace infinicore::op::add_impl::infiniop
#include "infinicore/ops/add_rms_norm.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(AddRMSNorm);
AddRMSNorm::AddRMSNorm(Tensor y, Tensor residual_out, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, residual_out, a, b, weight);
INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, residual_out, a, b, weight, epsilon);
}
void AddRMSNorm::execute(Tensor y, Tensor residual_out, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(AddRMSNorm, y, residual_out, a, b, weight, epsilon);
}
std::pair<Tensor, Tensor> add_rms_norm(const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
auto y = Tensor::empty(a->shape(), a->dtype(), a->device());
auto residual_out = Tensor::empty(a->shape(), a->dtype(), a->device());
add_rms_norm_(y, residual_out, a, b, weight, epsilon);
return std::make_pair(y, residual_out);
}
void add_rms_norm_(Tensor out, Tensor residual, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
AddRMSNorm::execute(out, residual, a, b, weight, epsilon);
}
void add_rms_norm_inplace(Tensor input, Tensor residual, const Tensor &weight, float epsilon) {
add_rms_norm_(input, residual, input, residual, weight, epsilon);
}
} // namespace infinicore::op
#include "infinicore/ops/add_rms_norm.hpp"
#include "../infiniop_impl.hpp"
namespace infinicore::op::add_rms_norm_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, AddRMSNorm, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, out, residual, a, b, weight;
float epsilon;
};
void *plan(Tensor y, Tensor residual_out, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon) {
size_t seed = hash_combine(y, residual_out, a, b, weight, epsilon);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, AddRMSNorm,
seed, y->desc(), residual_out->desc(),
a->desc(), b->desc(), weight->desc(), epsilon);
INFINIOP_WORKSPACE_TENSOR(workspace, AddRMSNorm, descriptor);
auto planned = new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(y),
graph::GraphTensor(residual_out),
graph::GraphTensor(a),
graph::GraphTensor(b),
graph::GraphTensor(weight),
epsilon};
return planned;
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopAddRMSNorm(
planned->descriptor->desc, planned->workspace->data(), planned->workspace->numel(),
planned->out->data(), planned->residual->data(), planned->a->data(), planned->b->data(), planned->weight->data(), context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(AddRMSNorm, &plan, &run, &cleanup);
} // namespace infinicore::op::add_rms_norm_impl::infiniop
#include "infinicore/ops/causal_softmax.hpp" #include "infinicore/ops/causal_softmax.hpp"
#include "../../utils.hpp" #include "../../utils.hpp"
#include <stdexcept>
namespace infinicore::op { namespace infinicore::op {
common::OpDispatcher<CausalSoftmax::schema> &CausalSoftmax::dispatcher() { INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(CausalSoftmax);
static common::OpDispatcher<CausalSoftmax::schema> dispatcher_;
return dispatcher_;
};
void CausalSoftmax::execute(Tensor output, Tensor input) { CausalSoftmax::CausalSoftmax(Tensor output, const Tensor &input) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
infinicore::context::setDevice(output->device()); INFINICORE_GRAPH_OP_DISPATCH(output->device().getType(), output, input);
auto device_type = output->device().getType(); }
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No CausalSoftmax implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(output, input); void CausalSoftmax::execute(Tensor output, const Tensor &input) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(CausalSoftmax, output, input);
} }
Tensor causal_softmax(Tensor input) { Tensor causal_softmax(const Tensor &input) {
Shape shape = input->shape(); auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
auto output = Tensor::empty(shape, input->dtype(), input->device());
causal_softmax_(output, input); causal_softmax_(output, input);
return output; return output;
} }
void causal_softmax_(Tensor output, Tensor input) { void causal_softmax_(Tensor output, const Tensor &input) {
CausalSoftmax::execute(output, input); CausalSoftmax::execute(output, input);
} }
} // namespace infinicore::op } // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/causal_softmax.hpp" #include "infinicore/ops/causal_softmax.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h> #include "../infiniop_impl.hpp"
namespace infinicore::op::causal_softmax_impl::infiniop { namespace infinicore::op::causal_softmax_impl::infiniop {
thread_local common::OpCache<size_t, infiniopCausalSoftmaxDescriptor_t> caches( INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, CausalSoftmax, 100);
100, // capacity
[](infiniopCausalSoftmaxDescriptor_t &desc) { struct PlannedMeta {
if (desc != nullptr) { std::shared_ptr<Descriptor> descriptor;
INFINICORE_CHECK_ERROR(infiniopDestroyCausalSoftmaxDescriptor(desc)); graph::GraphTensor workspace, output, input;
desc = nullptr; };
}
});
void calculate(Tensor output, Tensor input) { void *plan(Tensor output, const Tensor &input) {
size_t seed = hash_combine(output, input); size_t seed = hash_combine(output, input);
auto device = context::getDevice(); INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
auto &cache = caches.getCache(device); Descriptor, descriptor, CausalSoftmax,
seed, output->desc(), input->desc());
auto desc_opt = cache.get(seed); INFINIOP_WORKSPACE_TENSOR(workspace, CausalSoftmax, descriptor);
infiniopCausalSoftmaxDescriptor_t desc = nullptr;
if (!desc_opt) { return new PlannedMeta{
INFINICORE_CHECK_ERROR(infiniopCreateCausalSoftmaxDescriptor( descriptor,
context::getInfiniopHandle(device), &desc, graph::GraphTensor(workspace),
output->desc(), input->desc())); graph::GraphTensor(output),
cache.put(seed, desc); graph::GraphTensor(input)};
} else { }
desc = *desc_opt;
}
size_t workspace_size = 0; void run(void *planned_meta) {
INFINICORE_CHECK_ERROR(infiniopGetCausalSoftmaxWorkspaceSize(desc, &workspace_size)); auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopCausalSoftmax( INFINICORE_CHECK_ERROR(infiniopCausalSoftmax(
desc, workspace->data(), workspace_size, planned->descriptor->desc,
output->data(), input->data(), context::getStream())); planned->workspace->data(),
planned->workspace->numel(),
planned->output->data(),
planned->input->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
} }
static bool registered = []() { INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(CausalSoftmax, &plan, &run, &cleanup);
CausalSoftmax::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::causal_softmax_impl::infiniop } // namespace infinicore::op::causal_softmax_impl::infiniop
#include "infinicore/ops/dequantize_awq.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(DequantizeAWQ);
DequantizeAWQ::DequantizeAWQ(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zeros);
INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zeros);
}
void DequantizeAWQ::execute(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(DequantizeAWQ, x, x_packed, x_scale, x_zeros);
}
void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
DequantizeAWQ::execute(x, x_packed, x_scale, x_zeros);
}
} // namespace infinicore::op
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment