Unverified Commit 986bb179 authored by Ceng's avatar Ceng Committed by GitHub
Browse files

issue/695 c++ infinicore::nn::module支持bf16



* issue/695 c++ infinicore::nn::module支持bf16
Signed-off-by: default avatarCeng23333 <441651826@qq.com>

* embedding support f16
Signed-off-by: default avatarCeng23333 <441651826@qq.com>

* resolve comments
Signed-off-by: default avatarCeng23333 <441651826@qq.com>

* fix format
Signed-off-by: default avatarCeng23333 <441651826@qq.com>

---------
Signed-off-by: default avatarCeng23333 <441651826@qq.com>
parent 53f4bc1d
#include "infinicore/nn/embedding.hpp"
#include "infinicore/context/context.hpp"
#include "infinicore/ops.hpp"
#include <limits>
#include <spdlog/spdlog.h>
#include <stdexcept>
......@@ -55,7 +56,6 @@ Tensor Embedding::forward(const Tensor &indices) const {
// Flatten indices for sequential row copies
auto cpu_device = Device(Device::Type::CPU, 0);
auto indices_cpu = indices->to(cpu_device)->contiguous();
const auto *indices_data = reinterpret_cast<const int64_t *>(indices_cpu->data());
// Calculate total number of lookups
size_t num_lookups = 1;
......@@ -63,17 +63,41 @@ Tensor Embedding::forward(const Tensor &indices) const {
num_lookups *= dim;
}
const size_t row_bytes = embedding_dim_ * (weight_->dtype() == DataType::F32 ? sizeof(float) : weight_->dtype() == DataType::BF16 ? sizeof(uint16_t)
: sizeof(float));
const size_t row_bytes = embedding_dim_ * dsize(weight_->dtype());
// Source and destination base pointers
auto *weight_base = weight_->data();
auto *out_base = out->data();
// Helper lambda to read index based on dtype with bounds checking
auto read_index = [&](size_t i) -> int64_t {
auto dtype = indices_cpu->dtype();
if (dtype == DataType::I32) {
const auto *data = reinterpret_cast<const int32_t *>(indices_cpu->data());
return static_cast<int64_t>(data[i]);
} else if (dtype == DataType::I64) {
const auto *data = reinterpret_cast<const int64_t *>(indices_cpu->data());
return data[i];
} else if (dtype == DataType::U32) {
const auto *data = reinterpret_cast<const uint32_t *>(indices_cpu->data());
return static_cast<int64_t>(data[i]);
} else if (dtype == DataType::U64) {
const auto *data = reinterpret_cast<const uint64_t *>(indices_cpu->data());
uint64_t val = data[i];
// Check if value can fit in int64_t
if (val > static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
throw std::out_of_range("Index value out of range for int64_t: " + std::to_string(val));
}
return static_cast<int64_t>(val);
} else {
throw std::runtime_error("Embedding indices must be integer type, got dtype=" + std::to_string(static_cast<int>(dtype)));
}
};
if (weight_->device().getType() == Device::Type::CPU) {
// CPU path: memcpy row by row
for (size_t i = 0; i < num_lookups; ++i) {
int64_t idx = indices_data[i];
int64_t idx = read_index(i);
if (idx < 0 || idx >= static_cast<int64_t>(num_embeddings_)) {
throw std::out_of_range(
"Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")");
......@@ -83,7 +107,7 @@ Tensor Embedding::forward(const Tensor &indices) const {
} else {
// Device path: use stream-ordered D2D copies
for (size_t i = 0; i < num_lookups; ++i) {
int64_t idx = indices_data[i];
int64_t idx = read_index(i);
if (idx < 0 || idx >= static_cast<int64_t>(num_embeddings_)) {
throw std::out_of_range(
"Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")");
......
#include "infinicore/nn/rope.hpp"
#include "../../utils.h"
#include "../utils.hpp"
#include "infinicore/ops.hpp"
#include <algorithm>
#include <cmath>
#include <functional>
#include <stdexcept>
#include <utility>
#include <vector>
namespace infinicore::nn {
......@@ -50,36 +53,63 @@ void RoPE::initialize_cache() {
for (size_t pos = 0; pos < max_seq_len_; pos++) {
for (size_t j = 0; j < cache_dim; j++) {
// GPT-J style inverse frequency: theta^(-2j/head_dim)
double inv_freq = 1.0 / std::pow(theta_, 2.0 * static_cast<double>(j) / static_cast<double>(head_dim_));
// Compute directly in float to avoid double->float casting
float inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
// Compute angle: position * inverse_frequency
double angle = static_cast<double>(pos) * inv_freq;
float angle = static_cast<float>(pos) * inv_freq;
// Compute sin and cos
sin_data[pos * cache_dim + j] = static_cast<float>(std::sin(angle));
cos_data[pos * cache_dim + j] = static_cast<float>(std::cos(angle));
// Compute sin and cos directly on float
sin_data[pos * cache_dim + j] = std::sin(angle);
cos_data[pos * cache_dim + j] = std::cos(angle);
}
}
// Create CPU tensors and copy data
auto sin_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
auto cos_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
// Convert to target dtype on CPU (matching Python's numpy astype conversion pattern)
// Python: np_array.astype(ml_dtypes.bfloat16, copy=True) converts F32 -> BF16
if (dtype_ == DataType::F32) {
// Direct use of F32 data
auto sin_f32_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
auto cos_f32_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
sin_cache_->copy_from(sin_f32_cpu);
cos_cache_->copy_from(cos_f32_cpu);
} else if (dtype_ == DataType::BF16) {
// Convert F32 to BF16 using the same conversion as Python's ml_dtypes.bfloat16
// This uses round-to-nearest-even (matching _f32_to_bf16 implementation)
std::vector<bf16_t> sin_bf16_data(max_seq_len_ * cache_dim);
std::vector<bf16_t> cos_bf16_data(max_seq_len_ * cache_dim);
for (size_t i = 0; i < sin_data.size(); i++) {
sin_bf16_data[i] = utils::cast<bf16_t, float>(sin_data[i]);
cos_bf16_data[i] = utils::cast<bf16_t, float>(cos_data[i]);
}
auto sin_bf16_cpu = Tensor::from_blob(sin_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device);
auto cos_bf16_cpu = Tensor::from_blob(cos_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device);
// Copy to device
// Note: Cache is created with dtype_, but we compute in F32 for precision.
// If dtype_ != F32, copy_from will fail. For now, we only support F32 cache.
// TODO: Add dtype conversion support when cast operation is available
if (dtype_ != DataType::F32) {
// copy_from handles cross-device copying to target device
sin_cache_->copy_from(sin_bf16_cpu);
cos_cache_->copy_from(cos_bf16_cpu);
} else if (dtype_ == DataType::F16) {
// Convert F32 to F16
std::vector<fp16_t> sin_f16_data(max_seq_len_ * cache_dim);
std::vector<fp16_t> cos_f16_data(max_seq_len_ * cache_dim);
for (size_t i = 0; i < sin_data.size(); i++) {
sin_f16_data[i] = utils::cast<fp16_t, float>(sin_data[i]);
cos_f16_data[i] = utils::cast<fp16_t, float>(cos_data[i]);
}
auto sin_f16_cpu = Tensor::from_blob(sin_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device);
auto cos_f16_cpu = Tensor::from_blob(cos_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device);
sin_cache_->copy_from(sin_f16_cpu);
cos_cache_->copy_from(cos_f16_cpu);
} else {
throw std::runtime_error(
"RoPE cache dtype conversion not yet supported. Please use DataType::F32 for cache. "
"Requested dtype: "
"RoPE cache dtype conversion not yet supported for dtype: "
+ std::to_string(static_cast<int>(dtype_)));
}
// copy_from handles cross-device copying automatically
// Direct copy from CPU to target device avoids double copying
sin_cache_->copy_from(sin_cpu);
cos_cache_->copy_from(cos_cpu);
}
Tensor RoPE::forward(const Tensor &x, const Tensor &pos) const {
......
......@@ -376,6 +376,7 @@ target("infinicore_cpp_api")
add_files("src/infinicore/tensor/*.cc")
add_files("src/infinicore/nn/*.cc")
add_files("src/infinicore/ops/*/*.cc")
add_files("src/utils/*.cc")
set_installdir(INFINI_ROOT)
add_installfiles("include/infinicore/(**.h)", {prefixdir = "include/infinicore"})
......@@ -415,6 +416,7 @@ target("_infinicore")
add_files("src/infinicore/nn/*.cc")
add_files("src/infinicore/ops/*/*.cc")
add_files("src/infinicore/pybind11/**.cc")
add_files("src/utils/*.cc")
set_installdir("python/infinicore")
target_end()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment