Unverified Commit e9e5f61c authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update to commit 2016f07b (#10352)

parent 11dde418
UPSTREAM=https://github.com/ggerganov/llama.cpp.git UPSTREAM=https://github.com/ggerganov/llama.cpp.git
WORKDIR=llama/vendor WORKDIR=llama/vendor
FETCH_HEAD=71e90e8813f90097701e62f7fce137d96ddf41e2 FETCH_HEAD=2016f07bd106c73699ecbaace80f55db5ed95dac
.PHONY: help .PHONY: help
help: help:
......
int LLAMA_BUILD_NUMBER = 0; int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "71e90e8813f90097701e62f7fce137d96ddf41e2"; char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac";
char const *LLAMA_COMPILER = ""; char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = ""; char const *LLAMA_BUILD_TARGET = "";
...@@ -50,7 +50,6 @@ ...@@ -50,7 +50,6 @@
// tensor name constants // tensor name constants
// //
#define TN_TOKEN_EMBD "%s.token_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd" #define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
...@@ -66,8 +65,6 @@ ...@@ -66,8 +65,6 @@
#define TN_LN_2 "%s.blk.%d.ln2.%s" #define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s" #define TN_LN_POST "%s.post_ln.%s"
#define TN_TEXT_PROJ "text_projection.weight"
#define TN_VIS_PROJ "visual_projection.weight"
#define TN_LLAVA_PROJ "mm.%d.%s" #define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <sstream> #include <sstream>
#include <cinttypes> #include <cinttypes>
#include <limits> #include <limits>
#include <array>
#if defined(_WIN32) #if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
...@@ -1719,12 +1720,24 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length ...@@ -1719,12 +1720,24 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
return true; return true;
} }
// Linear interpolation between two points // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
inline float clip_lerp(float s, float e, float t) { static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
return s + (e - s) * t; dst.nx = src.nx;
dst.ny = src.ny;
dst.buf.resize(src.buf.size());
// TODO @ngxson : seems like this could be done more efficiently on cgraph
for (size_t i = 0; i < src.buf.size(); ++i) {
int c = i % 3; // rgb
dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
}
} }
// Bilinear resize function
static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { // set of tools to manupulate images
// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
struct image_manipulation {
// Bilinear resize function
static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
dst.nx = target_width; dst.nx = target_width;
dst.ny = target_height; dst.ny = target_height;
dst.buf.resize(3 * target_width * target_height); dst.buf.resize(3 * target_width * target_height);
...@@ -1742,40 +1755,25 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta ...@@ -1742,40 +1755,25 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
float y_lerp = py - y_floor; float y_lerp = py - y_floor;
for (int c = 0; c < 3; c++) { for (int c = 0; c < 3; c++) {
float top = clip_lerp( float top = lerp(
static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]), static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
x_lerp x_lerp
); );
float bottom = clip_lerp( float bottom = lerp(
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
x_lerp x_lerp
); );
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp)); dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
} }
} }
} }
}
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
dst.nx = src.nx;
dst.ny = src.ny;
dst.buf.resize(src.buf.size());
// TODO @ngxson : seems like this could be done more efficiently on cgraph
for (size_t i = 0; i < src.buf.size(); ++i) {
int c = i % 3; // rgb
dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
} }
}
inline int clip(int x, int lower, int upper) { // Bicubic resize function
return std::max(lower, std::min(x, upper)); // part of image will be cropped if the aspect ratio is different
} static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
const int nx = img.nx; const int nx = img.nx;
const int ny = img.ny; const int ny = img.ny;
...@@ -1836,12 +1834,14 @@ static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int t ...@@ -1836,12 +1834,14 @@ static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int t
} }
return true; return true;
} }
// llava-1.6 type of resize_and_pad (black) // llava-1.6 type of resize_and_pad
static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) { // if the ratio is not 1:1, padding with pad_color will be applied
int target_width = target_resolution.first; // pad_color is single channel, default is 0 (black)
int target_height = target_resolution.second; static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
int target_width = target_resolution.width;
int target_height = target_resolution.height;
float scale_w = static_cast<float>(target_width) / image.nx; float scale_w = static_cast<float>(target_width) / image.nx;
float scale_h = static_cast<float>(target_height) / image.ny; float scale_h = static_cast<float>(target_height) / image.ny;
...@@ -1857,13 +1857,19 @@ static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &imag ...@@ -1857,13 +1857,19 @@ static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &imag
} }
clip_image_u8 resized_image; clip_image_u8 resized_image;
// bilinear_resize(image, resized_image, new_width, new_height);
bicubic_resize(image, resized_image, new_width, new_height); bicubic_resize(image, resized_image, new_width, new_height);
clip_image_u8 padded_image; clip_image_u8 padded_image;
padded_image.nx = target_width; padded_image.nx = target_width;
padded_image.ny = target_height; padded_image.ny = target_height;
padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black padded_image.buf.resize(3 * target_width * target_height);
// Fill the padded image with the fill color
for (size_t i = 0; i < padded_image.buf.size(); i += 3) {
padded_image.buf[i] = pad_color[0];
padded_image.buf[i + 1] = pad_color[1];
padded_image.buf[i + 2] = pad_color[2];
}
// Calculate padding offsets // Calculate padding offsets
int pad_x = (target_width - new_width) / 2; int pad_x = (target_width - new_width) / 2;
...@@ -1877,26 +1883,223 @@ static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &imag ...@@ -1877,26 +1883,223 @@ static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &imag
} }
} }
} }
image_output = std::move(padded_image); dst = std::move(padded_image);
} }
static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
dst.nx = w;
dst.ny = h;
dst.buf.resize(3 * w * h);
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int src_idx = 3 * ((y + i)*image.nx + (x + j));
int dst_idx = 3 * (i*w + j);
dst.buf[dst_idx] = image.buf[src_idx];
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
}
}
}
private:
static inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
// Linear interpolation between two points
static inline float lerp(float s, float e, float t) {
return s + (e - s) * t;
}
};
/** /**
* implementation of LLaVA-UHD:
* - https://arxiv.org/pdf/2403.11703
* - https://github.com/thunlp/LLaVA-UHD
* - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
*
* overview:
* - an image always have a single overview (downscaled image)
* - an image can have 0 or multiple slices, depending on the image size
* - each slice can then be considered as a separate image
*
* for example:
*
* [overview] --> [slice 1] --> [slice 2]
* | |
* +--> [slice 3] --> [slice 4]
*/
struct llava_uhd {
struct slice_coordinates {
int x;
int y;
clip_image_size size;
};
struct slice_instructions {
clip_image_size overview_size; // size of downscaled image
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
std::vector<slice_coordinates> slices;
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
};
static int get_max_slices(struct clip_ctx * ctx) {
if (clip_is_minicpmv(ctx)) {
return 9;
}
return 0;
}
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
slice_instructions res;
const int patch_size = clip_get_patch_size(ctx);
const int slice_size = clip_get_image_size(ctx);
const int max_slice_nums = get_max_slices(ctx);
const int original_width = original_size.width;
const int original_height = original_size.height;
const float log_ratio = log((float)original_width / original_height);
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
const int multiple = fmin(ceil(ratio), max_slice_nums);
const bool has_slices = (multiple > 1);
const bool has_pinpoints = !ctx->vision_model.hparams.image_grid_pinpoints.empty();
if (has_pinpoints) {
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
auto refine_size = llava_uhd::select_best_resolution(
ctx->vision_model.hparams.image_grid_pinpoints,
original_size);
res.overview_size = clip_image_size{slice_size, slice_size};
res.refined_size = refine_size;
res.grid_size = clip_image_size{0, 0};
res.padding_refined = true;
for (int y = 0; y < refine_size.height; y += slice_size) {
for (int x = 0; x < refine_size.width; x += slice_size) {
slice_coordinates slice;
slice.x = x;
slice.y = y;
slice.size.width = std::min(slice_size, refine_size.width - x);
slice.size.height = std::min(slice_size, refine_size.height - y);
res.slices.push_back(slice);
if (x == 0) {
res.grid_size.width++;
}
}
res.grid_size.height++;
}
return res;
}
// no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
auto best_size = get_best_resize(original_size, slice_size, patch_size, has_slices);
res.overview_size = best_size;
if (!has_slices) {
// skip slicing logic
res.refined_size = clip_image_size{0, 0};
res.grid_size = clip_image_size{0, 0};
} else {
auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
res.grid_size = best_grid;
res.refined_size = refine_size;
int width = refine_size.width;
int height = refine_size.height;
int grid_x = int(width / best_grid.width);
int grid_y = int(height / best_grid.height);
for (int patches_y = 0, ic = 0;
patches_y < refine_size.height && ic < best_grid.height;
patches_y += grid_y, ic += 1) {
for (int patches_x = 0, jc = 0;
patches_x < refine_size.width && jc < best_grid.width;
patches_x += grid_x, jc += 1) {
slice_coordinates slice;
slice.x = patches_x;
slice.y = patches_y;
slice.size.width = grid_x;
slice.size.height = grid_y;
res.slices.push_back(slice);
// LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y);
}
}
}
return res;
}
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
std::vector<clip_image_u8_ptr> output;
// resize to overview size
clip_image_u8_ptr resized_img(clip_image_u8_init());
image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height);
output.push_back(std::move(resized_img));
if (inst.slices.empty()) {
// no slices, just return the resized image
return output;
}
// resize to refined size
clip_image_u8_ptr refined_img(clip_image_u8_init());
if (inst.padding_refined) {
image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size);
} else {
image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height);
}
// create slices
for (const auto & slice : inst.slices) {
int x = slice.x;
int y = slice.y;
int w = slice.size.width;
int h = slice.size.height;
clip_image_u8_ptr img_slice(clip_image_u8_init());
image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h);
output.push_back(std::move(img_slice));
}
return output;
}
private:
static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width = original_size.width;
int height = original_size.height;
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
float r = static_cast<float>(width) / height;
height = static_cast<int>(scale_resolution / std::sqrt(r));
width = static_cast<int>(height * r);
}
clip_image_size res;
res.width = ensure_divide(width, patch_size);
res.height = ensure_divide(height, patch_size);
return res;
}
/**
* Selects the best resolution from a list of possible resolutions based on the original size. * Selects the best resolution from a list of possible resolutions based on the original size.
* *
* @param original_size The original size of the image in the format (width, height). * @param original_size The original size of the image
* @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. * @param possible_resolutions A list of possible resolutions
* @return The best fit resolution in the format (width, height). * @return The best fit resolution
*/ */
static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) { static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
int original_width = original_size.first; int original_width = original_size.width;
int original_height = original_size.second; int original_height = original_size.height;
std::pair<int, int> best_fit; clip_image_size best_fit;
int max_effective_resolution = 0; int max_effective_resolution = 0;
int min_wasted_resolution = std::numeric_limits<int>::max(); int min_wasted_resolution = std::numeric_limits<int>::max();
for (const auto& resolution : possible_resolutions) { for (const auto & resolution : possible_resolutions) {
int width = resolution.first; int width = resolution.width;
int height = resolution.second; int height = resolution.height;
float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height); float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
int downscaled_width = static_cast<int>(original_width * scale); int downscaled_width = static_cast<int>(original_width * scale);
int downscaled_height = static_cast<int>(original_height * scale); int downscaled_height = static_cast<int>(original_height * scale);
...@@ -1911,71 +2114,45 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or ...@@ -1911,71 +2114,45 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
} }
return best_fit; return best_fit;
}
static std::vector<clip_image_u8_ptr> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
std::vector<clip_image_u8_ptr> patches;
int width = image.nx;
int height = image.ny;
for (int i = 0; i < height; i += patch_size) {
for (int j = 0; j < width; j += patch_size) {
clip_image_u8_ptr patch(clip_image_u8_init());
patch->nx = std::min(patch_size, width - j);
patch->ny = std::min(patch_size, height - i);
patch->buf.resize(3 * patch->nx * patch->ny);
for (int y = 0; y < patch->ny; ++y) {
for (int x = 0; x < patch->nx; ++x) {
for (int c = 0; c < 3; ++c) {
patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
}
}
} }
patches.push_back(std::move(patch));
// used by llava 1.6 with custom list of pinpoints
static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
std::vector<clip_image_size> possible_resolutions;
for (size_t i = 0; i < pinpoints.size(); i += 2) {
possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
} }
return select_best_resolution(original_size, possible_resolutions);
} }
return patches;
}
static int ensure_divide(int length, int patch_size) { static int ensure_divide(int length, int patch_size) {
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size); return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
}
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width = original_size.first;
int height = original_size.second;
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
float r = static_cast<float>(width) / height;
height = static_cast<int>(scale_resolution / std::sqrt(r));
width = static_cast<int>(height * r);
} }
int best_width = ensure_divide(width, patch_size);
int best_height = ensure_divide(height, patch_size);
return std::make_pair(best_width, best_height);
}
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) { static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width, height; int width = original_size.width;
std::tie(width, height) = original_size; int height = original_size.height;
int grid_x, grid_y; int grid_x = grid.width;
std::tie(grid_x, grid_y) = grid; int grid_y = grid.height;
int refine_width = ensure_divide(width, grid_x); int refine_width = ensure_divide(width, grid_x);
int refine_height = ensure_divide(height, grid_y); int refine_height = ensure_divide(height, grid_y);
int grid_width = refine_width / grid_x; clip_image_size grid_size;
int grid_height = refine_height / grid_y; grid_size.width = refine_width / grid_x;
grid_size.height = refine_height / grid_y;
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair int best_grid_width = best_grid_size.width;
int best_grid_width, best_grid_height; int best_grid_height = best_grid_size.height;
std::tie(best_grid_width, best_grid_height) = best_grid_size;
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) clip_image_size refine_size;
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) refine_size.width = best_grid_width * grid_x;
refine_size.height = best_grid_height * grid_y;
return refine_size; return refine_size;
} }
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
std::vector<int> candidate_split_grids_nums; std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) { for (int i : {multiple - 1, multiple, multiple + 1}) {
if (i == 1 || i > max_slice_nums) { if (i == 1 || i > max_slice_nums) {
...@@ -1984,124 +2161,62 @@ static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int mul ...@@ -1984,124 +2161,62 @@ static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int mul
candidate_split_grids_nums.push_back(i); candidate_split_grids_nums.push_back(i);
} }
std::vector<std::pair<int, int>> candidate_grids; std::vector<clip_image_size> candidate_grids;
for (int split_grids_nums : candidate_split_grids_nums) { for (int split_grids_nums : candidate_split_grids_nums) {
int m = 1; int m = 1;
while (m <= split_grids_nums) { while (m <= split_grids_nums) {
if (split_grids_nums % m == 0) { if (split_grids_nums % m == 0) {
candidate_grids.emplace_back(m, split_grids_nums / m); candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
} }
++m; ++m;
} }
} }
std::pair<int, int> best_grid{1, 1}; clip_image_size best_grid{1, 1};
float min_error = std::numeric_limits<float>::infinity(); float min_error = std::numeric_limits<float>::infinity();
for (const auto& grid : candidate_grids) { for (const auto& grid : candidate_grids) {
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
if (error < min_error) { if (error < min_error) {
best_grid = grid; best_grid = grid;
min_error = error; min_error = error;
} }
} }
return best_grid; return best_grid;
} }
};
// inspired from LLaVA-UHD:
// -> https://arxiv.org/pdf/2403.11703
// -> https://github.com/thunlp/LLaVA-UHD
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
const std::pair<int, int> original_size={img->nx,img->ny};
const int original_width = img->nx;
const int original_height = img->ny;
const float log_ratio = log(1.0*original_width/original_height);
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<clip_image_u8_ptr>> images;
LOG_DBG("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8_ptr>());
if (multiple <= 1) {
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
clip_image_u8_ptr source_image(clip_image_u8_init());
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
images.back().push_back(std::move(source_image));
}
else if (multiple > 1) {
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
clip_image_u8_ptr source_image(clip_image_u8_init());
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LOG_DBG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
images.back().push_back(std::move(source_image));
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
LOG_DBG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8_ptr refine_image(clip_image_u8_init());
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
LOG_DBG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
// split_to_patches
int width = refine_image->nx;
int height = refine_image->ny;
int grid_x = int(width / best_grid.first);
int grid_y = int(height / best_grid.second);
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
images.push_back(std::vector<clip_image_u8_ptr>());
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
clip_image_u8_ptr patch(clip_image_u8_init());
patch->nx = grid_x;
patch->ny = grid_y;
patch->buf.resize(3 * patch->nx * patch->ny);
for (int y = patches_i; y < patches_i + grid_y; ++y) {
for (int x = patches_j; x < patches_j + grid_x; ++x) {
const int i = 3 * (y * refine_image->nx + x);
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
patch->buf[j] = refine_image->buf[i];
patch->buf[j+1] = refine_image->buf[i+1];
patch->buf[j+2] = refine_image->buf[i+2];
}
}
images.back().push_back(std::move(patch));
}
}
}
return images;
}
// TODO @ngxson : decprecate the load_image_size singleton pattern
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
const int max_slice_nums=9; const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size);
const int scale_resolution=448; return inst.grid_size.width;
const int original_width = ctx_clip->load_image_size.width;
const int original_height = ctx_clip->load_image_size.height;
const float log_ratio = log(1.0*original_width/original_height);
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
const int multiple = fmin(ceil(ratio), max_slice_nums);
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
return best_grid.first;
} }
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found // res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
if (!ctx->has_vision_encoder) {
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false;
}
clip_image_size original_size{img->nx, img->ny};
bool pad_to_square = true;
auto & params = ctx->vision_model.hparams;
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
pad_to_square = false;
}
if (clip_is_minicpmv(ctx)) { if (clip_is_minicpmv(ctx)) {
int max_slice_nums = 9; auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
std::vector<std::vector<clip_image_u8_ptr>> imgs = uhd_slice_image(img, max_slice_nums); std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
for (size_t i = 0; i < imgs.size(); ++i) { for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) { // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
clip_image_f32_ptr res(clip_image_f32_init()); clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(*imgs[i][j], *res, ctx->image_mean, ctx->image_std); normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
res_imgs->entries.push_back(std::move(res)); res_imgs->entries.push_back(std::move(res));
} }
}
return true; return true;
} }
else if (ctx->has_qwen2vl_merger) { else if (ctx->has_qwen2vl_merger) {
...@@ -2109,7 +2224,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str ...@@ -2109,7 +2224,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
auto patch_size = clip_get_patch_size(ctx) * 2; auto patch_size = clip_get_patch_size(ctx) * 2;
int nx = ceil((float)img->nx / patch_size) * patch_size; int nx = ceil((float)img->nx / patch_size) * patch_size;
int ny = ceil((float)img->ny / patch_size) * patch_size; int ny = ceil((float)img->ny / patch_size) * patch_size;
bicubic_resize(*img, resized, nx, ny); image_manipulation::bicubic_resize(*img, resized, nx, ny);
clip_image_f32_ptr img_f32(clip_image_f32_init()); clip_image_f32_ptr img_f32(clip_image_f32_init());
// clip_image_f32_ptr res(clip_image_f32_init()); // clip_image_f32_ptr res(clip_image_f32_init());
...@@ -2121,8 +2236,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str ...@@ -2121,8 +2236,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
clip_image_u8 resized_image; clip_image_u8 resized_image;
int32_t sz=ctx->vision_model.hparams.image_size; int sz = params.image_size;
bicubic_resize(*img, resized_image,sz,sz); image_manipulation::bicubic_resize(*img, resized_image, sz, sz);
clip_image_f32_ptr img_f32(clip_image_f32_init()); clip_image_f32_ptr img_f32(clip_image_f32_init());
//clip_image_save_to_bmp(resized_image, "resized.bmp"); //clip_image_save_to_bmp(resized_image, "resized.bmp");
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
...@@ -2130,156 +2245,47 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str ...@@ -2130,156 +2245,47 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
return true; return true;
} }
bool pad_to_square = true;
if (!ctx->has_vision_encoder) {
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false;
}
auto & params = ctx->vision_model.hparams;
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
pad_to_square = false;
}
// free the previous res_imgs if any set
res_imgs->entries.clear();
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
if (pad_to_square && img->nx != img->ny) {
int longer_side = std::max(img->nx, img->ny); if (pad_to_square) {
// for llava-1.5, we resize image to a square, and pad the shorter side with a background color
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
const int longer_side = std::max(img->nx, img->ny);
temp->nx = longer_side; temp->nx = longer_side;
temp->ny = longer_side; temp->ny = longer_side;
temp->buf.resize(3 * longer_side * longer_side); temp->buf.resize(3 * longer_side * longer_side);
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
// fill with background color // background color in RGB from LLaVA (this is the mean rgb color * 255)
for (size_t i = 0; i < temp->buf.size(); i++) { const std::array<uint8_t, 3> pad_color = {122, 116, 104};
temp->buf[i] = bc[i % 3];
} // resize the image to the target_size
image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
// copy from the input image
for (int y = 0; y < img->ny; y++) {
for (int x = 0; x < img->nx; x++) {
const int i = 3 * (y * img->nx + x);
const int j = 3 * (y * temp->nx + x);
temp->buf[j] = img->buf[i];
temp->buf[j+1] = img->buf[i+1];
temp->buf[j+2] = img->buf[i+2];
}
}
} else {
if (!params.image_grid_pinpoints.empty()) {
// "spatial_unpad" with "anyres" processing for llava-1.6
std::vector<std::pair<int, int>> possible_resolutions;
for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
}
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
// clip_image_save_to_bmp(*img, "input.bmp");
resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
// clip_image_save_to_bmp(*temp, "resized.bmp");
// visually verify normalized image:
// normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
// {
// clip_image_u8 * temp2 = clip_image_u8_init();
// clip_image_convert_f32_to_u8(*res, *temp2);
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
// clip_image_u8_free(temp2);
// }
std::vector<clip_image_u8_ptr> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
clip_image_u8_ptr image_original_resize(clip_image_u8_init());
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
patches.insert(patches.begin(), std::move(image_original_resize));
for (auto & patch : patches) {
clip_image_f32_ptr res(clip_image_f32_init()); clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(*patch, *res, ctx->image_mean, ctx->image_std); normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
res_imgs->entries.push_back(std::move(res)); res_imgs->entries.push_back(std::move(res));
}
return true; return true;
} else {
temp->nx = img->nx;
temp->ny = img->ny;
temp->buf.resize(img->buf.size());
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
}
}
const int nx = temp->nx; } else if (!params.image_grid_pinpoints.empty()) {
const int ny = temp->ny; // "spatial_unpad" with "anyres" processing for llava-1.6
// clip_image_save_to_bmp(*temp, "resized_vanilla.bmp"); auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
const int nx2 = ctx->vision_model.hparams.image_size; for (size_t i = 0; i < imgs.size(); ++i) {
const int ny2 = ctx->vision_model.hparams.image_size; // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
clip_image_f32_ptr res(clip_image_f32_init()); clip_image_f32_ptr res(clip_image_f32_init());
res->nx = nx2; normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
res->ny = ny2; res_imgs->entries.push_back(std::move(res));
res->buf.resize(3 * nx2 * ny2);
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
const int nx3 = int(nx / scale + 0.5f);
const int ny3 = int(ny / scale + 0.5f);
const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f};
for (int y = 0; y < ny3; y++) {
for (int x = 0; x < nx3; x++) {
for (int c = 0; c < 3; c++) {
// linear interpolation
const float sx = (x + 0.5f) * scale - 0.5f;
const float sy = (y + 0.5f) * scale - 0.5f;
const int x0 = std::max(0, (int)std::floor(sx));
const int y0 = std::max(0, (int)std::floor(sy));
const int x1 = std::min(x0 + 1, nx - 1);
const int y1 = std::min(y0 + 1, ny - 1);
const float dx = sx - x0;
const float dy = sy - y0;
const int j00 = 3 * (y0 * nx + x0) + c;
const int j01 = 3 * (y0 * nx + x1) + c;
const int j10 = 3 * (y1 * nx + x0) + c;
const int j11 = 3 * (y1 * nx + x1) + c;
const float v00 = temp->buf[j00];
const float v01 = temp->buf[j01];
const float v10 = temp->buf[j10];
const float v11 = temp->buf[j11];
const float v0 = v00 * (1.0f - dx) + v01 * dx;
const float v1 = v10 * (1.0f - dx) + v11 * dx;
const float v = v0 * (1.0f - dy) + v1 * dy;
const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
const int i = 3 * (y * nx3 + x) + c;
res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
}
}
} }
// { return true;
// clip_image_u8 * temp2 = clip_image_u8_init();
// clip_image_convert_f32_to_u8(*res, *temp2);
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
// clip_image_u8_free(temp2);
// }
// res_imgs.push_back(res);
res_imgs->entries.push_back(std::move(res)); }
return true; GGML_ASSERT(false && "Unknown image preprocessing type");
} }
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
......
...@@ -145,6 +145,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { ...@@ -145,6 +145,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" }, { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
...@@ -1142,6 +1144,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -1142,6 +1144,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
...@@ -1636,23 +1640,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { ...@@ -1636,23 +1640,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
......
...@@ -149,6 +149,8 @@ enum llm_kv { ...@@ -149,6 +149,8 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_DIMENSION_SECTIONS,
...@@ -311,6 +313,8 @@ enum llm_tensor { ...@@ -311,6 +313,8 @@ enum llm_tensor {
LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_Q_B,
LLM_TENSOR_ATTN_KV_A_MQA, LLM_TENSOR_ATTN_KV_A_MQA,
LLM_TENSOR_ATTN_KV_B, LLM_TENSOR_ATTN_KV_B,
LLM_TENSOR_ATTN_K_B,
LLM_TENSOR_ATTN_V_B,
LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM,
LLM_TENSOR_ATTN_SUB_NORM, LLM_TENSOR_ATTN_SUB_NORM,
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <cstring> #include <cstring>
#include <stdexcept> #include <stdexcept>
#include <cinttypes> #include <cinttypes>
#include <cmath>
// //
// llama_context // llama_context
...@@ -473,7 +474,6 @@ ggml_tensor * llama_context::build_rope_shift( ...@@ -473,7 +474,6 @@ ggml_tensor * llama_context::build_rope_shift(
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor; const auto & yarn_ext_factor = cparams.yarn_ext_factor;
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
const auto & yarn_beta_fast = cparams.yarn_beta_fast; const auto & yarn_beta_fast = cparams.yarn_beta_fast;
const auto & yarn_beta_slow = cparams.yarn_beta_slow; const auto & yarn_beta_slow = cparams.yarn_beta_slow;
...@@ -482,6 +482,10 @@ ggml_tensor * llama_context::build_rope_shift( ...@@ -482,6 +482,10 @@ ggml_tensor * llama_context::build_rope_shift(
const auto & n_rot = hparams.n_rot; const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type; const auto & rope_type = hparams.rope_type;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
ggml_tensor * tmp; ggml_tensor * tmp;
if (ggml_is_quantized(cur->type)) { if (ggml_is_quantized(cur->type)) {
......
...@@ -1194,6 +1194,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( ...@@ -1194,6 +1194,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_tensor * v, ggml_tensor * v,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * kq_mask, ggml_tensor * kq_mask,
ggml_tensor * v_mla,
bool v_trans, bool v_trans,
float kq_scale) const { float kq_scale) const {
//const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
...@@ -1205,8 +1206,6 @@ ggml_tensor * llm_graph_context::build_attn_mha( ...@@ -1205,8 +1206,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
//const auto & n_embd_head_k = hparams.n_embd_head_k; //const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v; //const auto & n_embd_head_v = hparams.n_embd_head_v;
const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0];
const auto n_tokens = q->ne[1]; const auto n_tokens = q->ne[1];
const auto n_head = q->ne[2]; const auto n_head = q->ne[2];
const auto n_kv = k->ne[1]; const auto n_kv = k->ne[1];
...@@ -1235,7 +1234,12 @@ ggml_tensor * llm_graph_context::build_attn_mha( ...@@ -1235,7 +1234,12 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); if (v_mla) {
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
cur = ggml_mul_mat(ctx0, v_mla, cur);
}
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
} else { } else {
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
...@@ -1273,9 +1277,14 @@ ggml_tensor * llm_graph_context::build_attn_mha( ...@@ -1273,9 +1277,14 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
if (v_mla) {
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
}
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
if (!cparams.offload_kqv) { if (!cparams.offload_kqv) {
// all nodes between the KV store and the attention output are run on the CPU // all nodes between the KV store and the attention output are run on the CPU
...@@ -1310,6 +1319,7 @@ ggml_tensor * llm_graph_context::build_attn( ...@@ -1310,6 +1319,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur, ggml_tensor * k_cur,
ggml_tensor * v_cur, ggml_tensor * v_cur,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * v_mla,
float kq_scale, float kq_scale,
int il) const { int il) const {
GGML_UNUSED(n_tokens); GGML_UNUSED(n_tokens);
...@@ -1331,7 +1341,7 @@ ggml_tensor * llm_graph_context::build_attn( ...@@ -1331,7 +1341,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
//cb(k, "v", il); //cb(k, "v", il);
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale); ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
...@@ -1385,6 +1395,7 @@ ggml_tensor * llm_graph_context::build_attn( ...@@ -1385,6 +1395,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur, ggml_tensor * k_cur,
ggml_tensor * v_cur, ggml_tensor * v_cur,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * v_mla,
float kq_scale, float kq_scale,
int il) const { int il) const {
// these nodes are added to the graph together so that they are not reordered // these nodes are added to the graph together so that they are not reordered
...@@ -1470,7 +1481,7 @@ ggml_tensor * llm_graph_context::build_attn( ...@@ -1470,7 +1481,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v, ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
0); 0);
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale); ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
if (wo) { if (wo) {
...@@ -1529,6 +1540,7 @@ ggml_tensor * llm_graph_context::build_attn( ...@@ -1529,6 +1540,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur, ggml_tensor * k_cur,
ggml_tensor * v_cur, ggml_tensor * v_cur,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * v_mla,
float kq_scale, float kq_scale,
int il) const { int il) const {
// these nodes are added to the graph together so that they are not reordered // these nodes are added to the graph together so that they are not reordered
...@@ -1548,7 +1560,7 @@ ggml_tensor * llm_graph_context::build_attn( ...@@ -1548,7 +1560,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
//cb(k, "v", il); //cb(k, "v", il);
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale); ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
...@@ -1717,4 +1729,3 @@ void llm_graph_context::build_pooling( ...@@ -1717,4 +1729,3 @@ void llm_graph_context::build_pooling(
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
...@@ -522,6 +522,7 @@ struct llm_graph_context { ...@@ -522,6 +522,7 @@ struct llm_graph_context {
ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false) ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * kq_mask, ggml_tensor * kq_mask,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
bool v_trans, bool v_trans,
float kq_scale) const; float kq_scale) const;
...@@ -536,6 +537,7 @@ struct llm_graph_context { ...@@ -536,6 +537,7 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale, float kq_scale,
int il) const; int il) const;
...@@ -550,6 +552,7 @@ struct llm_graph_context { ...@@ -550,6 +552,7 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale, float kq_scale,
int il) const; int il) const;
...@@ -564,6 +567,7 @@ struct llm_graph_context { ...@@ -564,6 +567,7 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale, float kq_scale,
int il) const; int il) const;
......
...@@ -46,6 +46,10 @@ struct llama_hparams { ...@@ -46,6 +46,10 @@ struct llama_hparams {
uint32_t n_rel_attn_bkts = 0; uint32_t n_rel_attn_bkts = 0;
uint32_t n_vocab = 0; uint32_t n_vocab = 0;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla = 0;
uint32_t n_embd_head_v_mla = 0;
// for WavTokenizer // for WavTokenizer
struct llama_hparams_posnet posnet; struct llama_hparams_posnet posnet;
struct llama_hparams_convnext convnext; struct llama_hparams_convnext convnext;
......
...@@ -27,7 +27,7 @@ bool llama_kv_cache_unified::init( ...@@ -27,7 +27,7 @@ bool llama_kv_cache_unified::init(
recurrent = llama_model_is_recurrent(&model); recurrent = llama_model_is_recurrent(&model);
v_trans = !recurrent && !cparams.flash_attn; v_trans = !recurrent && !cparams.flash_attn;
can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA can_shift = !recurrent;
LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
__func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift); __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
......
...@@ -1170,6 +1170,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ...@@ -1170,6 +1170,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
} }
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
...@@ -3281,8 +3283,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ...@@ -3281,8 +3283,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
{ {
const bool is_lite = (hparams.n_layer == 27); const bool is_lite = (hparams.n_layer == 27);
   
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
const int64_t n_embd_head_qk_rope = hparams.n_rot; const int64_t n_embd_head_qk_rope = hparams.n_rot;
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
   
const int64_t q_lora_rank = hparams.n_lora_q; const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t kv_lora_rank = hparams.n_lora_kv; const int64_t kv_lora_rank = hparams.n_lora_kv;
...@@ -3308,14 +3316,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ...@@ -3308,14 +3316,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
   
if (!is_lite) { if (!is_lite) {
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0); layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0); layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
} else { } else {
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0); layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
} }
   
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0); layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
if (is_mla) {
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
} else {
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
}
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
   
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
   
...@@ -4394,6 +4410,8 @@ void llama_model::print_info() const { ...@@ -4394,6 +4410,8 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
...@@ -4600,7 +4618,7 @@ struct llm_build_llama : public llm_graph_context { ...@@ -4600,7 +4618,7 @@ struct llm_build_llama : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, kq_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il); cb(cur, "attn_out", il);
} }
   
...@@ -4910,7 +4928,7 @@ struct llm_build_mllama: public llm_graph_context { ...@@ -4910,7 +4928,7 @@ struct llm_build_mllama: public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
// skip computing output for unused tokens // skip computing output for unused tokens
...@@ -5053,7 +5071,7 @@ struct llm_build_deci : public llm_graph_context { ...@@ -5053,7 +5071,7 @@ struct llm_build_deci : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, kq_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -5195,7 +5213,7 @@ struct llm_build_baichuan : public llm_graph_context { ...@@ -5195,7 +5213,7 @@ struct llm_build_baichuan : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -5310,7 +5328,7 @@ struct llm_build_xverse : public llm_graph_context { ...@@ -5310,7 +5328,7 @@ struct llm_build_xverse : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -5435,7 +5453,7 @@ struct llm_build_falcon : public llm_graph_context { ...@@ -5435,7 +5453,7 @@ struct llm_build_falcon : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -5565,7 +5583,7 @@ struct llm_build_grok : public llm_graph_context { ...@@ -5565,7 +5583,7 @@ struct llm_build_grok : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f, il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -5716,7 +5734,7 @@ struct llm_build_dbrx : public llm_graph_context { ...@@ -5716,7 +5734,7 @@ struct llm_build_dbrx : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -5830,7 +5848,7 @@ struct llm_build_starcoder : public llm_graph_context { ...@@ -5830,7 +5848,7 @@ struct llm_build_starcoder : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -5929,7 +5947,7 @@ struct llm_build_refact : public llm_graph_context { ...@@ -5929,7 +5947,7 @@ struct llm_build_refact : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -6083,7 +6101,7 @@ struct llm_build_bert : public llm_graph_context { ...@@ -6083,7 +6101,7 @@ struct llm_build_bert : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
   
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
...@@ -6200,7 +6218,7 @@ struct llm_build_bloom : public llm_graph_context { ...@@ -6200,7 +6218,7 @@ struct llm_build_bloom : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -6341,7 +6359,7 @@ struct llm_build_mpt : public llm_graph_context { ...@@ -6341,7 +6359,7 @@ struct llm_build_mpt : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -6487,7 +6505,7 @@ struct llm_build_stablelm : public llm_graph_context { ...@@ -6487,7 +6505,7 @@ struct llm_build_stablelm : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -6610,7 +6628,7 @@ struct llm_build_qwen : public llm_graph_context { ...@@ -6610,7 +6628,7 @@ struct llm_build_qwen : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -6730,7 +6748,7 @@ struct llm_build_qwen2 : public llm_graph_context { ...@@ -6730,7 +6748,7 @@ struct llm_build_qwen2 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -6851,7 +6869,7 @@ struct llm_build_qwen2vl : public llm_graph_context { ...@@ -6851,7 +6869,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -6978,7 +6996,7 @@ struct llm_build_qwen2moe : public llm_graph_context { ...@@ -6978,7 +6996,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -7131,7 +7149,7 @@ struct llm_build_qwen3 : public llm_graph_context { ...@@ -7131,7 +7149,7 @@ struct llm_build_qwen3 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -7252,7 +7270,7 @@ struct llm_build_qwen3moe : public llm_graph_context { ...@@ -7252,7 +7270,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -7392,7 +7410,7 @@ struct llm_build_phi2 : public llm_graph_context { ...@@ -7392,7 +7410,7 @@ struct llm_build_phi2 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f, il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -7521,7 +7539,7 @@ struct llm_build_phi3 : public llm_graph_context { ...@@ -7521,7 +7539,7 @@ struct llm_build_phi3 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f, il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -7656,7 +7674,7 @@ struct llm_build_plamo : public llm_graph_context { ...@@ -7656,7 +7674,7 @@ struct llm_build_plamo : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
ggml_tensor * sa_out = cur; ggml_tensor * sa_out = cur;
   
...@@ -7763,7 +7781,7 @@ struct llm_build_gpt2 : public llm_graph_context { ...@@ -7763,7 +7781,7 @@ struct llm_build_gpt2 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -7879,7 +7897,7 @@ struct llm_build_codeshell : public llm_graph_context { ...@@ -7879,7 +7897,7 @@ struct llm_build_codeshell : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -8008,7 +8026,7 @@ struct llm_build_orion : public llm_graph_context { ...@@ -8008,7 +8026,7 @@ struct llm_build_orion : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -8135,7 +8153,7 @@ struct llm_build_internlm2 : public llm_graph_context { ...@@ -8135,7 +8153,7 @@ struct llm_build_internlm2 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -8332,7 +8350,7 @@ struct llm_build_minicpm3 : public llm_graph_context { ...@@ -8332,7 +8350,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
q_states, k_states, v_states, nullptr, kq_scale, il); q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -8462,7 +8480,7 @@ struct llm_build_gemma : public llm_graph_context { ...@@ -8462,7 +8480,7 @@ struct llm_build_gemma : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f, il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -8584,7 +8602,7 @@ struct llm_build_gemma2 : public llm_graph_context { ...@@ -8584,7 +8602,7 @@ struct llm_build_gemma2 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f, il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
} }
   
cur = build_norm(cur, cur = build_norm(cur,
...@@ -8725,7 +8743,7 @@ struct llm_build_gemma3 : public llm_graph_context { ...@@ -8725,7 +8743,7 @@ struct llm_build_gemma3 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
} }
   
cur = build_norm(cur, cur = build_norm(cur,
...@@ -8865,7 +8883,7 @@ struct llm_build_starcoder2 : public llm_graph_context { ...@@ -8865,7 +8883,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -9200,7 +9218,7 @@ struct llm_build_command_r : public llm_graph_context { ...@@ -9200,7 +9218,7 @@ struct llm_build_command_r : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -9335,7 +9353,7 @@ struct llm_build_cohere2 : public llm_graph_context { ...@@ -9335,7 +9353,7 @@ struct llm_build_cohere2 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -9466,7 +9484,7 @@ struct llm_build_olmo : public llm_graph_context { ...@@ -9466,7 +9484,7 @@ struct llm_build_olmo : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, nullptr, model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -9586,7 +9604,7 @@ struct llm_build_olmo2 : public llm_graph_context { ...@@ -9586,7 +9604,7 @@ struct llm_build_olmo2 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
cur = build_norm(cur, cur = build_norm(cur,
...@@ -9719,7 +9737,7 @@ struct llm_build_olmoe : public llm_graph_context { ...@@ -9719,7 +9737,7 @@ struct llm_build_olmoe : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -9852,7 +9870,7 @@ struct llm_build_openelm : public llm_graph_context { ...@@ -9852,7 +9870,7 @@ struct llm_build_openelm : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -9966,7 +9984,7 @@ struct llm_build_gptneox : public llm_graph_context { ...@@ -9966,7 +9984,7 @@ struct llm_build_gptneox : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -10116,7 +10134,7 @@ struct llm_build_arctic : public llm_graph_context { ...@@ -10116,7 +10134,7 @@ struct llm_build_arctic : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -10271,7 +10289,7 @@ struct llm_build_deepseek : public llm_graph_context { ...@@ -10271,7 +10289,7 @@ struct llm_build_deepseek : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, kq_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -10361,15 +10379,22 @@ struct llm_build_deepseek2 : public llm_graph_context { ...@@ -10361,15 +10379,22 @@ struct llm_build_deepseek2 : public llm_graph_context {
llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
bool is_lite = (hparams.n_layer == 27); bool is_lite = (hparams.n_layer == 27);
   
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
const int64_t n_embd_head_qk_rope = hparams.n_rot;
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k)); const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
   
ggml_tensor * cur; ggml_tensor * cur;
ggml_tensor * inpL; ggml_tensor * inpL;
...@@ -10395,16 +10420,14 @@ struct llm_build_deepseek2 : public llm_graph_context { ...@@ -10395,16 +10420,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
{ {
ggml_tensor * q = NULL; ggml_tensor * q = NULL;
if (!is_lite) { if (!is_lite) {
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(q, "q", il); cb(q, "q", il);
   
q = build_norm(q, q = build_norm(q,
model.layers[il].attn_q_a_norm, NULL, model.layers[il].attn_q_a_norm, nullptr,
LLM_NORM_RMS, il); LLM_NORM_RMS, il);
cb(q, "q", il); cb(q, "q", il);
   
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il); cb(q, "q", il);
} else { } else {
...@@ -10412,96 +10435,125 @@ struct llm_build_deepseek2 : public llm_graph_context { ...@@ -10412,96 +10435,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
cb(q, "q", il); cb(q, "q", il);
} }
   
// split into {n_head * n_embd_head_qk_nope, n_tokens} // split into {n_embd_head_qk_nope, n_head, n_tokens}
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
ggml_row_size(q->type, hparams.n_embd_head_k), n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k * n_head), ggml_row_size(q->type, n_embd_head_k),
ggml_row_size(q->type, n_embd_head_k) * n_head,
0); 0);
cb(q_nope, "q_nope", il); cb(q_nope, "q_nope", il);
   
// and {n_head * n_embd_head_qk_rope, n_tokens} // and {n_embd_head_qk_rope, n_head, n_tokens}
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
ggml_row_size(q->type, hparams.n_embd_head_k), n_embd_head_qk_rope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k * n_head), ggml_row_size(q->type, n_embd_head_k),
ggml_row_size(q->type, n_embd_head_k) * n_head,
ggml_row_size(q->type, n_embd_head_qk_nope)); ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il); cb(q_pe, "q_pe", il);
   
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); cb(kv_cmpr_pe, "kv_cmpr_pe", il);
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
   
// split into {kv_lora_rank, n_tokens} // split into {kv_lora_rank, n_tokens}
ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
kv_pe_compresseed->nb[1], kv_lora_rank, n_tokens,
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
0); 0);
cb(kv_compressed, "kv_compressed", il); cb(kv_cmpr, "kv_cmpr", il);
// and {n_embd_head_qk_rope, 1, n_tokens}
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
n_embd_head_qk_rope, 1, n_tokens,
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
cb(k_pe, "k_pe", il);
   
// and {n_embd_head_qk_rope, n_tokens} q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
kv_pe_compresseed->nb[1], ext_factor, attn_factor, beta_fast, beta_slow
kv_pe_compresseed->nb[1], );
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); cb(q_pe, "q_pe", il);
k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(k_pe, "k_pe", il); cb(k_pe, "k_pe", il);
   
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont kv_cmpr = build_norm(kv_cmpr,
kv_compressed = ggml_cont(ctx0, kv_compressed); model.layers[il].attn_kv_a_norm, nullptr,
kv_compressed = build_norm(kv_compressed,
model.layers[il].attn_kv_a_norm, NULL,
LLM_NORM_RMS, il); LLM_NORM_RMS, il);
cb(kv_compressed, "kv_compressed", il); cb(kv_cmpr, "kv_cmpr", il);
   
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} if (is_mla) {
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); // {n_embd_head_qk_nope, n_tokens, n_head}
cb(kv, "kv", il); q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
cb(q_nope, "q_nope_perm", il);
   
// split into {n_head * n_embd_head_qk_nope, n_tokens} // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), cb(q_nope_absorbed, "q_nope_absorbed", il);
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
0);
cb(k_nope, "k_nope", il);
   
// and {n_head * n_embd_head_v, n_tokens} // {kv_lora_rank, n_head, n_tokens}
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
cb(v_states, "v_states", il);
   
v_states = ggml_cont(ctx0, v_states); // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
cb(v_states, "v_states", il); // note: rope must go first for in-place context shifting in build_rope_shift()
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
cb(Qcur, "Qcur", il);
   
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), cb(kv_cmpr, "kv_cmpr_reshape", il);
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
cb(Kcur, "Kcur", il);
// {kv_lora_rank, 1, n_tokens}
ggml_tensor * Vcur = kv_cmpr;
cb(Vcur, "Vcur", il);
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
} else {
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
cb(kv, "kv", il);
// split into {n_embd_head_qk_nope, n_head, n_tokens}
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
0); 0);
cb(v_states, "v_states", il); cb(k_nope, "k_nope_view", il);
   
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this // and {n_embd_head_v, n_head, n_tokens}
q_pe = ggml_rope_ext( ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
ctx0, q_pe, inp_pos, nullptr, n_embd_head_v, n_head, n_tokens,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
ext_factor, attn_factor_scaled, beta_fast, beta_slow ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
); ggml_row_size(kv->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il); cb(Vcur, "Vcur_view", il);
   
// shared RoPE key Vcur = ggml_cont(ctx0, Vcur);
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this cb(Vcur, "Vcur_cont", il);
k_pe = ggml_rope_ext(
ctx0, k_pe, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor_scaled, beta_fast, beta_slow
);
cb(k_pe, "k_pe", il);
   
ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); // note: rope must go first for in-place context shifting in build_rope_shift()
cb(q_states, "q_states", il); ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
cb(Qcur, "Qcur", il);
   
ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
cb(k_states, "k_states", il); cb(Kcur, "Kcur", il);
   
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
q_states, k_states, v_states, nullptr, kq_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
}
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -10667,7 +10719,7 @@ struct llm_build_bitnet : public llm_graph_context { ...@@ -10667,7 +10719,7 @@ struct llm_build_bitnet : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
NULL, NULL, NULL, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
   
cur = build_norm(cur, cur = build_norm(cur,
model.layers[il].attn_sub_norm, NULL, model.layers[il].attn_sub_norm, NULL,
...@@ -10790,7 +10842,7 @@ struct llm_build_t5_enc : public llm_graph_context { ...@@ -10790,7 +10842,7 @@ struct llm_build_t5_enc : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo_enc, nullptr, model.layers[il].wo_enc, nullptr,
Qcur, Kcur, Vcur, kq_b, 1.0f, il); Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
   
...@@ -10896,7 +10948,7 @@ struct llm_build_t5_dec : public llm_graph_context { ...@@ -10896,7 +10948,7 @@ struct llm_build_t5_dec : public llm_graph_context {
   
cur = build_attn(inp_attn_self, gf, cur = build_attn(inp_attn_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, kq_b, 1.0f, il); Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
   
...@@ -10928,7 +10980,7 @@ struct llm_build_t5_dec : public llm_graph_context { ...@@ -10928,7 +10980,7 @@ struct llm_build_t5_dec : public llm_graph_context {
   
cur = build_attn(inp_attn_cross, gf, cur = build_attn(inp_attn_cross, gf,
model.layers[il].wo_cross, nullptr, model.layers[il].wo_cross, nullptr,
Qcur, Kcur, Vcur, nullptr, 1.0f, il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
   
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
...@@ -11061,7 +11113,7 @@ struct llm_build_jais : public llm_graph_context { ...@@ -11061,7 +11113,7 @@ struct llm_build_jais : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -11193,7 +11245,7 @@ struct llm_build_chatglm : public llm_graph_context { ...@@ -11193,7 +11245,7 @@ struct llm_build_chatglm : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -11326,7 +11378,7 @@ struct llm_build_glm4 : public llm_graph_context { ...@@ -11326,7 +11378,7 @@ struct llm_build_glm4 : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -11470,7 +11522,7 @@ struct llm_build_nemotron : public llm_graph_context { ...@@ -11470,7 +11522,7 @@ struct llm_build_nemotron : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -11601,7 +11653,7 @@ struct llm_build_exaone : public llm_graph_context { ...@@ -11601,7 +11653,7 @@ struct llm_build_exaone : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -12503,7 +12555,7 @@ struct llm_build_chameleon : public llm_graph_context { ...@@ -12503,7 +12555,7 @@ struct llm_build_chameleon : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, nullptr, model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
   
if (hparams.swin_norm) { if (hparams.swin_norm) {
cur = build_norm(cur, cur = build_norm(cur,
...@@ -12690,7 +12742,7 @@ struct llm_build_solar : public llm_graph_context { ...@@ -12690,7 +12742,7 @@ struct llm_build_solar : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, kq_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il); cb(cur, "attn_out", il);
} }
   
...@@ -13018,7 +13070,7 @@ struct llm_build_plm : public llm_graph_context { ...@@ -13018,7 +13070,7 @@ struct llm_build_plm : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
q_states, k_states, v_states, nullptr, kq_scale, il); q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -13141,7 +13193,7 @@ struct llm_build_bailingmoe : public llm_graph_context { ...@@ -13141,7 +13193,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
   
cur = build_attn(inp_attn, gf, cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_rot)), il); Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
......
...@@ -174,6 +174,8 @@ struct llama_layer { ...@@ -174,6 +174,8 @@ struct llama_layer {
struct ggml_tensor * wq_b = nullptr; struct ggml_tensor * wq_b = nullptr;
struct ggml_tensor * wkv_a_mqa = nullptr; struct ggml_tensor * wkv_a_mqa = nullptr;
struct ggml_tensor * wkv_b = nullptr; struct ggml_tensor * wkv_b = nullptr;
struct ggml_tensor * wk_b = nullptr;
struct ggml_tensor * wv_b = nullptr;
struct ggml_tensor * wq_cross = nullptr; struct ggml_tensor * wq_cross = nullptr;
struct ggml_tensor * wk_cross = nullptr; struct ggml_tensor * wk_cross = nullptr;
struct ggml_tensor * wv_cross = nullptr; struct ggml_tensor * wv_cross = nullptr;
......
...@@ -1833,6 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1833,6 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false if (false
|| t.first == "<|fim_prefix|>" // Qwen || t.first == "<|fim_prefix|>" // Qwen
|| t.first == "<fim-prefix>" || t.first == "<fim-prefix>"
|| t.first == "<fim_prefix>" // Granite
|| t.first == "<|fim▁begin|>" // DeepSeek || t.first == "<|fim▁begin|>" // DeepSeek
|| t.first == "<PRE>" || t.first == "<PRE>"
|| t.first == "▁<PRE>" // CodeLlama || t.first == "▁<PRE>" // CodeLlama
...@@ -1851,6 +1852,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1851,6 +1852,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false if (false
|| t.first == "<|fim_suffix|>" // Qwen || t.first == "<|fim_suffix|>" // Qwen
|| t.first == "<fim-suffix>" || t.first == "<fim-suffix>"
|| t.first == "<fim_suffix>" // Granite
|| t.first == "<|fim▁hole|>" // DeepSeek || t.first == "<|fim▁hole|>" // DeepSeek
|| t.first == "<SUF>" || t.first == "<SUF>"
|| t.first == "▁<SUF>" // CodeLlama || t.first == "▁<SUF>" // CodeLlama
...@@ -1869,6 +1871,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1869,6 +1871,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false if (false
|| t.first == "<|fim_middle|>" // Qwen || t.first == "<|fim_middle|>" // Qwen
|| t.first == "<fim-middle>" || t.first == "<fim-middle>"
|| t.first == "<fim_middle>" // Granite
|| t.first == "<|fim▁end|>" // DeepSeek || t.first == "<|fim▁end|>" // DeepSeek
|| t.first == "<MID>" || t.first == "<MID>"
|| t.first == "▁<MID>" // CodeLlama || t.first == "▁<MID>" // CodeLlama
...@@ -1887,6 +1890,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1887,6 +1890,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false if (false
|| t.first == "<|fim_pad|>" // Qwen || t.first == "<|fim_pad|>" // Qwen
|| t.first == "<fim-pad>" || t.first == "<fim-pad>"
|| t.first == "<fim_pad>" // Granite
|| t.first == "<PAD>" || t.first == "<PAD>"
) { ) {
special_fim_pad_id = t.second; special_fim_pad_id = t.second;
...@@ -1905,6 +1909,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1905,6 +1909,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|repo_name|>" || t.first == "<|repo_name|>"
|| t.first == "<fim-repo>" || t.first == "<fim-repo>"
|| t.first == "<REPO>" || t.first == "<REPO>"
|| t.first == "<reponame>" // Granite
) { ) {
special_fim_rep_id = t.second; special_fim_rep_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
......
...@@ -65,10 +65,10 @@ index 273075f4..dd11f304 100644 ...@@ -65,10 +65,10 @@ index 273075f4..dd11f304 100644
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index cec36b36..4b057973 100644 index e2617b06..242e50a7 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp --- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer( @@ -800,6 +800,7 @@ static void ggml_backend_cann_buffer_free_buffer(
ggml_backend_cann_buffer_context* ctx = ggml_backend_cann_buffer_context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context; (ggml_backend_cann_buffer_context*)buffer->context;
delete ctx; delete ctx;
...@@ -76,7 +76,7 @@ index cec36b36..4b057973 100644 ...@@ -76,7 +76,7 @@ index cec36b36..4b057973 100644
} }
/** /**
@@ -1199,6 +1200,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf @@ -1472,6 +1473,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/ */
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context)); ACL_CHECK(aclrtFreeHost(buffer->context));
...@@ -85,10 +85,10 @@ index cec36b36..4b057973 100644 ...@@ -85,10 +85,10 @@ index cec36b36..4b057973 100644
/** /**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index fafe9633..59a49560 100644 index a7febef7..31750b6f 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -533,6 +533,7 @@ struct ggml_backend_cuda_buffer_context { @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -96,7 +96,7 @@ index fafe9633..59a49560 100644 ...@@ -96,7 +96,7 @@ index fafe9633..59a49560 100644
} }
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
@@ -788,6 +789,7 @@ struct ggml_backend_cuda_split_buffer_context { @@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -104,7 +104,7 @@ index fafe9633..59a49560 100644 ...@@ -104,7 +104,7 @@ index fafe9633..59a49560 100644
} }
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1061,6 +1063,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ @@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context)); CUDA_CHECK(cudaFreeHost(buffer->context));
...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644 ...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 9f1c6c6c..310afe8a 100644 index 266d8af4..12886cd3 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4641,6 +4641,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
} }
free(ctx); free(ctx);
...@@ -137,10 +137,10 @@ index 9f1c6c6c..310afe8a 100644 ...@@ -137,10 +137,10 @@ index 9f1c6c6c..310afe8a 100644
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index b8b5cbd3..14d4561b 100644 index 05a2f4e6..392cc18d 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -1443,6 +1443,7 @@ struct ggml_backend_opencl_buffer_context { @@ -1940,6 +1940,7 @@ struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx; delete ctx;
...@@ -149,10 +149,10 @@ index b8b5cbd3..14d4561b 100644 ...@@ -149,10 +149,10 @@ index b8b5cbd3..14d4561b 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 862b9b66..34536681 100644 index a0667b7d..bd83adc5 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -443,6 +443,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status); GGML_ASSERT(status);
delete ctx; delete ctx;
...@@ -161,7 +161,7 @@ index 862b9b66..34536681 100644 ...@@ -161,7 +161,7 @@ index 862b9b66..34536681 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 3e48a924..a3d182fc 100644 index 1de34c96..4600f61e 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { @@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
...@@ -189,10 +189,10 @@ index 3e48a924..a3d182fc 100644 ...@@ -189,10 +189,10 @@ index 3e48a924..a3d182fc 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 783a0ff8..8ac1e07e 100644 index 39f3cd34..c569a8a5 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8639,6 +8639,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer); ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx; delete ctx;
...@@ -200,7 +200,7 @@ index 783a0ff8..8ac1e07e 100644 ...@@ -200,7 +200,7 @@ index 783a0ff8..8ac1e07e 100644
} }
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8782,6 +8783,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe @@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context); ggml_vk_host_free(vk_instance.devices[0], buffer->context);
......
...@@ -10,7 +10,7 @@ logs instead of throwing an error ...@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 464ff01e..0125ee53 100644 index 48060517..a35b498c 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
......
...@@ -11,10 +11,10 @@ instead of forcing one or the error ...@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-) 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4735e98e..65135172 100644 index 983385f8..32f59819 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -1232,7 +1232,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0; int64_t n_outputs_all = 0;
// count outputs // count outputs
...@@ -23,7 +23,7 @@ index 4735e98e..65135172 100644 ...@@ -23,7 +23,7 @@ index 4735e98e..65135172 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) { for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0; n_outputs_all += batch.logits[i] != 0;
} }
@@ -1344,7 +1344,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot"); // ggml_graph_dump_dot(gf, NULL, "llama.dot");
//} //}
...@@ -32,7 +32,7 @@ index 4735e98e..65135172 100644 ...@@ -32,7 +32,7 @@ index 4735e98e..65135172 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) { if (t_embd && res->get_embd_pooled()) {
@@ -1488,7 +1488,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { @@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
......
...@@ -10,12 +10,12 @@ filesystems for paths that include wide characters ...@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+) 1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 49c90b75..4b72ea9f 100644 index 75970615..d57b4bd6 100644
--- a/examples/llava/clip.cpp --- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp
@@ -28,6 +28,19 @@ @@ -29,6 +29,19 @@
#include <cinttypes>
#include <limits> #include <limits>
#include <array>
+#if defined(_WIN32) +#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN
...@@ -33,7 +33,7 @@ index 49c90b75..4b72ea9f 100644 ...@@ -33,7 +33,7 @@ index 49c90b75..4b72ea9f 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
@@ -1429,7 +1442,29 @@ struct clip_model_loader { @@ -1430,7 +1443,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
...@@ -63,7 +63,7 @@ index 49c90b75..4b72ea9f 100644 ...@@ -63,7 +63,7 @@ index 49c90b75..4b72ea9f 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -1456,7 +1491,11 @@ struct clip_model_loader { @@ -1457,7 +1492,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com> From: jmorganca <jmorganca@gmail.com>
Date: Tue, 8 Apr 2025 16:03:51 -0700 Date: Sun, 20 Apr 2025 16:11:09 -0700
Subject: [PATCH] solar-pro Subject: [PATCH] solar-pro
adds support for the Solar Pro architecture adds support for the Solar Pro architecture
...@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture ...@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+) 7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index a6fddc7f..0b0fedcd 100644 index 62e1480b..f754bc8f 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
...@@ -31,10 +31,10 @@ index a6fddc7f..0b0fedcd 100644 ...@@ -31,10 +31,10 @@ index a6fddc7f..0b0fedcd 100644
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, + { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, @@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -1478,6 +1480,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
}, },
}, },
...@@ -59,7 +59,7 @@ index a6fddc7f..0b0fedcd 100644 ...@@ -59,7 +59,7 @@ index a6fddc7f..0b0fedcd 100644
{ {
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
{ {
@@ -1671,6 +1691,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...@@ -68,7 +68,7 @@ index a6fddc7f..0b0fedcd 100644 ...@@ -68,7 +68,7 @@ index a6fddc7f..0b0fedcd 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 2c2099b3..74aa3dd0 100644 index 98ca00a1..439aaeab 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -72,6 +72,7 @@ enum llm_arch { @@ -72,6 +72,7 @@ enum llm_arch {
...@@ -84,10 +84,10 @@ index 2c2099b3..74aa3dd0 100644 ...@@ -84,10 +84,10 @@ index 2c2099b3..74aa3dd0 100644
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, + LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ROPE_DIMENSION_COUNT, @@ -344,6 +346,7 @@ enum llm_tensor {
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -340,6 +342,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
...@@ -115,10 +115,10 @@ index 90dfe7a7..8a667960 100644 ...@@ -115,10 +115,10 @@ index 90dfe7a7..8a667960 100644
if (il < n_layer) { if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1); return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 4e0b5719..c3147cbc 100644 index 80fcd65d..6e278945 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -51,6 +51,8 @@ struct llama_hparams { @@ -55,6 +55,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
...@@ -127,7 +127,7 @@ index 4e0b5719..c3147cbc 100644 ...@@ -127,7 +127,7 @@ index 4e0b5719..c3147cbc 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -149,6 +151,9 @@ struct llama_hparams { @@ -153,6 +155,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings // dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const; uint32_t n_embd_v_s() const;
...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644 ...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index b74dd72c..5fbd0055 100644 index 6b7bfecf..aba42819 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1372,6 +1372,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -175,7 +175,7 @@ index b74dd72c..5fbd0055 100644 ...@@ -175,7 +175,7 @@ index b74dd72c..5fbd0055 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3701,6 +3716,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -210,7 +210,7 @@ index b74dd72c..5fbd0055 100644 ...@@ -210,7 +210,7 @@ index b74dd72c..5fbd0055 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12244,6 +12287,165 @@ struct llm_build_chameleon : public llm_graph_context { @@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context {
} }
}; };
...@@ -316,7 +316,7 @@ index b74dd72c..5fbd0055 100644 ...@@ -316,7 +316,7 @@ index b74dd72c..5fbd0055 100644
+ +
+ cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo, + model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il); + cb(cur, "attn_out", il);
+ } + }
+ +
...@@ -376,7 +376,7 @@ index b74dd72c..5fbd0055 100644 ...@@ -376,7 +376,7 @@ index b74dd72c..5fbd0055 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context { struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur; ggml_tensor * cur;
@@ -12993,6 +13195,10 @@ llm_graph_result_ptr llama_model::build_graph( @@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params, gf); llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break; } break;
...@@ -387,7 +387,7 @@ index b74dd72c..5fbd0055 100644 ...@@ -387,7 +387,7 @@ index b74dd72c..5fbd0055 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13139,6 +13345,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
...@@ -396,7 +396,7 @@ index b74dd72c..5fbd0055 100644 ...@@ -396,7 +396,7 @@ index b74dd72c..5fbd0055 100644
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index 0f18dac1..e08d4ae4 100644 index fd82d106..5865d5e9 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -62,6 +62,7 @@ enum llm_type { @@ -62,6 +62,7 @@ enum llm_type {
...@@ -407,7 +407,7 @@ index 0f18dac1..e08d4ae4 100644 ...@@ -407,7 +407,7 @@ index 0f18dac1..e08d4ae4 100644
LLM_TYPE_30B, LLM_TYPE_30B,
LLM_TYPE_32B, LLM_TYPE_32B,
LLM_TYPE_34B, LLM_TYPE_34B,
@@ -305,6 +306,8 @@ struct llama_layer { @@ -307,6 +308,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr;
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com> From: jmorganca <jmorganca@gmail.com>
Date: Tue, 8 Apr 2025 19:27:12 -0700 Date: Sun, 20 Apr 2025 16:12:36 -0700
Subject: [PATCH] add mllama support Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture adds support for the llama 3.2 vision architecture
...@@ -28,7 +28,7 @@ adds support for the llama 3.2 vision architecture ...@@ -28,7 +28,7 @@ adds support for the llama 3.2 vision architecture
20 files changed, 475 insertions(+), 22 deletions(-) 20 files changed, 475 insertions(+), 22 deletions(-)
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 91a07e2a..13127c7b 100644 index 3d566475..654d1358 100644
--- a/examples/llava/gemma3-cli.cpp --- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp
@@ -106,7 +106,7 @@ struct decode_embd_batch { @@ -106,7 +106,7 @@ struct decode_embd_batch {
...@@ -79,10 +79,10 @@ index 03a22cbb..5eb40bcd 100644 ...@@ -79,10 +79,10 @@ index 03a22cbb..5eb40bcd 100644
LOG_ERR("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 114c274b..a0e649ad 100644 index 3fd5bebc..f0cec596 100644
--- a/examples/llava/mtmd.cpp --- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp
@@ -213,7 +213,7 @@ struct decode_embd_batch { @@ -233,7 +233,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids; std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits; std::vector<int8_t> logits;
llama_batch batch; llama_batch batch;
...@@ -91,7 +91,7 @@ index 114c274b..a0e649ad 100644 ...@@ -91,7 +91,7 @@ index 114c274b..a0e649ad 100644
pos .resize(n_tokens); pos .resize(n_tokens);
n_seq_id.resize(n_tokens); n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1); seq_ids .resize(n_tokens + 1);
@@ -225,6 +225,7 @@ struct decode_embd_batch { @@ -245,6 +245,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ embd, /*embd =*/ embd,
...@@ -99,9 +99,9 @@ index 114c274b..a0e649ad 100644 ...@@ -99,9 +99,9 @@ index 114c274b..a0e649ad 100644
/*pos =*/ pos.data(), /*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(), /*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(), /*seq_id =*/ seq_ids.data(),
@@ -291,7 +292,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, @@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t n_tokens = chunk.tokens_image->n_tokens(); int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
float * embd = mtmd_get_output_embd(ctx); float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0); - decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx)); + int n_embd = llama_model_n_embd(llama_get_model(lctx));
...@@ -158,7 +158,7 @@ index 5657fbf0..f91896e4 100644 ...@@ -158,7 +158,7 @@ index 5657fbf0..f91896e4 100644
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 0b0fedcd..c1f78618 100644 index f754bc8f..0568565f 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@ @@ -6,6 +6,7 @@
...@@ -174,10 +174,10 @@ index 0b0fedcd..c1f78618 100644 ...@@ -174,10 +174,10 @@ index 0b0fedcd..c1f78618 100644
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" }, + { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, @@ -271,6 +273,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -269,6 +271,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
...@@ -218,7 +218,7 @@ index 0b0fedcd..c1f78618 100644 ...@@ -218,7 +218,7 @@ index 0b0fedcd..c1f78618 100644
{ {
LLM_ARCH_DECI, LLM_ARCH_DECI,
{ {
@@ -1692,6 +1728,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -1681,6 +1717,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
...@@ -234,7 +234,7 @@ index 0b0fedcd..c1f78618 100644 ...@@ -234,7 +234,7 @@ index 0b0fedcd..c1f78618 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 74aa3dd0..f987844d 100644 index 439aaeab..6a989034 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -11,6 +11,7 @@ @@ -11,6 +11,7 @@
...@@ -250,10 +250,10 @@ index 74aa3dd0..f987844d 100644 ...@@ -250,10 +250,10 @@ index 74aa3dd0..f987844d 100644
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, + LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ROPE_DIMENSION_COUNT, @@ -347,6 +349,14 @@ enum llm_tensor {
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -343,6 +345,14 @@ enum llm_tensor {
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV, LLM_TENSOR_BSKCN_TV,
...@@ -297,10 +297,10 @@ index 01d5ca57..8682b0e6 100644 ...@@ -297,10 +297,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
} }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 65135172..afe6f552 100644 index 32f59819..0343ba8a 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -858,7 +858,7 @@ float * llama_context::get_logits_ith(int32_t i) { @@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
} }
...@@ -309,7 +309,7 @@ index 65135172..afe6f552 100644 ...@@ -309,7 +309,7 @@ index 65135172..afe6f552 100644
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG #ifndef NDEBUG
@@ -979,6 +979,10 @@ void llama_context::set_warmup(bool value) { @@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value; cparams.warmup = value;
} }
...@@ -320,7 +320,7 @@ index 65135172..afe6f552 100644 ...@@ -320,7 +320,7 @@ index 65135172..afe6f552 100644
void llama_context::set_adapter_lora( void llama_context::set_adapter_lora(
llama_adapter_lora * adapter, llama_adapter_lora * adapter,
float scale) { float scale) {
@@ -1054,7 +1058,7 @@ int llama_context::encode(llama_batch & inp_batch) { @@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
...@@ -329,7 +329,7 @@ index 65135172..afe6f552 100644 ...@@ -329,7 +329,7 @@ index 65135172..afe6f552 100644
const llama_ubatch ubatch = sbatch.split_simple(n_tokens); const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -1194,10 +1198,9 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch; const llama_batch & batch = batch_allocr.batch;
...@@ -341,7 +341,7 @@ index 65135172..afe6f552 100644 ...@@ -341,7 +341,7 @@ index 65135172..afe6f552 100644
const int64_t n_tokens_all = batch.n_tokens; const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
@@ -1245,7 +1248,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all; const bool logits_all = n_outputs_all == n_tokens_all;
...@@ -350,7 +350,7 @@ index 65135172..afe6f552 100644 ...@@ -350,7 +350,7 @@ index 65135172..afe6f552 100644
/* simple_split */ !kv_self->recurrent, /* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all); /* logits_all */ logits_all);
@@ -1479,12 +1482,11 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) { int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
...@@ -364,7 +364,7 @@ index 65135172..afe6f552 100644 ...@@ -364,7 +364,7 @@ index 65135172..afe6f552 100644
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
@@ -1554,7 +1556,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { @@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_context::output_reorder() { void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids; auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) { if (!out_ids.empty()) {
...@@ -373,7 +373,7 @@ index 65135172..afe6f552 100644 ...@@ -373,7 +373,7 @@ index 65135172..afe6f552 100644
const uint32_t n_embd = model.hparams.n_embd; const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size()); GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -2061,7 +2063,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { @@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{ {
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
...@@ -382,7 +382,7 @@ index 65135172..afe6f552 100644 ...@@ -382,7 +382,7 @@ index 65135172..afe6f552 100644
io.write(&logits_size, sizeof(logits_size)); io.write(&logits_size, sizeof(logits_size));
@@ -2244,6 +2246,7 @@ llama_context_params llama_context_default_params() { @@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true, /*.offload_kqv =*/ true,
/*.flash_attn =*/ false, /*.flash_attn =*/ false,
/*.no_perf =*/ true, /*.no_perf =*/ true,
...@@ -390,7 +390,7 @@ index 65135172..afe6f552 100644 ...@@ -390,7 +390,7 @@ index 65135172..afe6f552 100644
/*.abort_callback =*/ nullptr, /*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr, /*.abort_callback_data =*/ nullptr,
}; };
@@ -2371,6 +2374,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { @@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup); ctx->set_warmup(warmup);
} }
...@@ -426,7 +426,7 @@ index 30e550f0..85ad91b9 100644 ...@@ -426,7 +426,7 @@ index 30e550f0..85ad91b9 100644
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index cd955d63..83f3c5a8 100644 index a85e9728..d740c120 100644
--- a/src/llama-graph.cpp --- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp +++ b/src/llama-graph.cpp
@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { @@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
...@@ -442,7 +442,7 @@ index cd955d63..83f3c5a8 100644 ...@@ -442,7 +442,7 @@ index cd955d63..83f3c5a8 100644
// //
// llm_graph_context // llm_graph_context
// //
@@ -1495,6 +1501,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { @@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
} }
...@@ -469,7 +469,7 @@ index cd955d63..83f3c5a8 100644 ...@@ -469,7 +469,7 @@ index cd955d63..83f3c5a8 100644
llm_graph_input_attn_cross * inp, llm_graph_input_attn_cross * inp,
ggml_cgraph * gf, ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h diff --git a/src/llama-graph.h b/src/llama-graph.h
index 5b6618f9..51993998 100644 index d192dc14..260a2af2 100644
--- a/src/llama-graph.h --- a/src/llama-graph.h
+++ b/src/llama-graph.h +++ b/src/llama-graph.h
@@ -86,6 +86,7 @@ public: @@ -86,6 +86,7 @@ public:
...@@ -518,7 +518,7 @@ index 8a667960..6a02de03 100644 ...@@ -518,7 +518,7 @@ index 8a667960..6a02de03 100644
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); + return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+} +}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index c3147cbc..4567a0e9 100644 index 6e278945..c8a34d52 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@ @@ -2,6 +2,8 @@
...@@ -536,9 +536,9 @@ index c3147cbc..4567a0e9 100644 ...@@ -536,9 +536,9 @@ index c3147cbc..4567a0e9 100644
uint32_t n_rel_attn_bkts = 0; uint32_t n_rel_attn_bkts = 0;
+ uint32_t n_vocab = 0; + uint32_t n_vocab = 0;
// for WavTokenizer // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
struct llama_hparams_posnet posnet; uint32_t n_embd_head_k_mla = 0;
@@ -52,6 +55,7 @@ struct llama_hparams { @@ -56,6 +59,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {}; std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
...@@ -546,7 +546,7 @@ index c3147cbc..4567a0e9 100644 ...@@ -546,7 +546,7 @@ index c3147cbc..4567a0e9 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
@@ -154,6 +158,9 @@ struct llama_hparams { @@ -158,6 +162,9 @@ struct llama_hparams {
// Block skip connection // Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const; bool n_bskcn(uint32_t n, uint32_t il) const;
...@@ -557,7 +557,7 @@ index c3147cbc..4567a0e9 100644 ...@@ -557,7 +557,7 @@ index c3147cbc..4567a0e9 100644
}; };
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index dbf5f118..9310f262 100644 index 7c9d46d8..69f8d35a 100644
--- a/src/llama-kv-cache.cpp --- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp
@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init( @@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
...@@ -593,7 +593,7 @@ index a012aeae..2e11507d 100644 ...@@ -593,7 +593,7 @@ index a012aeae..2e11507d 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str()); const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 5fbd0055..d5ad466e 100644 index aba42819..d051696c 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...@@ -650,7 +650,7 @@ index 5fbd0055..d5ad466e 100644 ...@@ -650,7 +650,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1548,7 +1562,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff(); const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_embd_gqa = n_embd_v_gqa;
...@@ -659,7 +659,7 @@ index 5fbd0055..d5ad466e 100644 ...@@ -659,7 +659,7 @@ index 5fbd0055..d5ad466e 100644
const int64_t n_token_types = vocab.n_token_types(); const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot; const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert; const int64_t n_expert = hparams.n_expert;
@@ -1801,6 +1815,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
} }
} break; } break;
...@@ -712,7 +712,7 @@ index 5fbd0055..d5ad466e 100644 ...@@ -712,7 +712,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4665,6 +4725,246 @@ struct llm_build_llama : public llm_graph_context { @@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context {
} }
}; };
...@@ -900,7 +900,7 @@ index 5fbd0055..d5ad466e 100644 ...@@ -900,7 +900,7 @@ index 5fbd0055..d5ad466e 100644
+ +
+ cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo, + model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ +
+ if (il == n_layer - 1) { + if (il == n_layer - 1) {
+ // skip computing output for unused tokens + // skip computing output for unused tokens
...@@ -959,7 +959,7 @@ index 5fbd0055..d5ad466e 100644 ...@@ -959,7 +959,7 @@ index 5fbd0055..d5ad466e 100644
struct llm_build_deci : public llm_graph_context { struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12965,6 +13265,10 @@ llm_graph_result_ptr llama_model::build_graph( @@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_llama>(*this, params, gf); llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break; } break;
...@@ -970,7 +970,7 @@ index 5fbd0055..d5ad466e 100644 ...@@ -970,7 +970,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
llm = std::make_unique<llm_build_deci>(*this, params, gf); llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13325,6 +13629,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values // use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4: case LLM_ARCH_LLAMA4:
...@@ -979,7 +979,7 @@ index 5fbd0055..d5ad466e 100644 ...@@ -979,7 +979,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_BAICHUAN: case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER: case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index e08d4ae4..21c4617b 100644 index 5865d5e9..72bab5be 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -11,6 +11,7 @@ @@ -11,6 +11,7 @@
...@@ -998,7 +998,7 @@ index e08d4ae4..21c4617b 100644 ...@@ -998,7 +998,7 @@ index e08d4ae4..21c4617b 100644
LLM_TYPE_236B, LLM_TYPE_236B,
LLM_TYPE_314B, LLM_TYPE_314B,
LLM_TYPE_671B, LLM_TYPE_671B,
@@ -308,6 +310,16 @@ struct llama_layer { @@ -310,6 +312,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr; struct ggml_tensor * bskcn_tv = nullptr;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment