Commit b2b270ad authored by Devon Rifkin's avatar Devon Rifkin
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents 20c5fd39 2bb69b40
...@@ -6,7 +6,7 @@ package llama ...@@ -6,7 +6,7 @@ package llama
#cgo CXXFLAGS: -std=c++17 #cgo CXXFLAGS: -std=c++17
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/examples/llava #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/tools/mtmd
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src
#cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include #cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include
...@@ -17,7 +17,6 @@ package llama ...@@ -17,7 +17,6 @@ package llama
#include "llava.h" #include "llava.h"
#include "gguf.h" #include "gguf.h"
#include "mllama.h"
#include "sampling_ext.h" #include "sampling_ext.h"
extern bool llamaProgressCallback(float progress, void *user_data); extern bool llamaProgressCallback(float progress, void *user_data);
...@@ -40,8 +39,8 @@ import ( ...@@ -40,8 +39,8 @@ import (
"unsafe" "unsafe"
_ "github.com/ollama/ollama/llama/llama.cpp/common" _ "github.com/ollama/ollama/llama/llama.cpp/common"
_ "github.com/ollama/ollama/llama/llama.cpp/examples/llava"
_ "github.com/ollama/ollama/llama/llama.cpp/src" _ "github.com/ollama/ollama/llama/llama.cpp/src"
_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src" ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
) )
...@@ -510,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, ...@@ -510,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
return embed, nil return embed, nil
} }
type MllamaContext struct {
c *C.struct_mllama_ctx
}
func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))
c := C.mllama_model_load(mp, 1)
if c == nil {
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
}
projEmbedSize := int(C.mllama_n_embd(c))
modelEmbedSize := llamaContext.Model().NEmbd()
if projEmbedSize != modelEmbedSize {
return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
}
return &MllamaContext{c: c}, nil
}
func (m *MllamaContext) Free() {
C.mllama_free(m.c)
}
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
img := C.mllama_image_init()
defer C.mllama_image_free(img)
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
if !ok {
return nil, errors.New("unable to load mllama image data")
}
rows := make([]float32, m.EmbedSize(llamaContext))
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
if !ok {
return nil, errors.New("unable to make mllama embedding from image")
}
embed := make([][]float32, 1)
embed[0] = rows
return embed, nil
}
func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
numEmbed := llamaContext.Model().NEmbd()
return numTokens * numEmbed
}
func (c *Context) SetCrossAttention(state bool) {
C.llama_set_cross_attention(c.c, C.bool(state))
}
func (c *Context) Synchronize() { func (c *Context) Synchronize() {
C.llama_synchronize(c.c) C.llama_synchronize(c.c)
} }
...@@ -602,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, ...@@ -602,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
cparams.penalty_last_n = C.int32_t(params.RepeatLastN) cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
cparams.penalty_repeat = C.float(params.PenaltyRepeat) cparams.penalty_repeat = C.float(params.PenaltyRepeat)
cparams.penalty_freq = C.float(params.PenaltyFreq) cparams.penalty_freq = C.float(params.PenaltyFreq)
cparams.penalty_present = C.float(params.PenaltyFreq) cparams.penalty_present = C.float(params.PenaltyPresent)
cparams.seed = C.uint32_t(params.Seed) cparams.seed = C.uint32_t(params.Seed)
grammar := C.CString(params.Grammar) grammar := C.CString(params.Grammar)
...@@ -637,8 +579,8 @@ func SchemaToGrammar(schema []byte) []byte { ...@@ -637,8 +579,8 @@ func SchemaToGrammar(schema []byte) []byte {
cStr := C.CString(string(schema)) cStr := C.CString(string(schema))
defer C.free(unsafe.Pointer(cStr)) defer C.free(unsafe.Pointer(cStr))
// Allocate buffer for grammar output with reasonable size // Allocate buffer for grammar based on schema length but with upper bound
const maxLen = 32768 // 32KB maxLen := max(32768, min(1024*1024, len(schema)*4))
buf := make([]byte, maxLen) buf := make([]byte, maxLen)
// Call C function to convert schema to grammar // Call C function to convert schema to grammar
...@@ -660,7 +602,7 @@ type Grammar struct { ...@@ -660,7 +602,7 @@ type Grammar struct {
mu sync.Mutex mu sync.Mutex
} }
func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar { func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
cGrammar := C.CString(grammar) cGrammar := C.CString(grammar)
defer C.free(unsafe.Pointer(cGrammar)) defer C.free(unsafe.Pointer(cGrammar))
...@@ -680,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke ...@@ -680,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
cEogTokens[i] = C.uint32_t(token) cEogTokens[i] = C.uint32_t(token)
} }
g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens))) g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
if g == nil { if g == nil {
return nil return nil
} }
......
// NOTE: This is modified from clip.cpp for Mllama only
#include "mllama.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#include <algorithm>
#include <cmath>
#include <cstdarg>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <stdexcept>
#include <vector>
#define REQUIRE(x) \
do { \
if (!(x)) { \
throw std::runtime_error("REQUIRE failed: " #x); \
} \
} while (0)
#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#if __GLIBCXX__
#include <cstdio>
#include <ext/stdio_filebuf.h>
#include <fcntl.h>
#endif
#endif
struct mllama_image {
int width;
int height;
int num_channels = 3;
int num_tiles = 4;
int aspect_ratio_id;
std::vector<float> data;
};
static std::string format(const char *fmt, ...) {
va_list args;
va_start(args, fmt);
std::vector<char> b(128);
int n = vsnprintf(b.data(), b.size(), fmt, args);
REQUIRE(n >= 0 && n < b.size());
va_end(args);
return std::string(b.data(), b.size());
}
//
// utilities to get data from a gguf file
//
static int get_key_index(const gguf_context *ctx, const char *key) {
int key_index = gguf_find_key(ctx, key);
REQUIRE(key_index != -1);
return key_index;
}
static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
const int i = get_key_index(ctx, key.c_str());
const int n = gguf_get_arr_n(ctx, i);
const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
std::vector<uint32_t> s(n);
for (size_t j = 0; j < s.size(); j++) {
s[j] = data[j];
}
return s;
}
static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
}
static float get_f32(const gguf_context *ctx, const std::string &key) {
return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
}
static std::string get_ftype(int ftype) {
return ggml_type_name(static_cast<ggml_type>(ftype));
}
//
// mllama layers
//
struct mllama_hparams {
uint32_t image_size;
uint32_t patch_size;
uint32_t hidden_size;
uint32_t n_intermediate;
uint32_t projection_dim;
uint32_t n_head;
uint32_t n_layer;
uint32_t n_global_layer;
uint32_t n_tiles;
float eps;
std::vector<bool> intermediate_layers;
};
struct mllama_layer {
// attention
struct ggml_tensor *k_w;
struct ggml_tensor *k_b;
struct ggml_tensor *q_w;
struct ggml_tensor *q_b;
struct ggml_tensor *v_w;
struct ggml_tensor *v_b;
struct ggml_tensor *o_w;
struct ggml_tensor *o_b;
struct ggml_tensor *attn_gate;
// layernorm 1
struct ggml_tensor *ln_1_w;
struct ggml_tensor *ln_1_b;
// ff
struct ggml_tensor *ff_i_w;
struct ggml_tensor *ff_i_b;
struct ggml_tensor *ff_o_w;
struct ggml_tensor *ff_o_b;
struct ggml_tensor *ff_gate;
// layernorm 2
struct ggml_tensor *ln_2_w;
struct ggml_tensor *ln_2_b;
};
struct mllama_vision_model {
struct mllama_hparams hparams;
// embeddings
struct ggml_tensor *class_embedding;
struct ggml_tensor *patch_embeddings;
struct ggml_tensor *position_embeddings;
struct ggml_tensor *position_embeddings_gate;
struct ggml_tensor *tile_position_embeddings;
struct ggml_tensor *tile_position_embeddings_gate;
struct ggml_tensor *pre_tile_position_embeddings;
struct ggml_tensor *pre_tile_position_embeddings_gate;
struct ggml_tensor *post_tile_position_embeddings;
struct ggml_tensor *post_tile_position_embeddings_gate;
struct ggml_tensor *pre_ln_w;
struct ggml_tensor *pre_ln_b;
std::vector<mllama_layer> layers;
std::vector<mllama_layer> global_layers;
struct ggml_tensor *post_ln_w;
struct ggml_tensor *post_ln_b;
struct ggml_tensor *mm_0_w;
struct ggml_tensor *mm_0_b;
};
struct mllama_ctx {
struct mllama_vision_model vision_model;
uint32_t ftype = 1;
struct gguf_context *ctx_gguf;
struct ggml_context *ctx_data;
std::vector<uint8_t> buf_compute_meta;
// memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = nullptr;
ggml_backend_t backend = nullptr;
ggml_gallocr_t compute_alloc = nullptr;
};
static ggml_tensor *mllama_image_build_encoder_layer(
struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
struct ggml_tensor *cur = embeddings;
{
// layernorm1
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
ggml_set_name(cur, format("%d pre layernorm", il).c_str());
}
{
// self-attention
struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
if (layer.q_b != nullptr) {
Q = ggml_add(ctx0, Q, layer.q_b);
}
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
ggml_set_name(Q, format("%d query", il).c_str());
struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
if (layer.k_b != nullptr) {
K = ggml_add(ctx0, K, layer.k_b);
}
K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
ggml_set_name(K, format("%d key", il).c_str());
struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
if (layer.v_b != nullptr) {
V = ggml_add(ctx0, V, layer.v_b);
}
V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
ggml_set_name(V, format("%d value", il).c_str());
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
KQ = ggml_soft_max_inplace(ctx0, KQ);
ggml_set_name(KQ, format("%d KQ", il).c_str());
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
ggml_set_name(KQV, format("%d KQV", il).c_str());
cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
if (layer.o_b != nullptr) {
cur = ggml_add(ctx0, cur, layer.o_b);
}
ggml_set_name(cur, format("%d self attention", il).c_str());
if (layer.attn_gate != nullptr) {
cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
ggml_set_name(cur, format("%d self attention gate", il).c_str());
}
}
cur = ggml_add(ctx0, cur, embeddings);
ggml_set_name(cur, format("%d residual", il).c_str());
embeddings = cur;
{
// layernorm2
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
ggml_set_name(cur, format("%d post layernorm", il).c_str());
}
{
// feed forward
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
cur = ggml_gelu_inplace(ctx0, cur);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
ggml_set_name(cur, format("%d feed forward", il).c_str());
if (layer.ff_gate != nullptr) {
cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
ggml_set_name(cur, format("%d feed forward gate", il).c_str());
}
}
// residual 2
cur = ggml_add(ctx0, cur, embeddings);
ggml_set_name(cur, format("%d residual", il).c_str());
embeddings = cur;
return embeddings;
}
static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
const auto &model = ctx->vision_model;
const auto &hparams = model.hparams;
const int image_size = hparams.image_size;
const int image_size_width = image_size;
const int image_size_height = image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int batch_size = imgs->size;
REQUIRE(batch_size == 1);
int num_tiles = 4;
int num_channels = 3;
if (imgs->data != nullptr) {
num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
}
struct ggml_init_params params = {
ctx->buf_compute_meta.size(), // mem_size
ctx->buf_compute_meta.data(), // mem_buffer
true, // no_alloc
};
struct ggml_context *ctx0 = ggml_init(params);
struct ggml_cgraph *gf = ggml_new_graph(ctx0);
struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
ggml_set_name(aspect_ratios, "aspect_ratios");
ggml_set_input(aspect_ratios);
if (model.pre_tile_position_embeddings != nullptr) {
struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
if (model.pre_tile_position_embeddings_gate != nullptr) {
pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
}
inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
}
struct ggml_tensor *embeddings = inp;
if (model.class_embedding != nullptr) {
// concat class_embeddings and patch_embeddings
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
for (int i = 0; i < num_tiles; ++i) {
// repeat class embeddings for each tile
embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
}
embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
if (model.position_embeddings_gate != nullptr) {
position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
}
embeddings = ggml_add(ctx0, embeddings, position_embd);
if (model.tile_position_embeddings != nullptr) {
struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
if (model.tile_position_embeddings_gate != nullptr) {
tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
}
embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
}
// pre-layernorm
if (model.pre_ln_w != nullptr) {
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
if (model.pre_ln_b != nullptr) {
embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
}
ggml_set_name(embeddings, "pre layernorm");
}
const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
std::vector<struct ggml_tensor *> intermediate_embeddings;
// encoder
for (size_t il = 0; il < model.layers.size(); il++) {
if (hparams.intermediate_layers[il]) {
intermediate_embeddings.push_back(embeddings);
}
embeddings = mllama_image_build_encoder_layer(
ctx0, il, model.layers[il], embeddings,
hparams.eps, hidden_size, batch_size, n_head, d_head);
}
// post-layernorm
if (model.post_ln_w != nullptr) {
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
if (model.post_ln_b != nullptr) {
embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
}
ggml_set_name(embeddings, "post layernorm");
}
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
if (model.post_tile_position_embeddings != nullptr) {
struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
if (model.post_tile_position_embeddings_gate != nullptr) {
post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
}
embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
}
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
// global encoder
for (size_t il = 0; il < model.global_layers.size(); il++) {
embeddings = mllama_image_build_encoder_layer(
ctx0, il, model.global_layers[il], embeddings,
hparams.eps, hidden_size, batch_size, n_head, d_head);
}
struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
}
stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
// mllama projector
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
ggml_set_name(embeddings, "multi modal projector");
// build the graph
ggml_build_forward_expand(gf, embeddings);
ggml_free(ctx0);
return gf;
}
static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
REQUIRE(cur != nullptr || optional);
return cur;
}
static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
std::vector<struct mllama_layer> layers(n);
for (size_t i = 0; i < layers.size(); i++) {
auto &layer = layers[i];
layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
}
return layers;
}
// read and create ggml_context containing the tensors and their data
struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
struct ggml_context *meta = nullptr;
struct gguf_init_params params = {
true, // no_alloc
&meta, // ctx
};
struct gguf_context *ctx = gguf_init_from_file(fname, params);
REQUIRE(ctx != nullptr);
if (verbosity >= 1) {
const int n_tensors = gguf_get_n_tensors(ctx);
const int n_kv = gguf_get_n_kv(ctx);
const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
const int idx_desc = get_key_index(ctx, "general.description");
const std::string description = gguf_get_val_str(ctx, idx_desc);
const int idx_name = gguf_find_key(ctx, "general.name");
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name);
LOG("model name: %s", name.c_str());
}
LOG("description: %s", description.c_str());
LOG("GGUF version: %d", gguf_get_version(ctx));
LOG("alignment: %zu", gguf_get_alignment(ctx));
LOG("n_tensors: %d", n_tensors);
LOG("n_kv: %d", n_kv);
LOG("ftype: %s", ftype.c_str());
LOG("");
}
const int n_tensors = gguf_get_n_tensors(ctx);
mllama_ctx *new_mllama = new mllama_ctx{};
ggml_backend_t backend = ggml_backend_init_best();
if (backend == nullptr) {
LOG("%s: failed to initialize backend\n", __func__);
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
new_mllama->backend = backend;
// load tensors
{
std::vector<uint8_t> read_buf;
struct ggml_init_params params = {
(n_tensors + 1) * ggml_tensor_overhead(), // mem_size
nullptr, // mem_buffer
true, // no_alloc
};
new_mllama->ctx_data = ggml_init(params);
if (!new_mllama->ctx_data) {
LOG("ggml_init() failed");
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
#ifdef _WIN32
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
if (!wlen) {
return NULL;
}
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
if (!wlen) {
free(wbuf);
return NULL;
}
#if __GLIBCXX__
int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
__gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
std::istream fin(&buffer);
#else // MSVC
// unused in our current build
auto fin = std::ifstream(wbuf, std::ios::binary);
#endif
free(wbuf);
#else
auto fin = std::ifstream(fname, std::ios::binary);
#endif
if (!fin) {
LOG("cannot open model file for loading tensors\n");
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
// add tensors to context
for (int i = 0; i < n_tensors; ++i) {
const char *name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor *t = ggml_get_tensor(meta, name);
struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
ggml_set_name(cur, name);
}
// alloc memory and offload data
new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
for (int i = 0; i < n_tensors; ++i) {
const char *name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg);
if (!fin) {
LOG("failed to seek for tensor %s\n", name);
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
int num_bytes = ggml_nbytes(cur);
if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
// for the CPU and Metal backend, we can read directly into the tensor
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
} else {
// read into a temporary buffer first, then copy to device memory
read_buf.resize(num_bytes);
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
#if defined(_WIN32) && defined(__GLIBCXX__)
close(fd);
#else
fin.close();
#endif
}
// vision model
// load vision model
auto &vision_model = new_mllama->vision_model;
auto &hparams = vision_model.hparams;
hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
hparams.intermediate_layers.resize(hparams.n_layer);
for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
}
if (verbosity >= 2) {
LOG("");
LOG("vision model hparams");
LOG("image_size %d", hparams.image_size);
LOG("patch_size %d", hparams.patch_size);
LOG("v_hidden_size %d", hparams.hidden_size);
LOG("v_n_intermediate %d", hparams.n_intermediate);
LOG("v_projection_dim %d", hparams.projection_dim);
LOG("v_n_head %d", hparams.n_head);
LOG("v_n_layer %d", hparams.n_layer);
LOG("v_n_global_layer %d", hparams.n_global_layer);
LOG("v_eps %f", hparams.eps);
}
vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
ggml_free(meta);
new_mllama->ctx_gguf = ctx;
{
// measure mem requirement and allocate
new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
struct mllama_image_batch batch;
batch.size = 1;
ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
}
return new_mllama;
}
struct mllama_image *mllama_image_init() {
return new mllama_image();
}
void mllama_image_free(struct mllama_image *img) { delete img; }
void mllama_image_batch_free(struct mllama_image_batch *batch) {
if (batch->size > 0) {
delete[] batch->data;
batch->size = 0;
}
}
bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
img->width = width;
img->height = height;
img->num_channels = num_channels;
img->num_tiles = num_tiles;
img->aspect_ratio_id = aspect_ratio_id;
img->data.resize(n);
memcpy(img->data.data(), data, n);
return true;
}
inline int mllama(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
void mllama_free(mllama_ctx *ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
ggml_backend_buffer_free(ctx->params_buffer);
ggml_backend_free(ctx->backend);
ggml_gallocr_free(ctx->compute_alloc);
delete ctx;
}
bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
mllama_image_batch imgs{};
imgs.size = 1;
imgs.data = img;
return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
}
bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
int batch_size = imgs->size;
REQUIRE(batch_size == 1);
// build the inference graph
ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
// set inputs
const auto &model = ctx->vision_model;
const auto &hparams = model.hparams;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
{
struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
}
{
struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
if (embeddings != nullptr) {
void *zeros = malloc(ggml_nbytes(embeddings));
memset(zeros, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
free(zeros);
}
}
{
struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
if (positions != nullptr) {
int *positions_data = (int *)malloc(ggml_nbytes(positions));
for (int i = 0; i < num_positions; i++) {
positions_data[i] = i;
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
}
{
struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
if (aspect_ratios != nullptr) {
int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
free(aspect_ratios_data);
}
}
if (ggml_backend_is_cpu(ctx->backend)) {
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
}
ggml_backend_graph_compute(ctx->backend, gf);
// the last node is the embedding tensor
struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
// copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
return true;
}
int32_t mllama_image_size(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.image_size;
}
int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.patch_size;
}
int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.hidden_size;
}
int mllama_n_patches(const struct mllama_ctx *ctx) {
const auto &hparams = ctx->vision_model.hparams;
return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
}
int mllama_n_positions(const struct mllama_ctx *ctx) {
return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
}
int mllama_n_tiles(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.n_tiles;
}
int mllama_n_embd(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.projection_dim;
}
size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
}
#ifndef MLLAMA_H
#define MLLAMA_H
#include <stddef.h>
#include <stdint.h>
#ifdef LLAMA_SHARED
#if defined(_WIN32) && !defined(__MINGW32__)
#ifdef LLAMA_BUILD
#define MLLAMA_API __declspec(dllexport)
#else
#define MLLAMA_API __declspec(dllimport)
#endif
#else
#define MLLAMA_API __attribute__((visibility("default")))
#endif
#else
#define MLLAMA_API
#endif
#ifdef __cplusplus
extern "C" {
#endif
struct mllama_ctx;
struct mllama_image_batch {
struct mllama_image *data;
size_t size;
};
MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
MLLAMA_API struct mllama_image *mllama_image_init();
MLLAMA_API void mllama_image_free(struct mllama_image *img);
MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
#ifdef __cplusplus
}
#endif
#endif // MLLAMA_H
...@@ -24,7 +24,7 @@ problem. ...@@ -24,7 +24,7 @@ problem.
9 files changed, 21 insertions(+), 2 deletions(-) 9 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 273075f4..dd11f304 100644 index b30b4cb3..0ce73a99 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
...@@ -43,7 +43,7 @@ index 273075f4..dd11f304 100644 ...@@ -43,7 +43,7 @@ index 273075f4..dd11f304 100644
} }
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -1871,6 +1871,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_aligned_free(buffer->context, buffer->size); ggml_aligned_free(buffer->context, buffer->size);
...@@ -55,7 +55,7 @@ index 273075f4..dd11f304 100644 ...@@ -55,7 +55,7 @@ index 273075f4..dd11f304 100644
} }
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { @@ -1918,7 +1923,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
}; };
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
...@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644 ...@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
/** /**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 9fb2134f..04ce764e 100644 index b4b85abc..cb0d8528 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context { @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
...@@ -96,7 +96,7 @@ index 9fb2134f..04ce764e 100644 ...@@ -96,7 +96,7 @@ index 9fb2134f..04ce764e 100644
} }
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context { @@ -790,6 +791,7 @@ struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -104,7 +104,7 @@ index 9fb2134f..04ce764e 100644 ...@@ -104,7 +104,7 @@ index 9fb2134f..04ce764e 100644
} }
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ @@ -1067,6 +1069,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context)); CUDA_CHECK(cudaFreeHost(buffer->context));
...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644 ...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index d92392ed..425524d0 100644 index 576f9581..1b56f858 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -5214,6 +5214,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
} }
free(ctx); free(ctx);
...@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644 ...@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 140a775f..e33c4ba0 100644 index 4f0abb5a..de1ec184 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -483,6 +483,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status); GGML_ASSERT(status);
delete ctx; delete ctx;
...@@ -161,10 +161,10 @@ index 140a775f..e33c4ba0 100644 ...@@ -161,10 +161,10 @@ index 140a775f..e33c4ba0 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 66b6f2cc..e3e6deae 100644 index 0ea72994..ae3a3c33 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { @@ -320,6 +320,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device); ggml_sycl_set_device(ctx->device);
delete ctx; delete ctx;
...@@ -172,7 +172,7 @@ index 66b6f2cc..e3e6deae 100644 ...@@ -172,7 +172,7 @@ index 66b6f2cc..e3e6deae 100644
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context { @@ -765,6 +766,7 @@ struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -180,7 +180,7 @@ index 66b6f2cc..e3e6deae 100644 ...@@ -180,7 +180,7 @@ index 66b6f2cc..e3e6deae 100644
} }
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ @@ -1099,6 +1101,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context); ggml_sycl_host_free(buffer->context);
...@@ -189,10 +189,10 @@ index 66b6f2cc..e3e6deae 100644 ...@@ -189,10 +189,10 @@ index 66b6f2cc..e3e6deae 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c0bdb9e1..03d03064 100644 index e2b357fd..68768029 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -8962,6 +8962,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer); ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx; delete ctx;
...@@ -200,7 +200,7 @@ index c0bdb9e1..03d03064 100644 ...@@ -200,7 +200,7 @@ index c0bdb9e1..03d03064 100644
} }
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe @@ -9105,6 +9106,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context); ggml_vk_host_free(vk_instance.devices[0], buffer->context);
......
...@@ -10,10 +10,10 @@ logs instead of throwing an error ...@@ -10,10 +10,10 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 50ded286..a9ee9f03 100644 index 9389ca80..806c1b3d 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1503,16 +1503,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) { if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false; add_space_prefix = false;
clean_spaces = true; clean_spaces = true;
...@@ -31,8 +31,8 @@ index 50ded286..a9ee9f03 100644 ...@@ -31,8 +31,8 @@ index 50ded286..a9ee9f03 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1651,7 +1642,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
clean_spaces = false; clean_spaces = false;
} else { } else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
......
...@@ -11,10 +11,10 @@ instead of forcing one or the error ...@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-) 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5a2eef9b..9c1fe93f 100644 index 62246c10..dca22d8b 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -901,7 +901,7 @@ int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0; int64_t n_outputs_all = 0;
// count outputs // count outputs
...@@ -23,7 +23,7 @@ index 5a2eef9b..9c1fe93f 100644 ...@@ -23,7 +23,7 @@ index 5a2eef9b..9c1fe93f 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) { for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0; n_outputs_all += batch.logits[i] != 0;
} }
@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -982,7 +982,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot"); // ggml_graph_dump_dot(gf, NULL, "llama.dot");
//} //}
...@@ -32,7 +32,7 @@ index 5a2eef9b..9c1fe93f 100644 ...@@ -32,7 +32,7 @@ index 5a2eef9b..9c1fe93f 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) { if (t_embd && res->get_embd_pooled()) {
@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { @@ -1151,7 +1151,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
......
...@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode ...@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
fixes loading vision models in llama.cpp on windows fixes loading vision models in llama.cpp on windows
filesystems for paths that include wide characters filesystems for paths that include wide characters
--- ---
examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++ tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+) 1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index ad3e7df1..b3218c78 100644 index 41ba45a7..cdd8ca44 100644
--- a/examples/llava/clip.cpp --- a/tools/mtmd/clip.cpp
+++ b/examples/llava/clip.cpp +++ b/tools/mtmd/clip.cpp
@@ -30,6 +30,19 @@ @@ -31,6 +31,19 @@
#include <array>
#include <numeric> #include <numeric>
#include <functional>
+#if defined(_WIN32) +#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN
...@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644 ...@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644
+ +
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS enum ffn_op_type {
@@ -1971,7 +1984,29 @@ struct clip_model_loader { @@ -2190,7 +2203,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
...@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644 ...@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -1998,7 +2033,11 @@ struct clip_model_loader { @@ -2217,7 +2252,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
......
...@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644 ...@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
}; };
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ea73a8a7..a012aeae 100644 index 4cce5166..7f6617fa 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -439,6 +439,7 @@ namespace GGUFMeta { @@ -439,6 +439,7 @@ namespace GGUFMeta {
...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644 ...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 822e2bb2..572378c9 100644 index 3a4e72a3..831b68c0 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644 ...@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644 ...@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context { @@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context {
} }
}; };
...@@ -270,7 +270,7 @@ index 822e2bb2..572378c9 100644 ...@@ -270,7 +270,7 @@ index 822e2bb2..572378c9 100644
+ // self-attention + // self-attention
+ { + {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models + // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ +
+ // compute Q and K and RoPE them + // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
...@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644 ...@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context { struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur; ggml_tensor * cur;
@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph( @@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params, gf); llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break; } break;
...@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644 ...@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
...@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644 ...@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index 95eca002..856e6042 100644 index 6bdec263..43746c7d 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -64,6 +64,7 @@ enum llm_type { @@ -65,6 +65,7 @@ enum llm_type {
LLM_TYPE_15B, LLM_TYPE_15B,
LLM_TYPE_16B, LLM_TYPE_16B,
LLM_TYPE_20B, LLM_TYPE_20B,
...@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644 ...@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644
LLM_TYPE_27B, LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
LLM_TYPE_32B, LLM_TYPE_32B,
@@ -311,6 +312,8 @@ struct llama_layer { @@ -315,6 +316,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr;
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sun, 20 Apr 2025 16:12:36 -0700
Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture
---
examples/llava/llava.cpp | 5 +-
examples/llava/mtmd.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
include/llama.h | 6 +
src/llama-arch.cpp | 44 +++++
src/llama-arch.h | 10 ++
src/llama-batch.cpp | 3 +
src/llama-context.cpp | 25 ++-
src/llama-context.h | 1 +
src/llama-cparams.h | 1 +
src/llama-graph.cpp | 25 +++
src/llama-graph.h | 12 ++
src/llama-hparams.cpp | 4 +
src/llama-hparams.h | 7 +
src/llama-kv-cache.cpp | 12 +-
src/llama-model-loader.cpp | 2 +
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +-
19 files changed, 473 insertions(+), 21 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index c00d16ae..bab027b5 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -457,7 +457,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -469,6 +469,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 7081fd73..c14ac501 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -476,7 +476,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -487,6 +487,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31..82ae1b5b 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -178,9 +178,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
-#ifdef GGML_USE_BLAS
- register_backend(ggml_backend_blas_reg());
-#endif
+// #ifdef GGML_USE_BLAS
+// register_backend(ggml_backend_blas_reg());
+// #endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
diff --git a/include/llama.h b/include/llama.h
index 06c56395..f1628e88 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -256,6 +256,7 @@ extern "C" {
llama_token * token;
float * embd;
+ int32_t n_embd;
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -358,6 +359,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
+ bool cross_attn; // whether to use cross attention
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@@ -459,6 +461,10 @@ extern "C" {
struct llama_context_params params),
"use llama_init_from_model instead");
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
+ // and not set on the context for all batches.
+ LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
+
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 5ab3f572..eb7b5325 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
+ { LLM_ARCH_MLLAMA, "mllama" },
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" },
@@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
+ {
+ LLM_ARCH_MLLAMA,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
+ { LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" },
+ { LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" },
+ { LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" },
+ { LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" },
+ { LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" },
+ { LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" },
+ { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
+ { LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" },
+ },
+ },
{
LLM_ARCH_DECI,
{
@@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 525c1b7d..bc8a4f0b 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -11,6 +11,7 @@
enum llm_arch {
LLM_ARCH_LLAMA,
LLM_ARCH_LLAMA4,
+ LLM_ARCH_MLLAMA,
LLM_ARCH_DECI,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
@@ -148,6 +149,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -349,6 +351,14 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
+ LLM_TENSOR_CROSS_ATTN_K_NORM,
+ LLM_TENSOR_CROSS_ATTN_K_PROJ,
+ LLM_TENSOR_CROSS_ATTN_O_PROJ,
+ LLM_TENSOR_CROSS_ATTN_Q_NORM,
+ LLM_TENSOR_CROSS_ATTN_Q_PROJ,
+ LLM_TENSOR_CROSS_ATTN_V_PROJ,
+ LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
+ LLM_TENSOR_CROSS_ATTN_MLP_GATE,
LLM_TENSOR_CONV1D,
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index 01d5ca57..8682b0e6 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
+ /*n_embd =*/ 0,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
+ /*n_embd =*/ 0,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
+ batch.n_embd = embd;
} else {
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 9c1fe93f..cd06ad91 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
}
- return logits + j*model.vocab.n_tokens();
+ return logits + j*model.hparams.n_vocab;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value;
}
+void llama_context::set_cross_attn(bool value) {
+ cparams.cross_attn = value;
+}
+
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
float scale) {
@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd;
- sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+ sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch;
- const auto & vocab = model.vocab;
const auto & hparams = model.hparams;
- const int32_t n_vocab = vocab.n_tokens();
+ const int32_t n_vocab = hparams.n_vocab;
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all;
- sbatch.from_batch(batch, n_embd,
+ sbatch.from_batch(batch, batch.n_embd,
/* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all);
@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
- const auto & vocab = model.vocab;
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
const auto n_batch = cparams.n_batch;
- const auto n_vocab = vocab.n_tokens();
+ const auto n_vocab = hparams.n_vocab;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) {
- const uint32_t n_vocab = model.vocab.n_tokens();
+ const uint32_t n_vocab = model.hparams.n_vocab;
const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
- const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
+ const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
io.write(&logits_size, sizeof(logits_size));
@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
+ /*.cross_attn =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup);
}
+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
+ ctx->set_cross_attn(cross_attention);
+}
+
void llama_synchronize(llama_context * ctx) {
ctx->synchronize();
}
diff --git a/src/llama-context.h b/src/llama-context.h
index 5457f077..a50c4afa 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -65,6 +65,7 @@ struct llama_context {
void set_embeddings (bool value);
void set_causal_attn(bool value);
void set_warmup(bool value);
+ void set_cross_attn(bool value);
void set_adapter_lora(
llama_adapter_lora * adapter,
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 30e550f0..85ad91b9 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -29,6 +29,7 @@ struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
+ bool cross_attn;
bool warmup;
enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index fabb9ca2..b67216a4 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
+void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
+ if (ubatch->embd) {
+ ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
+ }
+}
+
//
// llm_graph_context
//
@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
}
+ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
+ const int64_t n_embd = hparams.n_embd;
+
+ auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
+
+ ggml_tensor * cur = nullptr;
+
+ inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
+ ggml_set_input(inp->cross_attn_state);
+
+ cur = inp->cross_attn_state;
+
+ cb(cur, "inp_cross_attn_state", -1);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_cross * inp,
ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d0c8d321..0fe18150 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -86,6 +86,7 @@ public:
ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
+ ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
};
class llm_graph_input_pos : public llm_graph_input_i {
@@ -283,6 +284,16 @@ public:
const llama_cross * cross = nullptr;
};
+class llm_graph_input_cross_attn_state : public llm_graph_input_i {
+public:
+ llm_graph_input_cross_attn_state() = default;
+ virtual ~llm_graph_input_cross_attn_state() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
+};
+
//
// llm_graph_result
//
@@ -491,6 +502,7 @@ struct llm_graph_context {
ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const;
+ ggml_tensor * build_inp_cross_attn_state() const;
ggml_tensor * build_inp_cross_embd() const;
ggml_tensor * build_inp_pos_bucket_enc() const;
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 8a667960..6a02de03 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT("fatal error");
}
+
+bool llama_hparams::cross_attention_layers(uint32_t il) const {
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 48dce407..b6fc7e6d 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@
#include "llama.h"
+#include <algorithm>
+
#include <array>
// bump if necessary
@@ -42,6 +44,7 @@ struct llama_hparams {
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;
+ uint32_t n_vocab = 0;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla = 0;
@@ -56,6 +59,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
+ std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -159,6 +163,9 @@ struct llama_hparams {
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
+ // cross attention layers
+ bool cross_attention_layers(uint32_t il) const;
+
bool is_swa(uint32_t il) const;
};
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 7c9d46d8..69f8d35a 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
return false;
}
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+ ggml_tensor * k, *v;
+
+ // for cross attention layers
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
+ k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ } else {
+ k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+ }
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
k_l.push_back(k);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index a012aeae..2e11507d 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -315,6 +315,8 @@ namespace GGUFMeta {
return true;
}
+ template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
+
template<typename T, size_t N_MAX>
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 572378c9..9d099f11 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
// everything past this point is not vocab-related
if (hparams.vocab_only) {
@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false;
}
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 40: type = LLM_TYPE_11B; break;
+ case 100: type = LLM_TYPE_90B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_DECI:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
- const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_vocab = hparams.n_vocab;
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
+
+ // output
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ if (hparams.cross_attention_layers(i)) {
+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
+ layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
+ layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
+ layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
+ layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
+ layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
+ layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
+ layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ } else {
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ }
+ } break;
case LLM_ARCH_DECI:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
}
};
+struct llm_build_mllama: public llm_graph_context {
+ llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * inpCAS;
+
+ inpL = build_inp_embd(model.tok_embd);
+ inpCAS = build_inp_cross_attn_state();
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv_unified();
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ if (hparams.cross_attention_layers(il)) {
+ if (!ubatch.embd && !cparams.cross_attn) {
+ continue;
+ }
+
+ // cross attention layer
+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
+ cb(Qcur, "Qcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur, * Vcur;
+ if (ubatch.embd) {
+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
+ cb(Kcur, "Kcur", il);
+
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
+ cb(Kcur, "Kcur", il);
+
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+ cb(Kcur, "Kcur", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur", il);
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
+
+ Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
+ cb(Vcur, "Vcur", il);
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
+ cb(Vcur, "Vcur", il);
+
+ Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
+ cb(Vcur, "Vcur", il);
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
+ } else {
+ Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
+ cb(Kcur, "Kcur (view)", il);
+
+ Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
+ cb(Vcur, "Vcur (view)", il);
+ }
+
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
+ cb(kq, "kq", il);
+
+ // TODO: apply causal masks
+ struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
+ cb(kq_soft_max, "kq_soft_max", il);
+
+ Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
+ cb(Vcur, "Vcur", il);
+
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
+ cb(kqv, "kqv", il);
+
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
+
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
+
+ cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
+ cb(cur, "cur", il);
+
+ // TODO: do this in place once?
+ cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ // TODO: do this inplace once?
+ cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ } else {
+ // self attention layer
+
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+ }
+};
+
struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ llm = std::make_unique<llm_build_mllama>(*this, params, gf);
+ } break;
case LLM_ARCH_DECI:
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index 856e6042..6be91282 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -11,6 +11,7 @@
#include <string>
#include <unordered_map>
#include <vector>
+#include <stdexcept>
struct llama_cparams;
struct llama_ubatch;
@@ -73,6 +74,7 @@ enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
+ LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B,
@@ -314,6 +316,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7dc54227..223e1f3f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
size_t total_size_org = 0;
...@@ -12,10 +12,10 @@ regex ...@@ -12,10 +12,10 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a9ee9f03..1306864e 100644 index 806c1b3d..10f34d33 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = { regex_exprs = {
"[\r\n]", "[\r\n]",
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sun, 13 Apr 2025 22:10:06 -0400
Subject: [PATCH] add unpad operator
adds the unpad operator to GGML
---
ggml/include/ggml.h | 10 +++++
ggml/src/ggml-cpu/ggml-cpu.c | 5 +++
ggml/src/ggml-cpu/ops.cpp | 55 ++++++++++++++++++++++++++++
ggml/src/ggml-cpu/ops.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++++++
ggml/src/ggml-cuda/pad.cuh | 1 +
ggml/src/ggml-metal/ggml-metal.m | 33 +++++++++++++++++
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
ggml/src/ggml.c | 25 ++++++++++++-
10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1b8603e7..53ef31b2 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -489,6 +489,7 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
+ GGML_OP_UNPAD,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1777,6 +1778,15 @@ extern "C" {
int p0,
int p1);
+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
+ GGML_API struct ggml_tensor * ggml_unpad(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int p0,
+ int p1,
+ int p2,
+ int p3);
+
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 64405449..34624cca 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
+ case GGML_OP_UNPAD:
+ {
+ ggml_compute_forward_unpad(params, tensor);
+ } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 7413192b..becdae07 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
}
}
+// ggml_compute_forward_unpad
+
+static void ggml_compute_forward_unpad_f32(
+ const struct ggml_compute_params *params,
+ struct ggml_tensor *dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ float * dst_ptr = (float *) dst->data;
+
+ // TODO: optimize
+
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ dst_ptr[dst_idx] = *src_ptr;
+ }
+ }
+ }
+ }
+ }
+}
+
+void ggml_compute_forward_unpad(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_unpad_f32(params, dst);
+ } break;
+ default:
+ {
+ GGML_ABORT("fatal error");
+ }
+ }
+}
+
// ggml_compute_forward_arange
static void ggml_compute_forward_arange_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index dc081b9e..a7125555 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 04ce764e..491acccb 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
+ case GGML_OP_UNPAD:
+ ggml_cuda_op_unpad(ctx, dst);
+ break;
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU:
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index 77432b04..7d45a7e1 100644
--- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
}
+
+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
+ // blockIdx.y: idx of ne1
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (nidx >= ne0) {
+ return;
+ }
+
+ // operation
+ int offset_dst =
+ nidx +
+ blockIdx.y * ne0 +
+ blockIdx.z * ne0 * gridDim.y;
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
+ int offset_src =
+ nidx +
+ blockIdx.y * ne00 +
+ blockIdx.z * ne00 * ne01;
+ dst[offset_dst] = x[offset_src];
+ }
+}
+
+static void unpad_f32_cuda(const float * x, float * dst,
+ const int ne00, const int ne01, const int ne02, const int ne03,
+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+ dim3 gridDim(num_blocks, ne1, ne2*ne3);
+ unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
+}
+
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+ const float * src0_d = (const float *)src0->data;
+ float * dst_d = (float *)dst->data;
+ cudaStream_t stream = ctx.stream();
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+ unpad_f32_cuda(src0_d, dst_d,
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
index 8fd386b0..e2ededc3 100644
--- a/ggml/src/ggml-cuda/pad.cuh
+++ b/ggml/src/ggml-cuda/pad.cuh
@@ -3,3 +3,4 @@
#define CUDA_PAD_BLOCK_SIZE 256
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 425524d0..112abef6 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU:
@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
+ [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+ } break;
+ case GGML_OP_UNPAD:
+ {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
+
+ [encoder setComputePipelineState:pipeline];
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+ [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+ [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
+ [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
+ [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
+ [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
+ [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
+ [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
+
+ const int nth = MIN(1024, ne0);
+
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 9f4147e9..6ceb3cef 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
}
}
+kernel void kernel_unpad_f32(
+ device const char * src0,
+ device char * dst,
+ constant int64_t & ne00,
+ constant int64_t & ne01,
+ constant int64_t & ne02,
+ constant int64_t & ne03,
+ constant uint64_t & nb00,
+ constant uint64_t & nb01,
+ constant uint64_t & nb02,
+ constant uint64_t & nb03,
+ constant int64_t & ne0,
+ constant int64_t & ne1,
+ constant int64_t & ne2,
+ constant int64_t & ne3,
+ constant uint64_t & nb0,
+ constant uint64_t & nb1,
+ constant uint64_t & nb2,
+ constant uint64_t & nb3,
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]],
+ uint3 ntg[[threads_per_threadgroup]]) {
+
+ const int64_t i3 = tgpig.z;
+ const int64_t i2 = tgpig.y;
+ const int64_t i1 = tgpig.x;
+
+ const int64_t i03 = i3;
+ const int64_t i02 = i2;
+ const int64_t i01 = i1;
+
+ device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
+ device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
+
+ if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+ if (i0 < ne00) {
+ dst_ptr[i0] = src0_ptr[i0];
+ }
+ }
+
+ return;
+ }
+}
+
kernel void kernel_arange_f32(
device char * dst,
constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7654ae17..3c57aff8 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
+ "UNPAD",
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
+ "unpad(x)",
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result;
}
+// ggml_unpad
+
+struct ggml_tensor * ggml_unpad(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int p0, int p1, int p2, int p3) {
+
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+ a->ne[0] - p0,
+ a->ne[1] - p1,
+ a->ne[2] - p2,
+ a->ne[3] - p3);
+
+ result->op = GGML_OP_UNPAD;
+ result->src[0] = a;
+
+ return result;
+}
+
// ggml_arange
struct ggml_tensor * ggml_arange(
...@@ -15,33 +15,139 @@ but this can leave a cache that still does not have adequate space ...@@ -15,33 +15,139 @@ but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete. multiple batches of processing until everything is complete.
--- ---
src/llama-context.cpp | 105 +++++++++++++---------------------------- src/llama-context.cpp | 18 ++++---
src/llama-context.h | 4 +- src/llama-context.h | 1 +
src/llama-kv-cache.cpp | 39 +++------------ src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
src/llama-kv-cache.h | 9 +++- src/llama-kv-cache.h | 12 ++++-
4 files changed, 51 insertions(+), 106 deletions(-) 4 files changed, 59 insertions(+), 79 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index cd06ad91..77177c5e 100644 index c22687e4..c5948e8f 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( @@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
llm_graph_result_ptr llama_context::build_kv_self_defrag( // find KV slot
ggml_context * ctx0, if (!kv_self->find_slot(ubatch)) {
- ggml_cgraph * gf) const { - LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ ggml_cgraph * gf, -
+ const std::vector<struct llama_kv_defrag_move> & moves) const { - return 1;
auto res = std::make_unique<llm_graph_result>(); + kv_self->defrag_sched(-1.0f);
+ kv_self->update(*this);
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ return 1;
+ }
}
ggml_backend_sched_reset(sched.get());
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
// TODO: not sure if this is needed
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- GGML_ABORT("TODO: handle this error");
+ kv_self->defrag_sched(-1.0f);
+ kv_self->update(*this);
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ GGML_ABORT("TODO: handle this error");
+ }
}
auto * gf = graph_init();
diff --git a/src/llama-context.h b/src/llama-context.h
index c0ceacb1..0264e937 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
#include "llama-cparams.h"
#include "llama-graph.h"
#include "llama-adapter.h"
+#include "llama-kv-cache.h"
const auto & hparams = model.hparams; #include "ggml-cpp.h"
#include "ggml-opt.h"
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 3dcad65b..60e67b03 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() {
}
bool llama_kv_cache_unified::update(llama_context & lctx) {
- bool need_reserve = false;
-
auto * sched = lctx.get_sched();
if (has_shift) {
@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
res->set_inputs(nullptr);
- const auto & ids = kv_self->defrag_info.ids; lctx.graph_compute(gf, false);
-
- need_reserve = true;
}
{
@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
if (do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ const uint32_t n_max_nodes = lctx.graph_max_nodes();
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+ if (!defrag_prepare(n_max_nodes)) {
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+ return false;
+ }
+
+ for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, defrag_info.moves.size());
+ chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
- if (defrag_prepare(lctx.graph_max_nodes())) {
ggml_backend_sched_reset(sched);
auto * gf = lctx.graph_init();
- auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
+ auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
ggml_backend_sched_alloc_graph(sched, gf);
res->set_inputs(nullptr);
lctx.graph_compute(gf, false);
-
- need_reserve = true;
}
do_defrag = false;
}
- return need_reserve;
+ // we never need to reserve a worst case graph
+ return false;
}
void llama_kv_cache_unified::defrag_sched(float thold) {
@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
- ggml_cgraph * gf) const {
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
auto res = std::make_unique<llm_graph_result>();
- const auto & ids = defrag_info.ids;
- -
#if 0 #if 0
// CPU defrag // CPU defrag
// //
@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( @@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
} }
#else #else
...@@ -63,188 +169,63 @@ index cd06ad91..77177c5e 100644 ...@@ -63,188 +169,63 @@ index cd06ad91..77177c5e 100644
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il], ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
- n_embd_k_gqa, nm, - n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len, + n_embd_k_gqa, move.len,
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i)); - ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src)); + ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il], ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
- n_embd_k_gqa, nm, - n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len, + n_embd_k_gqa, move.len,
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id)); - ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst)); + ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src; ggml_tensor * view_v_src;
ggml_tensor * view_v_dst; ggml_tensor * view_v_dst;
@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( @@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
if (cparams.flash_attn) { if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention // NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], view_v_src = ggml_view_2d(ctx, v_l[il],
- n_embd_v_gqa, nm, - n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len, + n_embd_v_gqa, move.len,
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i)); - ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src)); + ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst));
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], view_v_dst = ggml_view_2d(ctx, v_l[il],
- n_embd_v_gqa, nm, - n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len, + move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id)); - ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst)); + ggml_row_size(v_l[il]->type, move.src));
} else { } else {
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], view_v_src = ggml_view_2d(ctx, v_l[il],
- nm, n_embd_v_gqa, - nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa, + move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, kv_self->size), ggml_row_size(v_l[il]->type, size),
- ggml_row_size(kv_self->v_l[il]->type, i)); - ggml_row_size(v_l[il]->type, i));
+ ggml_row_size(kv_self->v_l[il]->type, move.src)); + ggml_row_size(v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], view_v_dst = ggml_view_2d(ctx, v_l[il],
- nm, n_embd_v_gqa, - nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa, + move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, kv_self->size), ggml_row_size(v_l[il]->type, size),
- ggml_row_size(kv_self->v_l[il]->type, id)); - ggml_row_size(v_l[il]->type, id));
+ ggml_row_size(kv_self->v_l[il]->type, move.dst)); + ggml_row_size(v_l[il]->type, move.dst));
} }
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
} }
- -
- i += nm - 1; - i += nm - 1;
} }
-
- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
#endif
return res; //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( @@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
void llama_context::kv_self_update() {
auto & kv = kv_self;
- bool need_reserve = false;
-
if (kv->has_shift) {
if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift");
@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
res->set_inputs(nullptr);
graph_compute(gf, false);
-
- need_reserve = true;
}
{
@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
// defragment the KV cache if needed
if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ const uint32_t n_max_nodes = graph_max_nodes();
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+ if (!kv->defrag_prepare(n_max_nodes)) {
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+ return;
+ }
- if (kv->defrag_prepare(graph_max_nodes())) {
- ggml_backend_sched_reset(sched.get());
+ for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
+ chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
+ ggml_backend_sched_reset(sched.get());
auto * gf = graph_init();
-
- auto res = build_kv_self_defrag(ctx_compute.get(), gf);
-
+ auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
ggml_backend_sched_alloc_graph(sched.get(), gf);
-
res->set_inputs(nullptr);
-
graph_compute(gf, false);
-
- need_reserve = true;
}
kv->do_defrag = false;
}
-
- // reserve a worst case graph if needed
- if (need_reserve) {
- LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
- // build worst-case graph
- uint32_t n_seqs = 1; // TODO: worst-case number of sequences
- uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
- // simulate full KV cache
- kv_self->n = kv_self->size;
-
- llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
- llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
- auto * gf = graph_init();
- graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
-
- // initialize scheduler with the worst-case graph
- ggml_backend_sched_reset(sched.get());
- if (!ggml_backend_sched_reserve(sched.get(), gf)) {
- LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
- }
- }
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- return 1;
+ kv_self->defrag();
+ kv_self_update();
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ return 1;
+ }
}
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index a50c4afa..30f84bfd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
#include "llama-cparams.h"
#include "llama-graph.h"
#include "llama-adapter.h"
+#include "llama-kv-cache.h"
#include "ggml-cpp.h"
@@ -179,7 +180,8 @@ private:
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,
- ggml_cgraph * gf) const;
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const;
// TODO: read/write lora adapters and cvec
size_t state_write_data(llama_io_write_i & io);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 69f8d35a..35a750d3 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
assert(n_used <= n_kv); assert(n_used <= n_kv);
...@@ -263,7 +244,7 @@ index 69f8d35a..35a750d3 100644 ...@@ -263,7 +244,7 @@ index 69f8d35a..35a750d3 100644
// determine which KV cells to move where // determine which KV cells to move where
// //
@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// //
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
// //
...@@ -275,7 +256,7 @@ index 69f8d35a..35a750d3 100644 ...@@ -275,7 +256,7 @@ index 69f8d35a..35a750d3 100644
for (uint32_t i0 = 0; i0 < n_used; ++i0) { for (uint32_t i0 = 0; i0 < n_used; ++i0) {
const auto & cell0 = cells[i0]; const auto & cell0 = cells[i0];
@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// are we moving a continuous block of memory? // are we moving a continuous block of memory?
bool cont = false; bool cont = false;
...@@ -295,7 +276,7 @@ index 69f8d35a..35a750d3 100644 ...@@ -295,7 +276,7 @@ index 69f8d35a..35a750d3 100644
cont = false; cont = false;
continue; continue;
} }
@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
head = n_used; head = n_used;
if (!cont) { if (!cont) {
...@@ -307,7 +288,7 @@ index 69f8d35a..35a750d3 100644 ...@@ -307,7 +288,7 @@ index 69f8d35a..35a750d3 100644
} }
nf++; nf++;
@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
} }
} }
...@@ -325,37 +306,47 @@ index 69f8d35a..35a750d3 100644 ...@@ -325,37 +306,47 @@ index 69f8d35a..35a750d3 100644
return false; return false;
} }
- LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves); - LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
- -
- LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer); - LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves); + // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
return true; return true;
} }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 56c74035..25cbcb56 100644 index bf3b4b6a..928b9712 100644
--- a/src/llama-kv-cache.h --- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h
@@ -43,6 +43,13 @@ private: @@ -82,6 +82,13 @@ struct llama_kv_cache_guard {
private:
llama_kv_cache * kv; llama_kv_cache * kv;
}; };
+
+// block of KV slots to move when defragging +// block of KV slots to move when defragging
+struct llama_kv_defrag_move { +struct llama_kv_defrag_move {
+ uint32_t src; + uint32_t src;
+ uint32_t dst; + uint32_t dst;
+ uint32_t len; + uint32_t len;
+}; +};
+
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
@@ -131,7 +138,7 @@ public:
// defrag
//
// llama_kv_cache_unified
@@ -207,7 +214,7 @@ private:
// defrag
struct { struct {
- std::vector<uint32_t> ids; - std::vector<uint32_t> ids;
+ std::vector<llama_kv_defrag_move> moves; + std::vector<llama_kv_defrag_move> moves;
} defrag_info; } defrag_info;
// return true if cells have been moved // return true if cells have been moved
@@ -249,7 +256,8 @@ private:
llm_graph_result_ptr build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
- ggml_cgraph * gf) const;
+ ggml_cgraph * gf,
+ const std::vector<llama_kv_defrag_move> & moves) const;
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
...@@ -11,7 +11,7 @@ with the fastest acceleration is loaded ...@@ -11,7 +11,7 @@ with the fastest acceleration is loaded
1 file changed, 13 insertions(+), 8 deletions(-) 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 82ae1b5b..1487f322 100644 index 405d8e31..4e67d243 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp
@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry { @@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants ...@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 43d9fc4f..4c0d3824 100644 index ddea5ad3..45918bf6 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name) @@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
......
...@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems ...@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-) 1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 4c0d3824..79c26312 100644 index 45918bf6..0beaed86 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS) @@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
......
...@@ -9,8 +9,8 @@ such as vocab fields ...@@ -9,8 +9,8 @@ such as vocab fields
--- ---
ggml/include/gguf.h | 1 + ggml/include/gguf.h | 1 +
ggml/src/gguf.cpp | 7 +++++-- ggml/src/gguf.cpp | 7 +++++--
src/llama-vocab.cpp | 2 +- src/llama-vocab.cpp | 4 +---
3 files changed, 7 insertions(+), 3 deletions(-) 3 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
index 79ee2020..3efb22f0 100644 index 79ee2020..3efb22f0 100644
...@@ -53,15 +53,17 @@ index 381a9c7d..e45b453d 100644 ...@@ -53,15 +53,17 @@ index 381a9c7d..e45b453d 100644
} }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 1306864e..d6515ff6 100644 index 10f34d33..9f5fd57b 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) { if (precompiled_charsmap_keyidx != -1) {
- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx); const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
+ size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx); - GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
-
- const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN #ifdef IS_BIG_ENDIAN
...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor ...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+) 1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 34624cca..59bd3c62 100644 index a30e67f2..2462d2b8 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@ @@ -15,6 +15,8 @@
...@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644 ...@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { @@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
......
...@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644 ...@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
const char * grammar_root, const char * grammar_root,
bool lazy, bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index c0a5f934..75731053 100644 index 804b11e0..15a10ca8 100644
--- a/src/llama-sampling.cpp --- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { @@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <git@mxy.ng>
Date: Thu, 1 May 2025 13:45:12 -0700
Subject: [PATCH] add argsort and cuda copy for i32
---
ggml/src/ggml-cpu/ops.cpp | 43 ++++++++++++++
ggml/src/ggml-cuda/argsort.cu | 102 +++++++++++++++++++++++++++++++++-
ggml/src/ggml-cuda/cpy.cu | 49 ++++++++++++++++
3 files changed, 192 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index becdae07..7a44b6cf 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
}
}
+static void ggml_compute_forward_argsort_i32(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+
+ const ggml_tensor * src0 = dst->src[0];
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ GGML_ASSERT(nb0 == sizeof(int32_t));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int64_t nr = ggml_nrows(src0);
+
+ ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
+
+ for (int64_t i = ith; i < nr; i += nth) {
+ int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+ const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
+
+ for (int64_t j = 0; j < ne0; j++) {
+ dst_data[j] = j;
+ }
+
+ // C doesn't have a functional sort, so we do a bubble sort instead
+ for (int64_t j = 0; j < ne0; j++) {
+ for (int64_t k = j + 1; k < ne0; k++) {
+ if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+ (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+ int32_t tmp = dst_data[j];
+ dst_data[j] = dst_data[k];
+ dst_data[k] = tmp;
+ }
+ }
+ }
+ }
+}
+
void ggml_compute_forward_argsort(
const ggml_compute_params * params,
ggml_tensor * dst) {
@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
{
ggml_compute_forward_argsort_f32(params, dst);
} break;
+ case GGML_TYPE_I32:
+ {
+ ggml_compute_forward_argsort_i32(params, dst);
+ } break;
default:
{
GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index 607ded85..53b02634 100644
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
}
}
+
+template<ggml_sort_order order>
+static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
+ extern __shared__ int shared_mem[];
+ int * indices = shared_mem;
+
+ const int tid = threadIdx.x;
+ const int row = blockIdx.y;
+
+ // Initialize all indices, handling the case where threads < ncols_pad
+ for (int i = tid; i < ncols_pad; i += blockDim.x) {
+ indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
+ }
+ __syncthreads();
+
+ // Bitonic sort
+ for (int k = 2; k <= ncols_pad; k *= 2) {
+ for (int j = k/2; j > 0; j /= 2) {
+ for (int i = tid; i < ncols_pad; i += blockDim.x) {
+ const int ij = i ^ j;
+ if (ij > i) {
+ // Only compare values within the actual data range
+ if (i < ncols && ij < ncols) {
+ if ((i & k) == 0) {
+ if (order == GGML_SORT_ORDER_ASC) {
+ if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+ int tmp = indices[i];
+ indices[i] = indices[ij];
+ indices[ij] = tmp;
+ }
+ } else {
+ if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+ int tmp = indices[i];
+ indices[i] = indices[ij];
+ indices[ij] = tmp;
+ }
+ }
+ } else {
+ if (order == GGML_SORT_ORDER_ASC) {
+ if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+ int tmp = indices[i];
+ indices[i] = indices[ij];
+ indices[ij] = tmp;
+ }
+ } else {
+ if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+ int tmp = indices[i];
+ indices[i] = indices[ij];
+ indices[ij] = tmp;
+ }
+ }
+ }
+ }
+ }
+ }
+ __syncthreads();
+ }
+ }
+
+ // Write sorted indices to output, only threads handling valid data
+ for (int i = tid; i < ncols; i += blockDim.x) {
+ dst[row * ncols + i] = indices[i];
+ }
+}
+
+static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
+ // Bitonic sort requires ncols to be power of 2
+ const int ncols_pad = next_power_of_2(ncols);
+
+ // Ensure thread count doesn't exceed maximum (typically 1024)
+ const int max_threads = 1024; // This is the typical max for most GPUs
+ const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
+
+ const dim3 block_dims(threads_per_block, 1, 1);
+ const dim3 block_nums(1, nrows, 1);
+ const size_t shared_mem = ncols_pad * sizeof(int);
+
+ // Check if shared memory size is within limits
+ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+ // Instead of logging an error, use GGML_ASSERT with a descriptive message
+ GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
+
+ // Launch kernels with the updated thread configuration
+ if (order == GGML_SORT_ORDER_ASC) {
+ k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+ } else if (order == GGML_SORT_ORDER_DESC) {
+ k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+ } else {
+ GGML_ABORT("fatal error");
+ }
+}
+
+
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
GGML_ASSERT( dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0));
@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
- argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+ if (src0->type == GGML_TYPE_I32) {
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+ } else {
+ argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+ }
}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 2d46176e..47383486 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
*dsti = *xi;
}
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
+ const int32_t * xi = (const int32_t *) cxi;
+ int32_t * dsti = (int32_t *) cdsti;
+
+ *dsti = *xi;
+}
+
template <cpy_kernel_t cpy_1>
static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
cpy_1(cx + x_offset, cdst + dst_offset);
}
+// First, add this template function after the other template functions
+template <cpy_kernel_t cpy_1>
+static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+ const int nb12, const int nb13) {
+ const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+
+ if (i >= ne) {
+ return;
+ }
+
+ const int64_t i03 = i/(ne00 * ne01 * ne02);
+ const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+ const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
+ const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+ const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+ const int64_t i13 = i/(ne10 * ne11 * ne12);
+ const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+ const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+ const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+ const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
+
+ cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
+static void ggml_cpy_i32_i32_cuda(
+ const char * cx, char * cdst, const int ne,
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
+
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+ cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
const float * xi = (const float *) cxi;
block_q8_0 * dsti = (block_q8_0 *) cdsti;
@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else {
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+ return (void*) cpy_i32_i32<cpy_1_i32_i32>;
} else {
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment