"...text-generation-inference.git" did not exist on "e4d31a40db0721a9b1bc9c9121f0a712d87bce1f"
Commit bab6f34d authored by Michael Yang's avatar Michael Yang
Browse files

ml/backend/ggml: update model loading for hybrid/multi backends

use a similar strategy as llama.cpp for deciding where tensors should be
allocated. this will be improved later to be aware of usable memory
before assigning the tensor
parent 0682dae0
...@@ -9,67 +9,46 @@ package ggml ...@@ -9,67 +9,46 @@ package ggml
import "C" import "C"
import ( import (
"errors"
"fmt" "fmt"
"io" "io"
"iter"
"log/slog" "log/slog"
"maps"
"os" "os"
"sync" "slices"
"strconv"
"strings"
"unicode"
"unsafe" "unsafe"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
fs "github.com/ollama/ollama/fs/ggml" fs "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
) )
type device struct { func devices() iter.Seq[*C.struct_ggml_backend_device] {
d *C.struct_ggml_backend_device return func(yield func(*C.struct_ggml_backend_device) bool) {
} for i := range C.ggml_backend_dev_count() {
if !yield(C.ggml_backend_dev_get(i)) {
func (d device) LogValue() slog.Value { return
var free, total uint64 }
C.ggml_backend_dev_memory(d.d, (*C.size_t)(&free), (*C.size_t)(&total)) }
kind := "unknown"
switch C.ggml_backend_dev_type(d.d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
kind = "cpu"
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
kind = "gpu"
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
kind = "accel"
} }
return slog.GroupValue(
slog.String("name", C.GoString(C.ggml_backend_dev_name(d.d))),
slog.String("description", C.GoString(C.ggml_backend_dev_description(d.d))),
slog.String("kind", kind),
slog.String("free", format.HumanBytes2(free)),
slog.String("total", format.HumanBytes2(total)),
)
} }
var devices = sync.OnceValue(func() []device {
ggml.OnceLoad()
s := make([]device, C.ggml_backend_dev_count())
for i := range s {
s[i] = device{C.ggml_backend_dev_get(C.size_t(i))}
}
return s
})
type Backend struct { type Backend struct {
flashAttention bool meta *fs.GGML
meta *fs.GGML flashAttention bool
cpus, gpus []Context
tensors map[string]*Context
sched *C.struct_ggml_backend_sched sched *C.struct_ggml_backend_sched
tensors map[string]*C.struct_ggml_tensor
ctxs []*C.struct_ggml_context
backends []*C.struct_ggml_backend
bufts []*C.struct_ggml_backend_buffer_type
} }
func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
...@@ -88,100 +67,226 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { ...@@ -88,100 +67,226 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
"num_key_values", len(meta.KV()), "num_key_values", len(meta.KV()),
) )
var cpus, gpus []Context type dbt struct {
for _, d := range devices() { d *C.struct_ggml_backend_device
switch C.ggml_backend_dev_type(d.d) { bts []*C.struct_ggml_backend_buffer_type
case C.GGML_BACKEND_DEVICE_TYPE_CPU, }
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
slog.Info("cpu", "device", d) var cpus, accels, gpus []*C.struct_ggml_backend_device
cpus = append(cpus, Context{ for d := range devices() {
ctx: C.ggml_init(C.struct_ggml_init_params{ switch C.ggml_backend_dev_type(d) {
mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)), case C.GGML_BACKEND_DEVICE_TYPE_CPU:
no_alloc: true, cpus = append(cpus, d)
}), case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
backend: C.ggml_backend_dev_init(d.d, nil), accels = append(accels, d)
})
case C.GGML_BACKEND_DEVICE_TYPE_GPU: case C.GGML_BACKEND_DEVICE_TYPE_GPU:
slog.Info("gpu", "device", d) gpus = append(gpus, d)
gpus = append(gpus, Context{
ctx: C.ggml_init(C.struct_ggml_init_params{
mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)),
no_alloc: true,
}),
backend: C.ggml_backend_dev_init(d.d, nil),
})
} }
} }
ctxFunc := func(s []Context) (*Context, error) { var cpuBufferTypes []*C.struct_ggml_backend_buffer_type
for _, e := range s { for _, d := range append(accels, append(gpus, cpus...)...) {
return &e, nil switch C.ggml_backend_dev_type(d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU,
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
cpuBufferTypes = append(cpuBufferTypes, C.ggml_backend_dev_buffer_type(d))
} }
}
return nil, fmt.Errorf("no devices available") var sum uint64
var cumsum []uint64
var gpuBufferTypes []dbt
for _, d := range gpus {
var free, total C.size_t
C.ggml_backend_dev_memory(d, &free, &total)
sum += uint64(free)
cumsum = append(cumsum, sum)
bt := C.ggml_backend_dev_buffer_type(d)
gpuBufferTypes = append(gpuBufferTypes, dbt{
d: d,
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuBufferTypes...),
})
} }
tensors := make(map[*fs.Tensor]*Context, len(meta.Tensors().Items())) splits := make([]float64, len(cumsum))
for _, t := range meta.Tensors().Items() { for i := range splits {
c, err := ctxFunc(append(gpus, cpus...)) splits[i] = float64(cumsum[i]) / float64(sum)
if err != nil { }
return nil, err
input := dbt{C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU), cpuBufferTypes}
slog.Info("input layer", "device", C.GoString(C.ggml_backend_dev_name(input.d)))
var blocks int
for key, value := range meta.KV() {
if strings.HasSuffix(key, ".block_count") {
blocks += int(value.(uint32))
} }
}
func() { indexFunc := func(i int) func(float64) bool {
tt := C.ggml_new_tensor(c.ctx, t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0]))) return func(f float64) bool {
return float64(i)/float64(blocks+1) < f
}
}
layers := make([]dbt, blocks)
for i := range layers {
layers[i] = gpuBufferTypes[slices.IndexFunc(splits, indexFunc(i))]
slog.Info("layer", "i", i, "device", C.GoString(C.ggml_backend_dev_name(layers[i].d)))
}
output := gpuBufferTypes[slices.IndexFunc(splits, indexFunc(blocks))]
slog.Info("output layer", "device", C.GoString(C.ggml_backend_dev_name(output.d)))
maxTensors := len(meta.Tensors().Items())
maxTensors += 1
maxTensors += blocks * 2
slog.Info("max tensors", "max_tensors", maxTensors)
ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
createTensor := func(t *fs.Tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
for _, bt := range bts {
if _, ok := ctxs[bt]; !ok {
ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
no_alloc: true,
})
}
cname := C.CString(t.Name) cname := C.CString(t.Name)
defer C.free(unsafe.Pointer(cname)) defer C.free(unsafe.Pointer(cname))
if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
return tt
}
tt := C.ggml_new_tensor(ctxs[bt], t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0])))
C.ggml_set_name(tt, cname) C.ggml_set_name(tt, cname)
tensors[t] = c slog.Debug("created tensor", "name", t.Name, "shape", t.Shape, "dtype", t.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
}() //nolint:staticcheck // TODO: check if buffer type supports this tensor
return tt
}
return nil
} }
for _, b := range append(gpus, cpus...) { hasPart := func(s string, parts ...string) bool {
C.ggml_backend_alloc_ctx_tensors(b.ctx, b.backend) split := strings.Split(s, ".")
for _, part := range parts {
if slices.Contains(split, part) {
return true
}
}
return false
} }
sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset)) for _, t := range meta.Tensors().Items() {
switch {
case hasPart(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
createTensor(t, input.bts)
case hasPart(t.Name, "cls", "output", "output_norm"):
createTensor(t, output.bts)
default:
if i := func() int {
if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
if i, err := strconv.Atoi(fields[0]); err == nil {
return i
}
}
return -1
}(); i >= 0 {
createTensor(t, layers[i].bts)
} else {
for _, layer := range layers {
createTensor(t, layer.bts)
}
}
}
}
bbs := make(map[*C.struct_ggml_context][]*C.struct_ggml_backend_buffer, len(ctxs))
for bt, c := range ctxs {
if C.ggml_get_first_tensor(c) == nil {
continue
}
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
bbs[c] = append(bbs[c], b)
}
for bs := range maps.Values(bbs) {
for _, b := range bs {
slog.Info("model", "buffer", C.GoString(C.ggml_backend_buffer_name(b)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(b))))
}
}
tensors := make(map[string]*C.struct_ggml_tensor)
for _, c := range ctxs {
for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
tensors[C.GoString(C.ggml_get_name(t))] = t
}
}
sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
var g errgroup.Group var g errgroup.Group
for t, c := range tensors { for _, t := range meta.Tensors().Items() {
g.Go(func() error { g.Go(func() error {
tt, ok := tensors[t.Name]
if !ok {
return fmt.Errorf("unassigned tensor: %s", t.Name)
}
bts := make([]byte, t.Size()) bts := make([]byte, t.Size())
n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts) n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts)
if err != nil { if err != nil {
return err return err
} }
if n != int(t.Size()) { if n != len(bts) {
return fmt.Errorf("expected %d bytes, got %d", t.Size(), n) return errors.New("short read")
} }
cname := C.CString(t.Name) cname := C.CString(t.Name)
defer C.free(unsafe.Pointer(cname)) C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), 0, C.size_t(t.Size()))
C.free(unsafe.Pointer(cname))
C.ggml_backend_tensor_set(C.ggml_get_tensor(c.ctx, cname), unsafe.Pointer(&bts[0]), 0, C.size_t(n))
return nil return nil
}) })
} }
if err := g.Wait(); err != nil { if g.Wait() != nil {
return nil, err return nil, err
} }
backends := make([]*C.struct_ggml_backend, len(gpus)+len(cpus)) var backends []*C.struct_ggml_backend
bufts := make([]*C.struct_ggml_backend_buffer_type, len(gpus)+len(cpus)) var bufts []*C.struct_ggml_backend_buffer_type
for i, c := range append(gpus, cpus...) { for _, d := range append(gpus, append(accels, cpus...)...) {
backends[i] = c.backend b := C.ggml_backend_dev_init(d, nil)
bufts[i] = C.ggml_backend_get_default_buffer_type(c.backend) backends = append(backends, b)
bt := C.ggml_backend_get_default_buffer_type(b)
if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
if hbt := C.ggml_backend_dev_host_buffer_type(d); hbt != nil {
bt = hbt
}
}
bufts = append(bufts, bt)
slog.Info("compute buffer", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
} }
return &Backend{ return &Backend{
flashAttention: params.FlashAttention, flashAttention: params.FlashAttention,
meta: meta, meta: meta,
cpus: cpus, tensors: tensors,
gpus: gpus,
sched: C.ggml_backend_sched_new( sched: C.ggml_backend_sched_new(
(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])), (*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])), (*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
...@@ -201,36 +306,22 @@ func (b *Backend) Config() ml.Config { ...@@ -201,36 +306,22 @@ func (b *Backend) Config() ml.Config {
} }
func (b *Backend) Get(name string) ml.Tensor { func (b *Backend) Get(name string) ml.Tensor {
cname := C.CString(name) if t, ok := b.tensors[name]; ok {
defer C.free(unsafe.Pointer(cname)) return &Tensor{b: b, t: t}
for _, c := range append(b.gpus, b.cpus...) {
if t := C.ggml_get_tensor(c.ctx, cname); t != nil {
return &Tensor{b: b, t: t}
}
} }
return nil return nil
} }
func (b *Backend) NewContext() ml.Context { func (b *Backend) NewContext() ml.Context {
nodes := max(8192, len(b.meta.Tensors().Items())*5) maxTensors := max(8192, len(b.meta.Tensors().Items())*5)
c := C.ggml_init(C.struct_ggml_init_params{
mem_buffer: nil,
mem_size: C.size_t(nodes)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(nodes), false),
no_alloc: true,
})
backends := make([]*C.struct_ggml_backend, len(b.gpus)+len(b.cpus))
for i, c := range append(b.gpus, b.cpus...) {
backends[i] = c.backend
}
return &Context{ return &Context{
b: b, b: b,
ctx: c, maxTensors: maxTensors,
backend: backends[0], ctx: C.ggml_init(C.struct_ggml_init_params{
nodes: nodes, mem_size: C.size_t(maxTensors)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(maxTensors), false),
no_alloc: true,
}),
} }
} }
...@@ -243,17 +334,17 @@ func (b *Backend) CacheConfig() ml.CacheConfig { ...@@ -243,17 +334,17 @@ func (b *Backend) CacheConfig() ml.CacheConfig {
} }
type Context struct { type Context struct {
b *Backend b *Backend
ctx *C.struct_ggml_context
backend *C.struct_ggml_backend
ctx *C.struct_ggml_context
graph *C.struct_ggml_cgraph graph *C.struct_ggml_cgraph
nodes int
maxTensors int
} }
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context { func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
if c.graph == nil { if c.graph == nil {
c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.nodes), false) c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxTensors), false)
} }
for _, tensor := range tensors { for _, tensor := range tensors {
...@@ -264,8 +355,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context { ...@@ -264,8 +355,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
} }
func (c *Context) Compute(tensors ...ml.Tensor) { func (c *Context) Compute(tensors ...ml.Tensor) {
C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
C.ggml_backend_sched_reset(c.b.sched) C.ggml_backend_sched_reset(c.b.sched)
C.ggml_backend_sched_alloc_graph(c.b.sched, c.graph)
C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
needSync := true needSync := true
sync := func() { sync := func() {
...@@ -283,19 +375,19 @@ func (c *Context) Compute(tensors ...ml.Tensor) { ...@@ -283,19 +375,19 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
} }
func (c *Context) MaxTensors() int { func (c *Context) MaxTensors() int {
return c.nodes return c.maxTensors
} }
func shapeToGGML(shape []int) *C.int64_t { func shapeToGGML(shape []int) *C.int64_t {
sh := make([]C.int64_t, len(shape)) sh := make([]C.int64_t, len(shape))
for i, s := range shape { for i, s := range shape {
sh[i] = (C.int64_t)(s) sh[i] = C.int64_t(s)
} }
return &sh[0] return &sh[0]
} }
func newTensor(ctx Context, dtype ml.DType, zero bool, shape []int) ml.Tensor { func newTensor(ctx Context, dtype ml.DType, shape []int) ml.Tensor {
if len(shape) < 1 || len(shape) > 4 { if len(shape) < 1 || len(shape) > 4 {
panic("unsupported number of dimensions") panic("unsupported number of dimensions")
} }
...@@ -318,20 +410,20 @@ func newTensor(ctx Context, dtype ml.DType, zero bool, shape []int) ml.Tensor { ...@@ -318,20 +410,20 @@ func newTensor(ctx Context, dtype ml.DType, zero bool, shape []int) ml.Tensor {
panic("unsupported dtype") panic("unsupported dtype")
} }
b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t)) b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b)) C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
if zero { C.ggml_set_input(t)
C.ggml_set_zero(t)
}
return &Tensor{b: ctx.b, t: t} return &Tensor{b: ctx.b, t: t}
} }
func (c Context) Empty(dtype ml.DType, shape ...int) ml.Tensor { func (c Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
return newTensor(c, dtype, false, shape) return newTensor(c, dtype, shape)
} }
func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor { func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
return newTensor(c, dtype, true, shape) t := newTensor(c, dtype, shape)
C.ggml_set_zero(t.(*Tensor).t)
return t
} }
func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) { func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
...@@ -352,9 +444,10 @@ func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype u ...@@ -352,9 +444,10 @@ func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype u
} }
t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), shapeToGGML(shape)) t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), shapeToGGML(shape))
b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t)) b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b)) C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t)) C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
C.ggml_set_input(t)
return &Tensor{b: ctx.b, t: t}, nil return &Tensor{b: ctx.b, t: t}, nil
} }
......
...@@ -207,18 +207,27 @@ struct ggml_backend_registry { ...@@ -207,18 +207,27 @@ struct ggml_backend_registry {
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i), score); register_device(ggml_backend_reg_dev_get(reg, i), score);
} }
std::stable_sort(devices.begin(), devices.end(),
[](const auto & a, const auto & b) {
return a.second > b.second;
}
);
} }
void register_device(ggml_backend_dev_t device, int score = -1) { void register_device(ggml_backend_dev_t device, int score = -1) {
switch (ggml_backend_dev_type(device)) {
case GGML_BACKEND_DEVICE_TYPE_CPU:
case GGML_BACKEND_DEVICE_TYPE_GPU:
score += 1 << 16;
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
score += 1 << 20;
}
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif #endif
devices.push_back({device, score}); devices.push_back({device, score});
std::stable_sort(devices.begin(), devices.end(),
[](const auto & a, const auto & b) {
return a.second > b.second;
}
);
} }
ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) { ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
......
...@@ -12,7 +12,6 @@ import ( ...@@ -12,7 +12,6 @@ import (
) )
type Options struct { type Options struct {
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
hiddenSize, numHeads, numKVHeads int hiddenSize, numHeads, numKVHeads int
eps, ropeBase, ropeScale float32 eps, ropeBase, ropeScale float32
ropeDim uint32 ropeDim uint32
...@@ -66,10 +65,11 @@ func New(c ml.Config) (model.Model, error) { ...@@ -66,10 +65,11 @@ func New(c ml.Config) (model.Model, error) {
} }
type SelfAttention struct { type SelfAttention struct {
Query *nn.Linear `gguf:"attn_q"` Query *nn.Linear `gguf:"attn_q"`
Key *nn.Linear `gguf:"attn_k"` Key *nn.Linear `gguf:"attn_k"`
Value *nn.Linear `gguf:"attn_v"` Value *nn.Linear `gguf:"attn_v"`
Output *nn.Linear `gguf:"attn_output"` Output *nn.Linear `gguf:"attn_output"`
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
} }
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor { func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
...@@ -78,11 +78,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten ...@@ -78,11 +78,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
q := sa.Query.Forward(ctx, hiddenState) q := sa.Query.Forward(ctx, hiddenState)
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale) q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
k := sa.Key.Forward(ctx, hiddenState) k := sa.Key.Forward(ctx, hiddenState)
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale) k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
v := sa.Value.Forward(ctx, hiddenState) v := sa.Value.Forward(ctx, hiddenState)
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
...@@ -95,7 +95,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten ...@@ -95,7 +95,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
} }
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, m.Options.ropeBase, m.Options.ropeScale), nil return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, m.ropeDim, m.ropeBase, m.ropeScale), nil
} }
type MLP struct { type MLP struct {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment