Unverified Commit 58245413 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

next ollama runner (#7913)



feat: add new Ollama engine using ggml through cgo

This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this.

- `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go`
- `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go`
- `ml.Tensor` defines the interface for a tensor and tensor operations

This is the first implementation of the new engine. Follow up PRs will implement more features:

- non-greedy sampling (#8410)
- integration with Ollama and KV caching (#8301)
- more model support (#9080) with more coming soon
Co-authored-by: default avatarBruce MacDonald <brucewmacdonald@gmail.com>
parent 8cf16063
package model
import (
"bufio"
"encoding/json"
"math"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
)
func llama(t testing.TB) BytePairEncoding {
t.Helper()
f, err := os.Open(filepath.Join("testdata", "llama3.2", "encoder.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
vocab := make(map[string]int32)
if err := json.NewDecoder(f).Decode(&vocab); err != nil {
t.Fatal(err)
}
types := make([]uint32, len(vocab))
tokens := make([]string, len(vocab))
for token, id := range vocab {
tokens[id] = token
types[id] = 1
}
for _, token := range []string{"<|begin_of_text|>", "<|end_of_text|>"} {
if _, ok := vocab[token]; !ok {
tokens = append(tokens, token) //nolint:makezero
types = append(types, 3) //nolint:makezero
vocab[token] = int32(len(vocab))
}
}
f, err = os.Open(filepath.Join("testdata", "llama3.2", "vocab.bpe"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
merges := make([]string, 0, 50000)
scanner := bufio.NewScanner(f)
for scanner.Scan() {
if !strings.HasPrefix(scanner.Text(), "#") {
merges = append(merges, scanner.Text())
}
}
return NewBytePairEncoding(
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
&Vocabulary{
Values: tokens,
Types: types,
Merges: merges,
},
)
}
func TestLlama(t *testing.T) {
tokenizer := llama(t)
t.Run("simple", func(t *testing.T) {
t.Parallel()
ids, err := tokenizer.Encode("hello world")
if err != nil {
t.Error(err)
}
if diff := cmp.Diff([]int32{15339, 1917}, ids); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
s, err := tokenizer.Decode([]int32{15339, 1917})
if err != nil {
t.Fatal(err)
}
if s != "hello world" {
t.Errorf("got %q, want hello world", s)
}
ids, err = tokenizer.Encode("hello <|end_of_text|>")
if err != nil {
t.Error(err)
}
if diff := cmp.Diff([]int32{15339, 220, 128001}, ids); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
})
t.Run("simple repeated", func(t *testing.T) {
t.Parallel()
cases := map[string][]int32{
strings.Repeat("0", 1): {15},
strings.Repeat("0", 2): {410},
strings.Repeat("0", 3): {931},
strings.Repeat("0", 4): {931, 15},
strings.Repeat("0", 5): {931, 410},
strings.Repeat("0", 6): {931, 931},
strings.Repeat("0", 7): {931, 931, 15},
strings.Repeat("0", 8): {931, 931, 410},
strings.Repeat("0", 9): {931, 931, 931},
strings.Repeat("0", 10): {931, 931, 931, 15},
strings.Repeat("0", 11): {931, 931, 931, 410},
strings.Repeat("0", 12): {931, 931, 931, 931},
strings.Repeat("0", 13): {931, 931, 931, 931, 15},
strings.Repeat("0", 14): {931, 931, 931, 931, 410},
strings.Repeat("0", 15): {931, 931, 931, 931, 931},
strings.Repeat("0", 16): {931, 931, 931, 931, 931, 15},
strings.Repeat("0", 17): {931, 931, 931, 931, 931, 410},
}
for s, want := range cases {
ids, err := tokenizer.Encode(s)
if err != nil {
t.Error(err)
}
if diff := cmp.Diff(want, ids); diff != "" {
t.Errorf("%q no match (-theirs +ours):\n%s", s, diff)
}
}
})
t.Run("basic roundtrip", func(t *testing.T) {
t.Parallel()
cases := []string{
"hello",
"hello ",
"hello ",
" hello",
" hello ",
" hello ",
"hello world",
"请考试我的软件!12345",
}
for _, want := range cases {
ids, err := tokenizer.Encode(want)
if err != nil {
t.Error(err)
}
if got, err := tokenizer.Decode(ids); err != nil {
t.Fatal(err)
} else if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
})
t.Run("special", func(t *testing.T) {
t.Parallel()
cases := map[string][]int32{
"<|begin_of_text|>A B!": {128000, 32, 426, 0},
"<|begin_of_text|>A<|end_of_text|>B!": {128000, 32, 128001, 33, 0},
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!": {128000, 32, 128001, 33, 128000, 0},
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!<|end_of_text|>": {128000, 32, 128001, 33, 128000, 0, 128001},
}
for s, want := range cases {
ids, err := tokenizer.Encode(s)
if err != nil {
t.Fatal(err)
}
if diff := cmp.Diff(want, ids); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
}
})
t.Run("split", func(t *testing.T) {
t.Parallel()
cases := map[string][]string{
"Hello World!": {"Hello", " World", "!"},
"I'm don't won't": {"I", "'m", " don", "'t", " won", "'t"},
"In 2024 there are 366 days": {"In", " ", "202", "4", " there", " are", " ", "366", " days"},
"Hello!! ...world": {"Hello", "!!", " ...", "world"},
"Hello World": {"Hello", " ", " World"},
"Hello\nWorld": {"Hello", "\n", "World"},
"Hello, WORLD!! How's it going?": {"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?"},
}
for s, want := range cases {
got := slices.Collect(tokenizer.split(s))
if diff := cmp.Diff(want, got); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
}
})
}
func BenchmarkBytePairEncoding(b *testing.B) {
tokenizer := llama(b)
bts, err := os.ReadFile(filepath.Join("testdata", "war-and-peace.txt"))
if err != nil {
b.Fatal(err)
}
for i := range 8 {
n := min(int(math.Pow10(i)), len(bts))
bts := bts[:n]
b.Run("encode"+strconv.Itoa(n), func(b *testing.B) {
b.ResetTimer()
for range b.N {
_, err := tokenizer.Encode(string(bts))
if err != nil {
b.Fatal(err)
}
}
})
b.Run("decode"+strconv.Itoa(n), func(b *testing.B) {
ids, err := tokenizer.Encode(string(bts))
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
for range b.N {
_, err := tokenizer.Decode(ids)
if err != nil {
b.Fatal(err)
}
}
})
b.Run("split"+strconv.Itoa(n), func(b *testing.B) {
b.ResetTimer()
for range b.N {
slices.Collect(tokenizer.split(string(bts)))
}
})
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -19,7 +19,7 @@ import ( ...@@ -19,7 +19,7 @@ import (
"golang.org/x/text/encoding/unicode" "golang.org/x/text/encoding/unicode"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/fs/ggml"
) )
func TestParseFileFile(t *testing.T) { func TestParseFileFile(t *testing.T) {
...@@ -769,7 +769,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) { ...@@ -769,7 +769,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) {
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
} }
func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, string) { func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
t.Helper() t.Helper()
f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf") f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf")
...@@ -778,7 +778,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st ...@@ -778,7 +778,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st
} }
defer f.Close() defer f.Close()
if err := llm.WriteGGUF(f, kv, ti); err != nil { if err := ggml.WriteGGUF(f, kv, ti); err != nil {
t.Fatal(err) t.Fatal(err)
} }
// Calculate sha256 of file // Calculate sha256 of file
......
package sample
import "gonum.org/v1/gonum/floats"
type greedy struct{}
func Greedy() Sampler {
return greedy{}
}
func (s greedy) Sample(t []float64) ([]float64, error) {
return []float64{float64(floats.MaxIdx(t))}, nil
}
package sample
import (
"slices"
"gonum.org/v1/gonum/floats"
"gonum.org/v1/gonum/stat/sampleuv"
)
type Sampler interface {
Sample([]float64) ([]float64, error)
}
type Temperature float64
func (s Temperature) Sample(t []float64) ([]float64, error) {
floats.Div(t, slices.Repeat([]float64{float64(s)}, len(t)))
return t, nil
}
type softmax struct{}
func Softmax() Sampler {
return softmax{}
}
func (softmax) Sample(t []float64) ([]float64, error) {
return t, nil
}
type TopK int
func (s TopK) Sample(t []float64) ([]float64, error) {
return t, nil
}
type TopP float32
func (s TopP) Sample(t []float64) ([]float64, error) {
return t, nil
}
type MinP float32
func (s MinP) Sample(t []float64) ([]float64, error) {
return t, nil
}
type weighed struct{}
func Weighed() Sampler {
return weighed{}
}
func (s weighed) Sample(t []float64) ([]float64, error) {
w := sampleuv.NewWeighted(t, nil)
if v, ok := w.Take(); ok {
return []float64{float64(v)}, nil
}
return t, nil
}
func Sample(floats []float64, samplers ...Sampler) ([]float64, error) {
var err error
for _, sampler := range samplers {
floats, err = sampler.Sample(floats)
if err != nil {
return nil, err
}
}
return floats, nil
}
...@@ -21,8 +21,8 @@ import ( ...@@ -21,8 +21,8 @@ import (
"github.com/ollama/ollama/convert" "github.com/ollama/ollama/convert"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llama" "github.com/ollama/ollama/llama"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/template" "github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
...@@ -205,7 +205,7 @@ func detectModelTypeFromFiles(files map[string]string) string { ...@@ -205,7 +205,7 @@ func detectModelTypeFromFiles(files map[string]string) string {
return "" return ""
} }
ct := llm.DetectGGMLType(buf) ct := ggml.DetectContentType(buf)
if ct == "gguf" { if ct == "gguf" {
return "gguf" return "gguf"
} }
...@@ -271,11 +271,11 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is ...@@ -271,11 +271,11 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
return nil, err return nil, err
} }
ggml, _, err := llm.DecodeGGML(bin, 0) f, _, err := ggml.Decode(bin, 0)
if err != nil { if err != nil {
return nil, err return nil, err
} }
layers := []*layerGGML{{layer, ggml}} layers := []*layerGGML{{layer, f}}
if !isAdapter { if !isAdapter {
return detectChatTemplate(layers) return detectChatTemplate(layers)
...@@ -283,13 +283,13 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is ...@@ -283,13 +283,13 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
return layers, nil return layers, nil
} }
func kvFromLayers(baseLayers []*layerGGML) (llm.KV, error) { func kvFromLayers(baseLayers []*layerGGML) (ggml.KV, error) {
for _, l := range baseLayers { for _, l := range baseLayers {
if l.GGML != nil { if l.GGML != nil {
return l.KV(), nil return l.KV(), nil
} }
} }
return llm.KV{}, fmt.Errorf("no base model was found") return ggml.KV{}, fmt.Errorf("no base model was found")
} }
func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, fn func(resp api.ProgressResponse)) (err error) { func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, fn func(resp api.ProgressResponse)) (err error) {
...@@ -306,7 +306,7 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, ...@@ -306,7 +306,7 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
if layer.GGML != nil { if layer.GGML != nil {
quantType := strings.ToUpper(cmp.Or(r.Quantize, r.Quantization)) quantType := strings.ToUpper(cmp.Or(r.Quantize, r.Quantization))
if quantType != "" && layer.GGML.Name() == "gguf" && layer.MediaType == "application/vnd.ollama.image.model" { if quantType != "" && layer.GGML.Name() == "gguf" && layer.MediaType == "application/vnd.ollama.image.model" {
want, err := llm.ParseFileType(quantType) want, err := ggml.ParseFileType(quantType)
if err != nil { if err != nil {
return err return err
} }
...@@ -403,7 +403,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr ...@@ -403,7 +403,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
ft := layer.GGML.KV().FileType() ft := layer.GGML.KV().FileType()
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)}) fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)})
want, err := llm.ParseFileType(quantizeType) want, err := ggml.ParseFileType(quantizeType)
if err != nil { if err != nil {
return nil, err return nil, err
} }
...@@ -433,13 +433,13 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr ...@@ -433,13 +433,13 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
return nil, err return nil, err
} }
ggml, _, err := llm.DecodeGGML(temp, 0) f, _, err := ggml.Decode(temp, 0)
if err != nil { if err != nil {
slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err)) slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
return nil, err return nil, err
} }
return &layerGGML{newLayer, ggml}, nil return &layerGGML{newLayer, f}, nil
} }
func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) { func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) {
...@@ -475,7 +475,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML ...@@ -475,7 +475,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
var offset int64 var offset int64
for offset < stat.Size() { for offset < stat.Size() {
ggml, n, err := llm.DecodeGGML(blob, 0) f, n, err := ggml.Decode(blob, 0)
if errors.Is(err, io.EOF) { if errors.Is(err, io.EOF) {
break break
} else if err != nil { } else if err != nil {
...@@ -483,9 +483,9 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML ...@@ -483,9 +483,9 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
} }
mediatype := "application/vnd.ollama.image.model" mediatype := "application/vnd.ollama.image.model"
if ggml.KV().Kind() == "adapter" { if f.KV().Kind() == "adapter" {
mediatype = "application/vnd.ollama.image.adapter" mediatype = "application/vnd.ollama.image.adapter"
} else if _, ok := ggml.KV()[fmt.Sprintf("%s.vision.block_count", ggml.KV().Architecture())]; ok || ggml.KV().Kind() == "projector" { } else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
mediatype = "application/vnd.ollama.image.projector" mediatype = "application/vnd.ollama.image.projector"
} }
...@@ -506,7 +506,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML ...@@ -506,7 +506,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
} }
} }
layers = append(layers, &layerGGML{layer, ggml}) layers = append(layers, &layerGGML{layer, f})
offset = n offset = n
} }
......
...@@ -23,7 +23,7 @@ import ( ...@@ -23,7 +23,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/parser"
"github.com/ollama/ollama/template" "github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
...@@ -78,21 +78,21 @@ func (m *Model) CheckCapabilities(caps ...Capability) error { ...@@ -78,21 +78,21 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
for _, cap := range caps { for _, cap := range caps {
switch cap { switch cap {
case CapabilityCompletion: case CapabilityCompletion:
f, err := os.Open(m.ModelPath) r, err := os.Open(m.ModelPath)
if err != nil { if err != nil {
slog.Error("couldn't open model file", "error", err) slog.Error("couldn't open model file", "error", err)
continue continue
} }
defer f.Close() defer r.Close()
// TODO(mxyng): decode the GGML into model to avoid doing this multiple times // TODO(mxyng): decode the GGML into model to avoid doing this multiple times
ggml, _, err := llm.DecodeGGML(f, 0) f, _, err := ggml.Decode(r, 0)
if err != nil { if err != nil {
slog.Error("couldn't decode ggml", "error", err) slog.Error("couldn't decode ggml", "error", err)
continue continue
} }
if _, ok := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]; ok { if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
errs = append(errs, errCapabilityCompletion) errs = append(errs, errCapabilityCompletion)
} }
case CapabilityTools: case CapabilityTools:
......
...@@ -15,7 +15,7 @@ import ( ...@@ -15,7 +15,7 @@ import (
"text/template/parse" "text/template/parse"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/template" "github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
) )
...@@ -24,7 +24,7 @@ var intermediateBlobs map[string]string = make(map[string]string) ...@@ -24,7 +24,7 @@ var intermediateBlobs map[string]string = make(map[string]string)
type layerGGML struct { type layerGGML struct {
Layer Layer
*llm.GGML *ggml.GGML
} }
func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
...@@ -64,12 +64,12 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe ...@@ -64,12 +64,12 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
} }
defer blob.Close() defer blob.Close()
ggml, _, err := llm.DecodeGGML(blob, 0) f, _, err := ggml.Decode(blob, 0)
if err != nil { if err != nil {
return nil, err return nil, err
} }
layers = append(layers, &layerGGML{layer, ggml}) layers = append(layers, &layerGGML{layer, f})
default: default:
layers = append(layers, &layerGGML{layer, nil}) layers = append(layers, &layerGGML{layer, nil})
} }
...@@ -118,7 +118,7 @@ func detectContentType(r io.Reader) (string, error) { ...@@ -118,7 +118,7 @@ func detectContentType(r io.Reader) (string, error) {
return "", err return "", err
} }
if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" { if contentType := ggml.DetectContentType(b.Bytes()); contentType != "" {
return contentType, nil return contentType, nil
} }
......
...@@ -30,6 +30,7 @@ import ( ...@@ -30,6 +30,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover" "github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/model/mllama" "github.com/ollama/ollama/model/mllama"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
...@@ -860,7 +861,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { ...@@ -860,7 +861,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
return resp, nil return resp, nil
} }
func getKVData(digest string, verbose bool) (llm.KV, error) { func getKVData(digest string, verbose bool) (ggml.KV, error) {
maxArraySize := 0 maxArraySize := 0
if verbose { if verbose {
maxArraySize = -1 maxArraySize = -1
......
...@@ -19,12 +19,12 @@ import ( ...@@ -19,12 +19,12 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/fs/ggml"
) )
var stream bool = false var stream bool = false
func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, string) { func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
t.Helper() t.Helper()
t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir())) t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))
...@@ -36,7 +36,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st ...@@ -36,7 +36,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st
} }
defer f.Close() defer f.Close()
if err := llm.WriteGGUF(f, kv, ti); err != nil { if err := ggml.WriteGGUF(f, kv, ti); err != nil {
t.Fatal(err) t.Fatal(err)
} }
// Calculate sha256 of file // Calculate sha256 of file
...@@ -672,7 +672,7 @@ func TestCreateDetectTemplate(t *testing.T) { ...@@ -672,7 +672,7 @@ func TestCreateDetectTemplate(t *testing.T) {
var s Server var s Server
t.Run("matched", func(t *testing.T) { t.Run("matched", func(t *testing.T) {
_, digest := createBinFile(t, llm.KV{ _, digest := createBinFile(t, ggml.KV{
"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", "tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
}, nil) }, nil)
w := createRequest(t, s.CreateHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
......
...@@ -16,6 +16,7 @@ import ( ...@@ -16,6 +16,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover" "github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
...@@ -45,8 +46,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error ...@@ -45,8 +46,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
return return
} }
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
return mock, nil return mock, nil
} }
} }
...@@ -76,7 +77,7 @@ func TestGenerateChat(t *testing.T) { ...@@ -76,7 +77,7 @@ func TestGenerateChat(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
...@@ -88,7 +89,7 @@ func TestGenerateChat(t *testing.T) { ...@@ -88,7 +89,7 @@ func TestGenerateChat(t *testing.T) {
go s.sched.Run(context.TODO()) go s.sched.Run(context.TODO())
_, digest := createBinFile(t, llm.KV{ _, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama", "general.architecture": "llama",
"llama.block_count": uint32(1), "llama.block_count": uint32(1),
"llama.context_length": uint32(8192), "llama.context_length": uint32(8192),
...@@ -98,7 +99,7 @@ func TestGenerateChat(t *testing.T) { ...@@ -98,7 +99,7 @@ func TestGenerateChat(t *testing.T) {
"tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0}, "tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{ }, []ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
...@@ -154,10 +155,10 @@ func TestGenerateChat(t *testing.T) { ...@@ -154,10 +155,10 @@ func TestGenerateChat(t *testing.T) {
}) })
t.Run("missing capabilities chat", func(t *testing.T) { t.Run("missing capabilities chat", func(t *testing.T) {
_, digest := createBinFile(t, llm.KV{ _, digest := createBinFile(t, ggml.KV{
"general.architecture": "bert", "general.architecture": "bert",
"bert.pooling_type": uint32(0), "bert.pooling_type": uint32(0),
}, []llm.Tensor{}) }, []ggml.Tensor{})
w := createRequest(t, s.CreateHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert", Model: "bert",
Files: map[string]string{"bert.gguf": digest}, Files: map[string]string{"bert.gguf": digest},
...@@ -612,7 +613,7 @@ func TestGenerate(t *testing.T) { ...@@ -612,7 +613,7 @@ func TestGenerate(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
...@@ -624,7 +625,7 @@ func TestGenerate(t *testing.T) { ...@@ -624,7 +625,7 @@ func TestGenerate(t *testing.T) {
go s.sched.Run(context.TODO()) go s.sched.Run(context.TODO())
_, digest := createBinFile(t, llm.KV{ _, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama", "general.architecture": "llama",
"llama.block_count": uint32(1), "llama.block_count": uint32(1),
"llama.context_length": uint32(8192), "llama.context_length": uint32(8192),
...@@ -634,7 +635,7 @@ func TestGenerate(t *testing.T) { ...@@ -634,7 +635,7 @@ func TestGenerate(t *testing.T) {
"tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0}, "tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{ }, []ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
...@@ -686,10 +687,10 @@ func TestGenerate(t *testing.T) { ...@@ -686,10 +687,10 @@ func TestGenerate(t *testing.T) {
}) })
t.Run("missing capabilities generate", func(t *testing.T) { t.Run("missing capabilities generate", func(t *testing.T) {
_, digest := createBinFile(t, llm.KV{ _, digest := createBinFile(t, ggml.KV{
"general.architecture": "bert", "general.architecture": "bert",
"bert.pooling_type": uint32(0), "bert.pooling_type": uint32(0),
}, []llm.Tensor{}) }, []ggml.Tensor{})
w := createRequest(t, s.CreateHandler, api.CreateRequest{ w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert", Model: "bert",
......
...@@ -21,7 +21,7 @@ import ( ...@@ -21,7 +21,7 @@ import (
"unicode" "unicode"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version" "github.com/ollama/ollama/version"
...@@ -654,8 +654,8 @@ func TestShow(t *testing.T) { ...@@ -654,8 +654,8 @@ func TestShow(t *testing.T) {
var s Server var s Server
_, digest1 := createBinFile(t, llm.KV{"general.architecture": "test"}, nil) _, digest1 := createBinFile(t, ggml.KV{"general.architecture": "test"}, nil)
_, digest2 := createBinFile(t, llm.KV{"general.type": "projector", "general.architecture": "clip"}, nil) _, digest2 := createBinFile(t, ggml.KV{"general.type": "projector", "general.architecture": "clip"}, nil)
createRequest(t, s.CreateHandler, api.CreateRequest{ createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "show-model", Name: "show-model",
......
...@@ -18,6 +18,7 @@ import ( ...@@ -18,6 +18,7 @@ import (
"github.com/ollama/ollama/discover" "github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
...@@ -41,8 +42,8 @@ type Scheduler struct { ...@@ -41,8 +42,8 @@ type Scheduler struct {
loaded map[string]*runnerRef loaded map[string]*runnerRef
loadedMu sync.Mutex loadedMu sync.Mutex
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func() discover.GpuInfoList getGpuFn func() discover.GpuInfoList
getCpuFn func() discover.GpuInfoList getCpuFn func() discover.GpuInfoList
reschedDelay time.Duration reschedDelay time.Duration
...@@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm ...@@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
}() }()
} }
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) { func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
if numParallel < 1 { if numParallel < 1 {
numParallel = 1 numParallel = 1
} }
...@@ -417,12 +418,12 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL ...@@ -417,12 +418,12 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL
if req.sessionDuration != nil { if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration sessionDuration = req.sessionDuration.Duration
} }
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil { if err != nil {
// some older models are not compatible with newer versions of llama.cpp // some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to // show a generalized compatibility error until there is a better way to
// check for model compatibility // check for model compatibility
if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") { if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName) err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
} }
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err) slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
...@@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool { ...@@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool {
// If the model can not be fit fully within the available GPU(s) nil is returned // If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
// opts.NumCtx accordingly // opts.NumCtx accordingly
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
var estimatedVRAM uint64 var estimatedVRAM uint64
var numParallelToTry []int var numParallelToTry []int
...@@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu ...@@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
req.opts.NumCtx = req.origNumCtx * p req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread() { if !envconfig.SchedSpread() {
for _, g := range sgl { for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
*numParallel = p *numParallel = p
return []discover.GpuInfo{g} return []discover.GpuInfo{g}
...@@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu ...@@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
// Now try all the GPUs // Now try all the GPUs
for _, p := range numParallelToTry { for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
*numParallel = p *numParallel = p
return sgl return sgl
...@@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu ...@@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
} }
// If multiple Libraries are detected, pick the Library which loads the most layers for the model // If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
if *numParallel <= 0 { if *numParallel <= 0 {
*numParallel = 1 *numParallel = 1
req.opts.NumCtx = req.origNumCtx req.opts.NumCtx = req.origNumCtx
...@@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover. ...@@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.
var bestEstimate uint64 var bestEstimate uint64
var bestFit int var bestFit int
for i, gl := range byLibrary { for i, gl := range byLibrary {
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts) _, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
if estimatedVRAM > bestEstimate { if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM bestEstimate = estimatedVRAM
bestFit = i bestFit = i
...@@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) { ...@@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) {
// If other runners are loaded, make sure the pending request will fit in system memory // If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded // If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef { func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
slog.Debug("evaluating if CPU model load will fit in available system memory") slog.Debug("evaluating if CPU model load will fit in available system memory")
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts) estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts)
if estimate.TotalSize <= gpus[0].FreeMemory { if estimate.TotalSize <= gpus[0].FreeMemory {
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory)) slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
return nil return nil
......
This diff is collapsed.
...@@ -14,7 +14,7 @@ import ( ...@@ -14,7 +14,7 @@ import (
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/fs/ggml"
) )
func TestNamed(t *testing.T) { func TestNamed(t *testing.T) {
...@@ -33,7 +33,7 @@ func TestNamed(t *testing.T) { ...@@ -33,7 +33,7 @@ func TestNamed(t *testing.T) {
for k, v := range ss { for k, v := range ss {
t.Run(k, func(t *testing.T) { t.Run(k, func(t *testing.T) {
kv := llm.KV{"tokenizer.chat_template": v} kv := ggml.KV{"tokenizer.chat_template": v}
s := kv.ChatTemplate() s := kv.ChatTemplate()
r, err := Named(s) r, err := Named(s)
if err != nil { if err != nil {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment