Unverified Commit 58245413 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

next ollama runner (#7913)



feat: add new Ollama engine using ggml through cgo

This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this.

- `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go`
- `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go`
- `ml.Tensor` defines the interface for a tensor and tensor operations

This is the first implementation of the new engine. Follow up PRs will implement more features:

- non-greedy sampling (#8410)
- integration with Ollama and KV caching (#8301)
- more model support (#9080) with more coming soon
Co-authored-by: default avatarBruce MacDonald <brucewmacdonald@gmail.com>
parent 8cf16063
package model
import (
"bufio"
"encoding/json"
"math"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
)
func llama(t testing.TB) BytePairEncoding {
t.Helper()
f, err := os.Open(filepath.Join("testdata", "llama3.2", "encoder.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
vocab := make(map[string]int32)
if err := json.NewDecoder(f).Decode(&vocab); err != nil {
t.Fatal(err)
}
types := make([]uint32, len(vocab))
tokens := make([]string, len(vocab))
for token, id := range vocab {
tokens[id] = token
types[id] = 1
}
for _, token := range []string{"<|begin_of_text|>", "<|end_of_text|>"} {
if _, ok := vocab[token]; !ok {
tokens = append(tokens, token) //nolint:makezero
types = append(types, 3) //nolint:makezero
vocab[token] = int32(len(vocab))
}
}
f, err = os.Open(filepath.Join("testdata", "llama3.2", "vocab.bpe"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
merges := make([]string, 0, 50000)
scanner := bufio.NewScanner(f)
for scanner.Scan() {
if !strings.HasPrefix(scanner.Text(), "#") {
merges = append(merges, scanner.Text())
}
}
return NewBytePairEncoding(
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
&Vocabulary{
Values: tokens,
Types: types,
Merges: merges,
},
)
}
func TestLlama(t *testing.T) {
tokenizer := llama(t)
t.Run("simple", func(t *testing.T) {
t.Parallel()
ids, err := tokenizer.Encode("hello world")
if err != nil {
t.Error(err)
}
if diff := cmp.Diff([]int32{15339, 1917}, ids); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
s, err := tokenizer.Decode([]int32{15339, 1917})
if err != nil {
t.Fatal(err)
}
if s != "hello world" {
t.Errorf("got %q, want hello world", s)
}
ids, err = tokenizer.Encode("hello <|end_of_text|>")
if err != nil {
t.Error(err)
}
if diff := cmp.Diff([]int32{15339, 220, 128001}, ids); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
})
t.Run("simple repeated", func(t *testing.T) {
t.Parallel()
cases := map[string][]int32{
strings.Repeat("0", 1): {15},
strings.Repeat("0", 2): {410},
strings.Repeat("0", 3): {931},
strings.Repeat("0", 4): {931, 15},
strings.Repeat("0", 5): {931, 410},
strings.Repeat("0", 6): {931, 931},
strings.Repeat("0", 7): {931, 931, 15},
strings.Repeat("0", 8): {931, 931, 410},
strings.Repeat("0", 9): {931, 931, 931},
strings.Repeat("0", 10): {931, 931, 931, 15},
strings.Repeat("0", 11): {931, 931, 931, 410},
strings.Repeat("0", 12): {931, 931, 931, 931},
strings.Repeat("0", 13): {931, 931, 931, 931, 15},
strings.Repeat("0", 14): {931, 931, 931, 931, 410},
strings.Repeat("0", 15): {931, 931, 931, 931, 931},
strings.Repeat("0", 16): {931, 931, 931, 931, 931, 15},
strings.Repeat("0", 17): {931, 931, 931, 931, 931, 410},
}
for s, want := range cases {
ids, err := tokenizer.Encode(s)
if err != nil {
t.Error(err)
}
if diff := cmp.Diff(want, ids); diff != "" {
t.Errorf("%q no match (-theirs +ours):\n%s", s, diff)
}
}
})
t.Run("basic roundtrip", func(t *testing.T) {
t.Parallel()
cases := []string{
"hello",
"hello ",
"hello ",
" hello",
" hello ",
" hello ",
"hello world",
"请考试我的软件!12345",
}
for _, want := range cases {
ids, err := tokenizer.Encode(want)
if err != nil {
t.Error(err)
}
if got, err := tokenizer.Decode(ids); err != nil {
t.Fatal(err)
} else if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
})
t.Run("special", func(t *testing.T) {
t.Parallel()
cases := map[string][]int32{
"<|begin_of_text|>A B!": {128000, 32, 426, 0},
"<|begin_of_text|>A<|end_of_text|>B!": {128000, 32, 128001, 33, 0},
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!": {128000, 32, 128001, 33, 128000, 0},
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!<|end_of_text|>": {128000, 32, 128001, 33, 128000, 0, 128001},
}
for s, want := range cases {
ids, err := tokenizer.Encode(s)
if err != nil {
t.Fatal(err)
}
if diff := cmp.Diff(want, ids); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
}
})
t.Run("split", func(t *testing.T) {
t.Parallel()
cases := map[string][]string{
"Hello World!": {"Hello", " World", "!"},
"I'm don't won't": {"I", "'m", " don", "'t", " won", "'t"},
"In 2024 there are 366 days": {"In", " ", "202", "4", " there", " are", " ", "366", " days"},
"Hello!! ...world": {"Hello", "!!", " ...", "world"},
"Hello World": {"Hello", " ", " World"},
"Hello\nWorld": {"Hello", "\n", "World"},
"Hello, WORLD!! How's it going?": {"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?"},
}
for s, want := range cases {
got := slices.Collect(tokenizer.split(s))
if diff := cmp.Diff(want, got); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
}
})
}
func BenchmarkBytePairEncoding(b *testing.B) {
tokenizer := llama(b)
bts, err := os.ReadFile(filepath.Join("testdata", "war-and-peace.txt"))
if err != nil {
b.Fatal(err)
}
for i := range 8 {
n := min(int(math.Pow10(i)), len(bts))
bts := bts[:n]
b.Run("encode"+strconv.Itoa(n), func(b *testing.B) {
b.ResetTimer()
for range b.N {
_, err := tokenizer.Encode(string(bts))
if err != nil {
b.Fatal(err)
}
}
})
b.Run("decode"+strconv.Itoa(n), func(b *testing.B) {
ids, err := tokenizer.Encode(string(bts))
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
for range b.N {
_, err := tokenizer.Decode(ids)
if err != nil {
b.Fatal(err)
}
}
})
b.Run("split"+strconv.Itoa(n), func(b *testing.B) {
b.ResetTimer()
for range b.N {
slices.Collect(tokenizer.split(string(bts)))
}
})
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -19,7 +19,7 @@ import (
"golang.org/x/text/encoding/unicode"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)
func TestParseFileFile(t *testing.T) {
......@@ -769,7 +769,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) {
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
}
func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, string) {
func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
t.Helper()
f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf")
......@@ -778,7 +778,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st
}
defer f.Close()
if err := llm.WriteGGUF(f, kv, ti); err != nil {
if err := ggml.WriteGGUF(f, kv, ti); err != nil {
t.Fatal(err)
}
// Calculate sha256 of file
......
package sample
import "gonum.org/v1/gonum/floats"
type greedy struct{}
func Greedy() Sampler {
return greedy{}
}
func (s greedy) Sample(t []float64) ([]float64, error) {
return []float64{float64(floats.MaxIdx(t))}, nil
}
package sample
import (
"slices"
"gonum.org/v1/gonum/floats"
"gonum.org/v1/gonum/stat/sampleuv"
)
type Sampler interface {
Sample([]float64) ([]float64, error)
}
type Temperature float64
func (s Temperature) Sample(t []float64) ([]float64, error) {
floats.Div(t, slices.Repeat([]float64{float64(s)}, len(t)))
return t, nil
}
type softmax struct{}
func Softmax() Sampler {
return softmax{}
}
func (softmax) Sample(t []float64) ([]float64, error) {
return t, nil
}
type TopK int
func (s TopK) Sample(t []float64) ([]float64, error) {
return t, nil
}
type TopP float32
func (s TopP) Sample(t []float64) ([]float64, error) {
return t, nil
}
type MinP float32
func (s MinP) Sample(t []float64) ([]float64, error) {
return t, nil
}
type weighed struct{}
func Weighed() Sampler {
return weighed{}
}
func (s weighed) Sample(t []float64) ([]float64, error) {
w := sampleuv.NewWeighted(t, nil)
if v, ok := w.Take(); ok {
return []float64{float64(v)}, nil
}
return t, nil
}
func Sample(floats []float64, samplers ...Sampler) ([]float64, error) {
var err error
for _, sampler := range samplers {
floats, err = sampler.Sample(floats)
if err != nil {
return nil, err
}
}
return floats, nil
}
......@@ -21,8 +21,8 @@ import (
"github.com/ollama/ollama/convert"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model"
......@@ -205,7 +205,7 @@ func detectModelTypeFromFiles(files map[string]string) string {
return ""
}
ct := llm.DetectGGMLType(buf)
ct := ggml.DetectContentType(buf)
if ct == "gguf" {
return "gguf"
}
......@@ -271,11 +271,11 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
return nil, err
}
ggml, _, err := llm.DecodeGGML(bin, 0)
f, _, err := ggml.Decode(bin, 0)
if err != nil {
return nil, err
}
layers := []*layerGGML{{layer, ggml}}
layers := []*layerGGML{{layer, f}}
if !isAdapter {
return detectChatTemplate(layers)
......@@ -283,13 +283,13 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
return layers, nil
}
func kvFromLayers(baseLayers []*layerGGML) (llm.KV, error) {
func kvFromLayers(baseLayers []*layerGGML) (ggml.KV, error) {
for _, l := range baseLayers {
if l.GGML != nil {
return l.KV(), nil
}
}
return llm.KV{}, fmt.Errorf("no base model was found")
return ggml.KV{}, fmt.Errorf("no base model was found")
}
func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, fn func(resp api.ProgressResponse)) (err error) {
......@@ -306,7 +306,7 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
if layer.GGML != nil {
quantType := strings.ToUpper(cmp.Or(r.Quantize, r.Quantization))
if quantType != "" && layer.GGML.Name() == "gguf" && layer.MediaType == "application/vnd.ollama.image.model" {
want, err := llm.ParseFileType(quantType)
want, err := ggml.ParseFileType(quantType)
if err != nil {
return err
}
......@@ -403,7 +403,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
ft := layer.GGML.KV().FileType()
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)})
want, err := llm.ParseFileType(quantizeType)
want, err := ggml.ParseFileType(quantizeType)
if err != nil {
return nil, err
}
......@@ -433,13 +433,13 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
return nil, err
}
ggml, _, err := llm.DecodeGGML(temp, 0)
f, _, err := ggml.Decode(temp, 0)
if err != nil {
slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
return nil, err
}
return &layerGGML{newLayer, ggml}, nil
return &layerGGML{newLayer, f}, nil
}
func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) {
......@@ -475,7 +475,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
var offset int64
for offset < stat.Size() {
ggml, n, err := llm.DecodeGGML(blob, 0)
f, n, err := ggml.Decode(blob, 0)
if errors.Is(err, io.EOF) {
break
} else if err != nil {
......@@ -483,9 +483,9 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
}
mediatype := "application/vnd.ollama.image.model"
if ggml.KV().Kind() == "adapter" {
if f.KV().Kind() == "adapter" {
mediatype = "application/vnd.ollama.image.adapter"
} else if _, ok := ggml.KV()[fmt.Sprintf("%s.vision.block_count", ggml.KV().Architecture())]; ok || ggml.KV().Kind() == "projector" {
} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
mediatype = "application/vnd.ollama.image.projector"
}
......@@ -506,7 +506,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
}
}
layers = append(layers, &layerGGML{layer, ggml})
layers = append(layers, &layerGGML{layer, f})
offset = n
}
......
......@@ -23,7 +23,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/model"
......@@ -78,21 +78,21 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
for _, cap := range caps {
switch cap {
case CapabilityCompletion:
f, err := os.Open(m.ModelPath)
r, err := os.Open(m.ModelPath)
if err != nil {
slog.Error("couldn't open model file", "error", err)
continue
}
defer f.Close()
defer r.Close()
// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
ggml, _, err := llm.DecodeGGML(f, 0)
f, _, err := ggml.Decode(r, 0)
if err != nil {
slog.Error("couldn't decode ggml", "error", err)
continue
}
if _, ok := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]; ok {
if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
errs = append(errs, errCapabilityCompletion)
}
case CapabilityTools:
......
......@@ -15,7 +15,7 @@ import (
"text/template/parse"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/model"
)
......@@ -24,7 +24,7 @@ var intermediateBlobs map[string]string = make(map[string]string)
type layerGGML struct {
Layer
*llm.GGML
*ggml.GGML
}
func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
......@@ -64,12 +64,12 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
}
defer blob.Close()
ggml, _, err := llm.DecodeGGML(blob, 0)
f, _, err := ggml.Decode(blob, 0)
if err != nil {
return nil, err
}
layers = append(layers, &layerGGML{layer, ggml})
layers = append(layers, &layerGGML{layer, f})
default:
layers = append(layers, &layerGGML{layer, nil})
}
......@@ -118,7 +118,7 @@ func detectContentType(r io.Reader) (string, error) {
return "", err
}
if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
if contentType := ggml.DetectContentType(b.Bytes()); contentType != "" {
return contentType, nil
}
......
......@@ -30,6 +30,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/model/mllama"
"github.com/ollama/ollama/openai"
......@@ -860,7 +861,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
return resp, nil
}
func getKVData(digest string, verbose bool) (llm.KV, error) {
func getKVData(digest string, verbose bool) (ggml.KV, error) {
maxArraySize := 0
if verbose {
maxArraySize = -1
......
......@@ -19,12 +19,12 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)
var stream bool = false
func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, string) {
func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
t.Helper()
t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))
......@@ -36,7 +36,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st
}
defer f.Close()
if err := llm.WriteGGUF(f, kv, ti); err != nil {
if err := ggml.WriteGGUF(f, kv, ti); err != nil {
t.Fatal(err)
}
// Calculate sha256 of file
......@@ -672,7 +672,7 @@ func TestCreateDetectTemplate(t *testing.T) {
var s Server
t.Run("matched", func(t *testing.T) {
_, digest := createBinFile(t, llm.KV{
_, digest := createBinFile(t, ggml.KV{
"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
}, nil)
w := createRequest(t, s.CreateHandler, api.CreateRequest{
......
......@@ -16,6 +16,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
)
......@@ -45,8 +46,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
return
}
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
return mock, nil
}
}
......@@ -76,7 +77,7 @@ func TestGenerateChat(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
......@@ -88,7 +89,7 @@ func TestGenerateChat(t *testing.T) {
go s.sched.Run(context.TODO())
_, digest := createBinFile(t, llm.KV{
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
......@@ -98,7 +99,7 @@ func TestGenerateChat(t *testing.T) {
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{
}, []ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
......@@ -154,10 +155,10 @@ func TestGenerateChat(t *testing.T) {
})
t.Run("missing capabilities chat", func(t *testing.T) {
_, digest := createBinFile(t, llm.KV{
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "bert",
"bert.pooling_type": uint32(0),
}, []llm.Tensor{})
}, []ggml.Tensor{})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert",
Files: map[string]string{"bert.gguf": digest},
......@@ -612,7 +613,7 @@ func TestGenerate(t *testing.T) {
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
......@@ -624,7 +625,7 @@ func TestGenerate(t *testing.T) {
go s.sched.Run(context.TODO())
_, digest := createBinFile(t, llm.KV{
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
......@@ -634,7 +635,7 @@ func TestGenerate(t *testing.T) {
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{
}, []ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
......@@ -686,10 +687,10 @@ func TestGenerate(t *testing.T) {
})
t.Run("missing capabilities generate", func(t *testing.T) {
_, digest := createBinFile(t, llm.KV{
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "bert",
"bert.pooling_type": uint32(0),
}, []llm.Tensor{})
}, []ggml.Tensor{})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert",
......
......@@ -21,7 +21,7 @@ import (
"unicode"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
......@@ -654,8 +654,8 @@ func TestShow(t *testing.T) {
var s Server
_, digest1 := createBinFile(t, llm.KV{"general.architecture": "test"}, nil)
_, digest2 := createBinFile(t, llm.KV{"general.type": "projector", "general.architecture": "clip"}, nil)
_, digest1 := createBinFile(t, ggml.KV{"general.architecture": "test"}, nil)
_, digest2 := createBinFile(t, ggml.KV{"general.type": "projector", "general.architecture": "clip"}, nil)
createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "show-model",
......
......@@ -18,6 +18,7 @@ import (
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
)
......@@ -41,8 +42,8 @@ type Scheduler struct {
loaded map[string]*runnerRef
loadedMu sync.Mutex
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func() discover.GpuInfoList
getCpuFn func() discover.GpuInfoList
reschedDelay time.Duration
......@@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
}()
}
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
if numParallel < 1 {
numParallel = 1
}
......@@ -417,12 +418,12 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL
if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration
}
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
// check for model compatibility
if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
}
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
......@@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool {
// If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
// opts.NumCtx accordingly
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
var estimatedVRAM uint64
var numParallelToTry []int
......@@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread() {
for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
*numParallel = p
return []discover.GpuInfo{g}
......@@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
// Now try all the GPUs
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
*numParallel = p
return sgl
......@@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
if *numParallel <= 0 {
*numParallel = 1
req.opts.NumCtx = req.origNumCtx
......@@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
......@@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) {
// If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
slog.Debug("evaluating if CPU model load will fit in available system memory")
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts)
if estimate.TotalSize <= gpus[0].FreeMemory {
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
return nil
......
......@@ -15,6 +15,7 @@ import (
"github.com/ollama/ollama/app/lifecycle"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
)
......@@ -37,7 +38,7 @@ func TestLoad(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
defer done()
s := InitScheduler(ctx)
var ggml *llm.GGML // value not used in tests
var f *ggml.GGML // value not used in tests
req := &LlmRequest{
ctx: ctx,
model: &Model{ModelPath: "foo"},
......@@ -47,11 +48,11 @@ func TestLoad(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2 * time.Second},
}
// Fail to load model first
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return nil, errors.New("something failed to load model blah")
}
gpus := discover.GpuInfoList{}
s.load(req, ggml, gpus, 0)
s.load(req, f, gpus, 0)
require.Empty(t, req.successCh)
require.Len(t, req.errCh, 1)
s.loadedMu.Lock()
......@@ -61,10 +62,10 @@ func TestLoad(t *testing.T) {
require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return server, nil
}
s.load(req, ggml, gpus, 0)
s.load(req, f, gpus, 0)
select {
case err := <-req.errCh:
require.NoError(t, err)
......@@ -78,7 +79,7 @@ func TestLoad(t *testing.T) {
req.model.ModelPath = "dummy_model_path"
server.waitResp = errors.New("wait failure")
s.load(req, ggml, gpus, 0)
s.load(req, f, gpus, 0)
select {
case err := <-req.errCh:
require.Contains(t, err.Error(), "wait failure")
......@@ -99,10 +100,10 @@ type reqBundle struct {
ctxDone func()
srv *mockLlm
req *LlmRequest
ggml *llm.GGML
f *ggml.GGML
}
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return scenario.srv, nil
}
......@@ -115,7 +116,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
require.NoError(t, err)
defer f.Close()
require.NoError(t, llm.WriteGGUF(f, llm.KV{
require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
"general.architecture": "llama",
"llama.context_length": uint32(32),
"llama.embedding_length": uint32(4096),
......@@ -125,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
"tokenizer.ggml.tokens": []string{" "},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{
}, []ggml.Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
}))
......@@ -133,7 +134,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
fname := f.Name()
model := &Model{Name: modelName, ModelPath: fname}
b.ggml, err = llm.LoadModel(model.ModelPath, 0)
b.f, err = llm.LoadModel(model.ModelPath, 0)
require.NoError(t, err)
if duration == nil {
......@@ -174,7 +175,7 @@ func TestRequestsSameModelSameRequest(t *testing.T) {
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
b.req.model = a.req.model
b.ggml = a.ggml
b.f = a.f
s.newServerFn = a.newServer
slog.Info("a")
......@@ -218,7 +219,7 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) {
b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
tmpModel := *a.req.model
b.req.model = &tmpModel
b.ggml = a.ggml
b.f = a.f
s.newServerFn = a.newServer
slog.Info("a")
......@@ -419,13 +420,13 @@ func TestExpireRunner(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
}
var ggml *llm.GGML
var f *ggml.GGML
gpus := discover.GpuInfoList{}
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return server, nil
}
s.load(req, ggml, gpus, 0)
s.load(req, f, gpus, 0)
select {
case err := <-req.errCh:
......@@ -729,9 +730,9 @@ func TestHomogeneousGPUs(t *testing.T) {
}
s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
require.Len(t, gpus, 1)
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
}
slog.Info("a")
s.pendingReqCh <- a.req
......
......@@ -14,7 +14,7 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)
func TestNamed(t *testing.T) {
......@@ -33,7 +33,7 @@ func TestNamed(t *testing.T) {
for k, v := range ss {
t.Run(k, func(t *testing.T) {
kv := llm.KV{"tokenizer.chat_template": v}
kv := ggml.KV{"tokenizer.chat_template": v}
s := kv.ChatTemplate()
r, err := Named(s)
if err != nil {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment