memory_test.go 4.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
package llm

import (
	"bytes"
	"fmt"
	"os"
	"testing"

	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
Michael Yang's avatar
lint  
Michael Yang committed
11
12

	"github.com/ollama/ollama/api"
13
	"github.com/ollama/ollama/discover"
Michael Yang's avatar
Michael Yang committed
14
	"github.com/ollama/ollama/fs/ggml"
15
	"github.com/ollama/ollama/ml"
16
17
18
)

func TestEstimateGPULayers(t *testing.T) {
Michael Yang's avatar
Michael Yang committed
19
	t.Setenv("OLLAMA_DEBUG", "1")
20
	t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16
21
	t.Setenv("OLLAMA_CONTEXT_LENGTH", "2048")
Michael Yang's avatar
Michael Yang committed
22

23
24
	modelName := "dummy"
	f, err := os.CreateTemp(t.TempDir(), modelName)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
25
	require.NoError(t, err)
26
27
	defer f.Close()
	inputLayerCount := 5
28

29
	tensors := []*ggml.Tensor{
30
31
32
33
34
35
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
36
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
37
	assert.Len(t, tensors, inputLayerCount+1)
Michael Yang's avatar
Michael Yang committed
38
	err = ggml.WriteGGUF(f, ggml.KV{
39
40
41
42
43
44
45
46
47
48
49
50
		"general.architecture":          "llama",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(inputLayerCount),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
	}, tensors)
	require.NoError(t, err)

51
52
53
54
	ggml, err := LoadModel(f.Name(), 0)
	if err != nil {
		t.Fatal(err)
	}
55
56

	// Simple CPU scenario
57
	gpus := []discover.GpuInfo{
58
		{
59
60
61
			DeviceID: ml.DeviceID{
				Library: "cpu",
			},
62
63
64
65
		},
	}
	projectors := []string{}
	opts := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
66
	t.Run("cpu", func(t *testing.T) {
Jesse Gross's avatar
Jesse Gross committed
67
		estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
70
		assert.Equal(t, 0, estimate.Layers)
		assert.Equal(t, uint64(0), estimate.Graph)
	})
71
72
73
74
75
76
77
78

	// derived from the dummy ggml file above
	graphPartialOffload := uint64(202377216)
	graphFullOffload := uint64(171968512)
	layerSize := uint64(33554436)
	projectorSize := uint64(0)
	memoryLayerOutput := uint64(4)

79
	// Dual CUDA scenario with asymmetry
80
	gpuMinimumMemory := uint64(2048)
81
	gpus = []discover.GpuInfo{
82
		{
83
84
85
			DeviceID: ml.DeviceID{
				Library: "cuda",
			},
86
87
88
			MinimumMemory: gpuMinimumMemory,
		},
		{
89
90
91
			DeviceID: ml.DeviceID{
				Library: "cuda",
			},
92
93
94
95
			MinimumMemory: gpuMinimumMemory,
		},
	}
	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
97
	for i, s := range []struct {
		layer0, layer1   uint64
Jesse Gross's avatar
Jesse Gross committed
98
		expect0, expect1 int
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
	}{
100
101
102
103
104
105
106
107
108
		{1, 1, 1, 1},
		{2, 1, 2, 1},
		{2, 2, 2, 2},
		{1, 2, 1, 2},
		{3, 3, 3, 3},
		{4, 4, 3, 3},
		{6, 6, 3, 3},
		{0, 3, 0, 3},
	} {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
109
110
111
112
113
114
115
116
117
118
119
120
121
		t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
			gpus[0].FreeMemory = 0
			gpus[1].FreeMemory = 0
			gpus[0].FreeMemory += projectorSize
			if s.layer0 > 0 {
				gpus[0].FreeMemory += memoryLayerOutput
			} else {
				gpus[1].FreeMemory += memoryLayerOutput
			}
			gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
Jesse Gross's avatar
Jesse Gross committed
122
123
124
			estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
			assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s)
			assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
125
126
127
128
129
130
131
132
133
134
135
136
			var layerSums uint64
			for _, b := range estimate.GPUSizes {
				layerSums += b
			}
			if estimate.Layers < inputLayerCount+1 {
				assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
				assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			} else {
				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			}
		})
137
138
	}
}