Commit 6fd04ca9 authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Improve multi-gpu handling at the limit

Still not complete, needs some refinement to our prediction to understand the
discrete GPUs available space so we can see how many layers fit in each one
since we can't split one layer across multiple GPUs we can't treat free space
as one logical block
parent 206797bd
...@@ -27,7 +27,7 @@ const ( ...@@ -27,7 +27,7 @@ const (
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
// Direct Rendering Manager sysfs location // Direct Rendering Manager sysfs location
DRMDeviceDirGlob = "/sys/class/drm/card[0-9]/device" DRMDeviceDirGlob = "/sys/class/drm/card*/device"
DRMTotalMemoryFile = "mem_info_vram_total" DRMTotalMemoryFile = "mem_info_vram_total"
DRMUsedMemoryFile = "mem_info_vram_used" DRMUsedMemoryFile = "mem_info_vram_used"
......
...@@ -246,10 +246,6 @@ func GetGPUInfo() GpuInfoList { ...@@ -246,10 +246,6 @@ func GetGPUInfo() GpuInfoList {
return GpuInfoList{cpus[0].GpuInfo} return GpuInfoList{cpus[0].GpuInfo}
} }
// TODO - implement
// TODO refine the discovery to only gather total memory
// On windows we bundle the nvidia library one level above the runner dir // On windows we bundle the nvidia library one level above the runner dir
depPath := "" depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" { if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
......
...@@ -44,14 +44,14 @@ type CPUInfo struct { ...@@ -44,14 +44,14 @@ type CPUInfo struct {
type CudaGPUInfo struct { type CudaGPUInfo struct {
GpuInfo GpuInfo
index int // device index index int // nolint: unused
} }
type CudaGPUInfoList []CudaGPUInfo type CudaGPUInfoList []CudaGPUInfo
type RocmGPUInfo struct { type RocmGPUInfo struct {
GpuInfo GpuInfo
usedFilepath string // linux usedFilepath string // nolint: unused
index int // device index on windows index int // nolint: unused
} }
type RocmGPUInfoList []RocmGPUInfo type RocmGPUInfoList []RocmGPUInfo
......
...@@ -38,7 +38,7 @@ func TestMultiModelConcurrency(t *testing.T) { ...@@ -38,7 +38,7 @@ func TestMultiModelConcurrency(t *testing.T) {
} }
resp = [2][]string{ resp = [2][]string{
[]string{"sunlight"}, []string{"sunlight"},
[]string{"england", "english", "massachusetts", "pilgrims"}, []string{"england", "english", "massachusetts", "pilgrims", "british"},
} }
) )
var wg sync.WaitGroup var wg sync.WaitGroup
...@@ -229,5 +229,23 @@ func TestMultiModelStress(t *testing.T) { ...@@ -229,5 +229,23 @@ func TestMultiModelStress(t *testing.T) {
} }
}(i) }(i)
} }
go func() {
for {
time.Sleep(2 * time.Second)
select {
case <-ctx.Done():
return
default:
models, err := client.ListRunning(ctx)
if err != nil {
slog.Warn("failed to list running models", "error", err)
continue
}
for _, m := range models.Models {
slog.Info("loaded model snapshot", "model", m)
}
}
}
}()
wg.Wait() wg.Wait()
} }
...@@ -11,7 +11,7 @@ import ( ...@@ -11,7 +11,7 @@ import (
) )
func TestContextExhaustion(t *testing.T) { func TestContextExhaustion(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter? ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) // Longer needed for small footprint GPUs
defer cancel() defer cancel()
// Set up the test data // Set up the test data
req := api.GenerateRequest{ req := api.GenerateRequest{
......
...@@ -331,7 +331,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { ...@@ -331,7 +331,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
[][]string{ [][]string{
[]string{"sunlight"}, []string{"sunlight"},
[]string{"soil", "organic", "earth", "black", "tan"}, []string{"soil", "organic", "earth", "black", "tan"},
[]string{"england", "english", "massachusetts", "pilgrims"}, []string{"england", "english", "massachusetts", "pilgrims", "british"},
[]string{"fourth", "july", "declaration", "independence"}, []string{"fourth", "july", "declaration", "independence"},
[]string{"nitrogen", "oxygen", "carbon", "dioxide"}, []string{"nitrogen", "oxygen", "carbon", "dioxide"},
} }
......
...@@ -307,6 +307,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui ...@@ -307,6 +307,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
partialOffload = 4 * batch * embedding partialOffload = 4 * batch * embedding
partialOffload += max( partialOffload += max(
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV), 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128, 4*batch*(embedding+vocab)+embedding*vocab*105/128,
) )
......
...@@ -3,9 +3,10 @@ package llm ...@@ -3,9 +3,10 @@ package llm
import ( import (
"fmt" "fmt"
"log/slog" "log/slog"
"strconv"
"strings"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/gpu"
) )
...@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors ...@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
var estimatedVRAM uint64 var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() { for _, gpus := range allGpus.ByLibrary() {
var layerCount int var layerCount int
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts) estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
if opts.NumGPU < 0 { if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) { if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
return true, estimatedVRAM return true, estimatedVRAM
...@@ -30,24 +32,68 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors ...@@ -30,24 +32,68 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
return false, estimatedVRAM return false, estimatedVRAM
} }
type MemoryEstimate struct {
// How many layers we predict we can load
Layers int
// The size of the graph which occupies the main GPU
Graph uint64
// How much VRAM will be allocated given the number of layers we predict
VRAMSize uint64
// The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
TotalSize uint64
// For multi-GPU scenarios, this provides the tensor split parameter
TensorSplit string
// For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64
}
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library // The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) { func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
var memoryAvailable uint64 // Graph size for a partial offload, applies to all GPUs
for _, info := range gpus { var graphPartialOffload uint64
memoryAvailable += info.FreeMemory
} // Graph size when all layers are offloaded, applies to all GPUs
if envconfig.MaxVRAM > 0 { var graphFullOffload uint64
memoryAvailable = envconfig.MaxVRAM
} // Final graph offload once we know full or partial
var graphOffload uint64
// Projectors loaded into GPU0 only
var projectorSize uint64
// Conditional output size on GPU 0
var memoryLayerOutput uint64
var includeOutput bool
// One extra layer as a pad for each GPU
var layerBuffer uint64
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable)) // The sizes of the main layers
var layerSizes []uint64
// TODO - this is probably wrong, first GPU vs secondaries will have different overheads // The sum of all the layer sizes (just for logging)
memoryMinimum := gpus[0].MinimumMemory var memoryWeights uint64
// True if all the layers are loaded
var fullyLoaded bool
// Overflow that didn't fit into the GPU
var overflow uint64
availableList := make([]string, len(gpus))
for i, gpu := range gpus {
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
}
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors { for _, projector := range projectors {
memoryMinimum += projectorMemoryRequirements(projector) projectorSize += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context // multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048) opts.NumCtx = max(opts.NumCtx, 2048)
...@@ -56,40 +102,28 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ...@@ -56,40 +102,28 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
layers := ggml.Tensors().Layers() layers := ggml.Tensors().Layers()
// add one layer worth of memory as a buffer // add one layer worth of memory as a buffer
if blk0, ok := layers["blk.0"]; ok { if blk0, ok := layers["blk.0"]; ok {
memoryMinimum += blk0.size() layerBuffer = blk0.size()
} }
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
if graphPartialOffload == 0 { if graphPartialOffload == 0 {
graphPartialOffload = ggml.KV().GQA() * kv / 6 graphPartialOffload = ggml.KV().GQA() * kv / 6
} }
if graphFullOffload == 0 { if graphFullOffload == 0 {
graphFullOffload = graphPartialOffload graphFullOffload = graphPartialOffload
} }
graphFullOffload *= uint64(len(gpus))
graphPartialOffload *= uint64(len(gpus))
// on metal there's no partial offload overhead // on metal there's no partial offload overhead
if gpus[0].Library == "metal" { if gpus[0].Library == "metal" {
graphPartialOffload = graphFullOffload graphPartialOffload = graphFullOffload
} }
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload
var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok { if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.size() memoryLayerOutput += layer.size()
} }
if layer, ok := layers["output"]; ok { if layer, ok := layers["output"]; ok {
memoryLayerOutput += layer.size() memoryLayerOutput += layer.size()
} else if layer, ok := layers["token_embd"]; ok { } else if layer, ok := layers["token_embd"]; ok {
...@@ -97,38 +131,144 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ...@@ -97,38 +131,144 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
} }
if gpus[0].Library == "metal" && opts.UseMMap { if gpus[0].Library == "metal" && opts.UseMMap {
// memory is preallocated for output tensors includeOutput = true
memoryRequiredTotal += memoryLayerOutput } else if gpus[0].Library != "metal" || !opts.UseMMap {
memoryRequiredPartial += memoryLayerOutput includeOutput = true
} }
gpuZeroOverhead := projectorSize
if includeOutput {
gpuZeroOverhead += memoryLayerOutput
}
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int var layerCount int
layerCounts := make([]int, len(gpus))
gpuAllocations := make([]uint64, len(gpus))
type gs struct {
i int
g *gpu.GpuInfo
}
gpusWithSpace := []gs{}
for i := range gpus {
var gzo uint64
if len(gpusWithSpace) == 0 {
gzo = gpuZeroOverhead
}
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
continue
}
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
}
var gpuZeroID int
if len(gpusWithSpace) > 0 {
gpuZeroID = gpusWithSpace[0].i
gpuAllocations[gpuZeroID] += gpuZeroOverhead
}
layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
for i := range int(ggml.KV().BlockCount()) { for i := range int(ggml.KV().BlockCount()) {
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
memoryLayer := blk.size() memoryLayer := blk.size()
// KV is proportional to the number of layers // KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount() memoryLayer += kv / ggml.KV().BlockCount()
layerSizes[i] = memoryLayer
memoryWeights += memoryLayer
}
}
memoryRequiredTotal += memoryLayer // For all the layers, find where they can fit on the GPU(s)
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) { for i := range layerSizes {
memoryRequiredPartial += memoryLayer if layerSizes[i] == 0 {
continue
}
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
// Stop allocating on GPU(s) once we hit the users target NumGPU
continue
}
// distribute the layers across the GPU(s) that have space
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[i%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > used+layerSizes[i] {
gpuAllocations[g.i] += layerSizes[i]
layerCounts[g.i]++
layerCount++ layerCount++
break
} else {
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
} }
} }
}
if layerCount >= int(ggml.KV().BlockCount()) {
fullyLoaded = true
} else {
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
overflow += layerSizes[i]
}
}
// Find where the output fits
if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[layerCount%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > used+memoryLayerOutput {
gpuAllocations[g.i] += memoryLayerOutput
layerCounts[g.i]++
layerCount++
break
}
}
if layerCount < int(ggml.KV().BlockCount())+1 {
fullyLoaded = false
overflow += memoryLayerOutput
}
} }
if gpus[0].Library != "metal" || !opts.UseMMap { // Add the applicable (full or partial) graph allocations
// memory was not preallocated for output tensors for i := range gpus {
memoryRequiredTotal += memoryLayerOutput if layerCounts[i] <= 0 {
continue
}
if fullyLoaded {
gpuAllocations[i] += graphFullOffload
} else {
gpuAllocations[i] += graphPartialOffload
}
}
if fullyLoaded {
graphOffload = graphFullOffload
} else {
graphOffload = graphPartialOffload
} }
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) { // Summaries for the log
layerCount = int(ggml.KV().BlockCount()) + 1 var memoryRequiredPartial, memoryRequiredTotal uint64
memoryRequiredPartial = memoryRequiredTotal for i := range gpuAllocations {
memoryRequiredPartial += gpuAllocations[i]
} }
memoryRequiredTotal = memoryRequiredPartial + overflow
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv tensorSplit := ""
if len(gpus) > 1 {
splits := make([]string, len(gpus))
for i, count := range layerCounts {
splits[i] = strconv.Itoa(count)
}
tensorSplit = strings.Join(splits, ",")
}
allocationsList := []string{}
for _, a := range gpuAllocations {
allocationsList = append(allocationsList, format.HumanBytes2(a))
}
slog.Info( slog.Info(
"offload to gpu", "offload to gpu",
...@@ -136,13 +276,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ...@@ -136,13 +276,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
"layers", "layers",
// requested number of layers to offload // requested number of layers to offload
"requested", opts.NumGPU, "requested", opts.NumGPU,
// The number of layers the model has (including output)
"model", int(ggml.KV().BlockCount())+1,
// estimated number of layers that can be offloaded // estimated number of layers that can be offloaded
"real", layerCount, "offload", layerCount,
// multi-gpu split for tesnors
"split", tensorSplit,
), ),
slog.Group( slog.Group(
"memory", "memory",
// memory available for offloading // memory available by GPU for offloading
"available", format.HumanBytes2(memoryAvailable), "available", availableList,
slog.Group( slog.Group(
"required", "required",
// memory required for full offloading // memory required for full offloading
...@@ -151,6 +295,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ...@@ -151,6 +295,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
"partial", format.HumanBytes2(memoryRequiredPartial), "partial", format.HumanBytes2(memoryRequiredPartial),
// memory of KV cache // memory of KV cache
"kv", format.HumanBytes2(kv), "kv", format.HumanBytes2(kv),
// Allocations across the GPUs
"allocations", allocationsList,
), ),
slog.Group( slog.Group(
"weights", "weights",
...@@ -171,12 +317,31 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ...@@ -171,12 +317,31 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
), ),
) )
if gpus[0].Library == "cpu" { if gpus[0].Library == "cpu" {
return 0, 0, memoryRequiredTotal return MemoryEstimate{
Layers: 0,
Graph: 0,
VRAMSize: 0,
TotalSize: memoryRequiredTotal,
GPUSizes: []uint64{},
}
} }
if memoryRequiredPartial > memoryAvailable { if layerCount == 0 {
slog.Debug("insufficient VRAM to load any model layers") slog.Debug("insufficient VRAM to load any model layers")
return 0, 0, memoryRequiredTotal return MemoryEstimate{
Layers: 0,
Graph: 0,
VRAMSize: 0,
TotalSize: memoryRequiredTotal,
GPUSizes: []uint64{},
}
} }
return layerCount, memoryRequiredPartial, memoryRequiredTotal return MemoryEstimate{
Layers: layerCount,
Graph: graphOffload,
VRAMSize: memoryRequiredPartial,
TotalSize: memoryRequiredTotal,
TensorSplit: tensorSplit,
GPUSizes: gpuAllocations,
}
} }
package llm
import (
"bytes"
"encoding/binary"
"fmt"
"os"
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestEstimateGPULayers(t *testing.T) {
envconfig.Debug = true
modelName := "dummy"
f, err := os.CreateTemp(t.TempDir(), modelName)
assert.Nil(t, err)
defer f.Close()
gguf := NewGGUFV3(binary.LittleEndian)
inputLayerCount := 5
tensors := []Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
}
assert.Equal(t, inputLayerCount+1, len(tensors))
err = gguf.Encode(f, KV{
"general.architecture": "llama",
"general.name": "name",
"llama.context_length": uint32(32),
"llama.embedding_length": uint32(4096),
"llama.block_count": uint32(inputLayerCount),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(32),
"tokenizer.ggml.tokens": []string{" "},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, tensors)
require.NoError(t, err)
ggml, err := LoadModel(f.Name())
require.NoError(t, err)
// Simple CPU scenario
gpus := []gpu.GpuInfo{
{
Library: "cpu",
},
}
projectors := []string{}
opts := api.DefaultOptions()
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
assert.Equal(t, 0, estimate.Layers)
assert.Equal(t, uint64(0), estimate.Graph)
// derived from the dummy ggml file above
graphPartialOffload := uint64(202377216)
graphFullOffload := uint64(171968512)
layerSize := uint64(33554436)
projectorSize := uint64(0)
memoryLayerOutput := uint64(4)
// Dual CUDA scenario with assymetry
gpuMinimumMemory := uint64(2048)
gpus = []gpu.GpuInfo{
{
Library: "cuda",
MinimumMemory: gpuMinimumMemory,
},
{
Library: "cuda",
MinimumMemory: gpuMinimumMemory,
},
}
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
for i, s := range [][]uint64{
{1, 1, 1, 1},
{2, 1, 2, 1},
{2, 2, 2, 2},
{1, 2, 1, 2},
{3, 3, 3, 3},
{4, 4, 3, 3},
{6, 6, 3, 3},
{0, 3, 0, 3},
} {
gpus[0].FreeMemory = 0
gpus[1].FreeMemory = 0
gpus[0].FreeMemory += projectorSize + memoryLayerOutput
gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
var layerSums uint64
for _, b := range estimate.GPUSizes {
layerSums += b
}
if estimate.Layers < inputLayerCount+1 {
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
} else {
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
}
}
}
...@@ -49,13 +49,11 @@ type llmServer struct { ...@@ -49,13 +49,11 @@ type llmServer struct {
status *StatusWriter status *StatusWriter
options api.Options options api.Options
// TODO - this should be broken down by GPU estimate MemoryEstimate
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model totalLayers uint64
estimatedTotal uint64 // Total size of model gpuCount int
totalLayers uint64 loadDuration time.Duration // Record how long it took the model to load
gpuCount int loadProgress float32
loadDuration time.Duration // Record how long it took the model to load
loadProgress float32
sem *semaphore.Weighted sem *semaphore.Weighted
} }
...@@ -80,8 +78,7 @@ func LoadModel(model string) (*GGML, error) { ...@@ -80,8 +78,7 @@ func LoadModel(model string) (*GGML, error) {
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) { func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
var err error var err error
var cpuRunner string var cpuRunner string
var estimatedVRAM uint64 var estimate MemoryEstimate
var estimatedTotal uint64
var systemMemory uint64 var systemMemory uint64
gpuCount := len(gpus) gpuCount := len(gpus)
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 { if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
...@@ -89,7 +86,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -89,7 +86,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
gpuCount = 0 gpuCount = 0
_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts) estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else { } else {
if gpus[0].Library == "metal" { if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem() memInfo, err := gpu.GetCPUMem()
...@@ -100,20 +97,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -100,20 +97,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory)) slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
} }
} }
var layers int estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
switch { switch {
case gpus[0].Library == "metal" && estimatedVRAM > systemMemory: case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
// disable partial offloading when model is greater than total system memory as this // disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system // can lead to locking up the system
opts.NumGPU = 0 opts.NumGPU = 0
case gpus[0].Library != "metal" && layers == 0: case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit // Don't bother loading into the GPU if no layers can fit
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
gpuCount = 0 gpuCount = 0
case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu": case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = layers opts.NumGPU = estimate.Layers
} }
} }
...@@ -232,6 +228,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -232,6 +228,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel)) params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
if estimate.TensorSplit != "" {
params = append(params, "--tensor-split", estimate.TensorSplit)
}
if estimate.TensorSplit != "" {
params = append(params, "--tensor-split", estimate.TensorSplit)
}
for i := range len(servers) { for i := range len(servers) {
dir := availableServers[servers[i]] dir := availableServers[servers[i]]
if dir == "" { if dir == "" {
...@@ -299,16 +303,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -299,16 +303,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
s := &llmServer{ s := &llmServer{
port: port, port: port,
cmd: exec.Command(server, finalParams...), cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr), status: NewStatusWriter(os.Stderr),
options: opts, options: opts,
estimatedVRAM: estimatedVRAM, estimate: estimate,
estimatedTotal: estimatedTotal, sem: semaphore.NewWeighted(int64(numParallel)),
sem: semaphore.NewWeighted(int64(numParallel)), totalLayers: ggml.KV().BlockCount() + 1,
totalLayers: ggml.KV().BlockCount() + 1, gpuCount: gpuCount,
gpuCount: gpuCount, done: make(chan error, 1),
done: make(chan error, 1),
} }
s.cmd.Env = os.Environ() s.cmd.Env = os.Environ()
...@@ -1004,11 +1007,11 @@ func (s *llmServer) Close() error { ...@@ -1004,11 +1007,11 @@ func (s *llmServer) Close() error {
} }
func (s *llmServer) EstimatedVRAM() uint64 { func (s *llmServer) EstimatedVRAM() uint64 {
return s.estimatedVRAM return s.estimate.VRAMSize
} }
func (s *llmServer) EstimatedTotal() uint64 { func (s *llmServer) EstimatedTotal() uint64 {
return s.estimatedTotal return s.estimate.TotalSize
} }
func parseDurationMs(ms float64) time.Duration { func parseDurationMs(ms float64) time.Duration {
......
...@@ -129,6 +129,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV ...@@ -129,6 +129,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
"tokenizer.ggml.token_type": []int32{0}, "tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{ }, []llm.Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
}) })
require.NoError(t, err) require.NoError(t, err)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment