Commit d5a0d8d9 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llm: New memory management

This changes the memory allocation strategy from upfront estimation to
tracking actual allocations done by the engine and reacting to that. The
goal is avoid issues caused by both under-estimation (crashing) and
over-estimation (low performance due to under-utilized GPUs).

It is currently opt-in and can be enabled for models running on the
Ollama engine by setting OLLAMA_NEW_ESTIMATES=1. Behavior in other
cases is unchanged and will continue to use the existing estimates.
parent ef7d26ba
...@@ -97,6 +97,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { ...@@ -97,6 +97,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
return a < b return a < b
}) })
gpuCount := 0 gpuCount := 0
gpuOrdinalID := 0
for _, match := range matches { for _, match := range matches {
slog.Debug("evaluating amdgpu node " + match) slog.Debug("evaluating amdgpu node " + match)
fp, err := os.Open(match) fp, err := os.Open(match)
...@@ -187,10 +188,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { ...@@ -187,10 +188,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
continue continue
} }
// Keep track of numeric IDs based on valid GPUs
gpuID := gpuCount
gpuCount += 1
// Look up the memory for the current node // Look up the memory for the current node
totalMemory := uint64(0) totalMemory := uint64(0)
usedMemory := uint64(0) usedMemory := uint64(0)
...@@ -269,7 +266,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { ...@@ -269,7 +266,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
if uniqueID != 0 { if uniqueID != 0 {
ID = fmt.Sprintf("GPU-%016x", uniqueID) ID = fmt.Sprintf("GPU-%016x", uniqueID)
} else { } else {
ID = strconv.Itoa(gpuID) ID = strconv.Itoa(gpuOrdinalID)
} }
gpuInfo := RocmGPUInfo{ gpuInfo := RocmGPUInfo{
...@@ -287,13 +284,40 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { ...@@ -287,13 +284,40 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
DriverMinor: driverMinor, DriverMinor: driverMinor,
}, },
usedFilepath: usedFile, usedFilepath: usedFile,
index: gpuID, index: gpuCount,
} }
// Keep track of numeric IDs based on valid GPUs
gpuCount += 1
// If the user wants to filter to a subset of devices, filter out if we aren't a match
if len(visibleDevices) > 0 {
include := false
for _, visible := range visibleDevices {
if (uniqueID != 0 && visible == gpuInfo.ID) || visible == strconv.Itoa(gpuInfo.index) {
include = true
break
}
}
if !include {
reason := "filtering out device per user request"
slog.Info(reason, "id", gpuInfo.ID, "index", gpuInfo.index, "visible_devices", visibleDevices)
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
})
continue
}
}
// Ordinal IDs are based on the visible GPUs
gpuOrdinalID += 1
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library // iGPU detection, remove this check once we can support an iGPU variant of the rocm library
if totalMemory < IGPUMemLimit { if totalMemory < IGPUMemLimit {
reason := "unsupported Radeon iGPU detected skipping" reason := "unsupported Radeon iGPU detected skipping"
slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory)) slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo, GpuInfo: gpuInfo.GpuInfo,
Reason: reason, Reason: reason,
...@@ -306,7 +330,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { ...@@ -306,7 +330,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
} }
if int(major) < minVer { if int(major) < minVer {
reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch) reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
slog.Warn(reason, "gpu", gpuID) slog.Warn(reason, "gpu", gpuInfo.ID)
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo, GpuInfo: gpuInfo.GpuInfo,
Reason: reason, Reason: reason,
...@@ -315,29 +339,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { ...@@ -315,29 +339,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
continue continue
} }
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory)) slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "available", format.HumanBytes2(totalMemory-usedMemory))
// If the user wants to filter to a subset of devices, filter out if we aren't a match
if len(visibleDevices) > 0 {
include := false
for _, visible := range visibleDevices {
if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
include = true
break
}
}
if !include {
reason := "filtering out device per user request"
slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
GpuInfo: gpuInfo.GpuInfo,
Reason: reason,
})
continue
}
}
// Final validation is gfx compatibility - load the library if we haven't already loaded it // Final validation is gfx compatibility - load the library if we haven't already loaded it
// even if the user overrides, we still need to validate the library // even if the user overrides, we still need to validate the library
......
...@@ -185,6 +185,8 @@ var ( ...@@ -185,6 +185,8 @@ var (
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096) ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
// Auth enables authentication between the Ollama client and server // Auth enables authentication between the Ollama client and server
UseAuth = Bool("OLLAMA_AUTH") UseAuth = Bool("OLLAMA_AUTH")
// Enable the new memory estimation logic
NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
) )
func String(s string) func() string { func String(s string) func() string {
...@@ -270,6 +272,7 @@ func AsMap() map[string]EnvVar { ...@@ -270,6 +272,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"}, "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
"OLLAMA_NEW_ESTIMATES": {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},
// Informational // Informational
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"}, "HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
......
...@@ -480,6 +480,8 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) { ...@@ -480,6 +480,8 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
} }
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) { func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
context *= uint64(numParallel)
embedding := f.KV().EmbeddingLength() embedding := f.KV().EmbeddingLength()
heads := f.KV().HeadCountMax() heads := f.KV().HeadCountMax()
headsKV := f.KV().HeadCountKVMax() headsKV := f.KV().HeadCountKVMax()
......
...@@ -62,6 +62,22 @@ func BackendInit() { ...@@ -62,6 +62,22 @@ func BackendInit() {
C.llama_backend_init() C.llama_backend_init()
} }
func EnumerateGPUs() []string {
var ids []string
for i := range C.ggml_backend_dev_count() {
device := C.ggml_backend_dev_get(i)
if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(device, &props)
ids = append(ids, C.GoString(props.id))
}
}
return ids
}
func GetModelArch(modelPath string) (string, error) { func GetModelArch(modelPath string) (string, error) {
mp := C.CString(modelPath) mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp)) defer C.free(unsafe.Pointer(mp))
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sun, 22 Jun 2025 09:22:05 -0700
Subject: [PATCH] temporary prevent rocm+cuda mixed loading
---
ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 3040b2aa..f1e9c180 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent, dir_path);
- ggml_backend_load_best("cuda", silent, dir_path);
- ggml_backend_load_best("hip", silent, dir_path);
+
+ // Avoid mixed hip+cuda configurations
+ const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+ const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
+ if (!hip_devices && !rocr_devices) {
+ ggml_backend_load_best("cuda", silent, dir_path);
+ } else {
+ ggml_backend_load_best("hip", silent, dir_path);
+ }
+
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);
...@@ -4,7 +4,7 @@ import ( ...@@ -4,7 +4,7 @@ import (
"fmt" "fmt"
"log/slog" "log/slog"
"os" "os"
"strconv" "sort"
"strings" "strings"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
...@@ -14,13 +14,79 @@ import ( ...@@ -14,13 +14,79 @@ import (
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
) )
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
for _, gl := range gpus.ByLibrary() {
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
if !envconfig.SchedSpread() {
// Try to pack into as few GPUs as possible, starting from 1 GPU
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
gpuSubset := sgl[:numGPUs]
ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
"model", modelPath,
"library", sgl[0].Library,
"parallel", numParallel,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", numGPUs)
return gpuSubset
}
}
} else {
// TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
slog.Info("new model will fit in available VRAM, loading",
"model", modelPath,
"library", sgl[0].Library,
"parallel", numParallel,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", len(sgl))
return sgl
}
}
}
return nil
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
byLibrary := gpus.ByLibrary()
if len(byLibrary) <= 1 {
return gpus
}
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
}
}
return byLibrary[bestFit]
}
// This algorithm looks for a complete fit to determine if we need to unload other models // This algorithm looks for a complete fit to determine if we need to unload other models
func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) { func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
// Split up the GPUs by type and try them // Split up the GPUs by type and try them
var estimatedVRAM uint64 var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() { for _, gpus := range allGpus.ByLibrary() {
var layerCount int var layerCount int
estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel) estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
if opts.NumGPU < 0 { if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) { if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
...@@ -49,7 +115,7 @@ type MemoryEstimate struct { ...@@ -49,7 +115,7 @@ type MemoryEstimate struct {
TotalSize uint64 TotalSize uint64
// For multi-GPU scenarios, this provides the tensor split parameter // For multi-GPU scenarios, this provides the tensor split parameter
TensorSplit string TensorSplit []int
// For multi-GPU scenarios, this is the size in bytes per GPU // For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64 GPUSizes []uint64
...@@ -71,7 +137,7 @@ type MemoryEstimate struct { ...@@ -71,7 +137,7 @@ type MemoryEstimate struct {
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library // The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate { func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
// Graph size for a partial offload, applies to all GPUs // Graph size for a partial offload, applies to all GPUs
var graphPartialOffload uint64 var graphPartialOffload uint64
...@@ -112,13 +178,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -112,13 +178,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
for _, projector := range projectors { for _, projector := range projectors {
llamaEngineProjectorWeights += projectorMemoryRequirements(projector) llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
} }
if llamaEngineProjectorWeights == 0 { if llamaEngineProjectorWeights == 0 {
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize() ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
opts.NumCtx = max(opts.NumCtx, 2048)
} }
layers := f.Tensors().GroupLayers() layers := f.Tensors().GroupLayers()
...@@ -184,7 +246,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -184,7 +246,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int var layerCount int
layerCounts := make([]int, len(gpus)) tensorSplit := make([]int, len(gpus))
gpuAllocations := make([]uint64, len(gpus)) gpuAllocations := make([]uint64, len(gpus))
type gs struct { type gs struct {
i int i int
...@@ -248,7 +310,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -248,7 +310,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > overhead+used+layerSize { if g.g.FreeMemory > overhead+used+layerSize {
gpuAllocations[g.i] += layerSize gpuAllocations[g.i] += layerSize
layerCounts[g.i]++ tensorSplit[g.i]++
layerCount++ layerCount++
break break
} else { } else {
...@@ -273,7 +335,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -273,7 +335,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > overhead+used+memoryLastLayer { if g.g.FreeMemory > overhead+used+memoryLastLayer {
gpuAllocations[g.i] += memoryLastLayer gpuAllocations[g.i] += memoryLastLayer
layerCounts[g.i]++ tensorSplit[g.i]++
layerCount++ layerCount++
break break
} }
...@@ -288,7 +350,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -288,7 +350,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
// Add the applicable (full or partial) graph allocations // Add the applicable (full or partial) graph allocations
for i := range gpus { for i := range gpus {
if layerCounts[i] <= 0 { if tensorSplit[i] <= 0 {
continue continue
} }
if fullyLoaded { if fullyLoaded {
...@@ -310,14 +372,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -310,14 +372,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
memoryRequiredTotal = memoryRequiredPartial + overflow memoryRequiredTotal = memoryRequiredPartial + overflow
tensorSplit := ""
if len(gpus) > 1 {
splits := make([]string, len(gpus))
for i, count := range layerCounts {
splits[i] = strconv.Itoa(count)
}
tensorSplit = strings.Join(splits, ",")
}
allocationsList := []string{} allocationsList := []string{}
for _, a := range gpuAllocations { for _, a := range gpuAllocations {
allocationsList = append(allocationsList, format.HumanBytes2(a)) allocationsList = append(allocationsList, format.HumanBytes2(a))
......
...@@ -61,7 +61,7 @@ func TestEstimateGPULayers(t *testing.T) { ...@@ -61,7 +61,7 @@ func TestEstimateGPULayers(t *testing.T) {
projectors := []string{} projectors := []string{}
opts := api.DefaultOptions() opts := api.DefaultOptions()
t.Run("cpu", func(t *testing.T) { t.Run("cpu", func(t *testing.T) {
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1) estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
assert.Equal(t, 0, estimate.Layers) assert.Equal(t, 0, estimate.Layers)
assert.Equal(t, uint64(0), estimate.Graph) assert.Equal(t, uint64(0), estimate.Graph)
}) })
...@@ -88,7 +88,7 @@ func TestEstimateGPULayers(t *testing.T) { ...@@ -88,7 +88,7 @@ func TestEstimateGPULayers(t *testing.T) {
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1 // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
for i, s := range []struct { for i, s := range []struct {
layer0, layer1 uint64 layer0, layer1 uint64
expect0, expect1 uint64 expect0, expect1 int
}{ }{
{1, 1, 1, 1}, {1, 1, 1, 1},
{2, 1, 2, 1}, {2, 1, 2, 1},
...@@ -112,9 +112,9 @@ func TestEstimateGPULayers(t *testing.T) { ...@@ -112,9 +112,9 @@ func TestEstimateGPULayers(t *testing.T) {
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1 gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload) gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload) gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1) estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s) assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s)
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s) assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s)
var layerSums uint64 var layerSums uint64
for _, b := range estimate.GPUSizes { for _, b := range estimate.GPUSizes {
layerSums += b layerSums += b
......
This diff is collapsed.
...@@ -8,9 +8,178 @@ import ( ...@@ -8,9 +8,178 @@ import (
"testing" "testing"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
"golang.org/x/sync/semaphore" "golang.org/x/sync/semaphore"
) )
func TestLLMServerFitGPU(t *testing.T) {
type gpu struct {
library string
free int
}
tests := []struct {
name string
gpus []gpu
layers []int
numGPU int
requireFull bool
expected ml.GPULayersList
expectedErr error
}{
{
name: "No GPU",
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{},
},
{
name: "Full single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
},
{
name: "Partial single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
},
{
name: "Single GPU with numGPU 1",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
},
{
name: "Single GPU with numGPU 0",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 0,
expected: ml.GPULayersList{},
},
{
name: "Single GPU with numGPU 999",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
},
{
name: "Multi GPU fits on one",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
},
{
name: "Multi GPU split",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
},
{
name: "Multi GPU partial",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 1",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 2",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 2,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 999",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
},
{
name: "Multi GPU different libraries",
gpus: []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
},
{
name: "requireFull",
gpus: []gpu{{free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
requireFull: true,
expectedErr: ErrLoadRequiredFull,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var systemInfo discover.SystemInfo
systemInfo.System.TotalMemory = format.GibiByte
systemInfo.System.FreeMemory = 512 * format.MebiByte
systemInfo.System.FreeSwap = 256 * format.MebiByte
gpus := make(discover.GpuInfoList, len(tt.gpus))
for i := range tt.gpus {
gpus[i].ID = fmt.Sprintf("gpu%d", i)
gpus[i].Library = tt.gpus[i].library
gpus[i].FreeMemory = uint64(tt.gpus[i].free)
}
s := &ollamaServer{
llmServer: llmServer{
totalLayers: uint64(len(tt.layers)),
options: api.Options{
Runner: api.Runner{
NumGPU: tt.numGPU,
},
},
},
}
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
Weights: make([]ml.Memory, s.totalLayers),
Cache: make([]ml.Memory, s.totalLayers),
}, GPUs: make([]ml.DeviceMemory, len(gpus))}
for i := range tt.layers {
s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
}
for i := range s.mem.GPUs {
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
}
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
if err != tt.expectedErr {
t.Fatalf("fitGPU returned error: %v", err)
}
if gpuLayers.Hash() != tt.expected.Hash() {
t.Errorf("fitGPU assigned %v, want %v", gpuLayers, tt.expected)
}
})
}
}
func TestLLMServerCompletionFormat(t *testing.T) { func TestLLMServerCompletionFormat(t *testing.T) {
// This test was written to fix an already deployed issue. It is a bit // This test was written to fix an already deployed issue. It is a bit
// of a mess, and but it's good enough, until we can refactoring the // of a mess, and but it's good enough, until we can refactoring the
......
...@@ -5,12 +5,14 @@ import ( ...@@ -5,12 +5,14 @@ import (
"context" "context"
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"hash/maphash"
"log/slog" "log/slog"
"math" "math"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
) )
...@@ -58,19 +60,89 @@ type CacheConfig struct { ...@@ -58,19 +60,89 @@ type CacheConfig struct {
MaskBatchPadding int MaskBatchPadding int
} }
// GPULayers is a set of layers to be allocated on a single GPU
type GPULayers struct {
// ID is the identifier of the GPU, as reported in DeviceMemory
ID string
// Layers is a set of layer indicies to load
Layers []int
}
func (g GPULayers) String() string {
if len(g.Layers) == 0 {
return ""
}
slices.Sort(g.Layers)
contiguous := true
base := g.Layers[0]
for i := range g.Layers {
if g.Layers[i] != base+i {
contiguous = false
break
}
}
if contiguous {
return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
} else {
return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
}
}
// GPULayersList is a set of layer allocations across multiple GPUs
type GPULayersList []GPULayers
func (l GPULayersList) String() string {
if l.Sum() > 0 {
return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
} else {
return fmt.Sprintf("%v", []GPULayers(l))
}
}
// Sum is the total number of layers assigned across all GPUs
func (l GPULayersList) Sum() int {
var sum int
for _, g := range l {
sum += len(g.Layers)
}
return sum
}
var h maphash.Hash
// Hash is an identifier of this layer assignment
func (l GPULayersList) Hash() uint64 {
h.Reset()
for _, g := range l {
if len(g.Layers) > 0 {
h.WriteString(g.ID)
for _, l := range g.Layers {
binary.Write(&h, binary.NativeEndian, int64(l))
}
}
}
return h.Sum64()
}
// BackendParams controls how the backend loads and executes models // BackendParams controls how the backend loads and executes models
type BackendParams struct { type BackendParams struct {
// AllocMemory causes the backend to allocate memory for the model. If
// false, this is only being used for discovering the required amount of
// memory and cannot load the model for running.
AllocMemory bool
// NumThreads sets the number of threads to use if running on the CPU // NumThreads sets the number of threads to use if running on the CPU
NumThreads int NumThreads int
// MainGPU is the index of the primary GPU to use // GPULayers is the set of layers to offload to GPUs
MainGPU int GPULayers GPULayersList
// NumGPULayers is the number of layers to offload to GPUs
NumGPULayers int
// TensorSplit is the fraction of the model to offload to each GPU
TensorSplit []float32
// FlashAttention indicates that we should use a fused flash attention kernel // FlashAttention indicates that we should use a fused flash attention kernel
FlashAttention bool FlashAttention bool
...@@ -141,6 +213,28 @@ type DeviceMemory struct { ...@@ -141,6 +213,28 @@ type DeviceMemory struct {
Graph Memory Graph Memory
} }
// Allocated returns the total size of the memory that has been successfully
// allocated on this device
func (m DeviceMemory) Allocated() uint64 {
var mem uint64
for _, w := range m.Weights {
if w.Status == Allocated {
mem += w.Size
}
}
for _, c := range m.Cache {
if c.Status == Allocated {
mem += c.Size
}
}
if m.Graph.Status == Allocated {
mem += m.Graph.Size
}
return mem
}
func memoryPresent(mem []Memory) bool { func memoryPresent(mem []Memory) bool {
return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 }) return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
} }
...@@ -197,6 +291,58 @@ func (m BackendMemory) LogValue() slog.Value { ...@@ -197,6 +291,58 @@ func (m BackendMemory) LogValue() slog.Value {
return slog.GroupValue(attrs...) return slog.GroupValue(attrs...)
} }
func sumMemory(mem []Memory) uint64 {
var sum uint64
for _, m := range mem {
sum += m.Size
}
return sum
}
// Log prints a high level summary of the memory (allocated or not)
func (m BackendMemory) Log(level slog.Level) {
var total uint64
for _, gpu := range m.GPUs {
if sum := sumMemory(gpu.Weights); sum > 0 {
slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 {
slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
for _, gpu := range m.GPUs {
if sum := sumMemory(gpu.Cache); sum > 0 {
slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := sumMemory(m.CPU.Cache); sum > 0 {
slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
for _, gpu := range m.GPUs {
if sum := gpu.Graph.Size; sum > 0 {
slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := m.CPU.Graph.Size; sum > 0 {
slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
if total > 0 {
slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
}
}
var backends = make(map[string]func(string, BackendParams) (Backend, error)) var backends = make(map[string]func(string, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) { func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
......
...@@ -10,6 +10,7 @@ import "C" ...@@ -10,6 +10,7 @@ import "C"
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"io" "io"
"log/slog" "log/slog"
...@@ -62,12 +63,21 @@ var initDevices = sync.OnceFunc(func() { ...@@ -62,12 +63,21 @@ var initDevices = sync.OnceFunc(func() {
} }
}) })
type layerDevice struct {
d C.ggml_backend_dev_t
bt C.ggml_backend_buffer_type_t
}
type Backend struct { type Backend struct {
// modelPath is the location of the model data // modelPath is the location of the model data
modelPath string modelPath string
meta *fsggml.GGML meta *fsggml.GGML
// allocMemory means that memory should be allocated for tensors and not
// just a dry run
allocMemory bool
// tensorLoadTargets maps from the name of the tensor in the file // tensorLoadTargets maps from the name of the tensor in the file
// to the name that is used by the model definition // to the name that is used by the model definition
tensorLoadTargets map[string][]string tensorLoadTargets map[string][]string
...@@ -78,11 +88,14 @@ type Backend struct { ...@@ -78,11 +88,14 @@ type Backend struct {
tensors map[string]*C.struct_ggml_tensor tensors map[string]*C.struct_ggml_tensor
// input is the backend used for inputs // input is the backend buffer type used for inputs
input C.ggml_backend_buffer_type_t input C.ggml_backend_buffer_type_t
// output is the backend device used for outputs
output C.ggml_backend_dev_t
// layers is the backend used for repeating layers // layers is the backend used for repeating layers
layers map[int]C.ggml_backend_buffer_type_t layers map[int]layerDevice
// requiredMemory is the cumulative memory allocations needed by the backend // requiredMemory is the cumulative memory allocations needed by the backend
requiredMemory *ml.BackendMemory requiredMemory *ml.BackendMemory
...@@ -99,6 +112,8 @@ type Backend struct { ...@@ -99,6 +112,8 @@ type Backend struct {
weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
} }
var once sync.Once
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
r, err := os.Open(modelPath) r, err := os.Open(modelPath)
if err != nil { if err != nil {
...@@ -111,15 +126,17 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -111,15 +126,17 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
return nil, err return nil, err
} }
slog.Info( once.Do(func() {
"", slog.Info(
"architecture", meta.KV().Architecture(), "",
"file_type", meta.KV().FileType(), "architecture", meta.KV().Architecture(),
"name", meta.KV().String("general.name"), "file_type", meta.KV().FileType(),
"description", meta.KV().String("general.description"), "name", meta.KV().String("general.name"),
"num_tensors", len(meta.Tensors().Items()), "description", meta.KV().String("general.description"),
"num_key_values", len(meta.KV()), "num_tensors", len(meta.Tensors().Items()),
) "num_key_values", len(meta.KV()),
)
})
initDevices() initDevices()
...@@ -139,7 +156,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -139,7 +156,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
switch C.ggml_backend_dev_type(d) { switch C.ggml_backend_dev_type(d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU, case C.GGML_BACKEND_DEVICE_TYPE_CPU,
C.GGML_BACKEND_DEVICE_TYPE_ACCEL: C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d)) bt := C.ggml_backend_dev_buffer_type(d)
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)
C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
} }
} }
...@@ -160,6 +180,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -160,6 +180,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
d: d, d: d,
bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...), bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
}) })
C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
btDeviceMemory[bt] = &requiredMemory.GPUs[i] btDeviceMemory[bt] = &requiredMemory.GPUs[i]
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d)) requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
var props C.struct_ggml_backend_dev_props var props C.struct_ggml_backend_dev_props
...@@ -169,56 +191,25 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -169,56 +191,25 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1) requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
} }
useDefaultSplit := true
for _, s := range params.TensorSplit {
if s != 0 {
useDefaultSplit = false
break
}
}
// calculate splits
splits := make([]float32, len(gpus))
if useDefaultSplit {
// default: split on free memory
for i := range splits {
var free, total C.size_t
C.ggml_backend_dev_memory(gpus[i], &free, &total)
splits[i] = float32(free)
}
} else {
splits = params.TensorSplit
}
var sum float32
// cumulative sum of all splits
for i := range splits {
sum += splits[i]
splits[i] = sum
}
// normalize splits
for i := range splits {
splits[i] /= sum
}
// inputs always use cpu // inputs always use cpu
input := cpuDeviceBufferType input := cpuDeviceBufferType
// define a range of gpu layers. anything outside of this range is assigned to the cpu assignLayer := func(layer int) deviceBufferType {
gpuRangeStart := max(0, blocks-params.NumGPULayers) for _, p := range params.GPULayers {
gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1) for _, l := range p.Layers {
assignLayer := func(i int) deviceBufferType { if l == layer {
if i < gpuRangeStart || i >= gpuRangeStop { for i := range requiredMemory.GPUs {
return cpuDeviceBufferType if requiredMemory.GPUs[i].ID == p.ID {
} return gpuDeviceBufferTypes[i]
}
}
index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f }) return cpuDeviceBufferType
if index < 0 || index >= len(gpuDeviceBufferTypes) { }
return cpuDeviceBufferType }
} }
return gpuDeviceBufferTypes[index] return cpuDeviceBufferType
} }
// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1) // repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
...@@ -284,7 +275,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -284,7 +275,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt)) size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
if layer == -1 { if layer == -1 {
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case // Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
requiredMemory.InputWeights.Status = ml.Allocated if params.AllocMemory {
requiredMemory.InputWeights.Status = ml.Allocated
}
requiredMemory.InputWeights.Size += uint64(size) requiredMemory.InputWeights.Size += uint64(size)
} else { } else {
btDeviceMemory[bt].Weights[layer].Size += uint64(size) btDeviceMemory[bt].Weights[layer].Size += uint64(size)
...@@ -355,12 +348,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -355,12 +348,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
} }
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt) b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
for i := range btDeviceMemory[bt].Weights { if params.AllocMemory {
if btDeviceMemory[bt].Weights[i].Size != 0 { for i := range btDeviceMemory[bt].Weights {
if b != nil { if btDeviceMemory[bt].Weights[i].Size != 0 {
btDeviceMemory[bt].Weights[i].Status = ml.Allocated if b != nil {
} else { btDeviceMemory[bt].Weights[i].Status = ml.Allocated
btDeviceMemory[bt].Weights[i].Status = ml.Failed } else {
btDeviceMemory[bt].Weights[i].Status = ml.Failed
}
} }
} }
} }
...@@ -381,28 +376,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -381,28 +376,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
bbs[c] = b bbs[c] = b
} }
// Mimic llama runner logs summarizing layers and memory
gpuLayers := 0
for _, layer := range layers {
if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
gpuLayers++
}
}
slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
switch C.ggml_backend_dev_type(output.d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
slog.Info("offloading output layer to CPU")
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
slog.Info("offloading output layer to GPU")
gpuLayers++
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
slog.Info("offloading output layer to ACCEL")
}
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
for bs := range maps.Values(bbs) { for bs := range maps.Values(bbs) {
slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs)))) slog.Log(context.TODO(), logutil.LevelTrace, "model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
} }
// map tensor names to tensors for easy lookup later // map tensor names to tensors for easy lookup later
...@@ -423,6 +399,13 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -423,6 +399,13 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
b := backends[d] b := backends[d]
bt := C.ggml_backend_get_default_buffer_type(b) bt := C.ggml_backend_get_default_buffer_type(b)
// Always include CPU as a fallback but otherwise, just use the devices where we assigned layers
if !slices.Contains(cpuDeviceBufferType.bts, bt) {
if c, ok := ctxs[bt]; !ok || C.ggml_get_first_tensor(c) == nil {
continue
}
}
deviceBufferTypes[d] = bt deviceBufferTypes[d] = bt
schedBackends = append(schedBackends, b) schedBackends = append(schedBackends, b)
...@@ -437,6 +420,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -437,6 +420,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5) maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
return &Backend{ return &Backend{
modelPath: modelPath, modelPath: modelPath,
allocMemory: params.AllocMemory,
flashAttention: params.FlashAttention, flashAttention: params.FlashAttention,
meta: meta, meta: meta,
tensorLoadTargets: targets, tensorLoadTargets: targets,
...@@ -452,10 +436,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -452,10 +436,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
schedBackends: schedBackends, schedBackends: schedBackends,
schedBufts: schedBufts, schedBufts: schedBufts,
input: deviceBufferTypes[input.d], input: deviceBufferTypes[input.d],
layers: func() map[int]C.ggml_backend_buffer_type_t { output: output.d,
m := make(map[int]C.ggml_backend_buffer_type_t) layers: func() map[int]layerDevice {
m := make(map[int]layerDevice)
for i, layer := range layers { for i, layer := range layers {
m[i] = deviceBufferTypes[layer.d] m[i] = layerDevice{
d: layer.d,
bt: deviceBufferTypes[layer.d],
}
} }
return m return m
}(), }(),
...@@ -484,6 +472,30 @@ func (b *Backend) Close() { ...@@ -484,6 +472,30 @@ func (b *Backend) Close() {
} }
func (b *Backend) Load(ctx context.Context, progress func(float32)) error { func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
if !b.allocMemory {
return errors.New("cannot load model without memory allocation")
}
// Mimic llama runner logs summarizing layers and memory
gpuLayers := 0
for layer := range maps.Values(b.layers) {
if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
gpuLayers++
}
}
slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
switch C.ggml_backend_dev_type(b.output) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
slog.Info("offloading output layer to CPU")
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
slog.Info("offloading output layer to GPU")
gpuLayers++
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
slog.Info("offloading output layer to ACCEL")
}
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
var doneBytes atomic.Uint64 var doneBytes atomic.Uint64
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
...@@ -730,11 +742,11 @@ func (c *Context) Input() ml.Context { ...@@ -730,11 +742,11 @@ func (c *Context) Input() ml.Context {
} }
func (c *Context) Layer(i int) ml.Context { func (c *Context) Layer(i int) ml.Context {
if buft, ok := c.b.layers[i]; ok { if layer, ok := c.b.layers[i]; ok {
return &Context{ return &Context{
b: c.b, b: c.b,
ctx: c.ctx, ctx: c.ctx,
buft: buft, buft: layer.bt,
allocatedBuffers: c.allocatedBuffers, allocatedBuffers: c.allocatedBuffers,
maxGraphNodes: c.maxGraphNodes, maxGraphNodes: c.maxGraphNodes,
layer: i, layer: i,
...@@ -792,14 +804,16 @@ func (c *Context) Reserve() { ...@@ -792,14 +804,16 @@ func (c *Context) Reserve() {
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
graph.Size += uint64(bufferStatus.size) graph.Size += uint64(bufferStatus.size)
if bufferStatus.allocated && graph.Status != ml.Failed { if c.b.allocMemory {
graph.Status = ml.Allocated if bufferStatus.allocated && graph.Status != ml.Failed {
} else { graph.Status = ml.Allocated
graph.Status = ml.Failed } else {
graph.Status = ml.Failed
}
} }
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), slog.Log(context.TODO(), logutil.LevelTrace, "compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
"size", format.HumanBytes2(uint64(bufferStatus.size))) "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
} }
if !reserved { if !reserved {
...@@ -868,10 +882,12 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor { ...@@ -868,10 +882,12 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer] cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
cache.Size += uint64(size) cache.Size += uint64(size)
if b != nil { if c.b.allocMemory {
cache.Status = ml.Allocated if b != nil {
} else { cache.Status = ml.Allocated
cache.Status = ml.Failed } else {
cache.Status = ml.Failed
}
} }
} }
...@@ -890,7 +906,9 @@ func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor { ...@@ -890,7 +906,9 @@ func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor { func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
t := c.newTensor(dtype, shape) t := c.newTensor(dtype, shape)
C.ggml_set_zero(t.(*Tensor).t) if c.b.allocMemory {
C.ggml_set_zero(t.(*Tensor).t)
}
return t return t
} }
...@@ -915,7 +933,7 @@ func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor { ...@@ -915,7 +933,7 @@ func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
t := c.newTensor(ml.DTypeF32, shape) t := c.newTensor(ml.DTypeF32, shape)
if len(s) > 0 { if c.b.allocMemory && len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t)) C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
} }
...@@ -927,7 +945,7 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor { ...@@ -927,7 +945,7 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
t := c.newTensor(ml.DTypeI32, shape) t := c.newTensor(ml.DTypeI32, shape)
if len(s) > 0 { if c.b.allocMemory && len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t)) C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
} }
...@@ -1550,7 +1568,7 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor { ...@@ -1550,7 +1568,7 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor { func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
// Unchecked to handle quantized types // Unchecked to handle quantized types
t := c.newTensor(dtype, shape) t := c.newTensor(dtype, shape)
if len(s) > 0 { if c.b.allocMemory && len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t)) C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
} }
......
...@@ -581,16 +581,8 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ...@@ -581,16 +581,8 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("blas", silent, dir_path); ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent, dir_path); ggml_backend_load_best("cann", silent, dir_path);
ggml_backend_load_best("cuda", silent, dir_path);
// Avoid mixed hip+cuda configurations ggml_backend_load_best("hip", silent, dir_path);
const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
if (!hip_devices && !rocr_devices) {
ggml_backend_load_best("cuda", silent, dir_path);
} else {
ggml_backend_load_best("hip", silent, dir_path);
}
ggml_backend_load_best("metal", silent, dir_path); ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path); ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path); ggml_backend_load_best("sycl", silent, dir_path);
......
...@@ -12,7 +12,6 @@ import ( ...@@ -12,7 +12,6 @@ import (
"net/http" "net/http"
"os" "os"
"regexp" "regexp"
"runtime"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
...@@ -216,6 +215,12 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error) ...@@ -216,6 +215,12 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
} }
type Server struct { type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
// is the server ready to process requests? // is the server ready to process requests?
// protects access to model and image // protects access to model and image
ready sync.WaitGroup ready sync.WaitGroup
...@@ -723,21 +728,12 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { ...@@ -723,21 +728,12 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
} }
} }
type multiLPath []string // loadModel allocates memory based on the given parameters and loads the weights. The
// memory allocated is worst case for text models but not for vision.
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) loadModel( func (s *Server) loadModel(
params llama.ModelParams, params llama.ModelParams,
mpath string, mpath string,
lpath multiLPath, lpath []string,
ppath string, ppath string,
kvSize int, kvSize int,
kvCacheType string, kvCacheType string,
...@@ -757,12 +753,10 @@ func (s *Server) loadModel( ...@@ -757,12 +753,10 @@ func (s *Server) loadModel(
panic(err) panic(err)
} }
if lpath.String() != "" { for _, path := range lpath {
for _, path := range lpath { err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads) if err != nil {
if err != nil { panic(err)
panic(err)
}
} }
} }
...@@ -783,26 +777,81 @@ func (s *Server) loadModel( ...@@ -783,26 +777,81 @@ func (s *Server) loadModel(
s.ready.Done() s.ready.Done()
} }
// load is the handler called by the Ollama server to process different
// load operations
func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.loadMu.Lock()
defer s.loadMu.Unlock()
w.Header().Set("Content-Type", "application/json")
if s.status != llm.ServerStatusLaunched {
http.Error(w, "model already loaded", http.StatusInternalServerError)
return
}
var req llm.LoadRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "bad request", http.StatusBadRequest)
return
}
slog.Info("load", "request", req)
switch req.Operation {
// LoadOperationFit and LoadOperationAlloc have no meaning here - just return a successful response
case llm.LoadOperationCommit:
s.batchSize = req.BatchSize
s.parallel = req.Parallel
s.seqs = make([]*Sequence, s.parallel)
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
gpuIDs := llama.EnumerateGPUs()
tensorSplit := make([]float32, len(gpuIDs))
numGPU := 0
for i := range gpuIDs {
for _, layers := range req.GPULayers {
if gpuIDs[i] == layers.ID {
tensorSplit[i] = float32(len(layers.Layers))
numGPU += len(layers.Layers)
}
}
}
params := llama.ModelParams{
NumGpuLayers: numGPU,
MainGpu: req.MainGPU,
UseMmap: req.UseMmap && len(req.LoraPath) == 0,
TensorSplit: tensorSplit,
Progress: func(progress float32) {
s.progress = progress
},
}
s.status = llm.ServerStatusLoadingModel
go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
case llm.LoadOperationClose:
// No-op for us
if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
resp := llm.LoadResponse{Success: true}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
return
}
}
func Execute(args []string) error { func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError) fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file") mpath := fs.String("model", "", "Path to model binary file")
ppath := fs.String("mmproj", "", "Path to projector binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size")
nGpuLayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
mainGpu := fs.Int("main-gpu", 0, "Main GPU")
flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on") port := fs.Int("port", 8080, "Port to expose the server on")
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)") _ = fs.Bool("verbose", false, "verbose output (default: disabled)")
noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
fs.Usage = func() { fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n") fmt.Fprintf(fs.Output(), "Runner usage\n")
...@@ -817,35 +866,11 @@ func Execute(args []string) error { ...@@ -817,35 +866,11 @@ func Execute(args []string) error {
llama.BackendInit() llama.BackendInit()
server := &Server{ server := &Server{
batchSize: *batchSize, modelPath: *mpath,
parallel: *parallel, status: llm.ServerStatusLaunched,
seqs: make([]*Sequence, *parallel),
seqsSem: semaphore.NewWeighted(int64(*parallel)),
status: llm.ServerStatusLoadingModel,
}
var tensorSplitFloats []float32
if *tensorSplit != "" {
splits := strings.Split(*tensorSplit, ",")
tensorSplitFloats = make([]float32, len(splits))
for i, s := range splits {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats[i] = float32(f)
}
}
params := llama.ModelParams{
NumGpuLayers: *nGpuLayers,
MainGpu: *mainGpu,
UseMmap: !*noMmap && lpaths.String() == "",
TensorSplit: tensorSplitFloats,
Progress: func(progress float32) {
server.progress = progress
},
} }
server.ready.Add(1) server.ready.Add(1)
go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache)
server.cond = sync.NewCond(&server.mu) server.cond = sync.NewCond(&server.mu)
...@@ -863,6 +888,7 @@ func Execute(args []string) error { ...@@ -863,6 +888,7 @@ func Execute(args []string) error {
defer listener.Close() defer listener.Close()
mux := http.NewServeMux() mux := http.NewServeMux()
mux.HandleFunc("POST /load", server.load)
mux.HandleFunc("/embedding", server.embeddings) mux.HandleFunc("/embedding", server.embeddings)
mux.HandleFunc("/completion", server.completion) mux.HandleFunc("/completion", server.completion)
mux.HandleFunc("/health", server.health) mux.HandleFunc("/health", server.health)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment