Commit 3fe74fba authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llm: Use first layer as memory buffer in estimation

This is a partial revert of 0478d440 "Fixed over vram allcation dure to
small initial layer sizes."

Previously we used the size of the first layer as an extra reserved
amount of space to buffer our memory estimates. The above commit
changed this to use the largest layer. However, this had performance
impacts on more models than the original commit was trying to fix.

There is just a heuristic without an ideal solution so this goes back
to the historic behavior.

Fixes: #10765, #10756, #10752, #10726
parent 1a0cfd08
package llm package llm
import ( import (
"cmp"
"fmt" "fmt"
"log/slog" "log/slog"
"maps"
"os" "os"
"slices"
"strconv" "strconv"
"strings" "strings"
...@@ -125,10 +122,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -125,10 +122,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
layers := f.Tensors().GroupLayers() layers := f.Tensors().GroupLayers()
// add one layer (chosing the max layer) worth of memory as a buffer // add one layer worth of memory as a buffer
layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int { if blk0, ok := layers["blk.0"]; ok {
return cmp.Compare(a.Size(), b.Size()) layerSize = blk0.Size()
}).Size() } else {
slog.Warn("model missing blk.0 layer size")
}
var kvct string var kvct string
if envconfig.FlashAttention() && if envconfig.FlashAttention() &&
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment