Commit 4372d0bf authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llamarunner: Respect device ordering for offloaded layers

We used to control the way that llama.cpp saw devices using
CUDA_VISIBLE_DEVICES or similar. This would ensure that the layers
offloaded to a device were actually the ones intended. This is
particularly important because we might reorder devices based on
free memory or performance.

When we started explicitly scheduling layers, this logic went
away but the llamarunner didn't have any way to set the correct
order of devices. This meant that the correct number of layers
would be assigned to a device but not necessarily the layers
that were expected. This change sets up the devices correctly
based on the offload information.
parent 31361c4d
...@@ -63,8 +63,13 @@ func BackendInit() { ...@@ -63,8 +63,13 @@ func BackendInit() {
C.llama_backend_init() C.llama_backend_init()
} }
func EnumerateGPUs() []ml.DeviceID { type Devices struct {
var ids []ml.DeviceID ml.DeviceID
LlamaID uint64
}
func EnumerateGPUs() []Devices {
var ids []Devices
for i := range C.ggml_backend_dev_count() { for i := range C.ggml_backend_dev_count() {
device := C.ggml_backend_dev_get(i) device := C.ggml_backend_dev_get(i)
...@@ -74,9 +79,12 @@ func EnumerateGPUs() []ml.DeviceID { ...@@ -74,9 +79,12 @@ func EnumerateGPUs() []ml.DeviceID {
C.GGML_BACKEND_DEVICE_TYPE_IGPU: C.GGML_BACKEND_DEVICE_TYPE_IGPU:
var props C.struct_ggml_backend_dev_props var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(device, &props) C.ggml_backend_dev_get_props(device, &props)
ids = append(ids, ml.DeviceID{ ids = append(ids, Devices{
ID: C.GoString(props.id), DeviceID: ml.DeviceID{
Library: C.GoString(props.library), ID: C.GoString(props.id),
Library: C.GoString(props.library),
},
LlamaID: uint64(i),
}) })
} }
} }
...@@ -231,6 +239,7 @@ func (c *Context) GetLogitsIth(i int) []float32 { ...@@ -231,6 +239,7 @@ func (c *Context) GetLogitsIth(i int) []float32 {
} }
type ModelParams struct { type ModelParams struct {
Devices []uint64
NumGpuLayers int NumGpuLayers int
MainGpu int MainGpu int
UseMmap bool UseMmap bool
...@@ -254,6 +263,21 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { ...@@ -254,6 +263,21 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
cparams.use_mmap = C.bool(params.UseMmap) cparams.use_mmap = C.bool(params.UseMmap)
cparams.vocab_only = C.bool(params.VocabOnly) cparams.vocab_only = C.bool(params.VocabOnly)
var devices []C.ggml_backend_dev_t
for _, llamaID := range params.Devices {
devices = append(devices, C.ggml_backend_dev_get(C.size_t(llamaID)))
}
if len(devices) > 0 {
devices = append(devices, C.ggml_backend_dev_t(C.NULL))
devicesData := &devices[0]
var devicesPin runtime.Pinner
devicesPin.Pin(devicesData)
defer devicesPin.Unpin()
cparams.devices = devicesData
}
if len(params.TensorSplit) > 0 { if len(params.TensorSplit) > 0 {
tensorSplitData := &params.TensorSplit[0] tensorSplitData := &params.TensorSplit[0]
......
...@@ -8,6 +8,7 @@ import ( ...@@ -8,6 +8,7 @@ import (
"hash/maphash" "hash/maphash"
"io" "io"
"log/slog" "log/slog"
"math"
"net/http" "net/http"
"runtime" "runtime"
"slices" "slices"
...@@ -28,6 +29,22 @@ type GPULayers struct { ...@@ -28,6 +29,22 @@ type GPULayers struct {
Layers []int Layers []int
} }
// FirstLayer returns the smallest layer index scheduled on this GPU, or MaxInt when empty.
func (g GPULayers) FirstLayer() int {
if len(g.Layers) == 0 {
return math.MaxInt
}
first := g.Layers[0]
for i := 1; i < len(g.Layers); i++ {
if g.Layers[i] < first {
first = g.Layers[i]
}
}
return first
}
func (g GPULayers) String() string { func (g GPULayers) String() string {
if len(g.Layers) == 0 { if len(g.Layers) == 0 {
return "" return ""
...@@ -54,6 +71,17 @@ func (g GPULayers) String() string { ...@@ -54,6 +71,17 @@ func (g GPULayers) String() string {
// GPULayersList is a set of layer allocations across multiple GPUs // GPULayersList is a set of layer allocations across multiple GPUs
type GPULayersList []GPULayers type GPULayersList []GPULayers
func (l GPULayersList) Len() int { return len(l) }
func (l GPULayersList) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
// Sort by the ordering of the layers offloaded
func (l GPULayersList) Less(i, j int) bool {
li := l[i].FirstLayer()
lj := l[j].FirstLayer()
return li < lj
}
func (l GPULayersList) String() string { func (l GPULayersList) String() string {
if l.Sum() > 0 { if l.Sum() > 0 {
return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l)) return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
......
...@@ -12,6 +12,7 @@ import ( ...@@ -12,6 +12,7 @@ import (
"net/http" "net/http"
"os" "os"
"regexp" "regexp"
"sort"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
...@@ -900,19 +901,24 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) { ...@@ -900,19 +901,24 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.seqs = make([]*Sequence, s.parallel) s.seqs = make([]*Sequence, s.parallel)
s.seqsSem = semaphore.NewWeighted(int64(s.parallel)) s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
gpuIDs := llama.EnumerateGPUs()
tensorSplit := make([]float32, len(gpuIDs))
numGPU := 0 numGPU := 0
for i := range gpuIDs { var tensorSplit []float32
for _, layers := range req.GPULayers { var llamaIDs []uint64
if gpuIDs[i] == layers.DeviceID {
tensorSplit[i] = float32(len(layers.Layers)) gpuIDs := llama.EnumerateGPUs()
sort.Sort(req.GPULayers)
for _, layers := range req.GPULayers {
for i := range gpuIDs {
if gpuIDs[i].DeviceID == layers.DeviceID {
numGPU += len(layers.Layers) numGPU += len(layers.Layers)
tensorSplit = append(tensorSplit, float32(len(layers.Layers)))
llamaIDs = append(llamaIDs, gpuIDs[i].LlamaID)
} }
} }
} }
params := llama.ModelParams{ params := llama.ModelParams{
Devices: llamaIDs,
NumGpuLayers: numGPU, NumGpuLayers: numGPU,
MainGpu: req.MainGPU, MainGpu: req.MainGPU,
UseMmap: req.UseMmap && len(req.LoraPath) == 0, UseMmap: req.UseMmap && len(req.LoraPath) == 0,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment