"git@developer.sourcefind.cn:OpenDAS/torch-cluster.git" did not exist on "d85bc4fb304bed80725c45cb23d08b8468cf469d"
Commit bee2f4a3 authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Record GPU usage information

This records more GPU usage information for eventual UX inclusion.
parent 88cf1544
...@@ -53,6 +53,8 @@ func HumanBytes(b int64) string { ...@@ -53,6 +53,8 @@ func HumanBytes(b int64) string {
func HumanBytes2(b uint64) string { func HumanBytes2(b uint64) string {
switch { switch {
case b >= GibiByte:
return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
case b >= MebiByte: case b >= MebiByte:
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte) return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
case b >= KibiByte: case b >= KibiByte:
......
...@@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors ...@@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
// Split up the GPUs by type and try them // Split up the GPUs by type and try them
for _, gpus := range allGpus.ByLibrary() { for _, gpus := range allGpus.ByLibrary() {
var layerCount int var layerCount int
layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts) layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
if opts.NumGPU < 0 { if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) { if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
return true, estimatedVRAM return true, estimatedVRAM
...@@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors ...@@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
return false, estimatedVRAM return false, estimatedVRAM
} }
// Given a model and one or more GPU targets, predict how many layers and bytes we can load // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library // The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) { func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
if gpus[0].Library == "cpu" {
return 0, 0
}
var memoryAvailable uint64 var memoryAvailable uint64
for _, info := range gpus { for _, info := range gpus {
memoryAvailable += info.FreeMemory memoryAvailable += info.FreeMemory
...@@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ...@@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size() memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers")
return 0, 0
}
var memoryLayerOutput uint64 var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok { if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.size() memoryLayerOutput += layer.size()
...@@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ...@@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
), ),
), ),
) )
return layerCount, uint64(memoryRequiredPartial) if gpus[0].Library == "cpu" {
return 0, 0, memoryRequiredTotal
}
if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers")
return 0, 0, memoryRequiredTotal
}
return layerCount, memoryRequiredPartial, memoryRequiredTotal
} }
...@@ -49,7 +49,10 @@ type llmServer struct { ...@@ -49,7 +49,10 @@ type llmServer struct {
options api.Options options api.Options
// TODO - this should be broken down by GPU // TODO - this should be broken down by GPU
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
estimatedTotal uint64 // Total size of model
totalLayers uint64
gpuCount int
sem *semaphore.Weighted sem *semaphore.Weighted
} }
...@@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner := "" cpuRunner := ""
var estimatedVRAM uint64 var estimatedVRAM uint64
var estimatedTotal uint64
var systemMemory uint64 var systemMemory uint64
gpuCount := len(gpus)
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 { if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner // TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
gpuCount = 0
} else { } else {
if gpus[0].Library == "metal" { if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem() memInfo, err := gpu.GetCPUMem()
...@@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
} }
var layers int var layers int
layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts) layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
if gpus[0].Library == "metal" && estimatedVRAM > systemMemory { if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
// disable partial offloading when model is greater than total system memory as this // disable partial offloading when model is greater than total system memory as this
...@@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else { } else {
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath) slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
servers = []string{demandLib} servers = []string{demandLib}
if strings.HasPrefix(demandLib, "cpu") {
// Omit the GPU flag to silence the warning
opts.NumGPU = -1
}
} }
} }
...@@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
continue continue
} }
if strings.HasPrefix(servers[i], "cpu") {
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
gpuCount = 0
}
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race // Find an availableServers port, retry on each iterration in case the failure was a port conflict race
port := 0 port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
...@@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
s := &llmServer{ s := &llmServer{
port: port, port: port,
cmd: exec.Command(server, finalParams...), cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr), status: NewStatusWriter(os.Stderr),
options: opts, options: opts,
estimatedVRAM: estimatedVRAM, estimatedVRAM: estimatedVRAM,
sem: semaphore.NewWeighted(int64(numParallel)), estimatedTotal: estimatedTotal,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
gpuCount: gpuCount,
} }
s.cmd.Env = os.Environ() s.cmd.Env = os.Environ()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment