Unverified Commit a50a87a7 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

partial offloading: allow flash attention and disable mmap (#4734)

* partial offloading: allow flash attention and disable mmap

* allow mmap with num_gpu=0
parent e91d0ef7
...@@ -191,35 +191,38 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -191,35 +191,38 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--memory-f32") params = append(params, "--memory-f32")
} }
if opts.UseMLock {
params = append(params, "--mlock")
}
if !opts.UseMMap {
params = append(params, "--no-mmap")
}
if opts.UseNUMA {
params = append(params, "--numa")
}
flashAttnEnabled := envconfig.FlashAttention flashAttnEnabled := envconfig.FlashAttention
// partial offloading does not support flash attention
if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
flashAttnEnabled = false
}
// only cuda (compute capability 7+) and metal support flash attention
for _, g := range gpus { for _, g := range gpus {
// only cuda (compute capability 7+) and metal support flash attention
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) { if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
flashAttnEnabled = false flashAttnEnabled = false
} }
// mmap has issues with partial offloading on metal
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
opts.UseMMap = false
}
} }
if flashAttnEnabled { if flashAttnEnabled {
params = append(params, "--flash-attn") params = append(params, "--flash-attn")
} }
if !opts.UseMMap {
params = append(params, "--no-mmap")
}
if opts.UseMLock {
params = append(params, "--mlock")
}
if opts.UseNUMA {
params = append(params, "--numa")
}
numParallel := envconfig.NumParallel numParallel := envconfig.NumParallel
// TODO (jmorganca): multimodal models don't support parallel yet // TODO (jmorganca): multimodal models don't support parallel yet
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment