"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "b82d8833c2a872acf1d73541032c57762dc5f0cc"
Commit bd6a7d5e authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

ollamarunner: Pass runner performance parameters to backends

Currently the following parameters are in the runner but not used:
 - numGPULayers
 - mainGPU
 - threads
 - tensorSplit

This passes them through to the backend, which is where they would
actually get used. However, the GGML backend does not yet do anything
with them.
parent 14b5a9a1
...@@ -26,9 +26,24 @@ type Backend interface { ...@@ -26,9 +26,24 @@ type Backend interface {
SystemInfo() string SystemInfo() string
} }
var backends = make(map[string]func(*os.File) (Backend, error)) // BackendParams controls how the backend loads and executes models
type BackendParams struct {
// NumThreads sets the number of threads to use if running on the CPU
NumThreads int
func RegisterBackend(name string, f func(*os.File) (Backend, error)) { // MainGPU is the index of the primary GPU to use
MainGPU int
// NumGPULayers is the number of layers to offload to GPUs
NumGPULayers int
// TensorSplit is the fraction of the model to offload to each GPU
TensorSplit []float32
}
var backends = make(map[string]func(*os.File, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(*os.File, BackendParams) (Backend, error)) {
if _, ok := backends[name]; ok { if _, ok := backends[name]; ok {
panic("backend: backend already registered") panic("backend: backend already registered")
} }
...@@ -36,9 +51,9 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) { ...@@ -36,9 +51,9 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
backends[name] = f backends[name] = f
} }
func NewBackend(f *os.File) (Backend, error) { func NewBackend(f *os.File, params BackendParams) (Backend, error) {
if backend, ok := backends["ggml"]; ok { if backend, ok := backends["ggml"]; ok {
return backend(f) return backend(f, params)
} }
return nil, fmt.Errorf("unsupported backend") return nil, fmt.Errorf("unsupported backend")
......
...@@ -84,7 +84,7 @@ type Backend struct { ...@@ -84,7 +84,7 @@ type Backend struct {
tensors map[string]*Context tensors map[string]*Context
} }
func New(r *os.File) (ml.Backend, error) { func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
meta, n, err := fs.Decode(r, -1) meta, n, err := fs.Decode(r, -1)
if err != nil { if err != nil {
return nil, err return nil, err
......
...@@ -70,14 +70,14 @@ func Register(name string, f func(ml.Config) (Model, error)) { ...@@ -70,14 +70,14 @@ func Register(name string, f func(ml.Config) (Model, error)) {
} }
// New initializes a new model instance with the provided configuration based on the metadata in the model file // New initializes a new model instance with the provided configuration based on the metadata in the model file
func New(modelPath string) (Model, error) { func New(modelPath string, params ml.BackendParams) (Model, error) {
r, err := os.Open(modelPath) r, err := os.Open(modelPath)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer r.Close() defer r.Close()
b, err := ml.NewBackend(r) b, err := ml.NewBackend(r, params)
if err != nil { if err != nil {
return nil, err return nil, err
} }
......
...@@ -25,6 +25,7 @@ import ( ...@@ -25,6 +25,7 @@ import (
"golang.org/x/sync/semaphore" "golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model" "github.com/ollama/ollama/model"
"github.com/ollama/ollama/runner/common" "github.com/ollama/ollama/runner/common"
"github.com/ollama/ollama/sample" "github.com/ollama/ollama/sample"
...@@ -801,6 +802,7 @@ func (m *multiLPath) String() string { ...@@ -801,6 +802,7 @@ func (m *multiLPath) String() string {
func (s *Server) loadModel( func (s *Server) loadModel(
mpath string, mpath string,
params ml.BackendParams,
lpath multiLPath, lpath multiLPath,
parallel int, parallel int,
kvCacheType string, kvCacheType string,
...@@ -808,12 +810,12 @@ func (s *Server) loadModel( ...@@ -808,12 +810,12 @@ func (s *Server) loadModel(
multiUserCache bool, multiUserCache bool,
) { ) {
var err error var err error
s.model, err = model.New(mpath) s.model, err = model.New(mpath, params)
if err != nil { if err != nil {
panic(err) panic(err)
} }
slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */) slog.Info("system", "info", s.model.Backend().SystemInfo(), "threads", params.NumThreads)
// TODO(jessegross): LoRA loading // TODO(jessegross): LoRA loading
if lpath.String() != "" { if lpath.String() != "" {
...@@ -843,17 +845,17 @@ func Execute(args []string) error { ...@@ -843,17 +845,17 @@ func Execute(args []string) error {
mpath := fs.String("model", "", "Path to model binary file") mpath := fs.String("model", "", "Path to model binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously") parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size") batchSize := fs.Int("batch-size", 512, "Batch size")
_ = fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU") numGPULayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
_ = fs.Int("main-gpu", 0, "Main GPU") mainGPU := fs.Int("main-gpu", 0, "Main GPU")
_ = fs.Bool("flash-attn", false, "Enable flash attention") _ = fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size") kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)") kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on") port := fs.Int("port", 8080, "Port to expose the server on")
_ = fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)") verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") _ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") _ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
_ = fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath var lpaths multiLPath
...@@ -890,15 +892,11 @@ func Execute(args []string) error { ...@@ -890,15 +892,11 @@ func Execute(args []string) error {
} }
// TODO(jessegross): Parameters that need to be implemented: // TODO(jessegross): Parameters that need to be implemented:
// n-gpu-layers
// main-gpu
// flash-attn // flash-attn
// threads
// no-mmap // no-mmap
// mlock // mlock
// tensor-split
/*var tensorSplitFloats []float32 var tensorSplitFloats []float32
if *tensorSplit != "" { if *tensorSplit != "" {
stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1) stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
...@@ -907,10 +905,17 @@ func Execute(args []string) error { ...@@ -907,10 +905,17 @@ func Execute(args []string) error {
f, _ := strconv.ParseFloat(s, 32) f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats = append(tensorSplitFloats, float32(f)) tensorSplitFloats = append(tensorSplitFloats, float32(f))
} }
}*/ }
params := ml.BackendParams{
NumThreads: *threads,
NumGPULayers: *numGPULayers,
MainGPU: *mainGPU,
TensorSplit: tensorSplitFloats,
}
server.ready.Add(1) server.ready.Add(1)
go server.loadModel(*mpath, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache) go server.loadModel(*mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
server.cond = sync.NewCond(&server.mu) server.cond = sync.NewCond(&server.mu)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment