Unverified Commit 5c191276 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

Merge pull request #5473 from ollama/mxyng/environ

fix: environ lookup
parents 71399aa6 85d9d73a
...@@ -1053,7 +1053,7 @@ func (s *Server) GenerateRoutes() http.Handler { ...@@ -1053,7 +1053,7 @@ func (s *Server) GenerateRoutes() http.Handler {
for _, prop := range openAIProperties { for _, prop := range openAIProperties {
config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop) config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop)
} }
config.AllowOrigins = envconfig.AllowOrigins config.AllowOrigins = envconfig.Origins()
r := gin.Default() r := gin.Default()
r.Use( r.Use(
...@@ -1098,7 +1098,7 @@ func (s *Server) GenerateRoutes() http.Handler { ...@@ -1098,7 +1098,7 @@ func (s *Server) GenerateRoutes() http.Handler {
func Serve(ln net.Listener) error { func Serve(ln net.Listener) error {
level := slog.LevelInfo level := slog.LevelInfo
if envconfig.Debug { if envconfig.Debug() {
level = slog.LevelDebug level = slog.LevelDebug
} }
...@@ -1126,7 +1126,7 @@ func Serve(ln net.Listener) error { ...@@ -1126,7 +1126,7 @@ func Serve(ln net.Listener) error {
return err return err
} }
if !envconfig.NoPrune { if !envconfig.NoPrune() {
// clean up unused layers and manifests // clean up unused layers and manifests
if err := PruneLayers(); err != nil { if err := PruneLayers(); err != nil {
return err return err
......
...@@ -15,7 +15,6 @@ import ( ...@@ -15,7 +15,6 @@ import (
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
...@@ -89,7 +88,6 @@ func TestCreateFromBin(t *testing.T) { ...@@ -89,7 +88,6 @@ func TestCreateFromBin(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
...@@ -117,7 +115,6 @@ func TestCreateFromModel(t *testing.T) { ...@@ -117,7 +115,6 @@ func TestCreateFromModel(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
...@@ -160,7 +157,6 @@ func TestCreateRemovesLayers(t *testing.T) { ...@@ -160,7 +157,6 @@ func TestCreateRemovesLayers(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
...@@ -209,7 +205,6 @@ func TestCreateUnsetsSystem(t *testing.T) { ...@@ -209,7 +205,6 @@ func TestCreateUnsetsSystem(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
...@@ -267,7 +262,6 @@ func TestCreateMergeParameters(t *testing.T) { ...@@ -267,7 +262,6 @@ func TestCreateMergeParameters(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
...@@ -372,7 +366,6 @@ func TestCreateReplacesMessages(t *testing.T) { ...@@ -372,7 +366,6 @@ func TestCreateReplacesMessages(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
...@@ -450,7 +443,6 @@ func TestCreateTemplateSystem(t *testing.T) { ...@@ -450,7 +443,6 @@ func TestCreateTemplateSystem(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
...@@ -534,7 +526,6 @@ func TestCreateLicenses(t *testing.T) { ...@@ -534,7 +526,6 @@ func TestCreateLicenses(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{ w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
...@@ -582,7 +573,6 @@ func TestCreateDetectTemplate(t *testing.T) { ...@@ -582,7 +573,6 @@ func TestCreateDetectTemplate(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
t.Run("matched", func(t *testing.T) { t.Run("matched", func(t *testing.T) {
......
...@@ -10,7 +10,6 @@ import ( ...@@ -10,7 +10,6 @@ import (
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
) )
...@@ -19,7 +18,6 @@ func TestDelete(t *testing.T) { ...@@ -19,7 +18,6 @@ func TestDelete(t *testing.T) {
p := t.TempDir() p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p) t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server var s Server
......
...@@ -9,14 +9,12 @@ import ( ...@@ -9,14 +9,12 @@ import (
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
) )
func TestList(t *testing.T) { func TestList(t *testing.T) {
gin.SetMode(gin.TestMode) gin.SetMode(gin.TestMode)
t.Setenv("OLLAMA_MODELS", t.TempDir()) t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
expectNames := []string{ expectNames := []string{
"mistral:7b-instruct-q4_0", "mistral:7b-instruct-q4_0",
......
...@@ -19,7 +19,6 @@ import ( ...@@ -19,7 +19,6 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/parser"
...@@ -347,7 +346,6 @@ func Test_Routes(t *testing.T) { ...@@ -347,7 +346,6 @@ func Test_Routes(t *testing.T) {
} }
t.Setenv("OLLAMA_MODELS", t.TempDir()) t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
s := &Server{} s := &Server{}
router := s.GenerateRoutes() router := s.GenerateRoutes()
...@@ -378,7 +376,6 @@ func Test_Routes(t *testing.T) { ...@@ -378,7 +376,6 @@ func Test_Routes(t *testing.T) {
func TestCase(t *testing.T) { func TestCase(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir()) t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
cases := []string{ cases := []string{
"mistral", "mistral",
...@@ -458,7 +455,6 @@ func TestCase(t *testing.T) { ...@@ -458,7 +455,6 @@ func TestCase(t *testing.T) {
func TestShow(t *testing.T) { func TestShow(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir()) t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
var s Server var s Server
......
...@@ -5,9 +5,11 @@ import ( ...@@ -5,9 +5,11 @@ import (
"errors" "errors"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"reflect" "reflect"
"runtime" "runtime"
"sort" "sort"
"strconv"
"strings" "strings"
"sync" "sync"
"time" "time"
...@@ -59,11 +61,12 @@ var defaultParallel = 4 ...@@ -59,11 +61,12 @@ var defaultParallel = 4
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded") var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
func InitScheduler(ctx context.Context) *Scheduler { func InitScheduler(ctx context.Context) *Scheduler {
maxQueue := envconfig.MaxQueue()
sched := &Scheduler{ sched := &Scheduler{
pendingReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests), pendingReqCh: make(chan *LlmRequest, maxQueue),
finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests), finishedReqCh: make(chan *LlmRequest, maxQueue),
expiredCh: make(chan *runnerRef, envconfig.MaxQueuedRequests), expiredCh: make(chan *runnerRef, maxQueue),
unloadedCh: make(chan interface{}, envconfig.MaxQueuedRequests), unloadedCh: make(chan interface{}, maxQueue),
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer, newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo, getGpuFn: gpu.GetGPUInfo,
...@@ -126,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -126,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
slog.Debug("pending request cancelled or timed out, skipping scheduling") slog.Debug("pending request cancelled or timed out, skipping scheduling")
continue continue
} }
numParallel := envconfig.NumParallel numParallel := int(envconfig.NumParallel())
// TODO (jmorganca): multimodal models don't support parallel yet // TODO (jmorganca): multimodal models don't support parallel yet
// see https://github.com/ollama/ollama/issues/4165 // see https://github.com/ollama/ollama/issues/4165
if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 { if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
...@@ -148,7 +151,7 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -148,7 +151,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending.useLoadedRunner(runner, s.finishedReqCh) pending.useLoadedRunner(runner, s.finishedReqCh)
break break
} }
} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners { } else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
runnerToExpire = s.findRunnerToUnload() runnerToExpire = s.findRunnerToUnload()
} else { } else {
...@@ -161,7 +164,7 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -161,7 +164,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
gpus = s.getGpuFn() gpus = s.getGpuFn()
} }
if envconfig.MaxRunners <= 0 { if envconfig.MaxRunners() <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use // No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
// if any GPU has unreliable free memory reporting, 1x the number of GPUs // if any GPU has unreliable free memory reporting, 1x the number of GPUs
...@@ -173,11 +176,13 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -173,11 +176,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
} }
} }
if allReliable { if allReliable {
envconfig.MaxRunners = defaultModelsPerGPU * len(gpus) // HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus)) slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
} else { } else {
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency") slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
envconfig.MaxRunners = len(gpus)
} }
} }
...@@ -404,7 +409,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, ...@@ -404,7 +409,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
if numParallel < 1 { if numParallel < 1 {
numParallel = 1 numParallel = 1
} }
sessionDuration := envconfig.KeepAlive sessionDuration := envconfig.KeepAlive()
if req.sessionDuration != nil { if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration sessionDuration = req.sessionDuration.Duration
} }
...@@ -699,7 +704,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL ...@@ -699,7 +704,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
// First attempt to fit the model into a single GPU // First attempt to fit the model into a single GPU
for _, p := range numParallelToTry { for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread { if !envconfig.SchedSpread() {
for _, g := range sgl { for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
......
...@@ -12,7 +12,6 @@ import ( ...@@ -12,7 +12,6 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/app/lifecycle" "github.com/ollama/ollama/app/lifecycle"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
...@@ -272,7 +271,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) { ...@@ -272,7 +271,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
c.req.opts.NumGPU = 0 // CPU load, will be allowed c.req.opts.NumGPU = 0 // CPU load, will be allowed
d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
envconfig.MaxRunners = 1 t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
s.newServerFn = a.newServer s.newServerFn = a.newServer
slog.Info("a") slog.Info("a")
s.pendingReqCh <- a.req s.pendingReqCh <- a.req
...@@ -291,7 +290,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) { ...@@ -291,7 +290,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
require.Len(t, s.loaded, 1) require.Len(t, s.loaded, 1)
s.loadedMu.Unlock() s.loadedMu.Unlock()
envconfig.MaxRunners = 0 t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
s.newServerFn = b.newServer s.newServerFn = b.newServer
slog.Info("b") slog.Info("b")
s.pendingReqCh <- b.req s.pendingReqCh <- b.req
...@@ -362,7 +361,7 @@ func TestGetRunner(t *testing.T) { ...@@ -362,7 +361,7 @@ func TestGetRunner(t *testing.T) {
a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond}) a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond}) b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond}) c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
envconfig.MaxQueuedRequests = 1 t.Setenv("OLLAMA_MAX_QUEUE", "1")
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.getGpuFn = getGpuFn s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn s.getCpuFn = getCpuFn
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment