Unverified Commit 5c191276 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

Merge pull request #5473 from ollama/mxyng/environ

fix: environ lookup
parents 71399aa6 85d9d73a
......@@ -1053,7 +1053,7 @@ func (s *Server) GenerateRoutes() http.Handler {
for _, prop := range openAIProperties {
config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop)
}
config.AllowOrigins = envconfig.AllowOrigins
config.AllowOrigins = envconfig.Origins()
r := gin.Default()
r.Use(
......@@ -1098,7 +1098,7 @@ func (s *Server) GenerateRoutes() http.Handler {
func Serve(ln net.Listener) error {
level := slog.LevelInfo
if envconfig.Debug {
if envconfig.Debug() {
level = slog.LevelDebug
}
......@@ -1126,7 +1126,7 @@ func Serve(ln net.Listener) error {
return err
}
if !envconfig.NoPrune {
if !envconfig.NoPrune() {
// clean up unused layers and manifests
if err := PruneLayers(); err != nil {
return err
......
......@@ -15,7 +15,6 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
)
......@@ -89,7 +88,6 @@ func TestCreateFromBin(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
......@@ -117,7 +115,6 @@ func TestCreateFromModel(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
......@@ -160,7 +157,6 @@ func TestCreateRemovesLayers(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
......@@ -209,7 +205,6 @@ func TestCreateUnsetsSystem(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
......@@ -267,7 +262,6 @@ func TestCreateMergeParameters(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
......@@ -372,7 +366,6 @@ func TestCreateReplacesMessages(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
......@@ -450,7 +443,6 @@ func TestCreateTemplateSystem(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
......@@ -534,7 +526,6 @@ func TestCreateLicenses(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
......@@ -582,7 +573,6 @@ func TestCreateDetectTemplate(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
t.Run("matched", func(t *testing.T) {
......
......@@ -10,7 +10,6 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/types/model"
)
......@@ -19,7 +18,6 @@ func TestDelete(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
......
......@@ -9,14 +9,12 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
)
func TestList(t *testing.T) {
gin.SetMode(gin.TestMode)
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
expectNames := []string{
"mistral:7b-instruct-q4_0",
......
......@@ -19,7 +19,6 @@ import (
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser"
......@@ -347,7 +346,6 @@ func Test_Routes(t *testing.T) {
}
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
s := &Server{}
router := s.GenerateRoutes()
......@@ -378,7 +376,6 @@ func Test_Routes(t *testing.T) {
func TestCase(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
cases := []string{
"mistral",
......@@ -458,7 +455,6 @@ func TestCase(t *testing.T) {
func TestShow(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
var s Server
......
......@@ -5,9 +5,11 @@ import (
"errors"
"fmt"
"log/slog"
"os"
"reflect"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"time"
......@@ -59,11 +61,12 @@ var defaultParallel = 4
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
func InitScheduler(ctx context.Context) *Scheduler {
maxQueue := envconfig.MaxQueue()
sched := &Scheduler{
pendingReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
expiredCh: make(chan *runnerRef, envconfig.MaxQueuedRequests),
unloadedCh: make(chan interface{}, envconfig.MaxQueuedRequests),
pendingReqCh: make(chan *LlmRequest, maxQueue),
finishedReqCh: make(chan *LlmRequest, maxQueue),
expiredCh: make(chan *runnerRef, maxQueue),
unloadedCh: make(chan interface{}, maxQueue),
loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo,
......@@ -126,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
slog.Debug("pending request cancelled or timed out, skipping scheduling")
continue
}
numParallel := envconfig.NumParallel
numParallel := int(envconfig.NumParallel())
// TODO (jmorganca): multimodal models don't support parallel yet
// see https://github.com/ollama/ollama/issues/4165
if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
......@@ -148,7 +151,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending.useLoadedRunner(runner, s.finishedReqCh)
break
}
} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
runnerToExpire = s.findRunnerToUnload()
} else {
......@@ -161,7 +164,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
gpus = s.getGpuFn()
}
if envconfig.MaxRunners <= 0 {
if envconfig.MaxRunners() <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
......@@ -173,11 +176,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
}
if allReliable {
envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
} else {
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
envconfig.MaxRunners = len(gpus)
}
}
......@@ -404,7 +409,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
if numParallel < 1 {
numParallel = 1
}
sessionDuration := envconfig.KeepAlive
sessionDuration := envconfig.KeepAlive()
if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration
}
......@@ -699,7 +704,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
// First attempt to fit the model into a single GPU
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread {
if !envconfig.SchedSpread() {
for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
......
......@@ -12,7 +12,6 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/app/lifecycle"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
......@@ -272,7 +271,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
c.req.opts.NumGPU = 0 // CPU load, will be allowed
d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
envconfig.MaxRunners = 1
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
s.newServerFn = a.newServer
slog.Info("a")
s.pendingReqCh <- a.req
......@@ -291,7 +290,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
require.Len(t, s.loaded, 1)
s.loadedMu.Unlock()
envconfig.MaxRunners = 0
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
s.newServerFn = b.newServer
slog.Info("b")
s.pendingReqCh <- b.req
......@@ -362,7 +361,7 @@ func TestGetRunner(t *testing.T) {
a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
envconfig.MaxQueuedRequests = 1
t.Setenv("OLLAMA_MAX_QUEUE", "1")
s := InitScheduler(ctx)
s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment