Commit 55cd3ddc authored by Michael Yang's avatar Michael Yang
Browse files

bool

parent 66fe77f0
...@@ -157,7 +157,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { ...@@ -157,7 +157,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
return err return err
} }
if envconfig.NoHistory { if envconfig.NoHistory() {
scanner.HistoryDisable() scanner.HistoryDisable()
} }
......
...@@ -17,21 +17,6 @@ import ( ...@@ -17,21 +17,6 @@ import (
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST") var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
// Debug returns true if the OLLAMA_DEBUG environment variable is set to a truthy value.
func Debug() bool {
if s := clean("OLLAMA_DEBUG"); s != "" {
b, err := strconv.ParseBool(s)
if err != nil {
// non-empty value is truthy
return true
}
return b
}
return false
}
// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable. // Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
// Default is scheme "http" and host "127.0.0.1:11434" // Default is scheme "http" and host "127.0.0.1:11434"
func Host() *url.URL { func Host() *url.URL {
...@@ -77,7 +62,7 @@ func Host() *url.URL { ...@@ -77,7 +62,7 @@ func Host() *url.URL {
// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable. // Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
func Origins() (origins []string) { func Origins() (origins []string) {
if s := clean("OLLAMA_ORIGINS"); s != "" { if s := getenv("OLLAMA_ORIGINS"); s != "" {
origins = strings.Split(s, ",") origins = strings.Split(s, ",")
} }
...@@ -114,9 +99,37 @@ func Models() string { ...@@ -114,9 +99,37 @@ func Models() string {
return filepath.Join(home, ".ollama", "models") return filepath.Join(home, ".ollama", "models")
} }
func Bool(k string) func() bool {
return func() bool {
if s := getenv(k); s != "" {
b, err := strconv.ParseBool(s)
if err != nil {
return true
}
return b
}
return false
}
}
var (
// Debug enabled additional debug information.
Debug = Bool("OLLAMA_DEBUG")
// FlashAttention enables the experimental flash attention feature.
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
// NoHistory disables readline history.
NoHistory = Bool("OLLAMA_NOHISTORY")
// NoPrune disables pruning of model blobs on startup.
NoPrune = Bool("OLLAMA_NOPRUNE")
// SchedSpread allows scheduling models across all GPUs.
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
// IntelGPU enables experimental Intel GPU detection.
IntelGPU = Bool("OLLAMA_INTEL_GPU")
)
var ( var (
// Experimental flash attention
FlashAttention bool
// Set via OLLAMA_KEEP_ALIVE in the environment // Set via OLLAMA_KEEP_ALIVE in the environment
KeepAlive time.Duration KeepAlive time.Duration
// Set via OLLAMA_LLM_LIBRARY in the environment // Set via OLLAMA_LLM_LIBRARY in the environment
...@@ -125,22 +138,12 @@ var ( ...@@ -125,22 +138,12 @@ var (
MaxRunners int MaxRunners int
// Set via OLLAMA_MAX_QUEUE in the environment // Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests int MaxQueuedRequests int
// Set via OLLAMA_MODELS in the environment
ModelsDir string
// Set via OLLAMA_NOHISTORY in the environment
NoHistory bool
// Set via OLLAMA_NOPRUNE in the environment
NoPrune bool
// Set via OLLAMA_NUM_PARALLEL in the environment // Set via OLLAMA_NUM_PARALLEL in the environment
NumParallel int NumParallel int
// Set via OLLAMA_RUNNERS_DIR in the environment // Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir string RunnersDir string
// Set via OLLAMA_SCHED_SPREAD in the environment
SchedSpread bool
// Set via OLLAMA_TMPDIR in the environment // Set via OLLAMA_TMPDIR in the environment
TmpDir string TmpDir string
// Set via OLLAMA_INTEL_GPU in the environment
IntelGpu bool
// Set via CUDA_VISIBLE_DEVICES in the environment // Set via CUDA_VISIBLE_DEVICES in the environment
CudaVisibleDevices string CudaVisibleDevices string
...@@ -163,19 +166,19 @@ type EnvVar struct { ...@@ -163,19 +166,19 @@ type EnvVar struct {
func AsMap() map[string]EnvVar { func AsMap() map[string]EnvVar {
ret := map[string]EnvVar{ ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, "OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"}, "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"}, "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
} }
if runtime.GOOS != "darwin" { if runtime.GOOS != "darwin" {
...@@ -184,7 +187,7 @@ func AsMap() map[string]EnvVar { ...@@ -184,7 +187,7 @@ func AsMap() map[string]EnvVar {
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"} ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"} ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"} ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"} ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
} }
return ret return ret
} }
...@@ -197,8 +200,8 @@ func Values() map[string]string { ...@@ -197,8 +200,8 @@ func Values() map[string]string {
return vals return vals
} }
// Clean quotes and spaces from the value // getenv returns an environment variable stripped of leading and trailing quotes or spaces
func clean(key string) string { func getenv(key string) string {
return strings.Trim(os.Getenv(key), "\"' ") return strings.Trim(os.Getenv(key), "\"' ")
} }
...@@ -213,14 +216,7 @@ func init() { ...@@ -213,14 +216,7 @@ func init() {
} }
func LoadConfig() { func LoadConfig() {
if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" { RunnersDir = getenv("OLLAMA_RUNNERS_DIR")
d, err := strconv.ParseBool(fa)
if err == nil {
FlashAttention = d
}
}
RunnersDir = clean("OLLAMA_RUNNERS_DIR")
if runtime.GOOS == "windows" && RunnersDir == "" { if runtime.GOOS == "windows" && RunnersDir == "" {
// On Windows we do not carry the payloads inside the main executable // On Windows we do not carry the payloads inside the main executable
appExe, err := os.Executable() appExe, err := os.Executable()
...@@ -256,11 +252,11 @@ func LoadConfig() { ...@@ -256,11 +252,11 @@ func LoadConfig() {
} }
} }
TmpDir = clean("OLLAMA_TMPDIR") TmpDir = getenv("OLLAMA_TMPDIR")
LLMLibrary = clean("OLLAMA_LLM_LIBRARY") LLMLibrary = getenv("OLLAMA_LLM_LIBRARY")
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" { if onp := getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
val, err := strconv.Atoi(onp) val, err := strconv.Atoi(onp)
if err != nil { if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err) slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
...@@ -269,24 +265,7 @@ func LoadConfig() { ...@@ -269,24 +265,7 @@ func LoadConfig() {
} }
} }
if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" { maxRunners := getenv("OLLAMA_MAX_LOADED_MODELS")
NoHistory = true
}
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
s, err := strconv.ParseBool(spread)
if err == nil {
SchedSpread = s
} else {
SchedSpread = true
}
}
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
NoPrune = true
}
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
if maxRunners != "" { if maxRunners != "" {
m, err := strconv.Atoi(maxRunners) m, err := strconv.Atoi(maxRunners)
if err != nil { if err != nil {
...@@ -305,20 +284,16 @@ func LoadConfig() { ...@@ -305,20 +284,16 @@ func LoadConfig() {
} }
} }
ka := clean("OLLAMA_KEEP_ALIVE") ka := getenv("OLLAMA_KEEP_ALIVE")
if ka != "" { if ka != "" {
loadKeepAlive(ka) loadKeepAlive(ka)
} }
if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil { CudaVisibleDevices = getenv("CUDA_VISIBLE_DEVICES")
IntelGpu = set HipVisibleDevices = getenv("HIP_VISIBLE_DEVICES")
} RocrVisibleDevices = getenv("ROCR_VISIBLE_DEVICES")
GpuDeviceOrdinal = getenv("GPU_DEVICE_ORDINAL")
CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES") HsaOverrideGfxVersion = getenv("HSA_OVERRIDE_GFX_VERSION")
HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
} }
func loadKeepAlive(ka string) { func loadKeepAlive(ka string) {
......
...@@ -20,8 +20,8 @@ func TestSmoke(t *testing.T) { ...@@ -20,8 +20,8 @@ func TestSmoke(t *testing.T) {
require.True(t, Debug()) require.True(t, Debug())
t.Setenv("OLLAMA_FLASH_ATTENTION", "1") t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
LoadConfig() require.True(t, FlashAttention())
require.True(t, FlashAttention)
t.Setenv("OLLAMA_KEEP_ALIVE", "") t.Setenv("OLLAMA_KEEP_ALIVE", "")
LoadConfig() LoadConfig()
require.Equal(t, 5*time.Minute, KeepAlive) require.Equal(t, 5*time.Minute, KeepAlive)
...@@ -162,3 +162,27 @@ func TestOrigins(t *testing.T) { ...@@ -162,3 +162,27 @@ func TestOrigins(t *testing.T) {
}) })
} }
} }
func TestBool(t *testing.T) {
cases := map[string]struct {
value string
expect bool
}{
"empty": {"", false},
"true": {"true", true},
"false": {"false", false},
"1": {"1", true},
"0": {"0", false},
"random": {"random", true},
"something": {"something", true},
}
for name, tt := range cases {
t.Run(name, func(t *testing.T) {
t.Setenv("OLLAMA_BOOL", tt.value)
if b := Bool("OLLAMA_BOOL"); b() != tt.expect {
t.Errorf("%s: expected %t, got %t", name, tt.expect, b())
}
})
}
}
...@@ -302,7 +302,7 @@ func GetGPUInfo() GpuInfoList { ...@@ -302,7 +302,7 @@ func GetGPUInfo() GpuInfoList {
} }
// Intel // Intel
if envconfig.IntelGpu { if envconfig.IntelGPU() {
oHandles = initOneAPIHandles() oHandles = initOneAPIHandles()
// On windows we bundle the oneapi library one level above the runner dir // On windows we bundle the oneapi library one level above the runner dir
depPath = "" depPath = ""
......
...@@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr ...@@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--memory-f32") params = append(params, "--memory-f32")
} }
flashAttnEnabled := envconfig.FlashAttention flashAttnEnabled := envconfig.FlashAttention()
for _, g := range gpus { for _, g := range gpus {
// only cuda (compute capability 7+) and metal support flash attention // only cuda (compute capability 7+) and metal support flash attention
......
...@@ -644,7 +644,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio ...@@ -644,7 +644,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err return err
} }
if !envconfig.NoPrune && old != nil { if !envconfig.NoPrune() && old != nil {
if err := old.RemoveLayers(); err != nil { if err := old.RemoveLayers(); err != nil {
return err return err
} }
...@@ -883,7 +883,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu ...@@ -883,7 +883,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
// build deleteMap to prune unused layers // build deleteMap to prune unused layers
deleteMap := make(map[string]struct{}) deleteMap := make(map[string]struct{})
if !envconfig.NoPrune { if !envconfig.NoPrune() {
manifest, _, err = GetManifest(mp) manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) { if err != nil && !errors.Is(err, os.ErrNotExist) {
return err return err
......
...@@ -1121,7 +1121,7 @@ func Serve(ln net.Listener) error { ...@@ -1121,7 +1121,7 @@ func Serve(ln net.Listener) error {
return err return err
} }
if !envconfig.NoPrune { if !envconfig.NoPrune() {
// clean up unused layers and manifests // clean up unused layers and manifests
if err := PruneLayers(); err != nil { if err := PruneLayers(); err != nil {
return err return err
......
...@@ -695,7 +695,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP ...@@ -695,7 +695,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// First attempt to fit the model into a single GPU // First attempt to fit the model into a single GPU
for _, p := range numParallelToTry { for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread { if !envconfig.SchedSpread() {
for _, g := range sgl { for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment