Unverified Commit dcfb7a10 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

next build (#8539)



* add build to .dockerignore

* test: only build one arch

* add build to .gitignore

* fix ccache path

* filter amdgpu targets

* only filter if autodetecting

* Don't clobber gpu list for default runner

This ensures the GPU specific environment variables are set properly

* explicitly set CXX compiler for HIP

* Update build_windows.ps1

This isn't complete, but is close.  Dependencies are missing, and it only builds the "default" preset.

* build: add ollama subdir

* add .git to .dockerignore

* docs: update development.md

* update build_darwin.sh

* remove unused scripts

* llm: add cwd and build/lib/ollama to library paths

* default DYLD_LIBRARY_PATH to LD_LIBRARY_PATH in runner on macOS

* add additional cmake output vars for msvc

* interim edits to make server detection logic work with dll directories like lib/ollama/cuda_v12

* remove unncessary filepath.Dir, cleanup

* add hardware-specific directory to path

* use absolute server path

* build: linux arm

* cmake install targets

* remove unused files

* ml: visit each library path once

* build: skip cpu variants on arm

* build: install cpu targets

* build: fix workflow

* shorter names

* fix rocblas install

* docs: clean up development.md

* consistent build dir removal in development.md

* silence -Wimplicit-function-declaration build warnings in ggml-cpu

* update readme

* update development readme

* llm: update library lookup logic now that there is one runner (#8587)

* tweak development.md

* update docs

* add windows cuda/rocm tests

---------
Co-authored-by: default avatarjmorganca <jmorganca@gmail.com>
Co-authored-by: default avatarDaniel Hiltgen <daniel@ollama.com>
parent 2ef3c803
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 3 Dec 2024 21:30:51 -0800
Subject: [PATCH] relative include paths
---
ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
ggml/src/ggml-cpu/ggml-cpu.cpp | 3 +--
ggml/src/ggml-quants.c | 2 +-
3 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b307d554..4eb39c52 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10,7 +10,7 @@
#include "ggml-quants.h"
#include "ggml-cpu-quants.h"
#include "ggml-threading.h"
-#include "amx/amx.h"
+#include "amx.h"
#include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index f11399cc..2a8b40ce 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -4,8 +4,7 @@
#include "ggml-cpu-aarch64.h"
#include "ggml-cpu-traits.h"
#include "ggml-impl.h"
-#include "amx/amx.h"
-
+#include "amx.h"
#include <cctype>
#include <string>
#include <vector>
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7918388a..e2ed84e4 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3,7 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
#include "ggml-cpu.h"
#include <math.h>
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 14 Jan 2025 12:01:24 -0800
Subject: [PATCH] sort devices by score
---
ggml/src/ggml-backend-reg.cpp | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 899d16f2..ac5cda07 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
struct ggml_backend_registry {
std::vector<ggml_backend_reg_entry> backends;
- std::vector<ggml_backend_dev_t> devices;
+ std::vector<std::pair<ggml_backend_dev_t, int>> devices;
ggml_backend_registry() {
#ifdef GGML_USE_CUDA
@@ -195,7 +195,7 @@ struct ggml_backend_registry {
}
}
- void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
+ void register_backend(ggml_backend_reg_t reg, int score = -1, dl_handle_ptr handle = nullptr) {
if (!reg) {
return;
}
@@ -206,15 +206,15 @@ struct ggml_backend_registry {
#endif
backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
- register_device(ggml_backend_reg_dev_get(reg, i));
+ register_device(ggml_backend_reg_dev_get(reg, i), score);
}
}
- void register_device(ggml_backend_dev_t device) {
+ void register_device(ggml_backend_dev_t device, int score = -1) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif
- devices.push_back(device);
+ devices.push_back({device, score});
}
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
@@ -257,7 +257,7 @@ struct ggml_backend_registry {
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
- register_backend(reg, std::move(handle));
+ register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
return reg;
}
@@ -280,7 +280,7 @@ struct ggml_backend_registry {
// remove devices
devices.erase(
std::remove_if(devices.begin(), devices.end(),
- [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
+ [reg](std::pair<ggml_backend_dev_t, int> dev) { return ggml_backend_dev_backend_reg(dev.first) == reg; }),
devices.end());
// remove backend
@@ -338,7 +338,12 @@ size_t ggml_backend_dev_count() {
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count());
- return get_reg().devices[index];
+ auto devices = get_reg().devices;
+ if (!std::is_heap(devices.begin(), devices.end())) {
+ std::make_heap(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second < b.second; });
+ }
+
+ return devices[index].first;
}
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 14 Jan 2025 15:59:04 -0800
Subject: [PATCH] add phony target ggml-cpu for all cpu variants
---
ggml/src/CMakeLists.txt | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 84101c32..72b488dd 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
+ add_dependencies(ggml-cpu ggml-cpu-${tag_name})
endfunction()
ggml_add_backend(CPU)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
+ add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(sandybridge AVX)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
#pragma once
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
const void *, int64_t, const void *, int64_t, void *, int64_t,
int, int, int);
#ifdef __cplusplus
}
#endif
...@@ -29,7 +29,6 @@ import ( ...@@ -29,7 +29,6 @@ import (
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/llama" "github.com/ollama/ollama/llama"
"github.com/ollama/ollama/runners"
) )
type LlamaServer interface { type LlamaServer interface {
...@@ -91,8 +90,6 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) { ...@@ -91,8 +90,6 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
// The gpu list must be a single family. // The gpu list must be a single family.
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var err error var err error
var cpuRunner string
var estimate MemoryEstimate
var systemTotalMemory uint64 var systemTotalMemory uint64
var systemFreeMemory uint64 var systemFreeMemory uint64
var systemSwapFreeMemory uint64 var systemSwapFreeMemory uint64
...@@ -107,12 +104,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter ...@@ -107,12 +104,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
if opts.NumGPU == 0 { if opts.NumGPU == 0 {
gpus = discover.GetCPUInfo() gpus = discover.GetCPUInfo()
} }
if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = runners.ServerForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else {
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
if len(gpus) > 1 || gpus[0].Library != "cpu" {
switch { switch {
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory: case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
// disable partial offloading when model is greater than total system memory as this // disable partial offloading when model is greater than total system memory as this
...@@ -120,7 +114,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter ...@@ -120,7 +114,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
opts.NumGPU = 0 opts.NumGPU = 0
case gpus[0].Library != "metal" && estimate.Layers == 0: case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit // Don't bother loading into the GPU if no layers can fit
cpuRunner = runners.ServerForCpu()
gpus = discover.GetCPUInfo() gpus = discover.GetCPUInfo()
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = estimate.Layers opts.NumGPU = estimate.Layers
...@@ -140,36 +133,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter ...@@ -140,36 +133,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
estimate.log() estimate.log()
// Loop through potential servers
finalErr := errors.New("no suitable llama servers found")
availableServers := runners.GetAvailableServers()
var servers []string
if cpuRunner != "" {
servers = []string{cpuRunner}
} else {
servers = runners.ServersForGpu(gpus[0].RunnerName()) // All GPUs in the list are matching Library and Variant
}
demandLib := envconfig.LLMLibrary()
if demandLib != "" {
serverPath := availableServers[demandLib]
if serverPath == "" {
slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
} else {
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
servers = []string{demandLib}
if strings.HasPrefix(demandLib, "cpu") || (!(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") && demandLib == runners.BuiltinName()) {
// Omit the GPU flag to silence the warning
opts.NumGPU = -1
}
}
}
if len(servers) == 0 {
return nil, fmt.Errorf("no servers found for %v", gpus)
}
params := []string{ params := []string{
"--model", model, "--model", model,
"--ctx-size", strconv.Itoa(opts.NumCtx), "--ctx-size", strconv.Itoa(opts.NumCtx),
...@@ -270,21 +233,49 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter ...@@ -270,21 +233,49 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
params = append(params, "--multiuser-cache") params = append(params, "--multiuser-cache")
} }
for i := range servers { // get available libraries
builtin := servers[i] == runners.BuiltinName() if err != nil {
server := availableServers[servers[i]] return nil, fmt.Errorf("could not get libollama dir: %w", err)
if server == "" { }
// Shouldn't happen
finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers) entries, err := os.ReadDir(discover.LibOllamaPath)
slog.Error("server list inconsistent", "error", finalErr) if err != nil {
return nil, fmt.Errorf("could not read libollama dir: %w", err)
}
libs := make(map[string]string)
for _, entry := range entries {
if entry.IsDir() {
libs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
}
}
lib := gpus[0].RunnerName()
requested := envconfig.LLMLibrary()
if libs[requested] != "" {
slog.Info("using requested gpu library", "requested", requested)
lib = requested
}
var compatible []string
for k := range libs {
// exact match first
if k == lib {
compatible = append([]string{k}, compatible...)
continue continue
} }
if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) { // then match the family (e.g. 'cuda')
gpus = discover.GetCPUInfo() if strings.Split(k, "_")[0] == strings.Split(lib, "_")[0] {
compatible = append(compatible, k)
} }
}
slog.Debug("compatible gpu libraries", "compatible", compatible)
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race // iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
// without any LD_LIBRARY_PATH flags
for {
port := 0 port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener var l *net.TCPListener
...@@ -305,25 +296,45 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter ...@@ -305,25 +296,45 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
if runtime.GOOS == "windows" { if runtime.GOOS == "windows" {
pathEnv = "PATH" pathEnv = "PATH"
} }
// Start with the server directory for the LD_LIBRARY_PATH/PATH
libraryPaths := []string{filepath.Dir(server)}
var libraryPaths []string
if libraryPath, ok := os.LookupEnv(pathEnv); ok { if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// favor our bundled library dependencies over system libraries
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
} }
if len(compatible) > 0 {
c := compatible[0]
if libpath, ok := libs[c]; ok {
slog.Debug("adding gpu library", "path", libpath)
libraryPaths = append(libraryPaths, libpath)
}
}
// Note: we always put the dependency path first // Note: we always put the dependency path first
// since this was the exact version we compiled/linked against // since this was the exact version we compiled/linked against
if gpus[0].DependencyPath != nil { if gpus[0].DependencyPath != nil {
slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
// assume gpus from the same library have the same dependency path // assume gpus from the same library have the same dependency path
libraryPaths = append(gpus[0].DependencyPath, libraryPaths...) libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
} }
// finally, add the root library path
libraryPaths = append(libraryPaths, discover.LibOllamaPath)
exe, err := os.Executable()
if err != nil {
return nil, fmt.Errorf("unable to lookup executable path: %w", err)
}
exe, err = filepath.EvalSymlinks(exe)
if err != nil {
return nil, fmt.Errorf("unable to evaluate symlinks for executable path: %w", err)
}
// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access // TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
s := &llmServer{ s := &llmServer{
port: port, port: port,
cmd: exec.Command(server, finalParams...), cmd: exec.Command(exe, finalParams...),
status: NewStatusWriter(os.Stderr), status: NewStatusWriter(os.Stderr),
options: opts, options: opts,
modelPath: model, modelPath: model,
...@@ -394,17 +405,17 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter ...@@ -394,17 +405,17 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
} }
if err = s.cmd.Start(); err != nil { if err = s.cmd.Start(); err != nil {
// Detect permission denied and augment the message about noexec var msg string
if errors.Is(err, os.ErrPermission) {
finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, server)
continue
}
msg := ""
if s.status != nil && s.status.LastErrMsg != "" { if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg msg = s.status.LastErrMsg
} }
err = fmt.Errorf("error starting the external llama server: %v %s", err, msg) err := fmt.Errorf("error starting runner: %v %s", err, msg)
finalErr = err if len(compatible) == 0 {
return nil, err
}
slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
compatible = compatible[1:]
continue continue
} }
...@@ -413,7 +424,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter ...@@ -413,7 +424,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
err := s.cmd.Wait() err := s.cmd.Wait()
// Favor a more detailed message over the process exit status // Favor a more detailed message over the process exit status
if err != nil && s.status != nil && s.status.LastErrMsg != "" { if err != nil && s.status != nil && s.status.LastErrMsg != "" {
slog.Debug("llama runner terminated", "error", err) slog.Error("llama runner terminated", "error", err)
if strings.Contains(s.status.LastErrMsg, "unknown model") { if strings.Contains(s.status.LastErrMsg, "unknown model") {
s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade" s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
} }
...@@ -425,9 +436,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter ...@@ -425,9 +436,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
return s, nil return s, nil
} }
slog.Error("unable to load any llama server", "error", finalErr)
return nil, finalErr
} }
type ServerStatus int type ServerStatus int
......
...@@ -18,8 +18,8 @@ const config: ForgeConfig = { ...@@ -18,8 +18,8 @@ const config: ForgeConfig = {
asar: true, asar: true,
icon: './assets/icon.icns', icon: './assets/icon.icns',
extraResource: [ extraResource: [
'../dist/ollama', path.join(__dirname, '../dist/darwin/ollama'),
'../dist/darwin-amd64/lib', ...fs.readdirSync(path.join(__dirname, '../dist/darwin/amd64')).map(f => path.join(__dirname, '../dist/darwin/amd64', f)),
path.join(__dirname, './assets/iconTemplate.png'), path.join(__dirname, './assets/iconTemplate.png'),
path.join(__dirname, './assets/iconTemplate@2x.png'), path.join(__dirname, './assets/iconTemplate@2x.png'),
path.join(__dirname, './assets/iconUpdateTemplate.png'), path.join(__dirname, './assets/iconUpdateTemplate.png'),
...@@ -43,7 +43,7 @@ const config: ForgeConfig = { ...@@ -43,7 +43,7 @@ const config: ForgeConfig = {
} }
: {}), : {}),
osxUniversal: { osxUniversal: {
x64ArchFiles: '**/ollama*', x64ArchFiles: '*',
}, },
}, },
rebuildConfig: {}, rebuildConfig: {},
......
# Build the discrete cpu runner(s) for the platform which do not rely on 3rd party GPU libraries
include make/common-defs.make
CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" $(TARGET_LDFLAGS)"
ifeq ($(ARCH),amd64)
ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
RUNNERS = cpu_avx cpu_avx2
endif
endif
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
cpu: $(BUILD_RUNNERS)
dist: $(DIST_RUNNERS)
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx"
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@-mkdir -p $(dir $@)
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2"
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@-mkdir -p $(dir $@)
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
@-mkdir -p $(dir $@)
cp $< $@
clean:
rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS)
.PHONY: clean cpu dist
# Handy debugging for make variables
print-%:
@echo '$*=$($*)'
# Build rules for CUDA v11 runner
include make/common-defs.make
include make/cuda-v11-defs.make
GPU_RUNNER_VARIANT := _v11
GPU_COMPILER=$(CUDA_11_COMPILER)
CUDA_ARCHITECTURES?=50;52;53;60;61;62;70;72;75;80;86
GPU_LIB_DIR = $(CUDA_11_LIB_DIR)
CGO_EXTRA_LDFLAGS = $(CUDA_11_CGO_EXTRA_LDFLAGS)
include make/cuda.make
include make/gpu.make
\ No newline at end of file
# Build rules for CUDA v12 runner
include make/common-defs.make
include make/cuda-v12-defs.make
GPU_RUNNER_VARIANT := _v12
GPU_COMPILER=$(CUDA_12_COMPILER)
CUDA_ARCHITECTURES?=60;61;62;70;72;75;80;86;87;89;90;90a
GPU_LIB_DIR = $(CUDA_12_LIB_DIR)
CGO_EXTRA_LDFLAGS = $(CUDA_12_CGO_EXTRA_LDFLAGS)
include make/cuda.make
include make/gpu.make
\ No newline at end of file
# Makefile for building top-level ollama binary
include make/common-defs.make
exe: $(OLLAMA_EXE)
dist_exe dist_ollama: $(DIST_OLLAMA_EXE)
GO_DEPS=$(foreach dir,$(shell go list -deps -f '{{.Dir}}' . ),$(wildcard $(dir)/*.go))
CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" $(EXTRA_GOLDFLAGS) $(TARGET_LDFLAGS)"
$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): $(COMMON_SRCS) $(COMMON_HDRS) $(GO_DEPS)
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ .
.PHONY: ollama dist_ollama exe dist_exe
# Handy debugging for make variables
print-%:
@echo '$*=$($*)'
# Build rules for ROCm runner
#
# Note: at present we only support a single ROCm version (whichever is default on the build system)
# unlike CUDA where we'll build both a v11 and v12 variant.
include make/common-defs.make
include make/rocm-defs.make
HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
ifeq ($(OS),windows)
GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
CGO_EXTRA_LDFLAGS := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
HIP_ARCHS?=$(HIP_ARCHS_COMMON)
GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
else ifeq ($(OS),linux)
GPU_LIB_DIR := $(strip $(shell ls -d $(HIP_PATH)/lib64 2>/dev/null || ls -d $(HIP_PATH)/lib 2>/dev/null))
CGO_EXTRA_LDFLAGS := -L$(GPU_LIB_DIR)
HIP_ARCHS?=$(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX)
GPU_COMPILER_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
endif
GPU_COMPILER=$(HIP_COMPILER)
# TODO future multi-variant support for ROCm
# ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version)))))))
# ifneq (,$(ROCM_VERSION))
# GPU_RUNNER_VARIANT = _v$(ROCM_VERSION)
# endif
GPU_RUNNER_GO_TAGS := rocm
GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT)
GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64
GPU_RUNNER_LIBS_SHORT := hipblas rocblas
# Note: ROCm requires an extra step of discovering and copying the transitive dependencies on linux
ifeq ($(OS),windows)
ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)/lib/ollama
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
else ifeq ($(OS),linux)
ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)-rocm/lib/ollama
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
FILTERED_GPU_TRANSITIVE_LIBS=$(sort $(filter-out $(addprefix %,$(notdir $(GPU_LIBS))), $(GPU_TRANSITIVE_LIBS)))
GPU_DIST_TRANSITIVE_LIB_DEPS = $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(FILTERED_GPU_TRANSITIVE_LIBS))))
endif
GPU_DIST_LIB_DEPS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt
ifeq ($(OS),linux)
GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++17
else ifeq ($(OS),windows)
GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
endif
GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))
# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
GPU_COMPILER_CUFLAGS = \
$(GPU_COMPILER_FPIC) \
$(addprefix -m,$(GPU_VECTOR_FLAGS)) \
-mf16c \
-mfma \
-c \
-O3 \
-DGGML_USE_CUDA \
-DGGML_BUILD=1 \
-DGGML_BACKEND_BUILD=1 \
-DGGML_SHARED=1 \
-DGGML_BACKEND_SHARED=1 \
-DGGML_CUDA_DMMV_X=32 \
-DGGML_CUDA_MMV_Y=1 \
-DGGML_SCHED_MAX_COPIES=4 \
-DGGML_USE_HIP \
-DGGML_USE_LLAMAFILE \
-DHIP_FAST_MATH \
-D__HIP_PLATFORM_AMD__=1 \
-D__HIP_ROCclr__=1 \
-DNDEBUG \
-DK_QUANTS_PER_ITERATION=2 \
-D_CRT_SECURE_NO_WARNINGS \
-D_GNU_SOURCE \
-D_XOPEN_SOURCE=600 \
-DUSE_PROF_API=1 \
-std=gnu++17 \
-x hip \
-mllvm=-amdgpu-early-inline-all=true \
-mllvm=-amdgpu-function-calls=false \
-Wno-expansion-to-defined \
-Wno-invalid-noreturn \
-Wno-ignored-attributes \
-Wno-pass-failed \
-Wno-deprecated-declarations \
-Wno-unused-result \
-I./llama/
# Workaround buggy P2P copy on some windows multi-GPU setups
# This workaround breaks linux systems with small system RAM, so only enable on windows
ifeq ($(OS),windows)
GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
endif
include make/gpu.make
# Adjust the rules from gpu.make to handle the ROCm dependencies properly
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(GPU_DIST_TRANSITIVE_LIB_DEPS)
$(ROCBLAS_DIST_DEP_MANIFEST):
@-mkdir -p $(dir $@)
@echo "Copying rocblas library..."
(cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . ) | (cd $(dir $@) && tar xf - )
@echo "rocblas library copy complete"
$(GPU_DIST_TRANSITIVE_LIB_DEPS):
@-mkdir -p $(dir $@)
$(CP) $(dir $(filter %$(notdir $@),$(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
# Helpers for managing our vendored llama.cpp repo and patch set
REPO_ROOT:=./
DEST_DIR:=./llama/
include $(DEST_DIR)vendoring
LLAMACPP_REPO := ./llama/vendor/
# Relative to the vendor dir
VENDOR_RELATIVE_PATCH_DIR := ../patches/
help-sync:
@echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes"
@echo ""
@echo " make apply-patches # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
@echo " make sync # Vendor llama.cpp and ggml from the tracking repo working tree"
@echo " make sync-clean # Remove all vendored files"
@echo " make create-patches # Generate the patch set based on the current commits in the tracking repo since the base commit"
@echo ""
@echo "For more details on the workflow, see the Vendoring section in 'docs/development.md'"
apply-patches: $(LLAMACPP_REPO)
@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
echo "ERROR: Your llama.cpp repo is dirty. The apply-patches target requires a clean working tree"; \
echo "To clobber: git -C $(LLAMACPP_REPO) reset --hard HEAD" ; \
exit 1; \
fi
@echo "Checking out $(LLAMACPP_BASE_COMMIT)"
@git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \
git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT)
@echo "Applying ollama patches..."
@cd $(LLAMACPP_REPO) && git -c 'user.name=nobody' -c 'user.email=<>' am -3 $(VENDOR_RELATIVE_PATCH_DIR)*.patch || \
echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches"
@echo ""
@echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied."
@echo "Don't forget to commit any changes you make and run 'make create-patches' "
$(LLAMACPP_REPO):
@echo "Cloning llama.cpp to $(LLAMACPP_REPO)"
git clone https://github.com/ggerganov/llama.cpp.git $@
create-patches: $(LLAMACPP_REPO)
@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
echo "ERROR: Your llama.cpp repo is dirty. You must commit any pending changes for format-patch to generate patches"; \
exit 1; \
fi
@cd $(LLAMACPP_REPO) && git format-patch --no-signature --no-numbered --zero-commit -o $(VENDOR_RELATIVE_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
# Vendoring template logic
EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
OLLAMA_NATIVE_FILES=mllama.cpp mllama.h llama_darwin.c sampling_ext.cpp sampling_ext.h
define vendor_file
$(strip $(addprefix $(2),$(notdir $1))) : $(addprefix $(LLAMACPP_REPO),$(1))
ifneq ($$(filter-out $(EXCLUDED_FILES),$(notdir $1)),)
@echo "vendoring $1"; \
mkdir -p $$(dir $$@) && \
echo "/**" > $$@ && \
echo " * llama.cpp - commit $$(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $$@ && \
echo " *" >> $$@ && \
sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$$$//' >> $$@ && \
echo " */" >> $$@ && \
echo "" >> $$@ && \
cat $$< >> $$@
else
@echo "vendoring $1"; \
mkdir -p $$(dir $$@) && \
cat $$< > $$@
endif
VENDORED_FILES += $(strip $(addprefix $(2),$(notdir $1)))
endef
# llama.cpp files -> llama/
LLAMACPP_FILES=\
src/unicode.cpp \
src/unicode.h \
src/unicode-data.cpp \
src/unicode-data.h \
src/llama.cpp \
src/llama-adapter.cpp \
src/llama-adapter.h \
src/llama-arch.cpp \
src/llama-arch.h \
src/llama-batch.cpp \
src/llama-batch.h \
src/llama-chat.cpp \
src/llama-chat.h \
src/llama-context.cpp \
src/llama-context.h \
src/llama-cparams.cpp \
src/llama-cparams.h \
src/llama-grammar.cpp \
src/llama-grammar.h \
src/llama-hparams.cpp \
src/llama-hparams.h \
src/llama-impl.cpp \
src/llama-impl.h \
src/llama-kv-cache.cpp \
src/llama-kv-cache.h \
src/llama-mmap.cpp \
src/llama-mmap.h \
src/llama-model-loader.cpp \
src/llama-model-loader.h \
src/llama-model.cpp \
src/llama-model.h \
src/llama-quant.cpp \
src/llama-quant.h \
src/llama-sampling.cpp \
src/llama-sampling.h \
src/llama-vocab.cpp \
src/llama-vocab.h \
include/llama.h \
include/llama-cpp.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-cpu/llamafile/sgemm.cpp \
ggml/src/ggml-cpu/llamafile/sgemm.h
$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
# llama.cpp files -> llama/llamafile
LLAMAFILE_FILES= \
ggml/src/ggml-cpu/llamafile/sgemm.h
$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR)llamafile/)))
# ggml files -> llama/
GGML_FILES= \
ggml/src/ggml.c \
ggml/include/ggml.h \
ggml/src/ggml-quants.c \
ggml/src/ggml-quants.h \
ggml/src/ggml-metal/ggml-metal.metal \
ggml/include/ggml-metal.h \
ggml/src/ggml-impl.h \
ggml/src/ggml-threading.h \
ggml/include/ggml-cuda.h \
ggml/src/ggml-backend-reg.cpp \
ggml/src/ggml-metal/ggml-metal-impl.h \
ggml/src/ggml-common.h \
ggml/include/ggml-backend.h \
ggml/src/ggml-backend.cpp \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-alloc.c \
ggml/include/ggml-blas.h \
ggml/include/ggml-cpp.h \
ggml/src/ggml-threading.cpp \
ggml/src/ggml-blas/ggml-blas.cpp \
ggml/src/ggml-cpu/ggml-cpu.c \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/src/ggml-cpu/ggml-cpu-aarch64.h \
ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp \
ggml/src/ggml-cpu/ggml-cpu-quants.h \
ggml/src/ggml-cpu/ggml-cpu-quants.c \
ggml/src/ggml-cpu/ggml-cpu-impl.h \
ggml/src/ggml-cpu/ggml-cpu-traits.h \
ggml/src/ggml-cpu/ggml-cpu-traits.cpp \
ggml/src/ggml-cpu/amx/amx.h \
ggml/src/ggml-cpu/amx/amx.cpp \
ggml/src/ggml-cpu/amx/mmq.cpp \
ggml/src/ggml-cpu/amx/mmq.h
$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
$(DEST_DIR)ggml-metal-embed.metal: $(DEST_DIR)ggml-common.h $(DEST_DIR)ggml-metal-impl.h
@sed -e '/__embed_ggml-common.h__/r $(DEST_DIR)/ggml-common.h' \
-e '/__embed_ggml-common.h__/d' \
< $(DEST_DIR)/ggml-metal.metal \
> $(DEST_DIR)/ggml-metal-embed.metal.tmp
@sed -e '/#include "ggml-metal-impl.h"/r $(DEST_DIR)/ggml-metal-impl.h' \
-e '/#include "ggml-metal-impl.h"/d' \
< $(DEST_DIR)/ggml-metal-embed.metal.tmp \
> $(DEST_DIR)/ggml-metal-embed.metal
@rm $(DEST_DIR)/ggml-metal-embed.metal.tmp
VENDORED_FILES += $(DEST_DIR)ggml-metal-embed.metal
# TODO generalize renaming pattern if we have more of these
$(DEST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal/ggml-metal.m
@echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \
mkdir -p $(dir $@) && \
echo "/**" > $@ && \
echo " * llama.cpp - commit $(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $@ && \
echo " *" >> $@ && \
sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$//' >> $@ && \
echo " */" >> $@ && \
echo "" >> $@ && \
cat $< >> $@
VENDORED_FILES += $(DEST_DIR)ggml-metal_darwin_arm64.m
# ggml-cuda -> llama/ggml-cuda/
GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh
GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES)))))
$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/)))
GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu
GGML_TEMPLATE_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES)))))
$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/template-instances/)))
GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h
GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES)))))
$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/vendors/)))
# llava -> llama/
LAVA_FILES= \
examples/llava/clip.cpp \
examples/llava/clip.h \
examples/llava/llava.cpp \
examples/llava/llava.h \
common/log.h \
common/log.cpp \
common/stb_image.h
# These files are mostly used by the llava code
# and shouldn't be necessary once we use clip.cpp directly
LAVA_FILES+= \
common/common.cpp \
common/common.h \
common/sampling.cpp \
common/sampling.h \
common/json.hpp \
common/json-schema-to-grammar.cpp \
common/json-schema-to-grammar.h \
common/base64.hpp
$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
$(DEST_DIR)build-info.cpp:
@echo "Generating $@"
@echo "int LLAMA_BUILD_NUMBER = 0;" > $@
@echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@
@echo "char const *LLAMA_COMPILER = \"\";" >> $@
@echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@
VENDORED_FILES += $(DEST_DIR)build-info.cpp
sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files
sync-clean:
rm -f $(VENDORED_FILES) $(EXTRA_NATIVE_FILES)
PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh
NATIVE_DIRS=$(DEST_DIR) $(DEST_DIR)llamafile/ $(DEST_DIR)ggml-cuda/ $(DEST_DIR)ggml-cuda/template-instances/ $(DEST_DIR)ggml-cuda/vendors/
ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS))))
EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DEST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
remove-stale-files:
@rm -f $(EXTRA_NATIVE_FILES)
.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT
# Handy debugging for make variables
print-%:
@echo '$*=$($*)'
# Targets to assist in running tests
include make/common-defs.make
test:
cd .. && go test ./...
integration: $(OLLAMA_EXE)
cd .. && go test --tags=integration ./integration -v
lint:
cd .. && golangci-lint run -v
# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows
$(OLLAMA_EXE):
@echo ""
@echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries"
@echo ""
@exit 1
\ No newline at end of file
# Common definitions for the various Makefiles
# No rules are defined here so this is safe to include at the beginning of other makefiles
OS := $(shell uname -s)
ARCH ?= $(subst aarch64,arm64,$(subst x86_64,amd64,$(shell uname -m)))
ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS)))
OS := windows
ARCH := $(shell systeminfo 2>/dev/null | grep "System Type" | grep ARM64 > /dev/null && echo "arm64" || echo "amd64" )
else ifeq ($(OS),Linux)
OS := linux
else ifeq ($(OS),Darwin)
OS := darwin
endif
comma:= ,
empty:=
space:= $(empty) $(empty)
uc = $(subst a,A,$(subst b,B,$(subst c,C,$(subst d,D,$(subst e,E,$(subst f,F,$(subst g,G,$(subst h,H,$(subst i,I,$(subst j,J,$(subst k,K,$(subst l,L,$(subst m,M,$(subst n,N,$(subst o,O,$(subst p,P,$(subst q,Q,$(subst r,R,$(subst s,S,$(subst t,T,$(subst u,U,$(subst v,V,$(subst w,W,$(subst x,X,$(subst y,Y,$(subst z,Z,$1))))))))))))))))))))))))))
export CGO_CFLAGS_ALLOW = -mfma|-mf16c
export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
export HIP_PLATFORM = amd
export CGO_ENABLED=1
BUILD_DIR = ./llama/build/$(OS)-$(ARCH)
DIST_BASE = ./dist/$(OS)-$(ARCH)
ifeq ($(OS),windows)
# Absolute paths with cygpath to convert to 8.3 without spaces
PWD="$(shell pwd)"
DIST_OLLAMA_EXE=$(DIST_BASE)/ollama$(EXE_EXT)
else
CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
DIST_OLLAMA_EXE=$(DIST_BASE)/bin/ollama$(EXE_EXT)
endif
DIST_LIB_DIR = $(DIST_BASE)/lib/ollama
RUNNERS_DIST_DIR = $(DIST_LIB_DIR)/runners
RUNNERS_BUILD_DIR = $(BUILD_DIR)/runners
VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")
# Conditionally enable ccache for cgo builds too
ifneq ($(CCACHE),)
CC?=$(CCACHE) gcc
CXX?=$(CCACHE) g++
export CC
export CXX
endif
# Override in environment to tune CPU vector flags
ifeq ($(ARCH),amd64)
ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
GPU_RUNNER_CPU_FLAGS=avx
GPU_RUNNER_EXTRA_VARIANT=_avx
else
GPU_RUNNER_CPU_FLAGS=$(subst $(comma),$(space),$(CUSTOM_CPU_FLAGS))
endif
endif
ifeq ($(OS),windows)
CP := cp
OBJ_EXT := obj
SHARED_EXT := dll
EXE_EXT := .exe
SHARED_PREFIX :=
CPU_FLAG_PREFIX := /arch:
ifneq ($(HIP_PATH),)
# If HIP_PATH has spaces, hipcc trips over them when subprocessing
HIP_PATH := $(shell cygpath -m -s "$(patsubst %\,%,$(HIP_PATH))")
export HIP_PATH
endif
else ifeq ($(OS),linux)
CP := cp -df
OBJ_EXT := o
SHARED_EXT := so
SHARED_PREFIX := lib
CPU_FLAG_PREFIX := -m
else
OBJ_EXT := o
SHARED_EXT := so
CPU_FLAG_PREFIX := -m
CP := cp -df
endif
COMMON_SRCS := \
$(wildcard ./llama/*.c) \
$(wildcard ./llama/*.cpp)
COMMON_HDRS := \
$(wildcard ./llama/*.h) \
$(wildcard ./llama/*.hpp)
OLLAMA_EXE=./ollama$(EXE_EXT)
\ No newline at end of file
# Common definitions for the various Makefiles which set cuda settings
# No rules are defined here so this is safe to include at the beginning of other makefiles
ifeq ($(OS),windows)
CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
CUDA_11_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc.exe)
CUDA_11_LIB_DIR = $(strip $(shell ls -d $(CUDA_11_PATH)/bin 2>/dev/null))
CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_PATH)/lib/x64"
else ifeq ($(OS),linux)
CUDA_PATH?=/usr/local/cuda
CUDA_11_PATH:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc)
CUDA_11_LIB_DIR=$(strip $(shell ls -d $(CUDA_11_PATH)/lib64 2>/dev/null || ls -d $(CUDA_11_PATH)/lib 2>/dev/null))
CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_LIB_DIR)" -L"$(CUDA_11_LIB_DIR)/stubs"
endif
# Common definitions for the various Makefiles which set cuda settings
# No rules are defined here so this is safe to include at the beginning of other makefiles
ifeq ($(OS),windows)
CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
CUDA_12_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc.exe)
CUDA_12_LIB_DIR = $(strip $(shell ls -d $(CUDA_12_PATH)/bin 2>/dev/null))
CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_PATH)/lib/x64"
else ifeq ($(OS),linux)
CUDA_PATH?=/usr/local/cuda
CUDA_12_PATH:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc)
CUDA_12_LIB_DIR=$(strip $(shell ls -d $(CUDA_12_PATH)/lib64 2>/dev/null || ls -d $(CUDA_12_PATH)/lib 2>/dev/null))
CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_LIB_DIR)" -L"$(CUDA_12_LIB_DIR)/stubs"
endif
# Common definitions for all cuda versions
ifndef GPU_RUNNER_VARIANT
dummy:
$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
endif
GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT)
GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
ifeq ($(OS),windows)
# On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on
GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)))))
GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__)
GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__)
GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__)
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
else ifeq ($(OS),linux)
# On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw
GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++17
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
GPU_COMPILER_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
endif
GPU_DIST_LIB_DEPS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
-DGGML_CUDA_USE_GRAPHS=1
GPU_COMPILER_CUFLAGS = \
$(GPU_COMPILER_EXTRA_FLAGS) \
-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \
-t2 \
-DGGML_CUDA_DMMV_X=32 \
-DGGML_CUDA_MMV_Y=1 \
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
-DGGML_USE_CUDA=1 \
-DGGML_SHARED=1 \
-DGGML_BACKEND_SHARED=1 \
-DGGML_BUILD=1 \
-DGGML_BACKEND_BUILD=1 \
-DGGML_USE_LLAMAFILE \
-DK_QUANTS_PER_ITERATION=2 \
-DNDEBUG \
-D_GNU_SOURCE \
-D_XOPEN_SOURCE=600 \
-Wno-deprecated-gpu-targets \
--forward-unknown-to-host-compiler \
-use_fast_math \
-I./llama/ \
-O3
# Generalized GPU runner build
ifndef GPU_RUNNER_NAME
dummy:
$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
endif
GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" $(EXTRA_GOLDFLAGS) $(TARGET_LDFLAGS)"
# TODO Unify how we handle dependencies in the dist/packaging and install flow
# today, cuda is bundled, but rocm is split out. Should split them each out by runner
DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
GPU_RUNNER_SRCS := \
$(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \
$(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \
llama/ggml.c llama/ggml-backend.cpp llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-threading.cpp
GPU_RUNNER_HDRS := \
$(wildcard llama/ggml-cuda/*.cuh)
# Conditional flags and components to speed up developer builds
ifneq ($(OLLAMA_FAST_BUILD),)
GPU_COMPILER_CUFLAGS += \
-DGGML_DISABLE_FLASH_ATTN
else
GPU_RUNNER_SRCS += \
$(wildcard llama/ggml-cuda/fattn*.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
endif
GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
$(GPU_RUNNER_NAME): $(BUILD_RUNNERS)
dist: $(DIST_RUNNERS)
# Build targets
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $<
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $<
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/"
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@-mkdir -p $(dir $@)
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
# Distribution targets
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
@-mkdir -p $(dir $@)
$(CP) $< $@
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS)
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
@-mkdir -p $(dir $@)
$(CP) $< $@
$(GPU_DIST_LIB_DEPS):
@-mkdir -p $(dir $@)
$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
clean:
rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS)
.PHONY: clean $(GPU_RUNNER_NAME)
# Handy debugging for make variables
print-%:
@echo '$*=$($*)'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment