Unverified Commit dcfb7a10 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

next build (#8539)



* add build to .dockerignore

* test: only build one arch

* add build to .gitignore

* fix ccache path

* filter amdgpu targets

* only filter if autodetecting

* Don't clobber gpu list for default runner

This ensures the GPU specific environment variables are set properly

* explicitly set CXX compiler for HIP

* Update build_windows.ps1

This isn't complete, but is close.  Dependencies are missing, and it only builds the "default" preset.

* build: add ollama subdir

* add .git to .dockerignore

* docs: update development.md

* update build_darwin.sh

* remove unused scripts

* llm: add cwd and build/lib/ollama to library paths

* default DYLD_LIBRARY_PATH to LD_LIBRARY_PATH in runner on macOS

* add additional cmake output vars for msvc

* interim edits to make server detection logic work with dll directories like lib/ollama/cuda_v12

* remove unncessary filepath.Dir, cleanup

* add hardware-specific directory to path

* use absolute server path

* build: linux arm

* cmake install targets

* remove unused files

* ml: visit each library path once

* build: skip cpu variants on arm

* build: install cpu targets

* build: fix workflow

* shorter names

* fix rocblas install

* docs: clean up development.md

* consistent build dir removal in development.md

* silence -Wimplicit-function-declaration build warnings in ggml-cpu

* update readme

* update development readme

* llm: update library lookup logic now that there is one runner (#8587)

* tweak development.md

* update docs

* add windows cuda/rocm tests

---------
Co-authored-by: default avatarjmorganca <jmorganca@gmail.com>
Co-authored-by: default avatarDaniel Hiltgen <daniel@ollama.com>
parent 2ef3c803
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 3 Dec 2024 21:30:51 -0800
Subject: [PATCH] relative include paths
---
ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
ggml/src/ggml-cpu/ggml-cpu.cpp | 3 +--
ggml/src/ggml-quants.c | 2 +-
3 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b307d554..4eb39c52 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10,7 +10,7 @@
#include "ggml-quants.h"
#include "ggml-cpu-quants.h"
#include "ggml-threading.h"
-#include "amx/amx.h"
+#include "amx.h"
#include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index f11399cc..2a8b40ce 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -4,8 +4,7 @@
#include "ggml-cpu-aarch64.h"
#include "ggml-cpu-traits.h"
#include "ggml-impl.h"
-#include "amx/amx.h"
-
+#include "amx.h"
#include <cctype>
#include <string>
#include <vector>
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7918388a..e2ed84e4 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3,7 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
#include "ggml-cpu.h"
#include <math.h>
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 14 Jan 2025 12:01:24 -0800
Subject: [PATCH] sort devices by score
---
ggml/src/ggml-backend-reg.cpp | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 899d16f2..ac5cda07 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
struct ggml_backend_registry {
std::vector<ggml_backend_reg_entry> backends;
- std::vector<ggml_backend_dev_t> devices;
+ std::vector<std::pair<ggml_backend_dev_t, int>> devices;
ggml_backend_registry() {
#ifdef GGML_USE_CUDA
@@ -195,7 +195,7 @@ struct ggml_backend_registry {
}
}
- void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
+ void register_backend(ggml_backend_reg_t reg, int score = -1, dl_handle_ptr handle = nullptr) {
if (!reg) {
return;
}
@@ -206,15 +206,15 @@ struct ggml_backend_registry {
#endif
backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
- register_device(ggml_backend_reg_dev_get(reg, i));
+ register_device(ggml_backend_reg_dev_get(reg, i), score);
}
}
- void register_device(ggml_backend_dev_t device) {
+ void register_device(ggml_backend_dev_t device, int score = -1) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif
- devices.push_back(device);
+ devices.push_back({device, score});
}
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
@@ -257,7 +257,7 @@ struct ggml_backend_registry {
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
- register_backend(reg, std::move(handle));
+ register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
return reg;
}
@@ -280,7 +280,7 @@ struct ggml_backend_registry {
// remove devices
devices.erase(
std::remove_if(devices.begin(), devices.end(),
- [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
+ [reg](std::pair<ggml_backend_dev_t, int> dev) { return ggml_backend_dev_backend_reg(dev.first) == reg; }),
devices.end());
// remove backend
@@ -338,7 +338,12 @@ size_t ggml_backend_dev_count() {
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count());
- return get_reg().devices[index];
+ auto devices = get_reg().devices;
+ if (!std::is_heap(devices.begin(), devices.end())) {
+ std::make_heap(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second < b.second; });
+ }
+
+ return devices[index].first;
}
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 14 Jan 2025 15:59:04 -0800
Subject: [PATCH] add phony target ggml-cpu for all cpu variants
---
ggml/src/CMakeLists.txt | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 84101c32..72b488dd 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
+ add_dependencies(ggml-cpu ggml-cpu-${tag_name})
endfunction()
ggml_add_backend(CPU)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
+ add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(sandybridge AVX)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
#pragma once
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
const void *, int64_t, const void *, int64_t, void *, int64_t,
int, int, int);
#ifdef __cplusplus
}
#endif
......@@ -29,7 +29,6 @@ import (
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/runners"
)
type LlamaServer interface {
......@@ -91,8 +90,6 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
// The gpu list must be a single family.
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var err error
var cpuRunner string
var estimate MemoryEstimate
var systemTotalMemory uint64
var systemFreeMemory uint64
var systemSwapFreeMemory uint64
......@@ -107,12 +104,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
if opts.NumGPU == 0 {
gpus = discover.GetCPUInfo()
}
if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = runners.ServerForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else {
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
if len(gpus) > 1 || gpus[0].Library != "cpu" {
switch {
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
// disable partial offloading when model is greater than total system memory as this
......@@ -120,7 +114,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
opts.NumGPU = 0
case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit
cpuRunner = runners.ServerForCpu()
gpus = discover.GetCPUInfo()
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = estimate.Layers
......@@ -140,36 +133,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
estimate.log()
// Loop through potential servers
finalErr := errors.New("no suitable llama servers found")
availableServers := runners.GetAvailableServers()
var servers []string
if cpuRunner != "" {
servers = []string{cpuRunner}
} else {
servers = runners.ServersForGpu(gpus[0].RunnerName()) // All GPUs in the list are matching Library and Variant
}
demandLib := envconfig.LLMLibrary()
if demandLib != "" {
serverPath := availableServers[demandLib]
if serverPath == "" {
slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
} else {
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
servers = []string{demandLib}
if strings.HasPrefix(demandLib, "cpu") || (!(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") && demandLib == runners.BuiltinName()) {
// Omit the GPU flag to silence the warning
opts.NumGPU = -1
}
}
}
if len(servers) == 0 {
return nil, fmt.Errorf("no servers found for %v", gpus)
}
params := []string{
"--model", model,
"--ctx-size", strconv.Itoa(opts.NumCtx),
......@@ -270,21 +233,49 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
params = append(params, "--multiuser-cache")
}
for i := range servers {
builtin := servers[i] == runners.BuiltinName()
server := availableServers[servers[i]]
if server == "" {
// Shouldn't happen
finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
slog.Error("server list inconsistent", "error", finalErr)
// get available libraries
if err != nil {
return nil, fmt.Errorf("could not get libollama dir: %w", err)
}
entries, err := os.ReadDir(discover.LibOllamaPath)
if err != nil {
return nil, fmt.Errorf("could not read libollama dir: %w", err)
}
libs := make(map[string]string)
for _, entry := range entries {
if entry.IsDir() {
libs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
}
}
lib := gpus[0].RunnerName()
requested := envconfig.LLMLibrary()
if libs[requested] != "" {
slog.Info("using requested gpu library", "requested", requested)
lib = requested
}
var compatible []string
for k := range libs {
// exact match first
if k == lib {
compatible = append([]string{k}, compatible...)
continue
}
if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) {
gpus = discover.GetCPUInfo()
// then match the family (e.g. 'cuda')
if strings.Split(k, "_")[0] == strings.Split(lib, "_")[0] {
compatible = append(compatible, k)
}
}
slog.Debug("compatible gpu libraries", "compatible", compatible)
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
// without any LD_LIBRARY_PATH flags
for {
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
......@@ -305,25 +296,45 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
// Start with the server directory for the LD_LIBRARY_PATH/PATH
libraryPaths := []string{filepath.Dir(server)}
var libraryPaths []string
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// favor our bundled library dependencies over system libraries
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
if len(compatible) > 0 {
c := compatible[0]
if libpath, ok := libs[c]; ok {
slog.Debug("adding gpu library", "path", libpath)
libraryPaths = append(libraryPaths, libpath)
}
}
// Note: we always put the dependency path first
// since this was the exact version we compiled/linked against
if gpus[0].DependencyPath != nil {
slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
// assume gpus from the same library have the same dependency path
libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
}
// finally, add the root library path
libraryPaths = append(libraryPaths, discover.LibOllamaPath)
exe, err := os.Executable()
if err != nil {
return nil, fmt.Errorf("unable to lookup executable path: %w", err)
}
exe, err = filepath.EvalSymlinks(exe)
if err != nil {
return nil, fmt.Errorf("unable to evaluate symlinks for executable path: %w", err)
}
// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
s := &llmServer{
port: port,
cmd: exec.Command(server, finalParams...),
cmd: exec.Command(exe, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
modelPath: model,
......@@ -394,17 +405,17 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
}
if err = s.cmd.Start(); err != nil {
// Detect permission denied and augment the message about noexec
if errors.Is(err, os.ErrPermission) {
finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, server)
continue
}
msg := ""
var msg string
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
err = fmt.Errorf("error starting the external llama server: %v %s", err, msg)
finalErr = err
err := fmt.Errorf("error starting runner: %v %s", err, msg)
if len(compatible) == 0 {
return nil, err
}
slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
compatible = compatible[1:]
continue
}
......@@ -413,7 +424,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
err := s.cmd.Wait()
// Favor a more detailed message over the process exit status
if err != nil && s.status != nil && s.status.LastErrMsg != "" {
slog.Debug("llama runner terminated", "error", err)
slog.Error("llama runner terminated", "error", err)
if strings.Contains(s.status.LastErrMsg, "unknown model") {
s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
}
......@@ -425,9 +436,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
return s, nil
}
slog.Error("unable to load any llama server", "error", finalErr)
return nil, finalErr
}
type ServerStatus int
......
......@@ -18,8 +18,8 @@ const config: ForgeConfig = {
asar: true,
icon: './assets/icon.icns',
extraResource: [
'../dist/ollama',
'../dist/darwin-amd64/lib',
path.join(__dirname, '../dist/darwin/ollama'),
...fs.readdirSync(path.join(__dirname, '../dist/darwin/amd64')).map(f => path.join(__dirname, '../dist/darwin/amd64', f)),
path.join(__dirname, './assets/iconTemplate.png'),
path.join(__dirname, './assets/iconTemplate@2x.png'),
path.join(__dirname, './assets/iconUpdateTemplate.png'),
......@@ -43,7 +43,7 @@ const config: ForgeConfig = {
}
: {}),
osxUniversal: {
x64ArchFiles: '**/ollama*',
x64ArchFiles: '*',
},
},
rebuildConfig: {},
......
# Build the discrete cpu runner(s) for the platform which do not rely on 3rd party GPU libraries
include make/common-defs.make
CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" $(TARGET_LDFLAGS)"
ifeq ($(ARCH),amd64)
ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
RUNNERS = cpu_avx cpu_avx2
endif
endif
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
cpu: $(BUILD_RUNNERS)
dist: $(DIST_RUNNERS)
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx"
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@-mkdir -p $(dir $@)
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2"
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@-mkdir -p $(dir $@)
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
@-mkdir -p $(dir $@)
cp $< $@
clean:
rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS)
.PHONY: clean cpu dist
# Handy debugging for make variables
print-%:
@echo '$*=$($*)'
# Build rules for CUDA v11 runner
include make/common-defs.make
include make/cuda-v11-defs.make
GPU_RUNNER_VARIANT := _v11
GPU_COMPILER=$(CUDA_11_COMPILER)
CUDA_ARCHITECTURES?=50;52;53;60;61;62;70;72;75;80;86
GPU_LIB_DIR = $(CUDA_11_LIB_DIR)
CGO_EXTRA_LDFLAGS = $(CUDA_11_CGO_EXTRA_LDFLAGS)
include make/cuda.make
include make/gpu.make
\ No newline at end of file
# Build rules for CUDA v12 runner
include make/common-defs.make
include make/cuda-v12-defs.make
GPU_RUNNER_VARIANT := _v12
GPU_COMPILER=$(CUDA_12_COMPILER)
CUDA_ARCHITECTURES?=60;61;62;70;72;75;80;86;87;89;90;90a
GPU_LIB_DIR = $(CUDA_12_LIB_DIR)
CGO_EXTRA_LDFLAGS = $(CUDA_12_CGO_EXTRA_LDFLAGS)
include make/cuda.make
include make/gpu.make
\ No newline at end of file
# Makefile for building top-level ollama binary
include make/common-defs.make
exe: $(OLLAMA_EXE)
dist_exe dist_ollama: $(DIST_OLLAMA_EXE)
GO_DEPS=$(foreach dir,$(shell go list -deps -f '{{.Dir}}' . ),$(wildcard $(dir)/*.go))
CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" $(EXTRA_GOLDFLAGS) $(TARGET_LDFLAGS)"
$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): $(COMMON_SRCS) $(COMMON_HDRS) $(GO_DEPS)
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ .
.PHONY: ollama dist_ollama exe dist_exe
# Handy debugging for make variables
print-%:
@echo '$*=$($*)'
# Build rules for ROCm runner
#
# Note: at present we only support a single ROCm version (whichever is default on the build system)
# unlike CUDA where we'll build both a v11 and v12 variant.
include make/common-defs.make
include make/rocm-defs.make
HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
ifeq ($(OS),windows)
GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
CGO_EXTRA_LDFLAGS := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
HIP_ARCHS?=$(HIP_ARCHS_COMMON)
GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
else ifeq ($(OS),linux)
GPU_LIB_DIR := $(strip $(shell ls -d $(HIP_PATH)/lib64 2>/dev/null || ls -d $(HIP_PATH)/lib 2>/dev/null))
CGO_EXTRA_LDFLAGS := -L$(GPU_LIB_DIR)
HIP_ARCHS?=$(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX)
GPU_COMPILER_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
endif
GPU_COMPILER=$(HIP_COMPILER)
# TODO future multi-variant support for ROCm
# ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version)))))))
# ifneq (,$(ROCM_VERSION))
# GPU_RUNNER_VARIANT = _v$(ROCM_VERSION)
# endif
GPU_RUNNER_GO_TAGS := rocm
GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT)
GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64
GPU_RUNNER_LIBS_SHORT := hipblas rocblas
# Note: ROCm requires an extra step of discovering and copying the transitive dependencies on linux
ifeq ($(OS),windows)
ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)/lib/ollama
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
else ifeq ($(OS),linux)
ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)-rocm/lib/ollama
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
FILTERED_GPU_TRANSITIVE_LIBS=$(sort $(filter-out $(addprefix %,$(notdir $(GPU_LIBS))), $(GPU_TRANSITIVE_LIBS)))
GPU_DIST_TRANSITIVE_LIB_DEPS = $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(FILTERED_GPU_TRANSITIVE_LIBS))))
endif
GPU_DIST_LIB_DEPS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt
ifeq ($(OS),linux)
GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++17
else ifeq ($(OS),windows)
GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
endif
GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))
# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
GPU_COMPILER_CUFLAGS = \
$(GPU_COMPILER_FPIC) \
$(addprefix -m,$(GPU_VECTOR_FLAGS)) \
-mf16c \
-mfma \
-c \
-O3 \
-DGGML_USE_CUDA \
-DGGML_BUILD=1 \
-DGGML_BACKEND_BUILD=1 \
-DGGML_SHARED=1 \
-DGGML_BACKEND_SHARED=1 \
-DGGML_CUDA_DMMV_X=32 \
-DGGML_CUDA_MMV_Y=1 \
-DGGML_SCHED_MAX_COPIES=4 \
-DGGML_USE_HIP \
-DGGML_USE_LLAMAFILE \
-DHIP_FAST_MATH \
-D__HIP_PLATFORM_AMD__=1 \
-D__HIP_ROCclr__=1 \
-DNDEBUG \
-DK_QUANTS_PER_ITERATION=2 \
-D_CRT_SECURE_NO_WARNINGS \
-D_GNU_SOURCE \
-D_XOPEN_SOURCE=600 \
-DUSE_PROF_API=1 \
-std=gnu++17 \
-x hip \
-mllvm=-amdgpu-early-inline-all=true \
-mllvm=-amdgpu-function-calls=false \
-Wno-expansion-to-defined \
-Wno-invalid-noreturn \
-Wno-ignored-attributes \
-Wno-pass-failed \
-Wno-deprecated-declarations \
-Wno-unused-result \
-I./llama/
# Workaround buggy P2P copy on some windows multi-GPU setups
# This workaround breaks linux systems with small system RAM, so only enable on windows
ifeq ($(OS),windows)
GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
endif
include make/gpu.make
# Adjust the rules from gpu.make to handle the ROCm dependencies properly
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(GPU_DIST_TRANSITIVE_LIB_DEPS)
$(ROCBLAS_DIST_DEP_MANIFEST):
@-mkdir -p $(dir $@)
@echo "Copying rocblas library..."
(cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . ) | (cd $(dir $@) && tar xf - )
@echo "rocblas library copy complete"
$(GPU_DIST_TRANSITIVE_LIB_DEPS):
@-mkdir -p $(dir $@)
$(CP) $(dir $(filter %$(notdir $@),$(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
# Helpers for managing our vendored llama.cpp repo and patch set
REPO_ROOT:=./
DEST_DIR:=./llama/
include $(DEST_DIR)vendoring
LLAMACPP_REPO := ./llama/vendor/
# Relative to the vendor dir
VENDOR_RELATIVE_PATCH_DIR := ../patches/
help-sync:
@echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes"
@echo ""
@echo " make apply-patches # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
@echo " make sync # Vendor llama.cpp and ggml from the tracking repo working tree"
@echo " make sync-clean # Remove all vendored files"
@echo " make create-patches # Generate the patch set based on the current commits in the tracking repo since the base commit"
@echo ""
@echo "For more details on the workflow, see the Vendoring section in 'docs/development.md'"
apply-patches: $(LLAMACPP_REPO)
@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
echo "ERROR: Your llama.cpp repo is dirty. The apply-patches target requires a clean working tree"; \
echo "To clobber: git -C $(LLAMACPP_REPO) reset --hard HEAD" ; \
exit 1; \
fi
@echo "Checking out $(LLAMACPP_BASE_COMMIT)"
@git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \
git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT)
@echo "Applying ollama patches..."
@cd $(LLAMACPP_REPO) && git -c 'user.name=nobody' -c 'user.email=<>' am -3 $(VENDOR_RELATIVE_PATCH_DIR)*.patch || \
echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches"
@echo ""
@echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied."
@echo "Don't forget to commit any changes you make and run 'make create-patches' "
$(LLAMACPP_REPO):
@echo "Cloning llama.cpp to $(LLAMACPP_REPO)"
git clone https://github.com/ggerganov/llama.cpp.git $@
create-patches: $(LLAMACPP_REPO)
@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
echo "ERROR: Your llama.cpp repo is dirty. You must commit any pending changes for format-patch to generate patches"; \
exit 1; \
fi
@cd $(LLAMACPP_REPO) && git format-patch --no-signature --no-numbered --zero-commit -o $(VENDOR_RELATIVE_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
# Vendoring template logic
EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
OLLAMA_NATIVE_FILES=mllama.cpp mllama.h llama_darwin.c sampling_ext.cpp sampling_ext.h
define vendor_file
$(strip $(addprefix $(2),$(notdir $1))) : $(addprefix $(LLAMACPP_REPO),$(1))
ifneq ($$(filter-out $(EXCLUDED_FILES),$(notdir $1)),)
@echo "vendoring $1"; \
mkdir -p $$(dir $$@) && \
echo "/**" > $$@ && \
echo " * llama.cpp - commit $$(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $$@ && \
echo " *" >> $$@ && \
sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$$$//' >> $$@ && \
echo " */" >> $$@ && \
echo "" >> $$@ && \
cat $$< >> $$@
else
@echo "vendoring $1"; \
mkdir -p $$(dir $$@) && \
cat $$< > $$@
endif
VENDORED_FILES += $(strip $(addprefix $(2),$(notdir $1)))
endef
# llama.cpp files -> llama/
LLAMACPP_FILES=\
src/unicode.cpp \
src/unicode.h \
src/unicode-data.cpp \
src/unicode-data.h \
src/llama.cpp \
src/llama-adapter.cpp \
src/llama-adapter.h \
src/llama-arch.cpp \
src/llama-arch.h \
src/llama-batch.cpp \
src/llama-batch.h \
src/llama-chat.cpp \
src/llama-chat.h \
src/llama-context.cpp \
src/llama-context.h \
src/llama-cparams.cpp \
src/llama-cparams.h \
src/llama-grammar.cpp \
src/llama-grammar.h \
src/llama-hparams.cpp \
src/llama-hparams.h \
src/llama-impl.cpp \
src/llama-impl.h \
src/llama-kv-cache.cpp \
src/llama-kv-cache.h \
src/llama-mmap.cpp \
src/llama-mmap.h \
src/llama-model-loader.cpp \
src/llama-model-loader.h \
src/llama-model.cpp \
src/llama-model.h \
src/llama-quant.cpp \
src/llama-quant.h \
src/llama-sampling.cpp \
src/llama-sampling.h \
src/llama-vocab.cpp \
src/llama-vocab.h \
include/llama.h \
include/llama-cpp.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-cpu/llamafile/sgemm.cpp \
ggml/src/ggml-cpu/llamafile/sgemm.h
$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
# llama.cpp files -> llama/llamafile
LLAMAFILE_FILES= \
ggml/src/ggml-cpu/llamafile/sgemm.h
$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR)llamafile/)))
# ggml files -> llama/
GGML_FILES= \
ggml/src/ggml.c \
ggml/include/ggml.h \
ggml/src/ggml-quants.c \
ggml/src/ggml-quants.h \
ggml/src/ggml-metal/ggml-metal.metal \
ggml/include/ggml-metal.h \
ggml/src/ggml-impl.h \
ggml/src/ggml-threading.h \
ggml/include/ggml-cuda.h \
ggml/src/ggml-backend-reg.cpp \
ggml/src/ggml-metal/ggml-metal-impl.h \
ggml/src/ggml-common.h \
ggml/include/ggml-backend.h \
ggml/src/ggml-backend.cpp \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-alloc.c \
ggml/include/ggml-blas.h \
ggml/include/ggml-cpp.h \
ggml/src/ggml-threading.cpp \
ggml/src/ggml-blas/ggml-blas.cpp \
ggml/src/ggml-cpu/ggml-cpu.c \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/src/ggml-cpu/ggml-cpu-aarch64.h \
ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp \
ggml/src/ggml-cpu/ggml-cpu-quants.h \
ggml/src/ggml-cpu/ggml-cpu-quants.c \
ggml/src/ggml-cpu/ggml-cpu-impl.h \
ggml/src/ggml-cpu/ggml-cpu-traits.h \
ggml/src/ggml-cpu/ggml-cpu-traits.cpp \
ggml/src/ggml-cpu/amx/amx.h \
ggml/src/ggml-cpu/amx/amx.cpp \
ggml/src/ggml-cpu/amx/mmq.cpp \
ggml/src/ggml-cpu/amx/mmq.h
$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
$(DEST_DIR)ggml-metal-embed.metal: $(DEST_DIR)ggml-common.h $(DEST_DIR)ggml-metal-impl.h
@sed -e '/__embed_ggml-common.h__/r $(DEST_DIR)/ggml-common.h' \
-e '/__embed_ggml-common.h__/d' \
< $(DEST_DIR)/ggml-metal.metal \
> $(DEST_DIR)/ggml-metal-embed.metal.tmp
@sed -e '/#include "ggml-metal-impl.h"/r $(DEST_DIR)/ggml-metal-impl.h' \
-e '/#include "ggml-metal-impl.h"/d' \
< $(DEST_DIR)/ggml-metal-embed.metal.tmp \
> $(DEST_DIR)/ggml-metal-embed.metal
@rm $(DEST_DIR)/ggml-metal-embed.metal.tmp
VENDORED_FILES += $(DEST_DIR)ggml-metal-embed.metal
# TODO generalize renaming pattern if we have more of these
$(DEST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal/ggml-metal.m
@echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \
mkdir -p $(dir $@) && \
echo "/**" > $@ && \
echo " * llama.cpp - commit $(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $@ && \
echo " *" >> $@ && \
sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$//' >> $@ && \
echo " */" >> $@ && \
echo "" >> $@ && \
cat $< >> $@
VENDORED_FILES += $(DEST_DIR)ggml-metal_darwin_arm64.m
# ggml-cuda -> llama/ggml-cuda/
GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh
GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES)))))
$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/)))
GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu
GGML_TEMPLATE_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES)))))
$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/template-instances/)))
GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h
GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES)))))
$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/vendors/)))
# llava -> llama/
LAVA_FILES= \
examples/llava/clip.cpp \
examples/llava/clip.h \
examples/llava/llava.cpp \
examples/llava/llava.h \
common/log.h \
common/log.cpp \
common/stb_image.h
# These files are mostly used by the llava code
# and shouldn't be necessary once we use clip.cpp directly
LAVA_FILES+= \
common/common.cpp \
common/common.h \
common/sampling.cpp \
common/sampling.h \
common/json.hpp \
common/json-schema-to-grammar.cpp \
common/json-schema-to-grammar.h \
common/base64.hpp
$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
$(DEST_DIR)build-info.cpp:
@echo "Generating $@"
@echo "int LLAMA_BUILD_NUMBER = 0;" > $@
@echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@
@echo "char const *LLAMA_COMPILER = \"\";" >> $@
@echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@
VENDORED_FILES += $(DEST_DIR)build-info.cpp
sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files
sync-clean:
rm -f $(VENDORED_FILES) $(EXTRA_NATIVE_FILES)
PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh
NATIVE_DIRS=$(DEST_DIR) $(DEST_DIR)llamafile/ $(DEST_DIR)ggml-cuda/ $(DEST_DIR)ggml-cuda/template-instances/ $(DEST_DIR)ggml-cuda/vendors/
ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS))))
EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DEST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
remove-stale-files:
@rm -f $(EXTRA_NATIVE_FILES)
.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT
# Handy debugging for make variables
print-%:
@echo '$*=$($*)'
# Targets to assist in running tests
include make/common-defs.make
test:
cd .. && go test ./...
integration: $(OLLAMA_EXE)
cd .. && go test --tags=integration ./integration -v
lint:
cd .. && golangci-lint run -v
# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows
$(OLLAMA_EXE):
@echo ""
@echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries"
@echo ""
@exit 1
\ No newline at end of file
# Common definitions for the various Makefiles
# No rules are defined here so this is safe to include at the beginning of other makefiles
OS := $(shell uname -s)
ARCH ?= $(subst aarch64,arm64,$(subst x86_64,amd64,$(shell uname -m)))
ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS)))
OS := windows
ARCH := $(shell systeminfo 2>/dev/null | grep "System Type" | grep ARM64 > /dev/null && echo "arm64" || echo "amd64" )
else ifeq ($(OS),Linux)
OS := linux
else ifeq ($(OS),Darwin)
OS := darwin
endif
comma:= ,
empty:=
space:= $(empty) $(empty)
uc = $(subst a,A,$(subst b,B,$(subst c,C,$(subst d,D,$(subst e,E,$(subst f,F,$(subst g,G,$(subst h,H,$(subst i,I,$(subst j,J,$(subst k,K,$(subst l,L,$(subst m,M,$(subst n,N,$(subst o,O,$(subst p,P,$(subst q,Q,$(subst r,R,$(subst s,S,$(subst t,T,$(subst u,U,$(subst v,V,$(subst w,W,$(subst x,X,$(subst y,Y,$(subst z,Z,$1))))))))))))))))))))))))))
export CGO_CFLAGS_ALLOW = -mfma|-mf16c
export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
export HIP_PLATFORM = amd
export CGO_ENABLED=1
BUILD_DIR = ./llama/build/$(OS)-$(ARCH)
DIST_BASE = ./dist/$(OS)-$(ARCH)
ifeq ($(OS),windows)
# Absolute paths with cygpath to convert to 8.3 without spaces
PWD="$(shell pwd)"
DIST_OLLAMA_EXE=$(DIST_BASE)/ollama$(EXE_EXT)
else
CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
DIST_OLLAMA_EXE=$(DIST_BASE)/bin/ollama$(EXE_EXT)
endif
DIST_LIB_DIR = $(DIST_BASE)/lib/ollama
RUNNERS_DIST_DIR = $(DIST_LIB_DIR)/runners
RUNNERS_BUILD_DIR = $(BUILD_DIR)/runners
VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")
# Conditionally enable ccache for cgo builds too
ifneq ($(CCACHE),)
CC?=$(CCACHE) gcc
CXX?=$(CCACHE) g++
export CC
export CXX
endif
# Override in environment to tune CPU vector flags
ifeq ($(ARCH),amd64)
ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
GPU_RUNNER_CPU_FLAGS=avx
GPU_RUNNER_EXTRA_VARIANT=_avx
else
GPU_RUNNER_CPU_FLAGS=$(subst $(comma),$(space),$(CUSTOM_CPU_FLAGS))
endif
endif
ifeq ($(OS),windows)
CP := cp
OBJ_EXT := obj
SHARED_EXT := dll
EXE_EXT := .exe
SHARED_PREFIX :=
CPU_FLAG_PREFIX := /arch:
ifneq ($(HIP_PATH),)
# If HIP_PATH has spaces, hipcc trips over them when subprocessing
HIP_PATH := $(shell cygpath -m -s "$(patsubst %\,%,$(HIP_PATH))")
export HIP_PATH
endif
else ifeq ($(OS),linux)
CP := cp -df
OBJ_EXT := o
SHARED_EXT := so
SHARED_PREFIX := lib
CPU_FLAG_PREFIX := -m
else
OBJ_EXT := o
SHARED_EXT := so
CPU_FLAG_PREFIX := -m
CP := cp -df
endif
COMMON_SRCS := \
$(wildcard ./llama/*.c) \
$(wildcard ./llama/*.cpp)
COMMON_HDRS := \
$(wildcard ./llama/*.h) \
$(wildcard ./llama/*.hpp)
OLLAMA_EXE=./ollama$(EXE_EXT)
\ No newline at end of file
# Common definitions for the various Makefiles which set cuda settings
# No rules are defined here so this is safe to include at the beginning of other makefiles
ifeq ($(OS),windows)
CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
CUDA_11_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc.exe)
CUDA_11_LIB_DIR = $(strip $(shell ls -d $(CUDA_11_PATH)/bin 2>/dev/null))
CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_PATH)/lib/x64"
else ifeq ($(OS),linux)
CUDA_PATH?=/usr/local/cuda
CUDA_11_PATH:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc)
CUDA_11_LIB_DIR=$(strip $(shell ls -d $(CUDA_11_PATH)/lib64 2>/dev/null || ls -d $(CUDA_11_PATH)/lib 2>/dev/null))
CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_LIB_DIR)" -L"$(CUDA_11_LIB_DIR)/stubs"
endif
# Common definitions for the various Makefiles which set cuda settings
# No rules are defined here so this is safe to include at the beginning of other makefiles
ifeq ($(OS),windows)
CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
CUDA_12_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc.exe)
CUDA_12_LIB_DIR = $(strip $(shell ls -d $(CUDA_12_PATH)/bin 2>/dev/null))
CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_PATH)/lib/x64"
else ifeq ($(OS),linux)
CUDA_PATH?=/usr/local/cuda
CUDA_12_PATH:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc)
CUDA_12_LIB_DIR=$(strip $(shell ls -d $(CUDA_12_PATH)/lib64 2>/dev/null || ls -d $(CUDA_12_PATH)/lib 2>/dev/null))
CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_LIB_DIR)" -L"$(CUDA_12_LIB_DIR)/stubs"
endif
# Common definitions for all cuda versions
ifndef GPU_RUNNER_VARIANT
dummy:
$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
endif
GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT)
GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
ifeq ($(OS),windows)
# On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on
GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)))))
GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__)
GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__)
GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__)
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
else ifeq ($(OS),linux)
# On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw
GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++17
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
GPU_COMPILER_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
endif
GPU_DIST_LIB_DEPS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
-DGGML_CUDA_USE_GRAPHS=1
GPU_COMPILER_CUFLAGS = \
$(GPU_COMPILER_EXTRA_FLAGS) \
-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \
-t2 \
-DGGML_CUDA_DMMV_X=32 \
-DGGML_CUDA_MMV_Y=1 \
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
-DGGML_USE_CUDA=1 \
-DGGML_SHARED=1 \
-DGGML_BACKEND_SHARED=1 \
-DGGML_BUILD=1 \
-DGGML_BACKEND_BUILD=1 \
-DGGML_USE_LLAMAFILE \
-DK_QUANTS_PER_ITERATION=2 \
-DNDEBUG \
-D_GNU_SOURCE \
-D_XOPEN_SOURCE=600 \
-Wno-deprecated-gpu-targets \
--forward-unknown-to-host-compiler \
-use_fast_math \
-I./llama/ \
-O3
# Generalized GPU runner build
ifndef GPU_RUNNER_NAME
dummy:
$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
endif
GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" $(EXTRA_GOLDFLAGS) $(TARGET_LDFLAGS)"
# TODO Unify how we handle dependencies in the dist/packaging and install flow
# today, cuda is bundled, but rocm is split out. Should split them each out by runner
DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
GPU_RUNNER_SRCS := \
$(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \
$(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \
llama/ggml.c llama/ggml-backend.cpp llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-threading.cpp
GPU_RUNNER_HDRS := \
$(wildcard llama/ggml-cuda/*.cuh)
# Conditional flags and components to speed up developer builds
ifneq ($(OLLAMA_FAST_BUILD),)
GPU_COMPILER_CUFLAGS += \
-DGGML_DISABLE_FLASH_ATTN
else
GPU_RUNNER_SRCS += \
$(wildcard llama/ggml-cuda/fattn*.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
endif
GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
$(GPU_RUNNER_NAME): $(BUILD_RUNNERS)
dist: $(DIST_RUNNERS)
# Build targets
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $<
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $<
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/"
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@-mkdir -p $(dir $@)
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
# Distribution targets
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
@-mkdir -p $(dir $@)
$(CP) $< $@
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS)
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
@-mkdir -p $(dir $@)
$(CP) $< $@
$(GPU_DIST_LIB_DEPS):
@-mkdir -p $(dir $@)
$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
clean:
rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS)
.PHONY: clean $(GPU_RUNNER_NAME)
# Handy debugging for make variables
print-%:
@echo '$*=$($*)'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment