Unverified Commit 4879a234 authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

build: Make target improvements (#7499)

* llama: wire up builtin runner

This adds a new entrypoint into the ollama CLI to run the cgo built runner.
On Mac arm64, this will have GPU support, but on all other platforms it will
be the lowest common denominator CPU build.  After we fully transition
to the new Go runners more tech-debt can be removed and we can stop building
the "default" runner via make and rely on the builtin always.

* build: Make target improvements

Add a few new targets and help for building locally.
This also adjusts the runner lookup to favor local builds, then
runners relative to the executable, and finally payloads.

* Support customized CPU flags for runners

This implements a simplified custom CPU flags pattern for the runners.
When built without overrides, the runner name contains the vector flag
we check for (AVX) to ensure we don't try to run on unsupported systems
and crash.  If the user builds a customized set, we omit the naming
scheme and don't check for compatibility.  This avoids checking
requirements at runtime, so that logic has been removed as well.  This
can be used to build GPU runners with no vector flags, or CPU/GPU
runners with additional flags (e.g. AVX512) enabled.

* Use relative paths

If the user checks out the repo in a path that contains spaces, make gets
really confused so use relative paths for everything in-repo to avoid breakage.

* Remove payloads from main binary

* install: clean up prior libraries

This removes support for v0.3.6 and older versions (before the tar bundle)
and ensures we clean up prior libraries before extracting the bundle(s).
Without this change, runners and dependent libraries could leak when we
update and lead to subtle runtime errors.
parent 63269668
...@@ -4,22 +4,25 @@ ...@@ -4,22 +4,25 @@
# unlike CUDA where we'll build both a v11 and v12 variant. # unlike CUDA where we'll build both a v11 and v12 variant.
include make/common-defs.make include make/common-defs.make
include make/rocm-defs.make
HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102 HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
ifeq ($(OS),windows) ifeq ($(OS),windows)
GPU_LIB_DIR_WIN := $(shell cygpath -m -s "$(HIP_PATH)/bin") GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
CGO_EXTRA_LDFLAGS_WIN := -L$(shell cygpath -m -s "$(HIP_PATH)/lib") CGO_EXTRA_LDFLAGS := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
GPU_COMPILER_WIN := $(HIP_PATH)/bin/hipcc.bin.exe HIP_ARCHS?=$(HIP_ARCHS_COMMON)
GPU_COMPILER:=$(GPU_COMPILER_WIN) GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
else ifeq ($(OS),linux) else ifeq ($(OS),linux)
GPU_LIB_DIR_LINUX := $(HIP_PATH)/lib GPU_LIB_DIR := $(strip $(shell ls -d $(HIP_PATH)/lib64 2>/dev/null || ls -d $(HIP_PATH)/lib 2>/dev/null))
GPU_COMPILER_LINUX := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X) CGO_EXTRA_LDFLAGS := -L$(GPU_LIB_DIR)
GPU_COMPILER:=$(GPU_COMPILER_LINUX) HIP_ARCHS?=$(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX)
ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf)) GPU_COMPILER_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL)) GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
endif endif
GPU_COMPILER=$(HIP_COMPILER)
# TODO future multi-variant support for ROCm # TODO future multi-variant support for ROCm
# ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version))))))) # ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version)))))))
...@@ -31,36 +34,37 @@ GPU_RUNNER_GO_TAGS := rocm ...@@ -31,36 +34,37 @@ GPU_RUNNER_GO_TAGS := rocm
GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT) GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT)
GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64 GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64
GPU_RUNNER_LIBS_SHORT := hipblas rocblas GPU_RUNNER_LIBS_SHORT := hipblas rocblas
GPU_PATH_ROOT_WIN=$(dir $(GPU_LIB_DIR_WIN))
GPU_PATH_ROOT_LINUX=$(dir $(GPU_LIB_DIR_LINUX))
GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -fPIC -D_GNU_SOURCE
GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
GPU_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) # Note: ROCm requires an extra step of discovering and copying the transitive dependencies on linux
ifeq ($(OS),windows) ifeq ($(OS),windows)
ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))/lib/ollama ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)/lib/ollama
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
else ifeq ($(OS),linux) else ifeq ($(OS),linux)
ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH)-rocm)/lib/ollama ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)-rocm/lib/ollama
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
FILTERED_GPU_TRANSITIVE_LIBS=$(sort $(filter-out $(addprefix %,$(notdir $(GPU_LIBS))), $(GPU_TRANSITIVE_LIBS)))
GPU_DIST_TRANSITIVE_LIB_DEPS = $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(FILTERED_GPU_TRANSITIVE_LIBS))))
endif endif
GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS)) $(notdir $(GPU_TRANSITIVE_LIBS)))) GPU_DIST_LIB_DEPS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt
ifeq ($(OS),linux) ifeq ($(OS),linux)
GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++11 GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++11
GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX), --offload-arch=$(arch))
else ifeq ($(OS),windows) else ifeq ($(OS),windows)
GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON), --offload-arch=$(arch))
endif endif
GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))
# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
GPU_COMPILER_CUFLAGS = \ GPU_COMPILER_CUFLAGS = \
$(GPU_COMPILER_FPIC) \ $(GPU_COMPILER_FPIC) \
$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \ $(addprefix -m,$(GPU_VECTOR_FLAGS)) \
-mf16c \ -mf16c \
-mfma \ -mfma \
-parallel-jobs=2 \
-c \ -c \
-O3 \ -O3 \
-DGGML_USE_CUDA \ -DGGML_USE_CUDA \
...@@ -90,7 +94,7 @@ GPU_COMPILER_CUFLAGS = \ ...@@ -90,7 +94,7 @@ GPU_COMPILER_CUFLAGS = \
-Wno-pass-failed \ -Wno-pass-failed \
-Wno-deprecated-declarations \ -Wno-deprecated-declarations \
-Wno-unused-result \ -Wno-unused-result \
-I. -I./llama/
# Workaround buggy P2P copy on some windows multi-GPU setups # Workaround buggy P2P copy on some windows multi-GPU setups
# This workaround breaks linux systems with small system RAM, so only enable on windows # This workaround breaks linux systems with small system RAM, so only enable on windows
...@@ -101,9 +105,13 @@ endif ...@@ -101,9 +105,13 @@ endif
include make/gpu.make include make/gpu.make
# Adjust the rules from gpu.make to handle the ROCm dependencies properly # Adjust the rules from gpu.make to handle the ROCm dependencies properly
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(GPU_DIST_TRANSITIVE_LIB_DEPS)
$(ROCBLAS_DIST_DEP_MANIFEST): $(ROCBLAS_DIST_DEP_MANIFEST):
@-mkdir -p $(dir $@) @-mkdir -p $(dir $@)
@echo "Copying rocblas library..." @echo "Copying rocblas library..."
cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - ) (cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . ) | (cd $(dir $@) && tar xf - )
@echo "rocblas library copy complete" @echo "rocblas library copy complete"
$(GPU_DIST_TRANSITIVE_LIB_DEPS):
@-mkdir -p $(dir $@)
$(CP) $(dir $(filter %$(notdir $@),$(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
# Helpers for managing our vendored llama.cpp repo and patch set # Helpers for managing our vendored llama.cpp repo and patch set
REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))))) REPO_ROOT:=./
DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))) DEST_DIR:=./llama/
include $(REPO_ROOT)llama/vendoring include $(DEST_DIR)vendoring
LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/ LLAMACPP_REPO := ./llama/vendor/
LLAMACPP_PATCH_DIR := $(DST_DIR)patches/ # Relative to the vendor dir
VENDOR_RELATIVE_PATCH_DIR := ../patches/
help-sync: help-sync:
@echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes" @echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes"
@echo "" @echo ""
@echo "\tmake apply-patches # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set" @echo " make apply-patches # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
@echo "\tmake sync # Vendor llama.cpp and ggml from the tracking repo working tree" @echo " make sync # Vendor llama.cpp and ggml from the tracking repo working tree"
@echo "\tmake create-patches # Generate the patch set based on the current commits in the tracking repo since the base commit" @echo " make sync-clean # Remove all vendored files"
@echo " make create-patches # Generate the patch set based on the current commits in the tracking repo since the base commit"
@echo "" @echo ""
@echo "For more details on the workflow, see the Vendoring section in ../docs/development.md" @echo "For more details on the workflow, see the Vendoring section in 'docs/development.md'"
apply-patches: $(LLAMACPP_REPO) apply-patches: $(LLAMACPP_REPO)
@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \ @if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
...@@ -29,7 +31,7 @@ apply-patches: $(LLAMACPP_REPO) ...@@ -29,7 +31,7 @@ apply-patches: $(LLAMACPP_REPO)
@git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \ @git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \
git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT)
@echo "Applying ollama patches..." @echo "Applying ollama patches..."
@git -c 'user.name=nobody' -c 'user.email=<>' -C $(LLAMACPP_REPO) am -3 $(LLAMACPP_PATCH_DIR)/*.patch || \ @cd $(LLAMACPP_REPO) && git -c 'user.name=nobody' -c 'user.email=<>' am -3 $(VENDOR_RELATIVE_PATCH_DIR)*.patch || \
echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches" echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches"
@echo "" @echo ""
@echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied." @echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied."
...@@ -44,7 +46,7 @@ create-patches: $(LLAMACPP_REPO) ...@@ -44,7 +46,7 @@ create-patches: $(LLAMACPP_REPO)
echo "ERROR: Your llama.cpp repo is dirty. You must commit any pending changes for format-patch to generate patches"; \ echo "ERROR: Your llama.cpp repo is dirty. You must commit any pending changes for format-patch to generate patches"; \
exit 1; \ exit 1; \
fi fi
git -C $(LLAMACPP_REPO) format-patch --no-signature --no-numbered --zero-commit -o $(LLAMACPP_PATCH_DIR) $(LLAMACPP_BASE_COMMIT) @cd $(LLAMACPP_REPO) && git format-patch --no-signature --no-numbered --zero-commit -o $(VENDOR_RELATIVE_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
# Vendoring template logic # Vendoring template logic
EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
...@@ -86,12 +88,12 @@ LLAMACPP_FILES=\ ...@@ -86,12 +88,12 @@ LLAMACPP_FILES=\
include/llama.h \ include/llama.h \
ggml/src/llamafile/sgemm.cpp \ ggml/src/llamafile/sgemm.cpp \
ggml/src/llamafile/sgemm.h ggml/src/llamafile/sgemm.h
$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR)))) $(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
# llama.cpp files -> llama/llamafile # llama.cpp files -> llama/llamafile
LLAMAFILE_FILES= \ LLAMAFILE_FILES= \
ggml/src/llamafile/sgemm.h ggml/src/llamafile/sgemm.h
$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR)llamafile/))) $(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR)llamafile/)))
# ggml files -> llama/ # ggml files -> llama/
GGML_FILES= \ GGML_FILES= \
...@@ -115,10 +117,10 @@ GGML_FILES= \ ...@@ -115,10 +117,10 @@ GGML_FILES= \
ggml/src/ggml-cpu-impl.h \ ggml/src/ggml-cpu-impl.h \
ggml/include/ggml-blas.h \ ggml/include/ggml-blas.h \
ggml/src/ggml-blas.cpp ggml/src/ggml-blas.cpp
$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR)))) $(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
# TODO generalize renaming pattern if we have more of these # TODO generalize renaming pattern if we have more of these
$(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m $(DEST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
@echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \ @echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \
mkdir -p $(dir $@) && \ mkdir -p $(dir $@) && \
echo "/**" > $@ && \ echo "/**" > $@ && \
...@@ -128,20 +130,20 @@ $(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m ...@@ -128,20 +130,20 @@ $(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
echo " */" >> $@ && \ echo " */" >> $@ && \
echo "" >> $@ && \ echo "" >> $@ && \
cat $< >> $@ cat $< >> $@
VENDORED_FILES += $(DST_DIR)ggml-metal_darwin_arm64.m VENDORED_FILES += $(DEST_DIR)ggml-metal_darwin_arm64.m
# ggml-cuda -> llama/ggml-cuda/ # ggml-cuda -> llama/ggml-cuda/
GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh
GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES))))) GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES)))))
$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/))) $(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/)))
GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu
GGML_TEMPLATE_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES))))) GGML_TEMPLATE_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES)))))
$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/template-instances/))) $(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/template-instances/)))
GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h
GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES))))) GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES)))))
$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/vendors/))) $(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/vendors/)))
# llava -> llama/ # llava -> llama/
LAVA_FILES= \ LAVA_FILES= \
...@@ -163,27 +165,30 @@ LAVA_FILES+= \ ...@@ -163,27 +165,30 @@ LAVA_FILES+= \
common/json-schema-to-grammar.cpp \ common/json-schema-to-grammar.cpp \
common/json-schema-to-grammar.h \ common/json-schema-to-grammar.h \
common/base64.hpp common/base64.hpp
$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR)))) $(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
$(DST_DIR)build-info.cpp: $(DEST_DIR)build-info.cpp:
@echo "Generating $@" @echo "Generating $@"
@echo "int LLAMA_BUILD_NUMBER = 0;" > $@ @echo "int LLAMA_BUILD_NUMBER = 0;" > $@
@echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@ @echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@
@echo "char const *LLAMA_COMPILER = \"\";" >> $@ @echo "char const *LLAMA_COMPILER = \"\";" >> $@
@echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@ @echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@
VENDORED_FILES += $(DST_DIR)build-info.cpp VENDORED_FILES += $(DEST_DIR)build-info.cpp
sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files
sync-clean:
rm -f $(VENDORED_FILES) $(EXTRA_NATIVE_FILES)
PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh
NATIVE_DIRS=$(DST_DIR) $(DST_DIR)llamafile/ $(DST_DIR)ggml-cuda/ $(DST_DIR)ggml-cuda/template-instances/ $(DST_DIR)ggml-cuda/vendors/ NATIVE_DIRS=$(DEST_DIR) $(DEST_DIR)llamafile/ $(DEST_DIR)ggml-cuda/ $(DEST_DIR)ggml-cuda/template-instances/ $(DEST_DIR)ggml-cuda/vendors/
ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS)))) ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS))))
EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES)) EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DEST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
remove-stale-files: remove-stale-files:
@rm -f $(EXTRA_NATIVE_FILES) @rm -f $(EXTRA_NATIVE_FILES)
.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT .PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT
# Handy debugging for make variables # Handy debugging for make variables
......
# Targets to assist in running tests
include make/common-defs.make
test:
cd .. && go test ./...
integration: $(OLLAMA_EXE)
cd .. && go test --tags=integration ./integration -v
lint:
cd .. && golangci-lint run -v
# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows
$(OLLAMA_EXE):
@echo ""
@echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries"
@echo ""
@exit 1
\ No newline at end of file
...@@ -21,37 +21,43 @@ export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c ...@@ -21,37 +21,43 @@ export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
export HIP_PLATFORM = amd export HIP_PLATFORM = amd
export CGO_ENABLED=1 export CGO_ENABLED=1
SRC_DIR := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))) BUILD_DIR = ./llama/build/$(OS)-$(ARCH)
BUILD_DIR = $(SRC_DIR)build/$(OS)-$(ARCH) DIST_BASE = ./dist/$(OS)-$(ARCH)
DIST_BASE = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))
ifeq ($(OS),windows)
# Absolute paths with cygpath to convert to 8.3 without spaces
PWD="$(shell pwd)"
DIST_OLLAMA_EXE=$(DIST_BASE)/ollama$(EXE_EXT)
else
CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
DIST_OLLAMA_EXE=$(DIST_BASE)/bin/ollama$(EXE_EXT)
endif
DIST_LIB_DIR = $(DIST_BASE)/lib/ollama DIST_LIB_DIR = $(DIST_BASE)/lib/ollama
RUNNERS_DIST_DIR = $(DIST_LIB_DIR)/runners RUNNERS_DIST_DIR = $(DIST_LIB_DIR)/runners
RUNNERS_PAYLOAD_DIR = $(abspath $(SRC_DIR)/../build/$(OS)/$(ARCH))
RUNNERS_BUILD_DIR = $(BUILD_DIR)/runners RUNNERS_BUILD_DIR = $(BUILD_DIR)/runners
DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
GZIP:=$(shell command -v pigz 2>/dev/null || echo "gzip")
ifneq ($(OS),windows)
CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
endif
VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g") VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")
# Conditionally enable ccache for cgo builds too # Conditionally enable ccache for cgo builds too
ifneq ($(CCACHE),) ifneq ($(CCACHE),)
CC=$(CCACHE) gcc CC?=$(CCACHE) gcc
CXX=$(CCACHE) g++ CXX?=$(CCACHE) g++
export CC export CC
export CXX export CXX
endif endif
# Override in environment space separated to tune GPU runner CPU vector flags # Override in environment to tune CPU vector flags
ifeq ($(ARCH),amd64) ifeq ($(ARCH),amd64)
GPU_RUNNER_CPU_FLAGS ?= avx ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
GPU_RUNNER_CPU_FLAGS=avx
GPU_RUNNER_EXTRA_VARIANT=_avx
else
GPU_RUNNER_CPU_FLAGS=$(subst $(comma),$(space),$(CUSTOM_CPU_FLAGS))
endif
endif endif
ifeq ($(OS),windows) ifeq ($(OS),windows)
CP := cp CP := cp
SRC_DIR := $(shell cygpath -m -s "$(SRC_DIR)")
OBJ_EXT := obj OBJ_EXT := obj
SHARED_EXT := dll SHARED_EXT := dll
EXE_EXT := .exe EXE_EXT := .exe
...@@ -63,22 +69,23 @@ ifneq ($(HIP_PATH),) ...@@ -63,22 +69,23 @@ ifneq ($(HIP_PATH),)
export HIP_PATH export HIP_PATH
endif endif
else ifeq ($(OS),linux) else ifeq ($(OS),linux)
CP := cp -af CP := cp -df
OBJ_EXT := o OBJ_EXT := o
SHARED_EXT := so SHARED_EXT := so
SHARED_PREFIX := lib SHARED_PREFIX := lib
CPU_FLAG_PREFIX := -m CPU_FLAG_PREFIX := -m
HIP_PATH?=/opt/rocm
else else
OBJ_EXT := o OBJ_EXT := o
SHARED_EXT := so SHARED_EXT := so
CPU_FLAG_PREFIX := -m CPU_FLAG_PREFIX := -m
CP := cp -af CP := cp -df
endif endif
COMMON_SRCS := \ COMMON_SRCS := \
$(wildcard *.c) \ $(wildcard ./llama/*.c) \
$(wildcard *.cpp) $(wildcard ./llama/*.cpp)
COMMON_HDRS := \ COMMON_HDRS := \
$(wildcard *.h) \ $(wildcard ./llama/*.h) \
$(wildcard *.hpp) $(wildcard ./llama/*.hpp)
OLLAMA_EXE=./ollama$(EXE_EXT)
\ No newline at end of file
# Common definitions for the various Makefiles which set cuda settings
# No rules are defined here so this is safe to include at the beginning of other makefiles
ifeq ($(OS),windows)
CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
CUDA_11_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc.exe)
CUDA_11_LIB_DIR = $(strip $(shell ls -d $(CUDA_11_PATH)/bin 2>/dev/null))
CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_PATH)/lib/x64"
else ifeq ($(OS),linux)
CUDA_PATH?=/usr/local/cuda
CUDA_11_PATH:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc)
CUDA_11_LIB_DIR=$(strip $(shell ls -d $(CUDA_11_PATH)/lib64 2>/dev/null || ls -d $(CUDA_11_PATH)/lib 2>/dev/null))
CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_LIB_DIR)" -L"$(CUDA_11_LIB_DIR)/stubs"
endif
# Common definitions for the various Makefiles which set cuda settings
# No rules are defined here so this is safe to include at the beginning of other makefiles
ifeq ($(OS),windows)
CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
CUDA_12_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc.exe)
CUDA_12_LIB_DIR = $(strip $(shell ls -d $(CUDA_12_PATH)/bin 2>/dev/null))
CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_PATH)/lib/x64"
else ifeq ($(OS),linux)
CUDA_PATH?=/usr/local/cuda
CUDA_12_PATH:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc)
CUDA_12_LIB_DIR=$(strip $(shell ls -d $(CUDA_12_PATH)/lib64 2>/dev/null || ls -d $(CUDA_12_PATH)/lib 2>/dev/null))
CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_LIB_DIR)" -L"$(CUDA_12_LIB_DIR)/stubs"
endif
...@@ -10,27 +10,31 @@ GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT) ...@@ -10,27 +10,31 @@ GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT)
GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT) GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
GPU_RUNNER_DRIVER_LIB_LINK := -lcuda GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
GPU_LIB_DIR_WIN = $(GPU_PATH_ROOT_WIN)/bin
GPU_LIB_DIR_LINUX = $(GPU_PATH_ROOT_LINUX)/lib64
CGO_EXTRA_LDFLAGS_WIN = -L"$(GPU_PATH_ROOT_WIN)/lib/x64"
GPU_COMPILER_WIN = $(GPU_PATH_ROOT_WIN)/bin/nvcc
GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc
GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
ifeq ($(OS),linux) ifeq ($(OS),windows)
CUDA_PATH?=/usr/local/cuda # On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on
GPU_COMPILER_FPIC = -fPIC -Wno-unused-function -std=c++11 GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)))))
GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__)
GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__)
GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__)
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
else ifeq ($(OS),linux)
# On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw
GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++11
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
GPU_COMPILER_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
endif endif
GPU_DIST_LIB_DEPS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \ GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
-DGGML_CUDA_USE_GRAPHS=1 -DGGML_CUDA_USE_GRAPHS=1
GPU_COMPILER_CUFLAGS = \ GPU_COMPILER_CUFLAGS = \
$(GPU_COMPILER_FPIC) \ $(GPU_COMPILER_EXTRA_FLAGS) \
-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(_OS_GPU_RUNNER_CPU_FLAGS))" \ -Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \
-t2 \ -t2 \
-DGGML_CUDA_DMMV_X=32 \ -DGGML_CUDA_DMMV_X=32 \
-DGGML_CUDA_MMV_Y=1 \ -DGGML_CUDA_MMV_Y=1 \
...@@ -46,5 +50,5 @@ GPU_COMPILER_CUFLAGS = \ ...@@ -46,5 +50,5 @@ GPU_COMPILER_CUFLAGS = \
-Wno-deprecated-gpu-targets \ -Wno-deprecated-gpu-targets \
--forward-unknown-to-host-compiler \ --forward-unknown-to-host-compiler \
-use_fast_math \ -use_fast_math \
-I. \ -I./llama/ \
-O3 -O3
...@@ -5,42 +5,22 @@ dummy: ...@@ -5,42 +5,22 @@ dummy:
$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables) $(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
endif endif
ifeq ($(OS),windows) GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS))\" $(EXTRA_GOLDLAGS) $(TARGET_LDFLAGS)"
GPU_COMPILER:=$(GPU_COMPILER_WIN)
GPU_LIB_DIR:=$(GPU_LIB_DIR_WIN)
CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_WIN)
GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_WIN)
GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_WIN)
else ifeq ($(OS),linux)
GPU_COMPILER:=$(GPU_COMPILER_LINUX)
GPU_LIB_DIR:=$(GPU_LIB_DIR_LINUX)
CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_LINUX)
GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_LINUX)
GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_LINUX)
endif
GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS))\" $(TARGET_LDFLAGS)"
# TODO Unify how we handle dependencies in the dist/packaging and install flow # TODO Unify how we handle dependencies in the dist/packaging and install flow
# today, cuda is bundled, but rocm is split out. Should split them each out by runner # today, cuda is bundled, but rocm is split out. Should split them each out by runner
DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR) DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
ifeq ($(OS),windows)
_OS_GPU_RUNNER_CPU_FLAGS=$(call uc,$(GPU_RUNNER_CPU_FLAGS))
else ifeq ($(OS),linux)
_OS_GPU_RUNNER_CPU_FLAGS=$(GPU_RUNNER_CPU_FLAGS)
endif
GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS)))
GPU_RUNNER_SRCS := \ GPU_RUNNER_SRCS := \
ggml-cuda.cu \ llama/ggml-cuda.cu \
$(filter-out $(wildcard ggml-cuda/fattn*.cu),$(wildcard ggml-cuda/*.cu)) \ $(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \
$(wildcard ggml-cuda/template-instances/mmq*.cu) \ $(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \
ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp ggml-aarch64.c llama/ggml.c llama/ggml-backend.c llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-aarch64.c
GPU_RUNNER_HDRS := \ GPU_RUNNER_HDRS := \
$(wildcard ggml-cuda/*.cuh) $(wildcard llama/ggml-cuda/*.cuh)
# Conditional flags and components to speed up developer builds # Conditional flags and components to speed up developer builds
...@@ -49,25 +29,24 @@ ifneq ($(OLLAMA_FAST_BUILD),) ...@@ -49,25 +29,24 @@ ifneq ($(OLLAMA_FAST_BUILD),)
-DGGML_DISABLE_FLASH_ATTN -DGGML_DISABLE_FLASH_ATTN
else else
GPU_RUNNER_SRCS += \ GPU_RUNNER_SRCS += \
$(wildcard ggml-cuda/fattn*.cu) \ $(wildcard llama/ggml-cuda/fattn*.cu) \
$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \ $(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \
$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \ $(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \ $(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) $(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
endif endif
GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT))) GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME))) DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
ifneq ($(OS),windows) BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME)))
endif
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
$(GPU_RUNNER_NAME): $(BUILD_RUNNERS)
$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) dist: $(DIST_RUNNERS)
# Build targets # Build targets
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
...@@ -79,11 +58,11 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c ...@@ -79,11 +58,11 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
@-mkdir -p $(dir $@) @-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $< $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS) $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/"
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@-mkdir -p $(dir $@) @-mkdir -p $(dir $@)
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
@-mkdir -p $(dir $@) @-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@ $(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
...@@ -91,27 +70,16 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME). ...@@ -91,27 +70,16 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
@-mkdir -p $(dir $@) @-mkdir -p $(dir $@)
$(CP) $< $@ $(CP) $< $@
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS) $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS)
$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
@-mkdir -p $(dir $@) @-mkdir -p $(dir $@)
$(CP) $< $@ $(CP) $< $@
$(DIST_GPU_RUNNER_LIB_DEPS): $(GPU_DIST_LIB_DEPS):
@-mkdir -p $(dir $@) @-mkdir -p $(dir $@)
$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@) $(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
$(GPU_DIST_DEPS_LIBS):
@-mkdir -p $(dir $@)
$(CP) $(dir $(filter %$(notdir $@),$(GPU_LIBS) $(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
# Payload targets
$(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server
@-mkdir -p $(dir $@)
${GZIP} --best -c $< > $@
$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/%
@-mkdir -p $(dir $@)
${GZIP} --best -c $< > $@
clean: clean:
rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS)
.PHONY: clean $(GPU_RUNNER_NAME) .PHONY: clean $(GPU_RUNNER_NAME)
......
# Common definitions for the various Makefiles which set cuda settings
# No rules are defined here so this is safe to include at the beginning of other makefiles
ifeq ($(OS),windows)
HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc.bin.exe)
else ifeq ($(OS),linux)
HIP_PATH?=$(shell ls -d /opt/rocm 2>/dev/null)
HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc)
endif
package runners package runners
import ( import (
"compress/gzip"
"errors"
"fmt"
"io"
"io/fs"
"log/slog" "log/slog"
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
"slices" "slices"
"strconv"
"strings" "strings"
"sync" "sync"
"syscall"
"golang.org/x/sync/errgroup" "golang.org/x/sys/cpu"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
) )
const (
binGlob = "*/*/*/*"
)
var ( var (
lock sync.Mutex
runnersDir = "" runnersDir = ""
once = sync.Once{}
) )
// Return the location where runners are stored type CPUCapability uint32
// If runners are payloads, this will either extract them
// or refresh them if any have disappeared due to tmp cleaners
func Refresh(payloadFS fs.FS) (string, error) {
lock.Lock()
defer lock.Unlock()
var err error
// Wire up extra logging on our first load // Override at build time when building base GPU runners
if runnersDir == "" { // var GPURunnerCPUCapability = CPUCapabilityAVX
defer func() {
var runners []string
for v := range GetAvailableServers(runnersDir) {
runners = append(runners, v)
}
slog.Info("Dynamic LLM libraries", "runners", runners)
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
}()
}
if hasPayloads(payloadFS) { const (
if runnersDir == "" { CPUCapabilityNone CPUCapability = iota
runnersDir, err = extractRunners(payloadFS) CPUCapabilityAVX
} else { CPUCapabilityAVX2
err = refreshRunners(payloadFS, runnersDir) // TODO AVX512
} )
} else if runnersDir == "" {
runnersDir, err = locateRunners()
}
return runnersDir, err
}
func Cleanup(payloadFS fs.FS) { func (c CPUCapability) String() string {
lock.Lock() switch c {
defer lock.Unlock() case CPUCapabilityAVX:
if hasPayloads(payloadFS) && runnersDir != "" { return "avx"
// We want to fully clean up the tmpdir parent of the payloads dir case CPUCapabilityAVX2:
tmpDir := filepath.Clean(filepath.Join(runnersDir, "..")) return "avx2"
slog.Debug("cleaning up", "dir", tmpDir) default:
err := os.RemoveAll(tmpDir) return "no vector extensions"
if err != nil {
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
}
} }
} }
func locateRunners() (string, error) { func GetCPUCapability() CPUCapability {
exe, err := os.Executable() if cpu.X86.HasAVX2 {
if err != nil { return CPUCapabilityAVX2
return "", err
}
cwd, err := os.Getwd()
if err != nil {
return "", err
}
var paths []string
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
paths = append(paths,
root,
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
)
} }
if cpu.X86.HasAVX {
// Try a few variations to improve developer experience when building from source in the local tree return CPUCapabilityAVX
for _, path := range paths {
candidate := filepath.Join(path, "lib", "ollama", "runners")
if _, err := os.Stat(candidate); err == nil {
return candidate, nil
}
} }
return "", fmt.Errorf("unable to locate runners in any search path %v", paths) // else LCD
return CPUCapabilityNone
} }
// Return true if we're carying nested payloads for the runners // Return the location where runners were located
func hasPayloads(payloadFS fs.FS) bool { // empty string indicates only builtin is present
files, err := fs.Glob(payloadFS, binGlob) func Locate() string {
if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) { once.Do(locateRunnersOnce)
return false return runnersDir
}
return true
} }
func extractRunners(payloadFS fs.FS) (string, error) { // searches for runners in a prioritized set of locations
cleanupTmpDirs() // 1. local build, with executable at the top of the tree
tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama") // 2. lib directory relative to executable
if err != nil { func locateRunnersOnce() {
return "", fmt.Errorf("failed to generate tmp dir: %w", err) exe, err := os.Executable()
}
// Track our pid so we can clean up orphaned tmpdirs
n := filepath.Join(tmpDir, "ollama.pid")
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
slog.Warn("failed to write pid file", "file", n, "error", err)
}
// We create a distinct subdirectory for payloads within the tmpdir
// This will typically look like /tmp/ollama3208993108/runners on linux
rDir := filepath.Join(tmpDir, "runners")
slog.Info("extracting embedded files", "dir", rDir)
return rDir, refreshRunners(payloadFS, rDir)
}
func refreshRunners(payloadFS fs.FS, rDir string) error {
// extract or refresh server libraries
err := extractFiles(payloadFS, rDir, binGlob)
if err != nil { if err != nil {
return fmt.Errorf("extract binaries: %v", err) slog.Debug("runner locate", "error", err)
} }
return nil
}
// extract extracts the embedded files to the target directory
func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
files, err := fs.Glob(payloadFS, glob)
if err != nil || len(files) == 0 {
// Should not happen
return fmt.Errorf("extractFiles called without payload present")
}
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
}
g := new(errgroup.Group)
// $OS/$GOARCH/$RUNNER/$FILE paths := []string{
for _, file := range files { filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
filename := file filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
runner := filepath.Base(filepath.Dir(filename))
slog.Debug("extracting", "runner", runner, "payload", filename)
g.Go(func() error {
srcf, err := payloadFS.Open(filename)
if err != nil {
return err
}
defer srcf.Close()
src := io.Reader(srcf)
if strings.HasSuffix(filename, ".gz") {
src, err = gzip.NewReader(src)
if err != nil {
return fmt.Errorf("decompress payload %s: %v", filename, err)
}
filename = strings.TrimSuffix(filename, ".gz")
}
runnerDir := filepath.Join(targetDir, runner)
if err := os.MkdirAll(runnerDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
}
base := filepath.Base(filename)
destFilename := filepath.Join(runnerDir, base)
_, err = os.Stat(destFilename)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", filename, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, src); err != nil {
return fmt.Errorf("copy payload %s: %v", filename, err)
}
case err != nil:
return fmt.Errorf("stat payload %s: %v", filename, err)
}
return nil
})
} }
for _, path := range paths {
err = g.Wait() if _, err := os.Stat(path); err == nil {
if err != nil { runnersDir = path
slog.Error("failed to extract files", "error", err) slog.Debug("runners located", "dir", runnersDir)
// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted return
err := os.RemoveAll(targetDir)
if err != nil {
slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
} }
return err
} }
return nil // Fall back to built-in
slog.Debug("no dynamic runners detected, using only built-in")
runnersDir = ""
} }
// Best effort to clean up prior tmpdirs // Return the well-known name of the builtin runner for the given platform
func cleanupTmpDirs() { func BuiltinName() string {
tmpDir := envconfig.TmpDir() if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
if tmpDir == "" { return "metal"
tmpDir = os.TempDir()
}
matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
if err != nil {
return
}
for _, match := range matches {
raw, err := os.ReadFile(match)
if errors.Is(err, os.ErrNotExist) {
slog.Debug("not a ollama runtime directory, skipping", "path", match)
continue
} else if err != nil {
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
continue
}
pid, err := strconv.Atoi(string(raw))
if err != nil {
slog.Warn("invalid pid, skipping", "path", match, "error", err)
continue
}
p, err := os.FindProcess(pid)
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("process still running, skipping", "pid", pid, "path", match)
continue
}
if err := os.Remove(match); err != nil {
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
}
runners := filepath.Join(filepath.Dir(match), "runners")
if err := os.RemoveAll(runners); err != nil {
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
}
if err := os.Remove(filepath.Dir(match)); err != nil {
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
}
} }
return "cpu"
} }
// directory names are the name of the runner and may contain an optional // directory names are the name of the runner and may contain an optional
// variant prefixed with '_' as the separator. For example, "cuda_v11" and // variant prefixed with '_' as the separator. For example, "cuda_v11" and
// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the // "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
// lowest common denominator // lowest common denominator
func GetAvailableServers(payloadsDir string) map[string]string { func GetAvailableServers() map[string]string {
if payloadsDir == "" { once.Do(locateRunnersOnce)
slog.Error("empty runner dir")
return nil servers := make(map[string]string)
exe, err := os.Executable()
if err == nil {
servers[BuiltinName()] = exe
}
if runnersDir == "" {
return servers
} }
// glob payloadsDir for files that start with ollama_ // glob runnersDir for files that start with ollama_
pattern := filepath.Join(payloadsDir, "*", "ollama_*") pattern := filepath.Join(runnersDir, "*", "ollama_*")
files, err := filepath.Glob(pattern) files, err := filepath.Glob(pattern)
if err != nil { if err != nil {
...@@ -289,96 +119,88 @@ func GetAvailableServers(payloadsDir string) map[string]string { ...@@ -289,96 +119,88 @@ func GetAvailableServers(payloadsDir string) map[string]string {
return nil return nil
} }
servers := make(map[string]string)
for _, file := range files { for _, file := range files {
slog.Debug("availableServers : found", "file", file) slog.Debug("availableServers : found", "file", file)
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file) runnerName := filepath.Base(filepath.Dir(file))
// Special case for our GPU runners - if compiled with standard AVX flag
// detect incompatible system
// Custom builds will omit this and its up to the user to ensure compatibility
parsed := strings.Split(runnerName, "_")
if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX {
slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName)
continue
}
servers[runnerName] = file
} }
return servers return servers
} }
// serversForGpu returns a list of compatible servers give the provided GPU // serversForGpu returns a list of compatible servers give the provided GPU library/variant
// info, ordered by performance. assumes Init() has been called func ServersForGpu(requested string) []string {
// TODO - switch to metadata based mapping
func ServersForGpu(info discover.GpuInfo) []string {
// glob workDir for files that start with ollama_ // glob workDir for files that start with ollama_
availableServers := GetAvailableServers(runnersDir) availableServers := GetAvailableServers()
requested := info.Library
if info.Variant != discover.CPUCapabilityNone.String() { // Short circuit if the only option is built-in
requested += "_" + info.Variant if _, ok := availableServers[BuiltinName()]; ok && len(availableServers) == 1 {
return []string{BuiltinName()}
} }
bestCPUVariant := GetCPUCapability()
requestedLib := strings.Split(requested, "_")[0]
servers := []string{} servers := []string{}
// exact match first // exact match first
for a := range availableServers { for a := range availableServers {
if a == requested { short := a
parsed := strings.Split(a, "_")
if len(parsed) == 3 {
// Strip off optional _avx for comparison
short = parsed[0] + "_" + parsed[1]
}
if a == requested || short == requested {
servers = []string{a} servers = []string{a}
if a == "metal" {
return servers
}
break
} }
} }
alt := []string{} // If no exact match, then try without variant
if len(servers) == 0 {
// Then for GPUs load alternates and sort the list for consistent load ordering alt := []string{}
if info.Library != "cpu" {
for a := range availableServers { for a := range availableServers {
if info.Library == strings.Split(a, "_")[0] && a != requested { if requestedLib == strings.Split(a, "_")[0] && a != requested {
alt = append(alt, a) alt = append(alt, a)
} }
} }
slices.Sort(alt) slices.Sort(alt)
servers = append(servers, alt...) servers = append(servers, alt...)
} }
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") { // Finally append the best CPU option if found, then builtin
// Load up the best CPU variant if not primary requested if bestCPUVariant != CPUCapabilityNone {
if info.Library != "cpu" { for cmp := range availableServers {
variant := discover.GetCPUCapability() if cmp == "cpu_"+bestCPUVariant.String() {
// If no variant, then we fall back to default servers = append(servers, cmp)
// If we have a variant, try that if we find an exact match break
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != discover.CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant.String() {
servers = append(servers, cmp)
break
}
}
} else {
servers = append(servers, "cpu")
} }
} }
if len(servers) == 0 {
servers = []string{"cpu"}
}
} }
servers = append(servers, BuiltinName())
return servers return servers
} }
// Return the optimal server for this CPU architecture // Return the optimal server for this CPU architecture
func ServerForCpu() string { func ServerForCpu() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "metal" return BuiltinName()
} }
variant := discover.GetCPUCapability() variant := GetCPUCapability()
availableServers := GetAvailableServers(runnersDir) availableServers := GetAvailableServers()
if variant != discover.CPUCapabilityNone { if variant != CPUCapabilityNone {
for cmp := range availableServers { for cmp := range availableServers {
if cmp == "cpu_"+variant.String() { if cmp == "cpu_"+variant.String() {
return cmp return cmp
} }
} }
} }
return "cpu" return BuiltinName()
} }
package runners
import (
"log/slog"
"os"
"path"
"runtime"
"strings"
"testing"
"testing/fstest"
)
func TestRefreshRunners(t *testing.T) {
slog.SetLogLoggerLevel(slog.LevelDebug)
payloadFS := fstest.MapFS{
path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
}
tmpDir, err := os.MkdirTemp("", "testing")
if err != nil {
t.Fatalf("failed to make tmp dir %s", err)
}
t.Setenv("OLLAMA_TMPDIR", tmpDir)
rDir, err := Refresh(payloadFS)
if err != nil {
t.Fatalf("failed to extract to %s %s", tmpDir, err)
}
if !strings.Contains(rDir, tmpDir) {
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
}
// spot check results
servers := GetAvailableServers(rDir)
if len(servers) < 1 {
t.Fatalf("expected at least 1 server")
}
// Refresh contents
rDir, err = extractRunners(payloadFS)
if err != nil {
t.Fatalf("failed to extract to %s %s", tmpDir, err)
}
if !strings.Contains(rDir, tmpDir) {
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
}
cleanupTmpDirs()
Cleanup(payloadFS)
}
...@@ -14,16 +14,14 @@ export CGO_CFLAGS=-mmacosx-version-min=11.3 ...@@ -14,16 +14,14 @@ export CGO_CFLAGS=-mmacosx-version-min=11.3
export CGO_CXXFLAGS=-mmacosx-version-min=11.3 export CGO_CXXFLAGS=-mmacosx-version-min=11.3
export CGO_LDFLAGS=-mmacosx-version-min=11.3 export CGO_LDFLAGS=-mmacosx-version-min=11.3
for TARGETARCH in arm64 amd64; do rm -rf llama/build dist/darwin-*
echo "Building Go runner darwin $TARGETARCH" echo "Building darwin arm64"
rm -rf llama/build GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8 echo "Building darwin amd64 with AVX enabled"
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
done
lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
rm -f dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
if [ -n "$APPLE_IDENTITY" ]; then if [ -n "$APPLE_IDENTITY" ]; then
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
else else
......
...@@ -82,7 +82,7 @@ function buildOllama() { ...@@ -82,7 +82,7 @@ function buildOllama() {
if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) { if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
write-host "Building ollama runners" write-host "Building ollama runners"
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
& make -C llama -j 12 & make -j 12 dist
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} else { } else {
write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set" write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
......
...@@ -71,29 +71,20 @@ for BINDIR in /usr/local/bin /usr/bin /bin; do ...@@ -71,29 +71,20 @@ for BINDIR in /usr/local/bin /usr/bin /bin; do
done done
OLLAMA_INSTALL_DIR=$(dirname ${BINDIR}) OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})
if [ -d "$OLLAMA_INSTALL_DIR/lib/ollama" ] ; then
status "Cleaning up old version at $OLLAMA_INSTALL_DIR/lib/ollama"
$SUDO rm -rf "$OLLAMA_INSTALL_DIR/lib/ollama"
fi
status "Installing ollama to $OLLAMA_INSTALL_DIR" status "Installing ollama to $OLLAMA_INSTALL_DIR"
$SUDO install -o0 -g0 -m755 -d $BINDIR $SUDO install -o0 -g0 -m755 -d $BINDIR
$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR" $SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then status "Downloading Linux ${ARCH} bundle"
status "Downloading Linux ${ARCH} bundle" curl --fail --show-error --location --progress-bar \
curl --fail --show-error --location --progress-bar \ "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
"https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \ $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
BUNDLE=1 status "Making ollama accessible in the PATH in $BINDIR"
if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
status "Making ollama accessible in the PATH in $BINDIR"
$SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
fi
else
status "Downloading Linux ${ARCH} CLI"
curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
"https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
BUNDLE=0
if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
status "Making ollama accessible in the PATH in $BINDIR"
$SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
fi
fi fi
# Check for NVIDIA JetPack systems with additional downloads # Check for NVIDIA JetPack systems with additional downloads
...@@ -230,31 +221,11 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg ...@@ -230,31 +221,11 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
fi fi
if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
if [ $BUNDLE -ne 0 ]; then status "Downloading Linux ROCm ${ARCH} bundle"
status "Downloading Linux ROCm ${ARCH} bundle" curl --fail --show-error --location --progress-bar \
curl --fail --show-error --location --progress-bar \ "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
"https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \ $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
install_success
status "AMD GPU ready."
exit 0
fi
# Look for pre-existing ROCm v6 before downloading the dependencies
for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
status "Compatible AMD GPU ROCm library detected at ${search}"
install_success
exit 0
fi
done
status "Downloading AMD GPU dependencies..."
$SUDO rm -rf /usr/share/ollama/lib
$SUDO chmod o+x /usr/share/ollama
$SUDO install -o ollama -g ollama -m 755 -d /usr/share/ollama/lib/rocm
curl --fail --show-error --location --progress-bar "https://ollama.com/download/ollama-linux-amd64-rocm.tgz${VER_PARAM}" \
| $SUDO tar zx --owner ollama --group ollama -C /usr/share/ollama/lib/rocm .
install_success install_success
status "AMD GPU ready." status "AMD GPU ready."
exit 0 exit 0
......
...@@ -27,7 +27,6 @@ import ( ...@@ -27,7 +27,6 @@ import (
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/build"
"github.com/ollama/ollama/discover" "github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
...@@ -1264,13 +1263,16 @@ func Serve(ln net.Listener) error { ...@@ -1264,13 +1263,16 @@ func Serve(ln net.Listener) error {
srvr.Close() srvr.Close()
schedDone() schedDone()
sched.unloadAllRunners() sched.unloadAllRunners()
runners.Cleanup(build.EmbedFS)
done() done()
}() }()
if _, err := runners.Refresh(build.EmbedFS); err != nil { // Locate and log what runners are present at startup
return fmt.Errorf("unable to initialize llm runners %w", err) var runnerNames []string
for v := range runners.GetAvailableServers() {
runnerNames = append(runnerNames, v)
} }
slog.Info("Dynamic LLM libraries", "runners", runnerNames)
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
s.sched.Run(schedCtx) s.sched.Run(schedCtx)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment