build: Make target improvements (#7499)

* llama: wire up builtin runner This adds a new entrypoint into the ollama CLI to run the cgo built runner. On Mac arm64, this will have GPU support, but on all other platforms it will be the lowest common denominator CPU build. After we fully transition to the new Go runners more tech-debt can be removed and we can stop building the "default" runner via make and rely on the builtin always. * build: Make target improvements Add a few new targets and help for building locally. This also adjusts the runner lookup to favor local builds, then runners relative to the executable, and finally payloads. * Support customized CPU flags for runners This implements a simplified custom CPU flags pattern for the runners. When built without overrides, the runner name contains the vector flag we check for (AVX) to ensure we don't try to run on unsupported systems and crash. If the user builds a customized set, we omit the naming scheme and don't check for compatibility. This avoids checking requirements at runtime, so that logic has been removed as well. This can be used to build GPU runners with no vector flags, or CPU/GPU runners with additional flags (e.g. AVX512) enabled. * Use relative paths If the user checks out the repo in a path that contains spaces, make gets really confused so use relative paths for everything in-repo to avoid breakage. * Remove payloads from main binary * install: clean up prior libraries This removes support for v0.3.6 and older versions (before the tar bundle) and ensures we clean up prior libraries before extracting the bundle(s). Without this change, runners and dependent libraries could leak when we update and lead to subtle runtime errors.

build: Make target improvements (#7499)
* llama: wire up builtin runner This adds a new entrypoint into the ollama CLI to run the cgo built runner. On Mac arm64, this will have GPU support, but on all other platforms it will be the lowest common denominator CPU build. After we fully transition to the new Go runners more tech-debt can be removed and we can stop building the "default" runner via make and rely on the builtin always. * build: Make target improvements Add a few new targets and help for building locally. This also adjusts the runner lookup to favor local builds, then runners relative to the executable, and finally payloads. * Support customized CPU flags for runners This implements a simplified custom CPU flags pattern for the runners. When built without overrides, the runner name contains the vector flag we check for (AVX) to ensure we don't try to run on unsupported systems and crash. If the user builds a customized set, we omit the naming scheme and don't check for compatibility. This avoids checking requirements at runtime, so that logic has been removed as well. This can be used to build GPU runners with no vector flags, or CPU/GPU runners with additional flags (e.g. AVX512) enabled. * Use relative paths If the user checks out the repo in a path that contains spaces, make gets really confused so use relative paths for everything in-repo to avoid breakage. * Remove payloads from main binary * install: clean up prior libraries This removes support for v0.3.6 and older versions (before the tar bundle) and ensures we clean up prior libraries before extracting the bundle(s). Without this change, runners and dependent libraries could leak when we update and lead to subtle runtime errors.
4879a234 · Daniel Hiltgen · GitHub · 63269668 · 4879a234 · 4879a234
Unverified Commit 4879a234 authored Dec 10, 2024 by Daniel Hiltgen Committed by GitHub Dec 10, 2024
15 changed files
--- a/llama/make/Makefile.rocm
+++ b/llama/make/Makefile.rocm
@@ -4,22 +4,25 @@
 # unlike CUDA where we'll build both a v11 and v12 variant.

 include make/common-defs.make
+include make/rocm-defs.make

 HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
 HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-

 ifeq ($(OS),windows)
-	GPU_LIB_DIR_WIN := $(shell cygpath -m -s "$(HIP_PATH)/bin")
-	CGO_EXTRA_LDFLAGS_WIN := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
-	GPU_COMPILER_WIN := $(HIP_PATH)/bin/hipcc.bin.exe
-	GPU_COMPILER:=$(GPU_COMPILER_WIN)
+	GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
+	CGO_EXTRA_LDFLAGS := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
+	HIP_ARCHS?=$(HIP_ARCHS_COMMON)
+	GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
+	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
 else ifeq ($(OS),linux)
-	GPU_LIB_DIR_LINUX := $(HIP_PATH)/lib
-	GPU_COMPILER_LINUX := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X)
-	GPU_COMPILER:=$(GPU_COMPILER_LINUX)
-	ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' '  | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
-	GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
+	GPU_LIB_DIR := $(strip $(shell ls -d $(HIP_PATH)/lib64 2>/dev/null || ls -d $(HIP_PATH)/lib 2>/dev/null))
+	CGO_EXTRA_LDFLAGS := -L$(GPU_LIB_DIR)
+	HIP_ARCHS?=$(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX)
+	GPU_COMPILER_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
+	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
 endif
+GPU_COMPILER=$(HIP_COMPILER)

 # TODO future multi-variant support for ROCm
 # ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version)))))))
@@ -31,36 +34,37 @@ GPU_RUNNER_GO_TAGS := rocm
 GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT)
 GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64
 GPU_RUNNER_LIBS_SHORT := hipblas rocblas
-GPU_PATH_ROOT_WIN=$(dir $(GPU_LIB_DIR_WIN))
-GPU_PATH_ROOT_LINUX=$(dir $(GPU_LIB_DIR_LINUX))
-GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -fPIC -D_GNU_SOURCE
-GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -fPIC -D_GNU_SOURCE

-GPU_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
+# Note: ROCm requires an extra step of discovering and copying the transitive dependencies on linux
 ifeq ($(OS),windows)
-	ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))/lib/ollama
+	ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)/lib/ollama
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
 else ifeq ($(OS),linux)
-	ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH)-rocm)/lib/ollama
+	ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)-rocm/lib/ollama
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+	ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' '  | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
+	GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
+	FILTERED_GPU_TRANSITIVE_LIBS=$(sort $(filter-out $(addprefix %,$(notdir $(GPU_LIBS))), $(GPU_TRANSITIVE_LIBS)))
+	GPU_DIST_TRANSITIVE_LIB_DEPS = $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(FILTERED_GPU_TRANSITIVE_LIBS))))
 endif
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS)) $(notdir $(GPU_TRANSITIVE_LIBS))))
+GPU_DIST_LIB_DEPS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
 ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt

 ifeq ($(OS),linux)
 	GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++11
-	GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX), --offload-arch=$(arch))
 else ifeq ($(OS),windows)
 	GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
-	GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON), --offload-arch=$(arch))
 endif
+GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))
+
+# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
+GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))

 GPU_COMPILER_CUFLAGS = \
 	$(GPU_COMPILER_FPIC) \
-	$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
+	$(addprefix -m,$(GPU_VECTOR_FLAGS)) \
 	-mf16c \
 	-mfma \
-	-parallel-jobs=2 \
 	-c \
 	-O3 \
 	-DGGML_USE_CUDA \
@@ -90,7 +94,7 @@ GPU_COMPILER_CUFLAGS = \
 	-Wno-pass-failed \
 	-Wno-deprecated-declarations \
 	-Wno-unused-result \
-	-I.
+	-I./llama/

 # Workaround buggy P2P copy on some windows multi-GPU setups
 # This workaround breaks linux systems with small system RAM, so only enable on windows
@@ -101,9 +105,13 @@ endif
 include make/gpu.make

 # Adjust the rules from gpu.make to handle the ROCm dependencies properly
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST)
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(GPU_DIST_TRANSITIVE_LIB_DEPS)
 $(ROCBLAS_DIST_DEP_MANIFEST):
 	@-mkdir -p $(dir $@)
 	@echo "Copying rocblas library..."
-	cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - )
+	(cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . ) | (cd $(dir $@) && tar xf - )
 	@echo "rocblas library copy complete"
+
+$(GPU_DIST_TRANSITIVE_LIB_DEPS):
+	@-mkdir -p $(dir $@)
+	$(CP) $(dir $(filter %$(notdir $@),$(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
--- a/llama/make/Makefile.sync
+++ b/llama/make/Makefile.sync
 # Helpers for managing our vendored llama.cpp repo and patch set

-REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
-DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
+REPO_ROOT:=./
+DEST_DIR:=./llama/

-include $(REPO_ROOT)llama/vendoring
+include $(DEST_DIR)vendoring

-LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
+LLAMACPP_REPO := ./llama/vendor/

-LLAMACPP_PATCH_DIR := $(DST_DIR)patches/
+# Relative to the vendor dir
+VENDOR_RELATIVE_PATCH_DIR := ../patches/


 help-sync:
 	@echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes"
 	@echo ""
-	@echo "\tmake apply-patches   # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
-	@echo "\tmake sync            # Vendor llama.cpp and ggml from the tracking repo working tree"
-	@echo "\tmake create-patches  # Generate the patch set based on the current commits in the tracking repo since the base commit"
+	@echo "	make apply-patches	# Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
+	@echo "	make sync		# Vendor llama.cpp and ggml from the tracking repo working tree"
+	@echo "	make sync-clean		# Remove all vendored files"
+	@echo "	make create-patches	# Generate the patch set based on the current commits in the tracking repo since the base commit"
 	@echo ""
-	@echo "For more details on the workflow, see the Vendoring section in ../docs/development.md"
+	@echo "For more details on the workflow, see the Vendoring section in 'docs/development.md'"

 apply-patches: $(LLAMACPP_REPO)
 	@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
@@ -29,7 +31,7 @@ apply-patches: $(LLAMACPP_REPO)
 	@git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \
 		git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT)
 	@echo "Applying ollama patches..."
-	@git -c 'user.name=nobody' -c 'user.email=<>' -C $(LLAMACPP_REPO) am -3 $(LLAMACPP_PATCH_DIR)/*.patch || \
+	@cd $(LLAMACPP_REPO) && git -c 'user.name=nobody' -c 'user.email=<>' am -3 $(VENDOR_RELATIVE_PATCH_DIR)*.patch || \
 		echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches"
 	@echo ""
 	@echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied."
@@ -44,7 +46,7 @@ create-patches: $(LLAMACPP_REPO)
  		echo "ERROR: Your llama.cpp repo is dirty.  You must commit any pending changes for format-patch to generate patches"; \
  		exit 1; \
 	fi
-	git -C $(LLAMACPP_REPO) format-patch --no-signature --no-numbered --zero-commit -o $(LLAMACPP_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
+	@cd $(LLAMACPP_REPO) && git format-patch --no-signature --no-numbered --zero-commit -o $(VENDOR_RELATIVE_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)

 # Vendoring template logic
 EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
@@ -86,12 +88,12 @@ LLAMACPP_FILES=\
 	include/llama.h \
 	ggml/src/llamafile/sgemm.cpp \
 	ggml/src/llamafile/sgemm.h
-$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))

 # llama.cpp files -> llama/llamafile
 LLAMAFILE_FILES= \
 	ggml/src/llamafile/sgemm.h
-$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR)llamafile/)))
+$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR)llamafile/)))

 # ggml files -> llama/
 GGML_FILES= \
@@ -115,10 +117,10 @@ GGML_FILES= \
 	ggml/src/ggml-cpu-impl.h \
 	ggml/include/ggml-blas.h \
 	ggml/src/ggml-blas.cpp
-$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))

 # TODO generalize renaming pattern if we have more of these
-$(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
+$(DEST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
 	@echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \
 		mkdir -p $(dir $@) && \
 		echo "/**" > $@ && \
@@ -128,20 +130,20 @@ $(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
 		echo " */" >> $@ && \
 		echo "" >> $@ && \
 		cat $< >> $@
-VENDORED_FILES += $(DST_DIR)ggml-metal_darwin_arm64.m
+VENDORED_FILES += $(DEST_DIR)ggml-metal_darwin_arm64.m

 # ggml-cuda -> llama/ggml-cuda/
 GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh
 GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES)))))
-$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/)))
+$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/)))

 GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu
 GGML_TEMPLATE_FILES_EXPANDED = 	$(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES)))))
-$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/template-instances/)))
+$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/template-instances/)))

 GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h
 GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES)))))
-$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/vendors/)))
+$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/vendors/)))

 # llava -> llama/
 LAVA_FILES= \
@@ -163,27 +165,30 @@ LAVA_FILES+= \
 	common/json-schema-to-grammar.cpp \
 	common/json-schema-to-grammar.h \
 	common/base64.hpp
-$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))

-$(DST_DIR)build-info.cpp:
+$(DEST_DIR)build-info.cpp:
 	@echo "Generating $@"
 	@echo "int LLAMA_BUILD_NUMBER = 0;" > $@
 	@echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@
 	@echo "char const *LLAMA_COMPILER = \"\";" >> $@
 	@echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@
-VENDORED_FILES += $(DST_DIR)build-info.cpp
+VENDORED_FILES += $(DEST_DIR)build-info.cpp


 sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files

+sync-clean:
+	rm -f $(VENDORED_FILES) $(EXTRA_NATIVE_FILES)
+
 PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh
-NATIVE_DIRS=$(DST_DIR) $(DST_DIR)llamafile/ $(DST_DIR)ggml-cuda/ $(DST_DIR)ggml-cuda/template-instances/ $(DST_DIR)ggml-cuda/vendors/
+NATIVE_DIRS=$(DEST_DIR) $(DEST_DIR)llamafile/ $(DEST_DIR)ggml-cuda/ $(DEST_DIR)ggml-cuda/template-instances/ $(DEST_DIR)ggml-cuda/vendors/
 ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS))))
-EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
+EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DEST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
 remove-stale-files:
 	@rm -f $(EXTRA_NATIVE_FILES)

-.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT 
+.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT


 # Handy debugging for make variables

--- a/make/Makefile.test
+++ b/make/Makefile.test
+# Targets to assist in running tests
+
+include make/common-defs.make
+
+test:
+	cd .. && go test ./... 
+
+integration: $(OLLAMA_EXE)
+	cd .. && go test --tags=integration ./integration -v
+
+lint:
+	cd .. && golangci-lint run -v
+
+# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows
+$(OLLAMA_EXE):
+	@echo ""
+	@echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries"
+	@echo ""
+	@exit 1
\ No newline at end of file
--- a/llama/make/common-defs.make
+++ b/llama/make/common-defs.make
@@ -21,37 +21,43 @@ export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
 export HIP_PLATFORM = amd
 export CGO_ENABLED=1

-SRC_DIR := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
-BUILD_DIR = $(SRC_DIR)build/$(OS)-$(ARCH)
-DIST_BASE = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))
+BUILD_DIR = ./llama/build/$(OS)-$(ARCH)
+DIST_BASE = ./dist/$(OS)-$(ARCH)
+
+ifeq ($(OS),windows)
+	# Absolute paths with cygpath to convert to 8.3 without spaces
+	PWD="$(shell pwd)"
+	DIST_OLLAMA_EXE=$(DIST_BASE)/ollama$(EXE_EXT)
+else
+	CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
+	DIST_OLLAMA_EXE=$(DIST_BASE)/bin/ollama$(EXE_EXT)
+endif
 DIST_LIB_DIR = $(DIST_BASE)/lib/ollama
 RUNNERS_DIST_DIR = $(DIST_LIB_DIR)/runners
-RUNNERS_PAYLOAD_DIR = $(abspath $(SRC_DIR)/../build/$(OS)/$(ARCH))
 RUNNERS_BUILD_DIR = $(BUILD_DIR)/runners
-DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
-GZIP:=$(shell command -v pigz 2>/dev/null || echo "gzip")
-ifneq ($(OS),windows)
-	CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
-endif
 VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")

 # Conditionally enable ccache for cgo builds too
 ifneq ($(CCACHE),)
-	CC=$(CCACHE) gcc
-	CXX=$(CCACHE) g++
+	CC?=$(CCACHE) gcc
+	CXX?=$(CCACHE) g++
 	export CC
 	export CXX
 endif


-# Override in environment space separated to tune GPU runner CPU vector flags
+# Override in environment to tune CPU vector flags
 ifeq ($(ARCH),amd64)
-	GPU_RUNNER_CPU_FLAGS ?= avx
+ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
+	GPU_RUNNER_CPU_FLAGS=avx
+	GPU_RUNNER_EXTRA_VARIANT=_avx
+else
+	GPU_RUNNER_CPU_FLAGS=$(subst $(comma),$(space),$(CUSTOM_CPU_FLAGS))
+endif
 endif

 ifeq ($(OS),windows)
 	CP := cp
-	SRC_DIR := $(shell cygpath -m -s "$(SRC_DIR)")
 	OBJ_EXT := obj
 	SHARED_EXT := dll
 	EXE_EXT := .exe
@@ -63,22 +69,23 @@ ifneq ($(HIP_PATH),)
 	export HIP_PATH
 endif
 else ifeq ($(OS),linux)
-	CP := cp -af
+	CP := cp -df
 	OBJ_EXT := o
 	SHARED_EXT := so
 	SHARED_PREFIX := lib
 	CPU_FLAG_PREFIX := -m
-	HIP_PATH?=/opt/rocm
 else
 	OBJ_EXT := o
 	SHARED_EXT := so
 	CPU_FLAG_PREFIX := -m
-	CP := cp -af
+	CP := cp -df
 endif

 COMMON_SRCS := \
-	$(wildcard *.c) \
-	$(wildcard *.cpp)
+	$(wildcard ./llama/*.c) \
+	$(wildcard ./llama/*.cpp)
 COMMON_HDRS := \
-	$(wildcard *.h) \
-	$(wildcard *.hpp)
+	$(wildcard ./llama/*.h) \
+	$(wildcard ./llama/*.hpp)
+
+OLLAMA_EXE=./ollama$(EXE_EXT)
\ No newline at end of file
--- a/make/cuda-v11-defs.make
+++ b/make/cuda-v11-defs.make
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
+	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
+	CUDA_11_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
+	CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc.exe)
+	CUDA_11_LIB_DIR = $(strip $(shell ls -d $(CUDA_11_PATH)/bin 2>/dev/null))
+	CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_PATH)/lib/x64"
+else ifeq ($(OS),linux)
+	CUDA_PATH?=/usr/local/cuda
+	CUDA_11_PATH:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
+	CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc)
+	CUDA_11_LIB_DIR=$(strip $(shell ls -d $(CUDA_11_PATH)/lib64 2>/dev/null || ls -d $(CUDA_11_PATH)/lib 2>/dev/null))
+	CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_LIB_DIR)" -L"$(CUDA_11_LIB_DIR)/stubs"
+endif
--- a/make/cuda-v12-defs.make
+++ b/make/cuda-v12-defs.make
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
+	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
+	CUDA_12_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
+	CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc.exe)
+	CUDA_12_LIB_DIR = $(strip $(shell ls -d $(CUDA_12_PATH)/bin 2>/dev/null))
+	CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_PATH)/lib/x64"
+else ifeq ($(OS),linux)
+	CUDA_PATH?=/usr/local/cuda
+	CUDA_12_PATH:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
+	CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc)
+	CUDA_12_LIB_DIR=$(strip $(shell ls -d $(CUDA_12_PATH)/lib64 2>/dev/null || ls -d $(CUDA_12_PATH)/lib 2>/dev/null))
+	CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_LIB_DIR)" -L"$(CUDA_12_LIB_DIR)/stubs" 
+endif
--- a/llama/make/cuda.make
+++ b/llama/make/cuda.make
@@ -10,27 +10,31 @@ GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT)
 GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
 GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
 GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
-GPU_LIB_DIR_WIN = $(GPU_PATH_ROOT_WIN)/bin
-GPU_LIB_DIR_LINUX = $(GPU_PATH_ROOT_LINUX)/lib64
-CGO_EXTRA_LDFLAGS_WIN = -L"$(GPU_PATH_ROOT_WIN)/lib/x64"
-GPU_COMPILER_WIN = $(GPU_PATH_ROOT_WIN)/bin/nvcc
-GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc
-GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))

-ifeq ($(OS),linux)
-	CUDA_PATH?=/usr/local/cuda
-	GPU_COMPILER_FPIC = -fPIC -Wno-unused-function -std=c++11
+ifeq ($(OS),windows)
+	# On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on
+	GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)))))
+	GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__)
+	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__)
+	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__)
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+	GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
+	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
+else ifeq ($(OS),linux)
+	# On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw
+	GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
+	GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++11
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+	GPU_COMPILER_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
+	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 endif
+GPU_DIST_LIB_DEPS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
+
 GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
 	-DGGML_CUDA_USE_GRAPHS=1
 GPU_COMPILER_CUFLAGS = \
-	$(GPU_COMPILER_FPIC) \
-	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(_OS_GPU_RUNNER_CPU_FLAGS))" \
+	$(GPU_COMPILER_EXTRA_FLAGS) \
+	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \
 	-t2 \
 	-DGGML_CUDA_DMMV_X=32 \
 	-DGGML_CUDA_MMV_Y=1 \
@@ -46,5 +50,5 @@ GPU_COMPILER_CUFLAGS = \
 	-Wno-deprecated-gpu-targets \
 	--forward-unknown-to-host-compiler \
 	-use_fast_math \
-	-I. \
+	-I./llama/  \
 	-O3
--- a/llama/make/gpu.make
+++ b/llama/make/gpu.make
@@ -5,42 +5,22 @@ dummy:
 	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
 endif

-ifeq ($(OS),windows)
-	GPU_COMPILER:=$(GPU_COMPILER_WIN)
-	GPU_LIB_DIR:=$(GPU_LIB_DIR_WIN)
-	CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_WIN)
-	GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_WIN)
-	GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_WIN)
-else ifeq ($(OS),linux)
-	GPU_COMPILER:=$(GPU_COMPILER_LINUX)
-	GPU_LIB_DIR:=$(GPU_LIB_DIR_LINUX)
-	CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_LINUX)
-	GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_LINUX)
-	GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_LINUX)
-endif
-
-GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS))\" $(TARGET_LDFLAGS)"
+GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS))\" $(EXTRA_GOLDLAGS) $(TARGET_LDFLAGS)"

 # TODO Unify how we handle dependencies in the dist/packaging and install flow
 # today, cuda is bundled, but rocm is split out.  Should split them each out by runner
 DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)

-ifeq ($(OS),windows)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(call uc,$(GPU_RUNNER_CPU_FLAGS))
-else ifeq ($(OS),linux)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(GPU_RUNNER_CPU_FLAGS)
-endif

 GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
-DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS)))

 GPU_RUNNER_SRCS := \
-	ggml-cuda.cu \
-	$(filter-out $(wildcard ggml-cuda/fattn*.cu),$(wildcard ggml-cuda/*.cu)) \
-	$(wildcard ggml-cuda/template-instances/mmq*.cu) \
-	ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp ggml-aarch64.c
+	llama/ggml-cuda.cu \
+	$(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \
+	$(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \
+	llama/ggml.c llama/ggml-backend.c llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-aarch64.c
 GPU_RUNNER_HDRS := \
-	$(wildcard ggml-cuda/*.cuh)
+	$(wildcard llama/ggml-cuda/*.cuh)


 # Conditional flags and components to speed up developer builds
@@ -49,25 +29,24 @@ ifneq ($(OLLAMA_FAST_BUILD),)
 		-DGGML_DISABLE_FLASH_ATTN
 else
 	GPU_RUNNER_SRCS += \
-		$(wildcard ggml-cuda/fattn*.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
+		$(wildcard llama/ggml-cuda/fattn*.cu) \
+		$(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \
+		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
+		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
+		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
 endif

 GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
 GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
 GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))

-DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
-ifneq ($(OS),windows)
-PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME)))
-endif
-BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
+DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
+BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
+

+$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) 

-$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
+dist: $(DIST_RUNNERS)

 # Build targets
 $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
@@ -79,11 +58,11 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
 $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
 	@-mkdir -p $(dir $@)
 	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS)
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/"
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
 	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
+	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
 	@-mkdir -p $(dir $@)
 	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@

@@ -91,27 +70,16 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).
 $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
 	@-mkdir -p $(dir $@)
 	$(CP) $< $@
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS)
-$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS)
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
 	@-mkdir -p $(dir $@)
 	$(CP) $< $@
-$(DIST_GPU_RUNNER_LIB_DEPS): 
+$(GPU_DIST_LIB_DEPS):
 	@-mkdir -p $(dir $@)
 	$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
-$(GPU_DIST_DEPS_LIBS): 
-	@-mkdir -p $(dir $@)
-	$(CP) $(dir $(filter %$(notdir $@),$(GPU_LIBS) $(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
-
-# Payload targets
-$(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server 
-	@-mkdir -p $(dir $@)
-	${GZIP} --best -c $< > $@
-$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/%
-	@-mkdir -p $(dir $@)
-	${GZIP} --best -c $< > $@

 clean: 
-	rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
+	rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS)

 .PHONY: clean $(GPU_RUNNER_NAME)


--- a/make/rocm-defs.make
+++ b/make/rocm-defs.make
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc.bin.exe)
+else ifeq ($(OS),linux)
+	HIP_PATH?=$(shell ls -d /opt/rocm 2>/dev/null)
+	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc)
+endif
--- a/runners/common.go
+++ b/runners/common.go
 package runners

 import (
-	"compress/gzip"
-	"errors"
-	"fmt"
-	"io"
-	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"slices"
-	"strconv"
 	"strings"
 	"sync"
-	"syscall"

-	"golang.org/x/sync/errgroup"
+	"golang.org/x/sys/cpu"

-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 )

-const (
-	binGlob = "*/*/*/*"
-)
-
 var (
-	lock       sync.Mutex
 	runnersDir = ""
+	once       = sync.Once{}
 )

-// Return the location where runners are stored
-// If runners are payloads, this will either extract them
-// or refresh them if any have disappeared due to tmp cleaners
-func Refresh(payloadFS fs.FS) (string, error) {
-	lock.Lock()
-	defer lock.Unlock()
-	var err error
+type CPUCapability uint32

-	// Wire up extra logging on our first load
-	if runnersDir == "" {
-		defer func() {
-			var runners []string
-			for v := range GetAvailableServers(runnersDir) {
-				runners = append(runners, v)
-			}
-			slog.Info("Dynamic LLM libraries", "runners", runners)
-			slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
-		}()
-	}
+// Override at build time when building base GPU runners
+// var GPURunnerCPUCapability = CPUCapabilityAVX

-	if hasPayloads(payloadFS) {
-		if runnersDir == "" {
-			runnersDir, err = extractRunners(payloadFS)
-		} else {
-			err = refreshRunners(payloadFS, runnersDir)
-		}
-	} else if runnersDir == "" {
-		runnersDir, err = locateRunners()
-	}
-
-	return runnersDir, err
-}
+const (
+	CPUCapabilityNone CPUCapability = iota
+	CPUCapabilityAVX
+	CPUCapabilityAVX2
+	// TODO AVX512
+)

-func Cleanup(payloadFS fs.FS) {
-	lock.Lock()
-	defer lock.Unlock()
-	if hasPayloads(payloadFS) && runnersDir != "" {
-		// We want to fully clean up the tmpdir parent of the payloads dir
-		tmpDir := filepath.Clean(filepath.Join(runnersDir, ".."))
-		slog.Debug("cleaning up", "dir", tmpDir)
-		err := os.RemoveAll(tmpDir)
-		if err != nil {
-			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
-		}
+func (c CPUCapability) String() string {
+	switch c {
+	case CPUCapabilityAVX:
+		return "avx"
+	case CPUCapabilityAVX2:
+		return "avx2"
+	default:
+		return "no vector extensions"
 	}
 }

-func locateRunners() (string, error) {
-	exe, err := os.Executable()
-	if err != nil {
-		return "", err
-	}
-
-	cwd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	var paths []string
-	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
-		paths = append(paths,
-			root,
-			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
-			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
-		)
+func GetCPUCapability() CPUCapability {
+	if cpu.X86.HasAVX2 {
+		return CPUCapabilityAVX2
 	}
-
-	// Try a few variations to improve developer experience when building from source in the local tree
-	for _, path := range paths {
-		candidate := filepath.Join(path, "lib", "ollama", "runners")
-		if _, err := os.Stat(candidate); err == nil {
-			return candidate, nil
-		}
+	if cpu.X86.HasAVX {
+		return CPUCapabilityAVX
 	}
-	return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
+	// else LCD
+	return CPUCapabilityNone
 }

-// Return true if we're carying nested payloads for the runners
-func hasPayloads(payloadFS fs.FS) bool {
-	files, err := fs.Glob(payloadFS, binGlob)
-	if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) {
-		return false
-	}
-	return true
+// Return the location where runners were located
+// empty string indicates only builtin is present
+func Locate() string {
+	once.Do(locateRunnersOnce)
+	return runnersDir
 }

-func extractRunners(payloadFS fs.FS) (string, error) {
-	cleanupTmpDirs()
-	tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama")
-	if err != nil {
-		return "", fmt.Errorf("failed to generate tmp dir: %w", err)
-	}
-	// Track our pid so we can clean up orphaned tmpdirs
-	n := filepath.Join(tmpDir, "ollama.pid")
-	if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
-		slog.Warn("failed to write pid file", "file", n, "error", err)
-	}
-	// We create a distinct subdirectory for payloads within the tmpdir
-	// This will typically look like /tmp/ollama3208993108/runners on linux
-	rDir := filepath.Join(tmpDir, "runners")
-
-	slog.Info("extracting embedded files", "dir", rDir)
-	return rDir, refreshRunners(payloadFS, rDir)
-}
-
-func refreshRunners(payloadFS fs.FS, rDir string) error {
-	// extract or refresh server libraries
-	err := extractFiles(payloadFS, rDir, binGlob)
+// searches for runners in a prioritized set of locations
+// 1. local build, with executable at the top of the tree
+// 2. lib directory relative to executable
+func locateRunnersOnce() {
+	exe, err := os.Executable()
 	if err != nil {
-		return fmt.Errorf("extract binaries: %v", err)
+		slog.Debug("runner locate", "error", err)
 	}
-	return nil
-}
-
-// extract extracts the embedded files to the target directory
-func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
-	files, err := fs.Glob(payloadFS, glob)
-	if err != nil || len(files) == 0 {
-		// Should not happen
-		return fmt.Errorf("extractFiles called without payload present")
-	}
-
-	if err := os.MkdirAll(targetDir, 0o755); err != nil {
-		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
-	}
-
-	g := new(errgroup.Group)

-	// $OS/$GOARCH/$RUNNER/$FILE
-	for _, file := range files {
-		filename := file
-
-		runner := filepath.Base(filepath.Dir(filename))
-
-		slog.Debug("extracting", "runner", runner, "payload", filename)
-
-		g.Go(func() error {
-			srcf, err := payloadFS.Open(filename)
-			if err != nil {
-				return err
-			}
-			defer srcf.Close()
-
-			src := io.Reader(srcf)
-			if strings.HasSuffix(filename, ".gz") {
-				src, err = gzip.NewReader(src)
-				if err != nil {
-					return fmt.Errorf("decompress payload %s: %v", filename, err)
-				}
-				filename = strings.TrimSuffix(filename, ".gz")
-			}
-
-			runnerDir := filepath.Join(targetDir, runner)
-			if err := os.MkdirAll(runnerDir, 0o755); err != nil {
-				return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
-			}
-
-			base := filepath.Base(filename)
-			destFilename := filepath.Join(runnerDir, base)
-
-			_, err = os.Stat(destFilename)
-			switch {
-			case errors.Is(err, os.ErrNotExist):
-				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-				if err != nil {
-					return fmt.Errorf("write payload %s: %v", filename, err)
-				}
-				defer destFile.Close()
-				if _, err := io.Copy(destFile, src); err != nil {
-					return fmt.Errorf("copy payload %s: %v", filename, err)
-				}
-			case err != nil:
-				return fmt.Errorf("stat payload %s: %v", filename, err)
-			}
-			return nil
-		})
+	paths := []string{
+		filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
+		filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
 	}
-
-	err = g.Wait()
-	if err != nil {
-		slog.Error("failed to extract files", "error", err)
-		// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted
-		err := os.RemoveAll(targetDir)
-		if err != nil {
-			slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
+	for _, path := range paths {
+		if _, err := os.Stat(path); err == nil {
+			runnersDir = path
+			slog.Debug("runners located", "dir", runnersDir)
+			return
 		}
-		return err
 	}
-	return nil
+	// Fall back to built-in
+	slog.Debug("no dynamic runners detected, using only built-in")
+	runnersDir = ""
 }

-// Best effort to clean up prior tmpdirs
-func cleanupTmpDirs() {
-	tmpDir := envconfig.TmpDir()
-	if tmpDir == "" {
-		tmpDir = os.TempDir()
-	}
-	matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
-	if err != nil {
-		return
-	}
-
-	for _, match := range matches {
-		raw, err := os.ReadFile(match)
-		if errors.Is(err, os.ErrNotExist) {
-			slog.Debug("not a ollama runtime directory, skipping", "path", match)
-			continue
-		} else if err != nil {
-			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		pid, err := strconv.Atoi(string(raw))
-		if err != nil {
-			slog.Warn("invalid pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		p, err := os.FindProcess(pid)
-		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-			slog.Warn("process still running, skipping", "pid", pid, "path", match)
-			continue
-		}
-
-		if err := os.Remove(match); err != nil {
-			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
-		}
-
-		runners := filepath.Join(filepath.Dir(match), "runners")
-		if err := os.RemoveAll(runners); err != nil {
-			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
-		}
-
-		if err := os.Remove(filepath.Dir(match)); err != nil {
-			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
-		}
+// Return the well-known name of the builtin runner for the given platform
+func BuiltinName() string {
+	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
+		return "metal"
 	}
+	return "cpu"
 }

 // directory names are the name of the runner and may contain an optional
 // variant prefixed with '_' as the separator. For example, "cuda_v11" and
 // "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
 // lowest common denominator
-func GetAvailableServers(payloadsDir string) map[string]string {
-	if payloadsDir == "" {
-		slog.Error("empty runner dir")
-		return nil
+func GetAvailableServers() map[string]string {
+	once.Do(locateRunnersOnce)
+
+	servers := make(map[string]string)
+	exe, err := os.Executable()
+	if err == nil {
+		servers[BuiltinName()] = exe
+	}
+
+	if runnersDir == "" {
+		return servers
 	}

-	// glob payloadsDir for files that start with ollama_
-	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
+	// glob runnersDir for files that start with ollama_
+	pattern := filepath.Join(runnersDir, "*", "ollama_*")

 	files, err := filepath.Glob(pattern)
 	if err != nil {
@@ -289,96 +119,88 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 		return nil
 	}

-	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
+		runnerName := filepath.Base(filepath.Dir(file))
+		// Special case for our GPU runners - if compiled with standard AVX flag
+		// detect incompatible system
+		// Custom builds will omit this and its up to the user to ensure compatibility
+		parsed := strings.Split(runnerName, "_")
+		if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX {
+			slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName)
+			continue
+		}
+		servers[runnerName] = file
 	}

 	return servers
 }

-// serversForGpu returns a list of compatible servers give the provided GPU
-// info, ordered by performance. assumes Init() has been called
-// TODO - switch to metadata based mapping
-func ServersForGpu(info discover.GpuInfo) []string {
+// serversForGpu returns a list of compatible servers give the provided GPU library/variant
+func ServersForGpu(requested string) []string {
 	// glob workDir for files that start with ollama_
-	availableServers := GetAvailableServers(runnersDir)
-	requested := info.Library
-	if info.Variant != discover.CPUCapabilityNone.String() {
-		requested += "_" + info.Variant
+	availableServers := GetAvailableServers()
+
+	// Short circuit if the only option is built-in
+	if _, ok := availableServers[BuiltinName()]; ok && len(availableServers) == 1 {
+		return []string{BuiltinName()}
 	}

+	bestCPUVariant := GetCPUCapability()
+	requestedLib := strings.Split(requested, "_")[0]
 	servers := []string{}

 	// exact match first
 	for a := range availableServers {
-		if a == requested {
+		short := a
+		parsed := strings.Split(a, "_")
+		if len(parsed) == 3 {
+			// Strip off optional _avx for comparison
+			short = parsed[0] + "_" + parsed[1]
+		}
+		if a == requested || short == requested {
 			servers = []string{a}
-
-			if a == "metal" {
-				return servers
-			}
-
-			break
 		}
 	}

-	alt := []string{}
-
-	// Then for GPUs load alternates and sort the list for consistent load ordering
-	if info.Library != "cpu" {
+	// If no exact match, then try without variant
+	if len(servers) == 0 {
+		alt := []string{}
 		for a := range availableServers {
-			if info.Library == strings.Split(a, "_")[0] && a != requested {
+			if requestedLib == strings.Split(a, "_")[0] && a != requested {
 				alt = append(alt, a)
 			}
 		}
-
 		slices.Sort(alt)
 		servers = append(servers, alt...)
 	}

-	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
-		// Load up the best CPU variant if not primary requested
-		if info.Library != "cpu" {
-			variant := discover.GetCPUCapability()
-			// If no variant, then we fall back to default
-			// If we have a variant, try that if we find an exact match
-			// Attempting to run the wrong CPU instructions will panic the
-			// process
-			if variant != discover.CPUCapabilityNone {
-				for cmp := range availableServers {
-					if cmp == "cpu_"+variant.String() {
-						servers = append(servers, cmp)
-						break
-					}
-				}
-			} else {
-				servers = append(servers, "cpu")
+	// Finally append the best CPU option if found, then builtin
+	if bestCPUVariant != CPUCapabilityNone {
+		for cmp := range availableServers {
+			if cmp == "cpu_"+bestCPUVariant.String() {
+				servers = append(servers, cmp)
+				break
 			}
 		}
-
-		if len(servers) == 0 {
-			servers = []string{"cpu"}
-		}
 	}
-
+	servers = append(servers, BuiltinName())
 	return servers
 }

 // Return the optimal server for this CPU architecture
 func ServerForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
-		return "metal"
+		return BuiltinName()
 	}
-	variant := discover.GetCPUCapability()
-	availableServers := GetAvailableServers(runnersDir)
-	if variant != discover.CPUCapabilityNone {
+	variant := GetCPUCapability()
+	availableServers := GetAvailableServers()
+	if variant != CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
 				return cmp
 			}
 		}
 	}
-	return "cpu"
+	return BuiltinName()
 }
--- a/runners/runners_test.go
+++ b/runners/runners_test.go
-package runners
-
-import (
-	"log/slog"
-	"os"
-	"path"
-	"runtime"
-	"strings"
-	"testing"
-	"testing/fstest"
-)
-
-func TestRefreshRunners(t *testing.T) {
-	slog.SetLogLoggerLevel(slog.LevelDebug)
-
-	payloadFS := fstest.MapFS{
-		path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
-	}
-	tmpDir, err := os.MkdirTemp("", "testing")
-	if err != nil {
-		t.Fatalf("failed to make tmp dir %s", err)
-	}
-	t.Setenv("OLLAMA_TMPDIR", tmpDir)
-	rDir, err := Refresh(payloadFS)
-	if err != nil {
-		t.Fatalf("failed to extract to %s %s", tmpDir, err)
-	}
-	if !strings.Contains(rDir, tmpDir) {
-		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
-	}
-
-	// spot check results
-	servers := GetAvailableServers(rDir)
-	if len(servers) < 1 {
-		t.Fatalf("expected at least 1 server")
-	}
-
-	// Refresh contents
-	rDir, err = extractRunners(payloadFS)
-	if err != nil {
-		t.Fatalf("failed to extract to %s %s", tmpDir, err)
-	}
-	if !strings.Contains(rDir, tmpDir) {
-		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
-	}
-
-	cleanupTmpDirs()
-
-	Cleanup(payloadFS)
-}
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -14,16 +14,14 @@ export CGO_CFLAGS=-mmacosx-version-min=11.3
 export CGO_CXXFLAGS=-mmacosx-version-min=11.3
 export CGO_LDFLAGS=-mmacosx-version-min=11.3

-for TARGETARCH in arm64 amd64; do
-    echo "Building Go runner darwin $TARGETARCH"
-    rm -rf llama/build
-    GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
-done
-
-lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
-rm -f dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
+rm -rf llama/build dist/darwin-*
+echo "Building darwin arm64"
+GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
+echo "Building darwin amd64 with AVX enabled"
+GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist
+
+
+lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
 if [ -n "$APPLE_IDENTITY" ]; then
    codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
 else

--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -82,7 +82,7 @@ function buildOllama() {
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
        write-host "Building ollama runners"
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-        & make -C llama -j 12
+        & make -j 12 dist
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"

--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -71,29 +71,20 @@ for BINDIR in /usr/local/bin /usr/bin /bin; do
 done
 OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})

+if [ -d "$OLLAMA_INSTALL_DIR/lib/ollama" ] ; then
+    status "Cleaning up old version at $OLLAMA_INSTALL_DIR/lib/ollama"
+    $SUDO rm -rf "$OLLAMA_INSTALL_DIR/lib/ollama"
+fi
 status "Installing ollama to $OLLAMA_INSTALL_DIR"
 $SUDO install -o0 -g0 -m755 -d $BINDIR
 $SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
-if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
-    status "Downloading Linux ${ARCH} bundle"
-    curl --fail --show-error --location --progress-bar \
-        "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
-        $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
-    BUNDLE=1
-    if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
-        status "Making ollama accessible in the PATH in $BINDIR"
-        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
-    fi
-else
-    status "Downloading Linux ${ARCH} CLI"
-    curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
-    "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
-    $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
-    BUNDLE=0
-    if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
-        status "Making ollama accessible in the PATH in $BINDIR"
-        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
-    fi
+status "Downloading Linux ${ARCH} bundle"
+curl --fail --show-error --location --progress-bar \
+    "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
+    $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
+    status "Making ollama accessible in the PATH in $BINDIR"
+    $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
 fi

 # Check for NVIDIA JetPack systems with additional downloads
@@ -230,31 +221,11 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
 fi

 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
-    if [ $BUNDLE -ne 0 ]; then
-        status "Downloading Linux ROCm ${ARCH} bundle"
-        curl --fail --show-error --location --progress-bar \
-            "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
-            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
-
-        install_success
-        status "AMD GPU ready."
-        exit 0
-    fi
-    # Look for pre-existing ROCm v6 before downloading the dependencies
-    for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
-        if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
-            status "Compatible AMD GPU ROCm library detected at ${search}"
-            install_success
-            exit 0
-        fi
-    done
+    status "Downloading Linux ROCm ${ARCH} bundle"
+    curl --fail --show-error --location --progress-bar \
+        "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
+        $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"

-    status "Downloading AMD GPU dependencies..."
-    $SUDO rm -rf /usr/share/ollama/lib
-    $SUDO chmod o+x /usr/share/ollama
-    $SUDO install -o ollama -g ollama -m 755 -d /usr/share/ollama/lib/rocm
-    curl --fail --show-error --location --progress-bar "https://ollama.com/download/ollama-linux-amd64-rocm.tgz${VER_PARAM}" \
-        | $SUDO tar zx --owner ollama --group ollama -C /usr/share/ollama/lib/rocm .
    install_success
    status "AMD GPU ready."
    exit 0

--- a/server/routes.go
+++ b/server/routes.go
@@ -27,7 +27,6 @@ import (
 	"golang.org/x/sync/errgroup"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
@@ -1264,13 +1263,16 @@ func Serve(ln net.Listener) error {
 		srvr.Close()
 		schedDone()
 		sched.unloadAllRunners()
-		runners.Cleanup(build.EmbedFS)
 		done()
 	}()

-	if _, err := runners.Refresh(build.EmbedFS); err != nil {
-		return fmt.Errorf("unable to initialize llm runners %w", err)
+	// Locate and log what runners are present at startup
+	var runnerNames []string
+	for v := range runners.GetAvailableServers() {
+		runnerNames = append(runnerNames, v)
 	}
+	slog.Info("Dynamic LLM libraries", "runners", runnerNames)
+	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")

 	s.sched.Run(schedCtx)