Makefile.rocm 4.64 KB
Newer Older
1
2
3
4
5
6
# Build rules for ROCm runner
#
# Note: at present we only support a single ROCm version (whichever is default on the build system)
# unlike CUDA where we'll build both a v11 and v12 variant.

include make/common-defs.make
7
include make/rocm-defs.make
8

9
10
HIP_ARCHS_COMMON := gfx928 gfx906
HIP_ARCHS_LINUX := gfx928 gfx906
11
12

ifeq ($(OS),windows)
13
14
15
16
17
	GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
	CGO_EXTRA_LDFLAGS := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
	HIP_ARCHS?=$(HIP_ARCHS_COMMON)
	GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
18
else ifeq ($(OS),linux)
19
20
21
22
23
	GPU_LIB_DIR := $(strip $(shell ls -d $(HIP_PATH)/lib64 2>/dev/null || ls -d $(HIP_PATH)/lib 2>/dev/null))
	CGO_EXTRA_LDFLAGS := -L$(GPU_LIB_DIR)
	HIP_ARCHS?=$(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX)
	GPU_COMPILER_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
24
endif
25
GPU_COMPILER=$(HIP_COMPILER)
26
27
28
29
30
31
32
33
34
35
36
37

# TODO future multi-variant support for ROCm
# ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version)))))))
# ifneq (,$(ROCM_VERSION))
# 	GPU_RUNNER_VARIANT = _v$(ROCM_VERSION)
# endif

GPU_RUNNER_GO_TAGS := rocm
GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT)
GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64
GPU_RUNNER_LIBS_SHORT := hipblas rocblas

38
# Note: ROCm requires an extra step of discovering and copying the transitive dependencies on linux
39
ifeq ($(OS),windows)
40
41
	ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)/lib/ollama
	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
42
else ifeq ($(OS),linux)
43
44
45
46
47
48
	ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)-rocm/lib/ollama
	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
	ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' '  | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
	GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
	FILTERED_GPU_TRANSITIVE_LIBS=$(sort $(filter-out $(addprefix %,$(notdir $(GPU_LIBS))), $(GPU_TRANSITIVE_LIBS)))
	GPU_DIST_TRANSITIVE_LIB_DEPS = $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(FILTERED_GPU_TRANSITIVE_LIBS))))
49
endif
50
GPU_DIST_LIB_DEPS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
51
52
53
ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt

ifeq ($(OS),linux)
54
	GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++17
55
56
57
else ifeq ($(OS),windows)
	GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
endif
58
59
60
61
GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))

# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
62
63
64

GPU_COMPILER_CUFLAGS = \
	$(GPU_COMPILER_FPIC) \
65
	$(addprefix -m,$(GPU_VECTOR_FLAGS)) \
66
67
	-mf16c \
	-mfma \
68
69
70
71
	-c \
	-O3 \
	-DGGML_USE_CUDA \
	-DGGML_BUILD=1 \
72
	-DGGML_BACKEND_BUILD=1 \
73
	-DGGML_SHARED=1 \
74
	-DGGML_BACKEND_SHARED=1 \
75
76
77
	-DGGML_CUDA_DMMV_X=32 \
	-DGGML_CUDA_MMV_Y=1 \
	-DGGML_SCHED_MAX_COPIES=4 \
78
	-DGGML_USE_HIP \
79
80
81
82
83
84
85
86
87
	-DGGML_USE_LLAMAFILE \
	-DHIP_FAST_MATH \
	-D__HIP_PLATFORM_AMD__=1 \
	-D__HIP_ROCclr__=1 \
	-DNDEBUG \
	-DK_QUANTS_PER_ITERATION=2 \
	-D_CRT_SECURE_NO_WARNINGS \
	-D_GNU_SOURCE \
	-D_XOPEN_SOURCE=600 \
88
	-DUSE_PROF_API=1 \
89
	-std=gnu++17 \
90
	-x hip \
91
92
	#-mllvm=-amdgpu-early-inline-all=true \
	#-mllvm=-amdgpu-function-calls=false \
93
94
95
96
97
98
	-Wno-expansion-to-defined \
	-Wno-invalid-noreturn \
	-Wno-ignored-attributes \
	-Wno-pass-failed \
	-Wno-deprecated-declarations \
	-Wno-unused-result \
99
	-I./llama/
100

101
102
103
104
105
106
# Workaround buggy P2P copy on some windows multi-GPU setups
# This workaround breaks linux systems with small system RAM, so only enable on windows
ifeq ($(OS),windows)
	GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
endif

107
108
109
include make/gpu.make

# Adjust the rules from gpu.make to handle the ROCm dependencies properly
110
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(GPU_DIST_TRANSITIVE_LIB_DEPS)
111
112
113
$(ROCBLAS_DIST_DEP_MANIFEST):
	@-mkdir -p $(dir $@)
	@echo "Copying rocblas library..."
114
	cd $(HIP_PATH)/../rocblas/lib/ && tar cf - . | (cd $(dir $@) && tar xf - )
115
	@echo "rocblas library copy complete"
116
117
118
119

$(GPU_DIST_TRANSITIVE_LIB_DEPS):
	@-mkdir -p $(dir $@)
	$(CP) $(dir $(filter %$(notdir $@),$(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)