Adapt to Ollama version 0.5.7, which supports DeepSeek-R1 inference

84e5b6ff · xuxzh1 · a420a453 · 84e5b6ff · 84e5b6ff · 84e5b6ff
Commit 84e5b6ff authored Feb 04, 2025 by xuxzh1 🎱
6 changed files
--- a/README.md
+++ b/README.md
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -22,7 +22,7 @@ import (
 // Discovery logic for AMD/ROCm GPUs
 const (
-	DriverVersionFile     = "/sys/module/amdgpu/version"
+	DriverVersionFile     = "/sys/module/hydcu/version"
 	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
 	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
@@ -42,8 +42,8 @@ const (
 var (
 	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs          = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
+	ROCmLibGlobs          = []string{"libhipblas.so.0.1","rocblas"} // TODO - probably include more coverage of files here...
-	RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
+	RocmStandardLocations = []string{"/opt/dtk/lib", "/usr/lib64"}
 )
 // Gather GPU information from the amdgpu driver if any supported GPUs are detected

--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -168,7 +168,7 @@ static void mul_mat_vec_q_cuda(
    int64_t nwarps = 1;
    int64_t rows_per_cuda_block = 1;
-    if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
+    if (ggml_cuda_info().devices[id].cc < 1001030) { // NVIDIA and AMD older than RDNA2 but not CDNA
        switch(ncols_y) {
            case 1:
                nwarps = 4;

--- a/llama/ggml-cuda/norm.cu
+++ b/llama/ggml-cuda/norm.cu
@@ -27,7 +27,7 @@
 #include "norm.cuh"
 template <int block_size>
-static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
+static __global__ void __launch_bounds__(1024) norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;
@@ -63,7 +63,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
 }
 template <int block_size>
-static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
+static __global__ void __launch_bounds__(1024) group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
    // blockIdx.x: num_groups idx
    // threadIdx.x: block_size idx
    int start = blockIdx.x * group_size;
@@ -124,7 +124,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
 }
 template <int block_size>
-static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
+static __global__ void __launch_bounds__(1024) rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;

--- a/make/Makefile.rocm
+++ b/make/Makefile.rocm
@@ -6,8 +6,8 @@
 include make/common-defs.make
 include make/rocm-defs.make
-HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
+HIP_ARCHS_COMMON := gfx928 gfx906
-HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
+HIP_ARCHS_LINUX := gfx928 gfx906
 ifeq ($(OS),windows)
 	GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
@@ -88,8 +88,8 @@ GPU_COMPILER_CUFLAGS = \
 	-DUSE_PROF_API=1 \
 	-std=gnu++17 \
 	-x hip \
-	-mllvm=-amdgpu-early-inline-all=true \
+	#-mllvm=-amdgpu-early-inline-all=true \
-	-mllvm=-amdgpu-function-calls=false \
+	#-mllvm=-amdgpu-function-calls=false \
 	-Wno-expansion-to-defined \
 	-Wno-invalid-noreturn \
 	-Wno-ignored-attributes \
@@ -111,7 +111,7 @@ $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_s
 $(ROCBLAS_DIST_DEP_MANIFEST):
 	@-mkdir -p $(dir $@)
 	@echo "Copying rocblas library..."
-	(cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . ) | (cd $(dir $@) && tar xf - )
+	cd $(HIP_PATH)/../rocblas/lib/ && tar cf - . | (cd $(dir $@) && tar xf - )
 	@echo "rocblas library copy complete"
 $(GPU_DIST_TRANSITIVE_LIB_DEPS):

--- a/make/gpu.make
+++ b/make/gpu.make
 # Generalized GPU runner build
+INCLUDES = -I$(CURDIR)/llama
+GPU_COMPILER_CFLAGS += $(INCLUDES)
 ifndef GPU_RUNNER_NAME
 dummy:
 	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)