[Rocm]Decouple the driving dependencies && [Bugfix]accuracy problem

b3befef4 · xuxz · 9de895f0 · b3befef4 · b3befef4 · b3befef4
Commit b3befef4 authored Apr 01, 2025 by xuxz
11 changed files
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Ollama可快速部署主流模型。
 ##### Docker

 ```bash
-docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy

 docker run -i -t -d  --device=/dev/kfd --privileged --network=host --device=/dev/dri --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home  -v /opt/hyhal:/opt/hyhal:ro -v --group-add video --shm-size 16G --name {容器名} {镜像ID}
 ```

--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -55,11 +55,11 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	}

 	// Opportunistic logging of driver version to aid in troubleshooting
-	driverMajor, driverMinor, err := AMDDriverVersion()
-	if err != nil {
-		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
-		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
-	}
+	// driverMajor, driverMinor, err := AMDDriverVersion()
+	// if err != nil {
+	// 	// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
+	// 	slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
+	// }

 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
 	var visibleDevices []string
@@ -284,8 +284,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				Name:          name,
 				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
 				MinimumMemory: rocmMinimumMemory,
-				DriverMajor:   driverMajor,
-				DriverMinor:   driverMinor,
+				// DriverMajor:   driverMajor,
+				// DriverMinor:   driverMinor,
 			},
 			usedFilepath: usedFile,
 			index:        gpuID,
@@ -415,15 +415,15 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 // Quick check for AMD driver so we can skip amdgpu discovery if not present
 func AMDDetected() bool {
 	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
-	sysfsDir := filepath.Dir(DriverVersionFile)
-	_, err := os.Stat(sysfsDir)
-	if errors.Is(err, os.ErrNotExist) {
-		slog.Debug("amdgpu driver not detected " + sysfsDir)
-		return false
-	} else if err != nil {
-		slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
-		return false
-	}
+	// sysfsDir := filepath.Dir(DriverVersionFile)
+	// _, err := os.Stat(sysfsDir)
+	// if errors.Is(err, os.ErrNotExist) {
+	// 	slog.Debug("amdgpu driver not detected " + sysfsDir)
+	// 	return false
+	// } else if err != nil {
+	// 	slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
+	// 	return false
+	// }
 	return true
 }


--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -49,7 +49,7 @@ static __device__ __forceinline__ float op_div(const float a, const float b) {
 }

 template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __global__ __launch_bounds__(1024) void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,

--- a/llama/ggml-cuda/convert.cu
+++ b/llama/ggml-cuda/convert.cu
@@ -596,7 +596,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
 }

 template <typename src_t, typename dst_t>
-static __global__ __launch_bounds__(1024) void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
+static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
    const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {

--- a/llama/ggml-cuda/cpy.cu
+++ b/llama/ggml-cuda/cpy.cu
@@ -57,7 +57,7 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
 }

 template <cpy_kernel_t cpy_1>
-static __global__ __launch_bounds__(1024) void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13) {

--- a/llama/ggml-cuda/ggml-cuda.cu
+++ b/llama/ggml-cuda/ggml-cuda.cu
@@ -1581,7 +1581,7 @@ static void ggml_cuda_op_mul_mat(
    }
 }

-static __global__ __launch_bounds__(1024) void k_compute_batched_ptrs(
+static __global__ void k_compute_batched_ptrs(
        const half * src0_as_f16, const half * src1_as_f16, char * dst,
        const void ** ptrs_src, void ** ptrs_dst,
        int64_t ne12, int64_t ne13,

--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -75,8 +75,8 @@
 
 template <ggml_type type, int ncols_y>
 //  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
- // tell the compiler to use as many registers as it wants, see nwarps definition below
- //__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
+//  // tell the compiler to use as many registers as it wants, see nwarps definition below
+//  __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
 //  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void mul_mat_vec_q(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
@@ -92,7 +92,7 @@
     constexpr int rows_per_cuda_block = 1;
 #else
     constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
-     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 1;
+     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
 
     const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
@@ -176,7 +176,7 @@
             case 3:
             case 4:
                 nwarps = 4;
-                 rows_per_cuda_block = 1;
+                 rows_per_cuda_block = 2;
                 break;
             case 5:
             case 6:

--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -27,7 +27,7 @@
 #include "quantize.cuh"
 #include <cstdint>
 
- static __global__ __launch_bounds__(1024) void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
     const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
 
     if (ix0 >= kx0_padded) {

--- a/llama/ggml-cuda/rope.cu
+++ b/llama/ggml-cuda/rope.cu
@@ -60,7 +60,7 @@ static __device__ void rope_yarn(
 }

 template<typename T, bool has_ff>
-static __global__ __launch_bounds__(1024) void rope_norm(
+static __global__ void rope_norm(
    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);

--- a/llama/ggml-cuda/unary.cu
+++ b/llama/ggml-cuda/unary.cu
@@ -68,7 +68,7 @@ static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
 }

-static __global__ __launch_bounds__(1024) void silu_f32(const float * x, float * dst, const int k) {
+static __global__ void silu_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {

--- a/make/Makefile.rocm
+++ b/make/Makefile.rocm
@@ -6,8 +6,8 @@
 include make/common-defs.make
 include make/rocm-defs.make

-HIP_ARCHS_COMMON := gfx928 gfx906
-HIP_ARCHS_LINUX := gfx928 gfx906
+HIP_ARCHS_COMMON := gfx928 gfx906 gfx936
+HIP_ARCHS_LINUX := gfx928 gfx906 gfx936

 ifeq ($(OS),windows)
 	GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")