[update] common.cuh && quantize.cu

b452da57 · xuxzh1 · ac4166cb · b452da57 · b452da57 · b452da57
Commit b452da57 authored Apr 02, 2025 by xuxzh1 🎱
3 changed files
--- a/README.md
+++ b/README.md
@@ -44,7 +44,6 @@ go env -w GOPROXY=https://goproxy.cn,direct
 export LIBRARY_PATH=/opt/dtk/lib:$LIBRARY_PATH
 cmake -B build
 cmake --build build
-go build .
 ```
 ## 运行

--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@@ -384,7 +384,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2) || true
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(RDNA3)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);

--- a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
@@ -33,8 +33,11 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
        return;
    }
-    reinterpret_cast<half&>(y[ib].ds.x) = d;
+    ggml_half2 ds = {d, sum};
-    reinterpret_cast<half&>(y[ib].ds.y) = sum;
+    y[ib].ds = ds;
+    // reinterpret_cast<half&>(y[ib].ds.x) = d;
+    // reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 template <mmq_q8_1_ds_layout ds_layout>