opt1

09a693cf · xuxzh1 · 84e5b6ff · 09a693cf · 09a693cf · 09a693cf
Commit 09a693cf authored Feb 20, 2025 by xuxzh1 🎱
7 changed files
--- a/DeepSeekQ1_Modelfile
+++ b/DeepSeekQ1_Modelfile
+FROM /models/DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_M/DeepSeek-R1-UD-IQ1_M.gguf
+TEMPLATE "<｜User｜>{{ .Prompt }}<｜Assistant｜>"
\ No newline at end of file
--- a/llama/ggml-cuda/common.cuh
+++ b/llama/ggml-cuda/common.cuh
@@ -332,7 +332,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half

 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2) || defined(CDNA) 
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(RDNA3)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);

--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -106,29 +106,30 @@ static __global__ void mul_mat_vec_q(
    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};

    const block_q8_1 * y = (const block_q8_1 *) vy;
-
    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
        const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx

        // x block quant index when casting the quants to int
        const int kqs = vdr * (tid % (qi/vdr));

-#pragma unroll
+#pragma unroll ncols_y
        for (int j = 0; j < ncols_y; ++j) {
-#pragma unroll
+#pragma unroll rows_per_cuda_block
            for (int i = 0; i < rows_per_cuda_block; ++i) {
-                tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
+                //tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
+                atomicAdd(&tmp[j][i], vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs));
            }
        }
    }

    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
    if (threadIdx.y > 0) {
-#pragma unroll
+#pragma unroll ncols_y
        for (int j = 0; j < ncols_y; ++j) {
-#pragma unroll
+#pragma unroll rows_per_cuda_block
            for (int i = 0; i < rows_per_cuda_block; ++i) {
-                tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
+                //tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
+                atomicExch(&tmp_shared[threadIdx.y-1][j][i][threadIdx.x], tmp[j][i]);
            }
        }
    }
@@ -138,19 +139,22 @@ static __global__ void mul_mat_vec_q(
    }

    // sum up partial sums and write back result
-#pragma unroll
+#pragma unroll ncols_y
    for (int j = 0; j < ncols_y; ++j) {
-#pragma unroll
+#pragma unroll rows_per_cuda_block
        for (int i = 0; i < rows_per_cuda_block; ++i) {
-#pragma unroll
+#pragma unroll nwarps-1
            for (int l = 0; l < nwarps-1; ++l) {
-                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
+                //tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
+                atomicAdd(&tmp[j][i], tmp_shared[l][j][i][threadIdx.x]);
            }
-            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
+            //tmp[j][i] = warp_reduce_sum(tmp[j][i]);
+            atomicExch(&tmp[j][i], warp_reduce_sum(tmp[j][i]));
        }

        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
-            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
+            //dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
+            atomicExch(&dst[j*nrows_dst + row0 + threadIdx.x], tmp[j][threadIdx.x]);
        }
    }
 }

--- a/llama/ggml-cuda/vecdotq.cuh
+++ b/llama/ggml-cuda/vecdotq.cuh
@@ -387,16 +387,16 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

-#pragma unroll
+#pragma unroll QR4_K
    for (int i = 0; i < QR4_K; ++i) {
        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
-
        const int dot1 = ggml_cuda_dp4a(v1i, u[2*i+1], ggml_cuda_dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
        const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+1], ggml_cuda_dp4a(0x01010101, u[2*i+0], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+        atomicAdd(&sumf_d,d8[i] * (dot1 * sc[i]));
+        atomicAdd(&sumf_m,d8[i] * (dot2 * m[i]));
+        //sumf_d += d8[i] * (dot1 * sc[i]);
+        //sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
    }

    const float2 dm4f = __half22float2(dm4);
@@ -728,6 +728,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    const uint8_t * sc = (const uint8_t *)aux;
    const uint8_t * m  = sc + 2;

+    #pragma unroll QR4_K
    for (int i = 0; i < QR4_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
        d8[i] = __low2float(bq8i->ds);
@@ -736,7 +737,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
        u[2*i+0] = q8[0];
        u[2*i+1] = q8[4];
    }
-
    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
 }


--- a/make/Makefile.rocm
+++ b/make/Makefile.rocm
@@ -76,6 +76,8 @@ GPU_COMPILER_CUFLAGS = \
 	-DGGML_CUDA_MMV_Y=1 \
 	-DGGML_SCHED_MAX_COPIES=4 \
 	-DGGML_USE_HIP \
+	--gpu-max-threads-per-block=1024 \
+	-DCDNA \
 	-DGGML_USE_LLAMAFILE \
 	-DHIP_FAST_MATH \
 	-D__HIP_PLATFORM_AMD__=1 \

--- a/test.py
+++ b/test.py
+import requests
+
+# 定义请求的 URL
+url = "http://localhost:11434/api/generate"
+for num_batch in(1,2,4):
+    for num_predict in (128,128):
+    # 定义请求的 JSON 数据
+        data = {
+            "model" : "deepseek-r1:70b",
+            "prompt" : "hi",
+            "stream" : False,
+            "raw" : True,
+            "keep_alive" : "1h",
+            "options": {
+                "num_predict": num_predict,
+                "num_batch": num_batch,
+                "seed" : 42,
+                "stop" : []
+            }
+        }
+
+        # 发送 POST 请求
+        response = requests.post(url, json=data)
+
+        # 打印响应内容
+        if response.status_code == 200:
+            respose_josn = response.json()
+            
+            prompt_tokens = respose_josn["prompt_eval_count"]
+            generate_tokens = respose_josn["eval_count"]
+            prefill_throughput = respose_josn["prompt_eval_count"] / respose_josn["prompt_eval_duration"] * (10**9)
+            generate_throughput = respose_josn["eval_count"] / respose_josn["eval_duration"] * (10**9)
+            print (f"batch : {num_batch}\nprompt_tokens : {prompt_tokens}\ngenerate_tokens : {generate_tokens}\nprefill_throughput : {round(prefill_throughput,2)}\ngenerate_throughput : {round(generate_throughput,2)}")
+            #print(response.json())
+            print("====================================")
+        else:
+            print(f"请求失败，状态码: {response.status_code}")
+            print("====================================")
\ No newline at end of file
--- a/threadtest.py
+++ b/threadtest.py
+import requests
+import time
+import concurrent.futures
+
+# 定义请求的URL和payload
+for concurrent_requests in (1,2,4):
+    for num_predict in (128,128):
+        url = "http://localhost:11434/api/generate"
+        headers = {
+            "Content-Type": "application/json"
+        }
+        #"hi "*510对应512tokens "hi "*998对应1000tokens
+        payload = {
+            "model" : "deepseek-r1:671b",
+                    "prompt" : "hi",
+                    "stream" : False,
+                    "raw" : True,
+                    "keep_alive" : "1h",
+                    "options": {
+                        "num_predict": num_predict,
+                        "seed" : 42,
+                        "stop" : []
+                    }
+        }
+
+        # 定义发送单个请求的函数
+        def send_request():
+            start_time = time.time()  # 记录请求开始时间
+            response = requests.post(url, headers=headers, json=payload)  # 发送请求
+            end_time = time.time()  # 记录请求结束时间
+
+            if response.status_code == 200:
+                response_data = response.json()
+                completion_tokens = response_data["eval_count"]
+                elapsed_time = end_time - start_time
+                return completion_tokens, elapsed_time, response_data
+            else:
+                print(f"请求失败，状态码: {response.status_code}")
+                return 0, 0
+
+        # 定义并发请求的数量
+        concurrent_requests = concurrent_requests # 可以根据需要调整并发数
+
+        # 记录总开始时间
+        total_start_time = time.time()
+
+        # 使用线程池并发发送请求
+        with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
+            futures = [executor.submit(send_request) for _ in range(concurrent_requests)]
+            results = [future.result() for future in concurrent.futures.as_completed(futures)]
+
+        # 记录总结束时间
+        total_end_time = time.time()
+
+        for result in results:
+            response_data = result[2]
+            completion_tokens = result[0]
+            elapsed_time = result[1]
+            print(f"请求完成: 生成 tokens = {completion_tokens}, 耗时 = {elapsed_time:.2f} 秒, 生成速度：{completion_tokens/elapsed_time:.2f}, 响应内容：{response_data}")
+
+        # 计算总生成 tokens 和总耗时
+        total_completion_tokens = sum(result[0] for result in results)
+        total_elapsed_time = total_end_time - total_start_time
+
+        # 计算整体生成速度（tokens/秒）
+        if total_elapsed_time > 0:
+            overall_speed = total_completion_tokens / total_elapsed_time
+            print(f"batch_size : {concurrent_requests}" )
+            print(f"总生成 tokens: {total_completion_tokens}")
+            print(f"总耗时: {total_elapsed_time:.2f} 秒")
+            print(f"整体生成速度: {overall_speed:.2f} tokens/秒")
+        else:
+            print("总耗时过短，无法计算生成速度")  
+        print("================num_predict====================")
+    print("================concurrent_requests====================") 
\ No newline at end of file