update README

f38c5ca1 · xuxzh1 · 09a693cf · f38c5ca1 · f38c5ca1 · 09a693cf
Commit f38c5ca1 authored Feb 20, 2025 by xuxzh1 🎱
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 120 deletions

README.md README.md +5 -5

llama/ggml-cuda/mmvq.cu llama/ggml-cuda/mmvq.cu +2 -2

test.py test.py +0 -38

threadtest.py threadtest.py +0 -75

No files found.
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ docker run -i -t -d  --device=/dev/kfd --privileged --network=host --device=/dev
 1、下载源码

 ```bash
-git clone -b 0.5.7 http://developer.sourcefind.cn/codes/OpenDAS/ollama.git
+git clone -b 0.5.7 http://developer.sourcefind.cn/codes/OpenDAS/ollama.git --depth=1
 cd ollama
 ```

@@ -30,8 +30,8 @@ cd ollama
 ##### 安装go

 ```bash
-wget https://go.dev/dl/go1.22.8.linux-amd64.tar.gz
-tar -C /usr/local -xzf go1.22.8.linux-amd64.tar.gz
+wget https://golang.google.cn/dl/go1.23.4.linux-amd64.tar.gz
+tar -C /usr/local -xzf go1.23.4.linux-amd64.tar.gz
 export PATH=$PATH:/usr/local/go/bin

 # 修改go下载源，提升速度（按需设置）
@@ -49,7 +49,7 @@ go build .
 ## 运行

 ```bash
-export HSA_OVERRIDE_GFX_VERSION=设备型号（如: gfx906对应9.0.6；k100ai gfx928对应9.2.8）
+export HSA_OVERRIDE_GFX_VERSION=设备型号（如: Z100L gfx906对应9.0.6；K100 gfx926对应9.2.6；K100AI gfx928对应9.2.8）
 export ROCR_VISIBLE_DEVICES=所有设备号（0,1,2,3,4,5,6,...）/选择设备号
 ./ollama serve  （选择可用设备，可通过上条命令输出结果查看）
 # 新增fa和kv cache量化
@@ -60,7 +60,7 @@ OLLAMA_FLASH_ATTENTION=1 OLLAMA_KV_CACHE_TYPE=q4_0 ./ollama serve
 ## deepseek-r1模型推理

 ```
-export HSA_OVERRIDE_GFX_VERSION=设备型号（如: gfx906对应9.0.6；k100ai gfx928对应9.2.8）
+export HSA_OVERRIDE_GFX_VERSION=设备型号（如: Z100L gfx906对应9.0.6；K100 gfx926对应9.2.6；K100AI gfx928对应9.2.8）
 ./ollama serve
 ./ollama run deepseek-r1:671b
 ```

--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -88,7 +88,7 @@ static __global__ void mul_mat_vec_q(

    constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);

-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) 
    constexpr int nwarps              = 1;
    constexpr int rows_per_cuda_block = 1;
 #else
@@ -143,7 +143,7 @@ static __global__ void mul_mat_vec_q(
    for (int j = 0; j < ncols_y; ++j) {
 #pragma unroll rows_per_cuda_block
        for (int i = 0; i < rows_per_cuda_block; ++i) {
-#pragma unroll nwarps-1
+#pragma unroll 
            for (int l = 0; l < nwarps-1; ++l) {
                //tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
                atomicAdd(&tmp[j][i], tmp_shared[l][j][i][threadIdx.x]);

--- a/test.py
+++ b/test.py
-import requests
-
-# 定义请求的 URL
-url = "http://localhost:11434/api/generate"
-for num_batch in(1,2,4):
-    for num_predict in (128,128):
-    # 定义请求的 JSON 数据
-        data = {
-            "model" : "deepseek-r1:70b",
-            "prompt" : "hi",
-            "stream" : False,
-            "raw" : True,
-            "keep_alive" : "1h",
-            "options": {
-                "num_predict": num_predict,
-                "num_batch": num_batch,
-                "seed" : 42,
-                "stop" : []
-            }
-        }
-
-        # 发送 POST 请求
-        response = requests.post(url, json=data)
-
-        # 打印响应内容
-        if response.status_code == 200:
-            respose_josn = response.json()
-            
-            prompt_tokens = respose_josn["prompt_eval_count"]
-            generate_tokens = respose_josn["eval_count"]
-            prefill_throughput = respose_josn["prompt_eval_count"] / respose_josn["prompt_eval_duration"] * (10**9)
-            generate_throughput = respose_josn["eval_count"] / respose_josn["eval_duration"] * (10**9)
-            print (f"batch : {num_batch}\nprompt_tokens : {prompt_tokens}\ngenerate_tokens : {generate_tokens}\nprefill_throughput : {round(prefill_throughput,2)}\ngenerate_throughput : {round(generate_throughput,2)}")
-            #print(response.json())
-            print("====================================")
-        else:
-            print(f"请求失败，状态码: {response.status_code}")
-            print("====================================")
\ No newline at end of file
--- a/threadtest.py
+++ b/threadtest.py
-import requests
-import time
-import concurrent.futures
-
-# 定义请求的URL和payload
-for concurrent_requests in (1,2,4):
-    for num_predict in (128,128):
-        url = "http://localhost:11434/api/generate"
-        headers = {
-            "Content-Type": "application/json"
-        }
-        #"hi "*510对应512tokens "hi "*998对应1000tokens
-        payload = {
-            "model" : "deepseek-r1:671b",
-                    "prompt" : "hi",
-                    "stream" : False,
-                    "raw" : True,
-                    "keep_alive" : "1h",
-                    "options": {
-                        "num_predict": num_predict,
-                        "seed" : 42,
-                        "stop" : []
-                    }
-        }
-
-        # 定义发送单个请求的函数
-        def send_request():
-            start_time = time.time()  # 记录请求开始时间
-            response = requests.post(url, headers=headers, json=payload)  # 发送请求
-            end_time = time.time()  # 记录请求结束时间
-
-            if response.status_code == 200:
-                response_data = response.json()
-                completion_tokens = response_data["eval_count"]
-                elapsed_time = end_time - start_time
-                return completion_tokens, elapsed_time, response_data
-            else:
-                print(f"请求失败，状态码: {response.status_code}")
-                return 0, 0
-
-        # 定义并发请求的数量
-        concurrent_requests = concurrent_requests # 可以根据需要调整并发数
-
-        # 记录总开始时间
-        total_start_time = time.time()
-
-        # 使用线程池并发发送请求
-        with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
-            futures = [executor.submit(send_request) for _ in range(concurrent_requests)]
-            results = [future.result() for future in concurrent.futures.as_completed(futures)]
-
-        # 记录总结束时间
-        total_end_time = time.time()
-
-        for result in results:
-            response_data = result[2]
-            completion_tokens = result[0]
-            elapsed_time = result[1]
-            print(f"请求完成: 生成 tokens = {completion_tokens}, 耗时 = {elapsed_time:.2f} 秒, 生成速度：{completion_tokens/elapsed_time:.2f}, 响应内容：{response_data}")
-
-        # 计算总生成 tokens 和总耗时
-        total_completion_tokens = sum(result[0] for result in results)
-        total_elapsed_time = total_end_time - total_start_time
-
-        # 计算整体生成速度（tokens/秒）
-        if total_elapsed_time > 0:
-            overall_speed = total_completion_tokens / total_elapsed_time
-            print(f"batch_size : {concurrent_requests}" )
-            print(f"总生成 tokens: {total_completion_tokens}")
-            print(f"总耗时: {total_elapsed_time:.2f} 秒")
-            print(f"整体生成速度: {overall_speed:.2f} tokens/秒")
-        else:
-            print("总耗时过短，无法计算生成速度")  
-        print("================num_predict====================")
-    print("================concurrent_requests====================") 
\ No newline at end of file