Commit f38c5ca1 authored by xuxzh1's avatar xuxzh1 🎱
Browse files

update README

parent 09a693cf
......@@ -21,7 +21,7 @@ docker run -i -t -d --device=/dev/kfd --privileged --network=host --device=/dev
1、下载源码
```bash
git clone -b 0.5.7 http://developer.sourcefind.cn/codes/OpenDAS/ollama.git
git clone -b 0.5.7 http://developer.sourcefind.cn/codes/OpenDAS/ollama.git --depth=1
cd ollama
```
......@@ -30,8 +30,8 @@ cd ollama
##### 安装go
```bash
wget https://go.dev/dl/go1.22.8.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.22.8.linux-amd64.tar.gz
wget https://golang.google.cn/dl/go1.23.4.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.23.4.linux-amd64.tar.gz
export PATH=$PATH:/usr/local/go/bin
# 修改go下载源,提升速度(按需设置)
......@@ -49,7 +49,7 @@ go build .
## 运行
```bash
export HSA_OVERRIDE_GFX_VERSION=设备型号(如: gfx906对应9.0.6;k100ai gfx928对应9.2.8)
export HSA_OVERRIDE_GFX_VERSION=设备型号(如: Z100L gfx906对应9.0.6;K100 gfx926对应9.2.6;K100AI gfx928对应9.2.8)
export ROCR_VISIBLE_DEVICES=所有设备号(0,1,2,3,4,5,6,...)/选择设备号
./ollama serve (选择可用设备,可通过上条命令输出结果查看)
# 新增fa和kv cache量化
......@@ -60,7 +60,7 @@ OLLAMA_FLASH_ATTENTION=1 OLLAMA_KV_CACHE_TYPE=q4_0 ./ollama serve
## deepseek-r1模型推理
```
export HSA_OVERRIDE_GFX_VERSION=设备型号(如: gfx906对应9.0.6;k100ai gfx928对应9.2.8)
export HSA_OVERRIDE_GFX_VERSION=设备型号(如: Z100L gfx906对应9.0.6;K100 gfx926对应9.2.6;K100AI gfx928对应9.2.8)
./ollama serve
./ollama run deepseek-r1:671b
```
......
......@@ -143,7 +143,7 @@ static __global__ void mul_mat_vec_q(
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll rows_per_cuda_block
for (int i = 0; i < rows_per_cuda_block; ++i) {
#pragma unroll nwarps-1
#pragma unroll
for (int l = 0; l < nwarps-1; ++l) {
//tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
atomicAdd(&tmp[j][i], tmp_shared[l][j][i][threadIdx.x]);
......
import requests
# 定义请求的 URL
url = "http://localhost:11434/api/generate"
for num_batch in(1,2,4):
for num_predict in (128,128):
# 定义请求的 JSON 数据
data = {
"model" : "deepseek-r1:70b",
"prompt" : "hi",
"stream" : False,
"raw" : True,
"keep_alive" : "1h",
"options": {
"num_predict": num_predict,
"num_batch": num_batch,
"seed" : 42,
"stop" : []
}
}
# 发送 POST 请求
response = requests.post(url, json=data)
# 打印响应内容
if response.status_code == 200:
respose_josn = response.json()
prompt_tokens = respose_josn["prompt_eval_count"]
generate_tokens = respose_josn["eval_count"]
prefill_throughput = respose_josn["prompt_eval_count"] / respose_josn["prompt_eval_duration"] * (10**9)
generate_throughput = respose_josn["eval_count"] / respose_josn["eval_duration"] * (10**9)
print (f"batch : {num_batch}\nprompt_tokens : {prompt_tokens}\ngenerate_tokens : {generate_tokens}\nprefill_throughput : {round(prefill_throughput,2)}\ngenerate_throughput : {round(generate_throughput,2)}")
#print(response.json())
print("====================================")
else:
print(f"请求失败,状态码: {response.status_code}")
print("====================================")
\ No newline at end of file
import requests
import time
import concurrent.futures
# 定义请求的URL和payload
for concurrent_requests in (1,2,4):
for num_predict in (128,128):
url = "http://localhost:11434/api/generate"
headers = {
"Content-Type": "application/json"
}
#"hi "*510对应512tokens "hi "*998对应1000tokens
payload = {
"model" : "deepseek-r1:671b",
"prompt" : "hi",
"stream" : False,
"raw" : True,
"keep_alive" : "1h",
"options": {
"num_predict": num_predict,
"seed" : 42,
"stop" : []
}
}
# 定义发送单个请求的函数
def send_request():
start_time = time.time() # 记录请求开始时间
response = requests.post(url, headers=headers, json=payload) # 发送请求
end_time = time.time() # 记录请求结束时间
if response.status_code == 200:
response_data = response.json()
completion_tokens = response_data["eval_count"]
elapsed_time = end_time - start_time
return completion_tokens, elapsed_time, response_data
else:
print(f"请求失败,状态码: {response.status_code}")
return 0, 0
# 定义并发请求的数量
concurrent_requests = concurrent_requests # 可以根据需要调整并发数
# 记录总开始时间
total_start_time = time.time()
# 使用线程池并发发送请求
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
futures = [executor.submit(send_request) for _ in range(concurrent_requests)]
results = [future.result() for future in concurrent.futures.as_completed(futures)]
# 记录总结束时间
total_end_time = time.time()
for result in results:
response_data = result[2]
completion_tokens = result[0]
elapsed_time = result[1]
print(f"请求完成: 生成 tokens = {completion_tokens}, 耗时 = {elapsed_time:.2f} 秒, 生成速度:{completion_tokens/elapsed_time:.2f}, 响应内容:{response_data}")
# 计算总生成 tokens 和总耗时
total_completion_tokens = sum(result[0] for result in results)
total_elapsed_time = total_end_time - total_start_time
# 计算整体生成速度(tokens/秒)
if total_elapsed_time > 0:
overall_speed = total_completion_tokens / total_elapsed_time
print(f"batch_size : {concurrent_requests}" )
print(f"总生成 tokens: {total_completion_tokens}")
print(f"总耗时: {total_elapsed_time:.2f} 秒")
print(f"整体生成速度: {overall_speed:.2f} tokens/秒")
else:
print("总耗时过短,无法计算生成速度")
print("================num_predict====================")
print("================concurrent_requests====================")
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment