Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
f38c5ca1
Commit
f38c5ca1
authored
Feb 20, 2025
by
xuxzh1
🎱
Browse files
update README
parent
09a693cf
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
7 additions
and
120 deletions
+7
-120
README.md
README.md
+5
-5
llama/ggml-cuda/mmvq.cu
llama/ggml-cuda/mmvq.cu
+2
-2
test.py
test.py
+0
-38
threadtest.py
threadtest.py
+0
-75
No files found.
README.md
View file @
f38c5ca1
...
...
@@ -21,7 +21,7 @@ docker run -i -t -d --device=/dev/kfd --privileged --network=host --device=/dev
1、下载源码
```
bash
git clone
-b
0.5.7 http://developer.sourcefind.cn/codes/OpenDAS/ollama.git
git clone
-b
0.5.7 http://developer.sourcefind.cn/codes/OpenDAS/ollama.git
--depth
=
1
cd
ollama
```
...
...
@@ -30,8 +30,8 @@ cd ollama
##### 安装go
```
bash
wget https://go
.dev
/dl/go1.2
2.8
.linux-amd64.tar.gz
tar
-C
/usr/local
-xzf
go1.2
2.8
.linux-amd64.tar.gz
wget https://go
lang.google.cn
/dl/go1.2
3.4
.linux-amd64.tar.gz
tar
-C
/usr/local
-xzf
go1.2
3.4
.linux-amd64.tar.gz
export
PATH
=
$PATH
:/usr/local/go/bin
# 修改go下载源,提升速度(按需设置)
...
...
@@ -49,7 +49,7 @@ go build .
## 运行
```
bash
export
HSA_OVERRIDE_GFX_VERSION
=
设备型号(如: gfx906对应9.0.6;
k
100
ai
gfx928对应9.2.8)
export
HSA_OVERRIDE_GFX_VERSION
=
设备型号(如:
Z100L
gfx906对应9.0.6;
K100 gfx926对应9.2.6;K
100
AI
gfx928对应9.2.8)
export
ROCR_VISIBLE_DEVICES
=
所有设备号(0,1,2,3,4,5,6,...)/选择设备号
./ollama serve (选择可用设备,可通过上条命令输出结果查看)
# 新增fa和kv cache量化
...
...
@@ -60,7 +60,7 @@ OLLAMA_FLASH_ATTENTION=1 OLLAMA_KV_CACHE_TYPE=q4_0 ./ollama serve
## deepseek-r1模型推理
```
export HSA_OVERRIDE_GFX_VERSION=设备型号(如: gfx906对应9.0.6;
k
100
ai
gfx928对应9.2.8)
export HSA_OVERRIDE_GFX_VERSION=设备型号(如:
Z100L
gfx906对应9.0.6;
K100 gfx926对应9.2.6;K
100
AI
gfx928对应9.2.8)
./ollama serve
./ollama run deepseek-r1:671b
```
...
...
llama/ggml-cuda/mmvq.cu
View file @
f38c5ca1
...
...
@@ -88,7 +88,7 @@ static __global__ void mul_mat_vec_q(
constexpr
vec_dot_q_cuda_t
vec_dot_q_cuda
=
get_vec_dot_q_cuda
(
type
);
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
constexpr
int
nwarps
=
1
;
constexpr
int
rows_per_cuda_block
=
1
;
#else
...
...
@@ -143,7 +143,7 @@ static __global__ void mul_mat_vec_q(
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll rows_per_cuda_block
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
#pragma unroll
nwarps-1
#pragma unroll
for
(
int
l
=
0
;
l
<
nwarps
-
1
;
++
l
)
{
//tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
atomicAdd
(
&
tmp
[
j
][
i
],
tmp_shared
[
l
][
j
][
i
][
threadIdx
.
x
]);
...
...
test.py
deleted
100644 → 0
View file @
09a693cf
import
requests
# 定义请求的 URL
url
=
"http://localhost:11434/api/generate"
for
num_batch
in
(
1
,
2
,
4
):
for
num_predict
in
(
128
,
128
):
# 定义请求的 JSON 数据
data
=
{
"model"
:
"deepseek-r1:70b"
,
"prompt"
:
"hi"
,
"stream"
:
False
,
"raw"
:
True
,
"keep_alive"
:
"1h"
,
"options"
:
{
"num_predict"
:
num_predict
,
"num_batch"
:
num_batch
,
"seed"
:
42
,
"stop"
:
[]
}
}
# 发送 POST 请求
response
=
requests
.
post
(
url
,
json
=
data
)
# 打印响应内容
if
response
.
status_code
==
200
:
respose_josn
=
response
.
json
()
prompt_tokens
=
respose_josn
[
"prompt_eval_count"
]
generate_tokens
=
respose_josn
[
"eval_count"
]
prefill_throughput
=
respose_josn
[
"prompt_eval_count"
]
/
respose_josn
[
"prompt_eval_duration"
]
*
(
10
**
9
)
generate_throughput
=
respose_josn
[
"eval_count"
]
/
respose_josn
[
"eval_duration"
]
*
(
10
**
9
)
print
(
f
"batch :
{
num_batch
}
\n
prompt_tokens :
{
prompt_tokens
}
\n
generate_tokens :
{
generate_tokens
}
\n
prefill_throughput :
{
round
(
prefill_throughput
,
2
)
}
\n
generate_throughput :
{
round
(
generate_throughput
,
2
)
}
"
)
#print(response.json())
print
(
"===================================="
)
else
:
print
(
f
"请求失败,状态码:
{
response
.
status_code
}
"
)
print
(
"===================================="
)
\ No newline at end of file
threadtest.py
deleted
100644 → 0
View file @
09a693cf
import
requests
import
time
import
concurrent.futures
# 定义请求的URL和payload
for
concurrent_requests
in
(
1
,
2
,
4
):
for
num_predict
in
(
128
,
128
):
url
=
"http://localhost:11434/api/generate"
headers
=
{
"Content-Type"
:
"application/json"
}
#"hi "*510对应512tokens "hi "*998对应1000tokens
payload
=
{
"model"
:
"deepseek-r1:671b"
,
"prompt"
:
"hi"
,
"stream"
:
False
,
"raw"
:
True
,
"keep_alive"
:
"1h"
,
"options"
:
{
"num_predict"
:
num_predict
,
"seed"
:
42
,
"stop"
:
[]
}
}
# 定义发送单个请求的函数
def
send_request
():
start_time
=
time
.
time
()
# 记录请求开始时间
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
payload
)
# 发送请求
end_time
=
time
.
time
()
# 记录请求结束时间
if
response
.
status_code
==
200
:
response_data
=
response
.
json
()
completion_tokens
=
response_data
[
"eval_count"
]
elapsed_time
=
end_time
-
start_time
return
completion_tokens
,
elapsed_time
,
response_data
else
:
print
(
f
"请求失败,状态码:
{
response
.
status_code
}
"
)
return
0
,
0
# 定义并发请求的数量
concurrent_requests
=
concurrent_requests
# 可以根据需要调整并发数
# 记录总开始时间
total_start_time
=
time
.
time
()
# 使用线程池并发发送请求
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
concurrent_requests
)
as
executor
:
futures
=
[
executor
.
submit
(
send_request
)
for
_
in
range
(
concurrent_requests
)]
results
=
[
future
.
result
()
for
future
in
concurrent
.
futures
.
as_completed
(
futures
)]
# 记录总结束时间
total_end_time
=
time
.
time
()
for
result
in
results
:
response_data
=
result
[
2
]
completion_tokens
=
result
[
0
]
elapsed_time
=
result
[
1
]
print
(
f
"请求完成: 生成 tokens =
{
completion_tokens
}
, 耗时 =
{
elapsed_time
:.
2
f
}
秒, 生成速度:
{
completion_tokens
/
elapsed_time
:.
2
f
}
, 响应内容:
{
response_data
}
"
)
# 计算总生成 tokens 和总耗时
total_completion_tokens
=
sum
(
result
[
0
]
for
result
in
results
)
total_elapsed_time
=
total_end_time
-
total_start_time
# 计算整体生成速度(tokens/秒)
if
total_elapsed_time
>
0
:
overall_speed
=
total_completion_tokens
/
total_elapsed_time
print
(
f
"batch_size :
{
concurrent_requests
}
"
)
print
(
f
"总生成 tokens:
{
total_completion_tokens
}
"
)
print
(
f
"总耗时:
{
total_elapsed_time
:.
2
f
}
秒"
)
print
(
f
"整体生成速度:
{
overall_speed
:.
2
f
}
tokens/秒"
)
else
:
print
(
"总耗时过短,无法计算生成速度"
)
print
(
"================num_predict===================="
)
print
(
"================concurrent_requests===================="
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment