Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
09a693cf
Commit
09a693cf
authored
Feb 20, 2025
by
xuxzh1
🎱
Browse files
opt1
parent
84e5b6ff
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
141 additions
and
20 deletions
+141
-20
DeepSeekQ1_Modelfile
DeepSeekQ1_Modelfile
+2
-0
llama/ggml-cuda/common.cuh
llama/ggml-cuda/common.cuh
+1
-1
llama/ggml-cuda/mmvq.cu
llama/ggml-cuda/mmvq.cu
+17
-13
llama/ggml-cuda/vecdotq.cuh
llama/ggml-cuda/vecdotq.cuh
+6
-6
make/Makefile.rocm
make/Makefile.rocm
+2
-0
test.py
test.py
+38
-0
threadtest.py
threadtest.py
+75
-0
No files found.
DeepSeekQ1_Modelfile
0 → 100644
View file @
09a693cf
FROM /models/DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_M/DeepSeek-R1-UD-IQ1_M.gguf
TEMPLATE "<|User|>{{ .Prompt }}<|Assistant|>"
\ No newline at end of file
llama/ggml-cuda/common.cuh
View file @
09a693cf
...
...
@@ -332,7 +332,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
static
__device__
__forceinline__
int
ggml_cuda_dp4a
(
const
int
a
,
const
int
b
,
int
c
)
{
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|| defined(CDNA)
c
=
__builtin_amdgcn_sdot4
(
a
,
b
,
c
,
false
);
#elif defined(RDNA3)
c
=
__builtin_amdgcn_sudot4
(
true
,
a
,
true
,
b
,
c
,
false
);
...
...
llama/ggml-cuda/mmvq.cu
View file @
09a693cf
...
...
@@ -106,29 +106,30 @@ static __global__ void mul_mat_vec_q(
float
tmp
[
ncols_y
][
rows_per_cuda_block
]
=
{
0.0
f
};
const
block_q8_1
*
y
=
(
const
block_q8_1
*
)
vy
;
for
(
int
kbx
=
tid
/
(
qi
/
vdr
);
kbx
<
blocks_per_row_x
;
kbx
+=
blocks_per_iter
)
{
const
int
kby
=
kbx
*
(
qk
/
QK8_1
);
// y block index that aligns with kbx
// x block quant index when casting the quants to int
const
int
kqs
=
vdr
*
(
tid
%
(
qi
/
vdr
));
#pragma unroll
#pragma unroll
ncols_y
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll
#pragma unroll
rows_per_cuda_block
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
tmp
[
j
][
i
]
+=
vec_dot_q_cuda
(
vx
,
&
y
[
j
*
blocks_per_col_y
+
kby
],
(
row0
+
i
)
*
blocks_per_row_x
+
kbx
,
kqs
);
//tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
atomicAdd
(
&
tmp
[
j
][
i
],
vec_dot_q_cuda
(
vx
,
&
y
[
j
*
blocks_per_col_y
+
kby
],
(
row0
+
i
)
*
blocks_per_row_x
+
kbx
,
kqs
));
}
}
}
__shared__
float
tmp_shared
[
nwarps
-
1
>
0
?
nwarps
-
1
:
1
][
ncols_y
][
rows_per_cuda_block
][
WARP_SIZE
];
if
(
threadIdx
.
y
>
0
)
{
#pragma unroll
#pragma unroll
ncols_y
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll
#pragma unroll
rows_per_cuda_block
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
tmp_shared
[
threadIdx
.
y
-
1
][
j
][
i
][
threadIdx
.
x
]
=
tmp
[
j
][
i
];
//tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
atomicExch
(
&
tmp_shared
[
threadIdx
.
y
-
1
][
j
][
i
][
threadIdx
.
x
],
tmp
[
j
][
i
]);
}
}
}
...
...
@@ -138,19 +139,22 @@ static __global__ void mul_mat_vec_q(
}
// sum up partial sums and write back result
#pragma unroll
#pragma unroll
ncols_y
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll
#pragma unroll
rows_per_cuda_block
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
#pragma unroll
#pragma unroll
nwarps-1
for
(
int
l
=
0
;
l
<
nwarps
-
1
;
++
l
)
{
tmp
[
j
][
i
]
+=
tmp_shared
[
l
][
j
][
i
][
threadIdx
.
x
];
//tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
atomicAdd
(
&
tmp
[
j
][
i
],
tmp_shared
[
l
][
j
][
i
][
threadIdx
.
x
]);
}
tmp
[
j
][
i
]
=
warp_reduce_sum
(
tmp
[
j
][
i
]);
//tmp[j][i] = warp_reduce_sum(tmp[j][i]);
atomicExch
(
&
tmp
[
j
][
i
],
warp_reduce_sum
(
tmp
[
j
][
i
]));
}
if
(
threadIdx
.
x
<
rows_per_cuda_block
&&
(
rows_per_cuda_block
==
1
||
row0
+
threadIdx
.
x
<
nrows_dst
))
{
dst
[
j
*
nrows_dst
+
row0
+
threadIdx
.
x
]
=
tmp
[
j
][
threadIdx
.
x
];
//dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
atomicExch
(
&
dst
[
j
*
nrows_dst
+
row0
+
threadIdx
.
x
],
tmp
[
j
][
threadIdx
.
x
]);
}
}
}
...
...
llama/ggml-cuda/vecdotq.cuh
View file @
09a693cf
...
...
@@ -387,16 +387,16 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
#pragma unroll
#pragma unroll
QR4_K
for
(
int
i
=
0
;
i
<
QR4_K
;
++
i
)
{
const
int
v0i
=
(
v
[
0
]
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
v1i
=
(
v
[
1
]
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
dot1
=
ggml_cuda_dp4a
(
v1i
,
u
[
2
*
i
+
1
],
ggml_cuda_dp4a
(
v0i
,
u
[
2
*
i
+
0
],
0
));
// SIMD dot product
const
int
dot2
=
ggml_cuda_dp4a
(
0x01010101
,
u
[
2
*
i
+
1
],
ggml_cuda_dp4a
(
0x01010101
,
u
[
2
*
i
+
0
],
0
));
// sum of u
sumf_d
+=
d8
[
i
]
*
(
dot1
*
sc
[
i
]);
sumf_m
+=
d8
[
i
]
*
(
dot2
*
m
[
i
]);
// multiply constant part of q4_K with sum of q8_1 values
atomicAdd
(
&
sumf_d
,
d8
[
i
]
*
(
dot1
*
sc
[
i
]));
atomicAdd
(
&
sumf_m
,
d8
[
i
]
*
(
dot2
*
m
[
i
]));
//sumf_d += d8[i] * (dot1 * sc[i]);
//sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
}
const
float2
dm4f
=
__half22float2
(
dm4
);
...
...
@@ -728,6 +728,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const
uint8_t
*
sc
=
(
const
uint8_t
*
)
aux
;
const
uint8_t
*
m
=
sc
+
2
;
#pragma unroll QR4_K
for
(
int
i
=
0
;
i
<
QR4_K
;
++
i
)
{
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
d8
[
i
]
=
__low2float
(
bq8i
->
ds
);
...
...
@@ -736,7 +737,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
u
[
2
*
i
+
0
]
=
q8
[
0
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
}
return
vec_dot_q4_K_q8_1_impl_vmmq
(
v
,
u
,
sc
,
m
,
bq4_K
->
dm
,
d8
);
}
...
...
make/Makefile.rocm
View file @
09a693cf
...
...
@@ -76,6 +76,8 @@ GPU_COMPILER_CUFLAGS = \
-DGGML_CUDA_MMV_Y
=
1
\
-DGGML_SCHED_MAX_COPIES
=
4
\
-DGGML_USE_HIP
\
--gpu-max-threads-per-block
=
1024
\
-DCDNA
\
-DGGML_USE_LLAMAFILE
\
-DHIP_FAST_MATH
\
-D__HIP_PLATFORM_AMD__
=
1
\
...
...
test.py
0 → 100644
View file @
09a693cf
import
requests
# 定义请求的 URL
url
=
"http://localhost:11434/api/generate"
for
num_batch
in
(
1
,
2
,
4
):
for
num_predict
in
(
128
,
128
):
# 定义请求的 JSON 数据
data
=
{
"model"
:
"deepseek-r1:70b"
,
"prompt"
:
"hi"
,
"stream"
:
False
,
"raw"
:
True
,
"keep_alive"
:
"1h"
,
"options"
:
{
"num_predict"
:
num_predict
,
"num_batch"
:
num_batch
,
"seed"
:
42
,
"stop"
:
[]
}
}
# 发送 POST 请求
response
=
requests
.
post
(
url
,
json
=
data
)
# 打印响应内容
if
response
.
status_code
==
200
:
respose_josn
=
response
.
json
()
prompt_tokens
=
respose_josn
[
"prompt_eval_count"
]
generate_tokens
=
respose_josn
[
"eval_count"
]
prefill_throughput
=
respose_josn
[
"prompt_eval_count"
]
/
respose_josn
[
"prompt_eval_duration"
]
*
(
10
**
9
)
generate_throughput
=
respose_josn
[
"eval_count"
]
/
respose_josn
[
"eval_duration"
]
*
(
10
**
9
)
print
(
f
"batch :
{
num_batch
}
\n
prompt_tokens :
{
prompt_tokens
}
\n
generate_tokens :
{
generate_tokens
}
\n
prefill_throughput :
{
round
(
prefill_throughput
,
2
)
}
\n
generate_throughput :
{
round
(
generate_throughput
,
2
)
}
"
)
#print(response.json())
print
(
"===================================="
)
else
:
print
(
f
"请求失败,状态码:
{
response
.
status_code
}
"
)
print
(
"===================================="
)
\ No newline at end of file
threadtest.py
0 → 100644
View file @
09a693cf
import
requests
import
time
import
concurrent.futures
# 定义请求的URL和payload
for
concurrent_requests
in
(
1
,
2
,
4
):
for
num_predict
in
(
128
,
128
):
url
=
"http://localhost:11434/api/generate"
headers
=
{
"Content-Type"
:
"application/json"
}
#"hi "*510对应512tokens "hi "*998对应1000tokens
payload
=
{
"model"
:
"deepseek-r1:671b"
,
"prompt"
:
"hi"
,
"stream"
:
False
,
"raw"
:
True
,
"keep_alive"
:
"1h"
,
"options"
:
{
"num_predict"
:
num_predict
,
"seed"
:
42
,
"stop"
:
[]
}
}
# 定义发送单个请求的函数
def
send_request
():
start_time
=
time
.
time
()
# 记录请求开始时间
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
payload
)
# 发送请求
end_time
=
time
.
time
()
# 记录请求结束时间
if
response
.
status_code
==
200
:
response_data
=
response
.
json
()
completion_tokens
=
response_data
[
"eval_count"
]
elapsed_time
=
end_time
-
start_time
return
completion_tokens
,
elapsed_time
,
response_data
else
:
print
(
f
"请求失败,状态码:
{
response
.
status_code
}
"
)
return
0
,
0
# 定义并发请求的数量
concurrent_requests
=
concurrent_requests
# 可以根据需要调整并发数
# 记录总开始时间
total_start_time
=
time
.
time
()
# 使用线程池并发发送请求
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
concurrent_requests
)
as
executor
:
futures
=
[
executor
.
submit
(
send_request
)
for
_
in
range
(
concurrent_requests
)]
results
=
[
future
.
result
()
for
future
in
concurrent
.
futures
.
as_completed
(
futures
)]
# 记录总结束时间
total_end_time
=
time
.
time
()
for
result
in
results
:
response_data
=
result
[
2
]
completion_tokens
=
result
[
0
]
elapsed_time
=
result
[
1
]
print
(
f
"请求完成: 生成 tokens =
{
completion_tokens
}
, 耗时 =
{
elapsed_time
:.
2
f
}
秒, 生成速度:
{
completion_tokens
/
elapsed_time
:.
2
f
}
, 响应内容:
{
response_data
}
"
)
# 计算总生成 tokens 和总耗时
total_completion_tokens
=
sum
(
result
[
0
]
for
result
in
results
)
total_elapsed_time
=
total_end_time
-
total_start_time
# 计算整体生成速度(tokens/秒)
if
total_elapsed_time
>
0
:
overall_speed
=
total_completion_tokens
/
total_elapsed_time
print
(
f
"batch_size :
{
concurrent_requests
}
"
)
print
(
f
"总生成 tokens:
{
total_completion_tokens
}
"
)
print
(
f
"总耗时:
{
total_elapsed_time
:.
2
f
}
秒"
)
print
(
f
"整体生成速度:
{
overall_speed
:.
2
f
}
tokens/秒"
)
else
:
print
(
"总耗时过短,无法计算生成速度"
)
print
(
"================num_predict===================="
)
print
(
"================concurrent_requests===================="
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment