Commit 09a693cf authored by xuxzh1's avatar xuxzh1 🎱
Browse files

opt1

parent 84e5b6ff
FROM /models/DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_M/DeepSeek-R1-UD-IQ1_M.gguf
TEMPLATE "<|User|>{{ .Prompt }}<|Assistant|>"
\ No newline at end of file
......@@ -332,7 +332,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2) || defined(CDNA)
c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3)
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
......
......@@ -106,29 +106,30 @@ static __global__ void mul_mat_vec_q(
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
const block_q8_1 * y = (const block_q8_1 *) vy;
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
// x block quant index when casting the quants to int
const int kqs = vdr * (tid % (qi/vdr));
#pragma unroll
#pragma unroll ncols_y
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
#pragma unroll rows_per_cuda_block
for (int i = 0; i < rows_per_cuda_block; ++i) {
tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
//tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
atomicAdd(&tmp[j][i], vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs));
}
}
}
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
if (threadIdx.y > 0) {
#pragma unroll
#pragma unroll ncols_y
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
#pragma unroll rows_per_cuda_block
for (int i = 0; i < rows_per_cuda_block; ++i) {
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
//tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
atomicExch(&tmp_shared[threadIdx.y-1][j][i][threadIdx.x], tmp[j][i]);
}
}
}
......@@ -138,19 +139,22 @@ static __global__ void mul_mat_vec_q(
}
// sum up partial sums and write back result
#pragma unroll
#pragma unroll ncols_y
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
#pragma unroll rows_per_cuda_block
for (int i = 0; i < rows_per_cuda_block; ++i) {
#pragma unroll
#pragma unroll nwarps-1
for (int l = 0; l < nwarps-1; ++l) {
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
//tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
atomicAdd(&tmp[j][i], tmp_shared[l][j][i][threadIdx.x]);
}
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
//tmp[j][i] = warp_reduce_sum(tmp[j][i]);
atomicExch(&tmp[j][i], warp_reduce_sum(tmp[j][i]));
}
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
//dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
atomicExch(&dst[j*nrows_dst + row0 + threadIdx.x], tmp[j][threadIdx.x]);
}
}
}
......
......@@ -387,16 +387,16 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
float sumf_d = 0.0f;
float sumf_m = 0.0f;
#pragma unroll
#pragma unroll QR4_K
for (int i = 0; i < QR4_K; ++i) {
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
const int dot1 = ggml_cuda_dp4a(v1i, u[2*i+1], ggml_cuda_dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+1], ggml_cuda_dp4a(0x01010101, u[2*i+0], 0)); // sum of u
sumf_d += d8[i] * (dot1 * sc[i]);
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
atomicAdd(&sumf_d,d8[i] * (dot1 * sc[i]));
atomicAdd(&sumf_m,d8[i] * (dot2 * m[i]));
//sumf_d += d8[i] * (dot1 * sc[i]);
//sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
}
const float2 dm4f = __half22float2(dm4);
......@@ -728,6 +728,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const uint8_t * sc = (const uint8_t *)aux;
const uint8_t * m = sc + 2;
#pragma unroll QR4_K
for (int i = 0; i < QR4_K; ++i) {
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
d8[i] = __low2float(bq8i->ds);
......@@ -736,7 +737,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
u[2*i+0] = q8[0];
u[2*i+1] = q8[4];
}
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
}
......
......@@ -76,6 +76,8 @@ GPU_COMPILER_CUFLAGS = \
-DGGML_CUDA_MMV_Y=1 \
-DGGML_SCHED_MAX_COPIES=4 \
-DGGML_USE_HIP \
--gpu-max-threads-per-block=1024 \
-DCDNA \
-DGGML_USE_LLAMAFILE \
-DHIP_FAST_MATH \
-D__HIP_PLATFORM_AMD__=1 \
......
import requests
# 定义请求的 URL
url = "http://localhost:11434/api/generate"
for num_batch in(1,2,4):
for num_predict in (128,128):
# 定义请求的 JSON 数据
data = {
"model" : "deepseek-r1:70b",
"prompt" : "hi",
"stream" : False,
"raw" : True,
"keep_alive" : "1h",
"options": {
"num_predict": num_predict,
"num_batch": num_batch,
"seed" : 42,
"stop" : []
}
}
# 发送 POST 请求
response = requests.post(url, json=data)
# 打印响应内容
if response.status_code == 200:
respose_josn = response.json()
prompt_tokens = respose_josn["prompt_eval_count"]
generate_tokens = respose_josn["eval_count"]
prefill_throughput = respose_josn["prompt_eval_count"] / respose_josn["prompt_eval_duration"] * (10**9)
generate_throughput = respose_josn["eval_count"] / respose_josn["eval_duration"] * (10**9)
print (f"batch : {num_batch}\nprompt_tokens : {prompt_tokens}\ngenerate_tokens : {generate_tokens}\nprefill_throughput : {round(prefill_throughput,2)}\ngenerate_throughput : {round(generate_throughput,2)}")
#print(response.json())
print("====================================")
else:
print(f"请求失败,状态码: {response.status_code}")
print("====================================")
\ No newline at end of file
import requests
import time
import concurrent.futures
# 定义请求的URL和payload
for concurrent_requests in (1,2,4):
for num_predict in (128,128):
url = "http://localhost:11434/api/generate"
headers = {
"Content-Type": "application/json"
}
#"hi "*510对应512tokens "hi "*998对应1000tokens
payload = {
"model" : "deepseek-r1:671b",
"prompt" : "hi",
"stream" : False,
"raw" : True,
"keep_alive" : "1h",
"options": {
"num_predict": num_predict,
"seed" : 42,
"stop" : []
}
}
# 定义发送单个请求的函数
def send_request():
start_time = time.time() # 记录请求开始时间
response = requests.post(url, headers=headers, json=payload) # 发送请求
end_time = time.time() # 记录请求结束时间
if response.status_code == 200:
response_data = response.json()
completion_tokens = response_data["eval_count"]
elapsed_time = end_time - start_time
return completion_tokens, elapsed_time, response_data
else:
print(f"请求失败,状态码: {response.status_code}")
return 0, 0
# 定义并发请求的数量
concurrent_requests = concurrent_requests # 可以根据需要调整并发数
# 记录总开始时间
total_start_time = time.time()
# 使用线程池并发发送请求
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
futures = [executor.submit(send_request) for _ in range(concurrent_requests)]
results = [future.result() for future in concurrent.futures.as_completed(futures)]
# 记录总结束时间
total_end_time = time.time()
for result in results:
response_data = result[2]
completion_tokens = result[0]
elapsed_time = result[1]
print(f"请求完成: 生成 tokens = {completion_tokens}, 耗时 = {elapsed_time:.2f} 秒, 生成速度:{completion_tokens/elapsed_time:.2f}, 响应内容:{response_data}")
# 计算总生成 tokens 和总耗时
total_completion_tokens = sum(result[0] for result in results)
total_elapsed_time = total_end_time - total_start_time
# 计算整体生成速度(tokens/秒)
if total_elapsed_time > 0:
overall_speed = total_completion_tokens / total_elapsed_time
print(f"batch_size : {concurrent_requests}" )
print(f"总生成 tokens: {total_completion_tokens}")
print(f"总耗时: {total_elapsed_time:.2f} 秒")
print(f"整体生成速度: {overall_speed:.2f} tokens/秒")
else:
print("总耗时过短,无法计算生成速度")
print("================num_predict====================")
print("================concurrent_requests====================")
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment