Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
hg-misc-tools
Commits
cc91f72b
Commit
cc91f72b
authored
Jan 30, 2026
by
one
Browse files
Update GEMV kernels
parent
a781cad3
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
147 additions
and
93 deletions
+147
-93
gemv/gemv_bf16.h
gemv/gemv_bf16.h
+77
-67
gemv/gemv_utils.h
gemv/gemv_utils.h
+2
-11
gemv/main.cpp
gemv/main.cpp
+16
-3
gemv/run-all.sh
gemv/run-all.sh
+18
-12
gemv/run-rocblas.sh
gemv/run-rocblas.sh
+34
-0
No files found.
gemv/gemv_bf16.h
View file @
cc91f72b
...
...
@@ -17,11 +17,11 @@
* 平台相关的 Shared Memory / LDS
*/
#if defined(__HIP_PLATFORM_AMD__)
// Hygon/AMD: 64KB LDS per CU
constexpr
int
MAX_SHMEM_BYTES_PER_BLOCK
=
65536
;
// Hygon/AMD: 64KB LDS per CU
constexpr
int
MAX_SHMEM_BYTES_PER_BLOCK
=
65536
;
#else
// Nvidia: 48KB
constexpr
int
MAX_SHMEM_BYTES_PER_BLOCK
=
49152
;
// Nvidia: 48KB
constexpr
int
MAX_SHMEM_BYTES_PER_BLOCK
=
49152
;
#endif
/**
...
...
@@ -91,7 +91,7 @@ __device__ __forceinline__ bf16_x8 load_128b(const hip_bfloat16 *src) {
}
}
/** y = alpha * A^T * x +
0
* y
/** y = alpha * A^T * x +
beta
* y
* Naive 实现:
* - JKI
* - 每个线程算一个输出,即 I 循环的一次迭代
...
...
@@ -99,7 +99,7 @@ __device__ __forceinline__ bf16_x8 load_128b(const hip_bfloat16 *src) {
__global__
void
gemv_bf16_TN_naive
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
// 0
const
float
beta
,
hip_bfloat16
*
__restrict__
y
)
{
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// output
if
(
m
>=
M
)
...
...
@@ -113,12 +113,14 @@ __global__ void gemv_bf16_TN_naive(int M, int K, const float alpha,
float
val_x
=
static_cast
<
float
>
(
x
[
k
]);
sum
+=
val_a
*
val_x
;
}
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
);
float
y_original
=
static_cast
<
float
>
(
y
[
m
]);
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
+
beta
*
y_original
);
return
;
}
/** y = alpha * A^T * x +
0
* y
/** y = alpha * A^T * x +
beta
* y
* 向量化实现:
* - JKI
* - 每个线程算一个输出,即 I 循环的一次迭代。
...
...
@@ -128,7 +130,7 @@ template <bool USE_NTL = false>
__global__
void
gemv_bf16_TN_vec
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
// 0
const
float
beta
,
hip_bfloat16
*
__restrict__
y
)
{
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// output
if
(
m
>=
M
)
...
...
@@ -149,12 +151,13 @@ __global__ void gemv_bf16_TN_vec(int M, int K, const float alpha,
}
}
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
);
float
y_original
=
static_cast
<
float
>
(
y
[
m
]);
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
+
beta
*
y_original
);
return
;
}
/** y = alpha * A^T * x +
0
* y
/** y = alpha * A^T * x +
beta
* y
* Warp 归约:
* - JKI
* - 每个 warp 算一个输出,相当于用 warp size 作为 stride 沿着 K 方向 tiling。
...
...
@@ -163,7 +166,7 @@ __global__ void gemv_bf16_TN_vec(int M, int K, const float alpha,
__global__
void
gemv_bf16_TN_warp
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
// 0
const
float
beta
,
hip_bfloat16
*
__restrict__
y
)
{
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -172,9 +175,9 @@ __global__ void gemv_bf16_TN_warp(int M, int K, const float alpha,
if
(
m
>=
M
)
return
;
const
int
stride
=
WARP_SIZE
;
const
hip_bfloat16
*
row_ptr
=
A
+
m
*
lda
;
float
sum
=
0.0
f
;
const
int
stride
=
WARP_SIZE
;
for
(
int
k
=
lane_id
;
k
<
K
;
k
+=
stride
)
{
float
val_a
=
static_cast
<
float
>
(
row_ptr
[
k
]);
...
...
@@ -189,24 +192,24 @@ __global__ void gemv_bf16_TN_warp(int M, int K, const float alpha,
// Lane 0 负责写回
if
(
lane_id
==
0
)
{
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
);
float
y_original
=
static_cast
<
float
>
(
y
[
m
]);
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
+
beta
*
y_original
);
}
return
;
}
/** y = alpha * A^T * x +
0
* y
/** y = alpha * A^T * x +
beta
* y
* Vec + warp:
* - JKI
* - 每个线程每次读 VEC_WIDTH 个 bf16 数据(矩阵 A 可用 non-temporal load)。
* - 每个 warp 算一个输出,warp 内归约。
*/
template
<
bool
USE_NTL
=
false
>
__global__
void
gemv_bf16_TN_vec_warp
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
// 0
__global__
void
gemv_bf16_TN_vec_warp
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
hip_bfloat16
*
__restrict__
y
)
{
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -215,9 +218,9 @@ __global__ void gemv_bf16_TN_vec_warp(int M, int K, const float alpha,
if
(
m
>=
M
)
return
;
const
int
stride
=
WARP_SIZE
*
VEC_WIDTH
;
const
hip_bfloat16
*
row_ptr
=
A
+
m
*
lda
;
float
sum
=
0.0
f
;
const
int
stride
=
WARP_SIZE
*
VEC_WIDTH
;
for
(
int
k
=
lane_id
*
VEC_WIDTH
;
k
<
K
;
k
+=
stride
)
{
bf16_x8
a_vec
=
load_128b
<
USE_NTL
>
(
&
row_ptr
[
k
]);
...
...
@@ -237,13 +240,14 @@ __global__ void gemv_bf16_TN_vec_warp(int M, int K, const float alpha,
// Lane 0 负责写回
if
(
lane_id
==
0
)
{
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
);
float
y_original
=
static_cast
<
float
>
(
y
[
m
]);
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
+
beta
*
y_original
);
}
return
;
}
/** y = alpha * A^T * x +
0
* y
/** y = alpha * A^T * x +
beta
* y
* 单线程 vec + warp 处理多行:
* - JKI
* - 每个线程每次读 VEC_WIDTH 个 bf16 数据(矩阵 A 可用 non-temporal load)。
...
...
@@ -251,11 +255,10 @@ __global__ void gemv_bf16_TN_vec_warp(int M, int K, const float alpha,
* - 每个 lane 维护 ROWS_PER_WARP 个累加器。
*/
template
<
bool
USE_NTL
=
false
,
int
ROWS_PER_WARP
=
2
>
__global__
void
gemv_bf16_TN_vec_warp_mr
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
// 0
__global__
void
gemv_bf16_TN_vec_warp_mr
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
hip_bfloat16
*
__restrict__
y
)
{
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -264,16 +267,20 @@ __global__ void gemv_bf16_TN_vec_warp_mr(int M, int K, const float alpha,
int
m_base
=
blockIdx
.
x
*
(
blockDim
.
x
/
WARP_SIZE
)
*
ROWS_PER_WARP
+
warp_id
*
ROWS_PER_WARP
;
// 预先计算每一行的指针和原始 y 值
const
hip_bfloat16
*
row_ptr
[
ROWS_PER_WARP
];
float
y_original
[
ROWS_PER_WARP
];
// 每个 lane 维护 ROWS_PER_WARP 个累加器
float
sum
[
ROWS_PER_WARP
]
=
{
0.0
f
};
// 预先计算每一行的指针
const
hip_bfloat16
*
row_ptr
[
ROWS_PER_WARP
];
#pragma unroll
for
(
int
r
=
0
;
r
<
ROWS_PER_WARP
;
++
r
)
{
int
m
=
m_base
+
r
;
// 越界时指向 A,确保地址有效,消除后续分支
row_ptr
[
r
]
=
(
m
<
M
)
?
(
A
+
m
*
lda
)
:
A
;
// 读取有效的原始 y 值
y_original
[
r
]
=
(
m
<
M
)
?
static_cast
<
float
>
(
y
[
m
])
:
0.0
f
;
}
const
int
stride
=
WARP_SIZE
*
VEC_WIDTH
;
...
...
@@ -311,7 +318,7 @@ __global__ void gemv_bf16_TN_vec_warp_mr(int M, int K, const float alpha,
if
(
lane_id
==
0
)
{
int
m
=
m_base
+
r
;
if
(
m
<
M
)
{
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
[
r
]);
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
[
r
]
+
beta
*
y_original
[
r
]
);
}
}
}
...
...
@@ -319,7 +326,7 @@ __global__ void gemv_bf16_TN_vec_warp_mr(int M, int K, const float alpha,
return
;
}
/** y = alpha * A^T * x +
0
* y
/** y = alpha * A^T * x +
beta
* y
* 单线程 vec + warp + 主循环 unroll:
* - JKI
* - 每个线程每次读 VEC_WIDTH 个 bf16 数据(矩阵 A 可用 non-temporal load)。
...
...
@@ -327,12 +334,11 @@ __global__ void gemv_bf16_TN_vec_warp_mr(int M, int K, const float alpha,
* - 主循环 unrolling。
*/
template
<
bool
USE_NTL
=
false
,
int
UNROLL
=
4
>
__global__
void
gemv_bf16_TN_vec_warp_unroll
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
__global__
void
gemv_bf16_TN_vec_warp_unroll
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
// 0
hip_bfloat16
*
__restrict__
y
)
{
const
float
beta
,
hip_bfloat16
*
__restrict__
y
)
{
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
int
m
=
blockIdx
.
x
*
(
blockDim
.
x
/
WARP_SIZE
)
+
warp_id
;
...
...
@@ -340,7 +346,6 @@ __global__ void gemv_bf16_TN_vec_warp_unroll(int M, int K, const float alpha,
if
(
m
>=
M
)
return
;
const
int
stride
=
WARP_SIZE
*
VEC_WIDTH
*
UNROLL
;
const
hip_bfloat16
*
row_ptr
=
A
+
m
*
lda
;
float
sum
=
0.0
f
;
...
...
@@ -352,6 +357,7 @@ __global__ void gemv_bf16_TN_vec_warp_unroll(int M, int K, const float alpha,
int
k
=
0
;
// 主循环
const
int
stride
=
WARP_SIZE
*
VEC_WIDTH
*
UNROLL
;
for
(;
k
<=
K
-
stride
;
k
+=
stride
)
{
#pragma unroll
for
(
int
u
=
0
;
u
<
UNROLL
;
++
u
)
{
...
...
@@ -392,13 +398,14 @@ __global__ void gemv_bf16_TN_vec_warp_unroll(int M, int K, const float alpha,
// Lane 0 负责写回
if
(
lane_id
==
0
)
{
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
);
float
y_original
=
static_cast
<
float
>
(
y
[
m
]);
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
+
beta
*
y_original
);
}
return
;
}
/** y = alpha * A^T * x +
0
* y
/** y = alpha * A^T * x +
beta
* y
* 单线程 vec + warp + shmem 缓存 x:
* - JKI
* - 每个线程每次读 VEC_WIDTH 个 bf16 数据(矩阵 A 可用 non-temporal load)。
...
...
@@ -406,11 +413,10 @@ __global__ void gemv_bf16_TN_vec_warp_unroll(int M, int K, const float alpha,
* - shmem 缓存 x,分块加载。
*/
template
<
bool
USE_NTL
=
false
,
int
TILE_K
=
4096
>
__global__
void
gemv_bf16_TN_vec_warp_shm
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
// 0
__global__
void
gemv_bf16_TN_vec_warp_shm
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
hip_bfloat16
*
__restrict__
y
)
{
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -419,8 +425,9 @@ __global__ void gemv_bf16_TN_vec_warp_shm(int M, int K, const float alpha,
// 缓存 x 的一个 tile
__shared__
hip_bfloat16
x_tile
[
TILE_K
];
// 不会在 m>=M 时访问 A,因此不需要分支
const
hip_bfloat16
*
row_ptr
=
A
+
m
*
lda
;
// 预先计算每一行的指针和原始 y 值
const
hip_bfloat16
*
row_ptr
=
A
+
m
*
lda
;
// 不需要分支
float
y_original
=
(
m
<
M
)
?
static_cast
<
float
>
(
y
[
m
])
:
0.0
f
;
float
sum
=
0.0
f
;
// 外层循环遍历 K 维度的所有 tile
...
...
@@ -485,13 +492,13 @@ __global__ void gemv_bf16_TN_vec_warp_shm(int M, int K, const float alpha,
// Lane 0 写回结果
if
(
lane_id
==
0
)
{
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
);
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
+
beta
*
y_original
);
}
return
;
}
/** y = alpha * A^T * x +
0
* y
/** y = alpha * A^T * x +
beta
* y
* 单线程 vec + warp + 主循环 unroll + shmem 缓存 x:
* - JKI
* - 每个线程每次读 VEC_WIDTH 个 bf16 数据(矩阵 A 可用 non-temporal load)。
...
...
@@ -502,7 +509,7 @@ __global__ void gemv_bf16_TN_vec_warp_shm(int M, int K, const float alpha,
template
<
bool
USE_NTL
=
false
,
int
UNROLL
=
4
,
int
TILE_K
=
4096
>
__global__
void
gemv_bf16_TN_vec_warp_unroll_shm
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
// 0
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
hip_bfloat16
*
__restrict__
y
)
{
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -511,8 +518,9 @@ __global__ void gemv_bf16_TN_vec_warp_unroll_shm(
// 缓存 x 的一个 tile
__shared__
hip_bfloat16
x_tile
[
TILE_K
];
// 不会在 m>=M 时访问 A,因此不需要分支
const
hip_bfloat16
*
row_ptr
=
A
+
m
*
lda
;
// 预先计算每一行的指针和原始 y 值
const
hip_bfloat16
*
row_ptr
=
A
+
m
*
lda
;
// 不需要分支
float
y_original
=
(
m
<
M
)
?
static_cast
<
float
>
(
y
[
m
])
:
0.0
f
;
float
sum
=
0.0
f
;
// 外层循环遍历 K 维度的所有 tile
...
...
@@ -603,13 +611,13 @@ __global__ void gemv_bf16_TN_vec_warp_unroll_shm(
// Lane 0 写回结果
if
(
lane_id
==
0
)
{
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
);
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
+
beta
*
y_original
);
}
return
;
}
/** y = alpha * A^T * x +
0
* y
/** y = alpha * A^T * x +
beta
* y
* 单线程 vec + warp 处理多行 + shmem 缓存 x:
* - JKI
* - 每个线程每次读 VEC_WIDTH 个 bf16 数据(矩阵 A 可用 non-temporal load)。
...
...
@@ -618,12 +626,11 @@ __global__ void gemv_bf16_TN_vec_warp_unroll_shm(
* - shmem 缓存 x,分块加载。
*/
template
<
bool
USE_NTL
=
false
,
int
TILE_K
=
4096
,
int
ROWS_PER_WARP
=
2
>
__global__
void
gemv_bf16_TN_vec_warp_mr_shm
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
__global__
void
gemv_bf16_TN_vec_warp_mr_shm
(
int
M
,
int
K
,
const
float
alpha
,
const
hip_bfloat16
*
__restrict__
A
,
int
lda
,
const
hip_bfloat16
*
__restrict__
x
,
const
float
beta
,
// 0
hip_bfloat16
*
__restrict__
y
)
{
const
float
beta
,
hip_bfloat16
*
__restrict__
y
)
{
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -637,13 +644,16 @@ __global__ void gemv_bf16_TN_vec_warp_mr_shm(int M, int K, const float alpha,
// 每个 lane 维护 ROWS_PER_WARP 个累加器
float
sum
[
ROWS_PER_WARP
]
=
{
0.0
f
};
// 预先计算每一行的指针
// 预先计算每一行的指针
和原始 y 值
const
hip_bfloat16
*
row_ptr
[
ROWS_PER_WARP
];
float
y_original
[
ROWS_PER_WARP
];
#pragma unroll
for
(
int
r
=
0
;
r
<
ROWS_PER_WARP
;
++
r
)
{
int
m
=
m_base
+
r
;
// 越界时指向 A,确保地址有效,消除后续分支
row_ptr
[
r
]
=
(
m
<
M
)
?
(
A
+
m
*
lda
)
:
A
;
// 读取有效的原始 y 值
y_original
[
r
]
=
(
m
<
M
)
?
static_cast
<
float
>
(
y
[
m
])
:
0.0
f
;
}
// 外层循环遍历 K 维度的所有 tile
...
...
@@ -720,7 +730,7 @@ __global__ void gemv_bf16_TN_vec_warp_mr_shm(int M, int K, const float alpha,
if
(
lane_id
==
0
)
{
int
m
=
m_base
+
r
;
if
(
m
<
M
)
{
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
[
r
]);
y
[
m
]
=
hip_bfloat16
(
alpha
*
sum
[
r
]
+
beta
*
y_original
[
r
]
);
}
}
}
...
...
gemv/gemv_utils.h
View file @
cc91f72b
...
...
@@ -111,9 +111,6 @@ inline void run_benchmark(int warmups, int loops,
float
alpha
,
const
hip_bfloat16
*
A
,
int
lda
,
const
hip_bfloat16
*
x
,
float
beta
,
hip_bfloat16
*
y
,
bool
do_verify
)
{
std
::
cout
<<
"GEMV Benchmarks"
<<
std
::
endl
;
hipEvent_t
start
,
stop
;
checkHipErrors
(
hipEventCreate
(
&
start
));
checkHipErrors
(
hipEventCreate
(
&
stop
));
...
...
@@ -141,12 +138,6 @@ inline void run_benchmark(int warmups, int loops,
// 表头
printf
(
"%s
\n
"
,
std
::
string
(
w_table
,
'-'
).
c_str
());
printf
(
"M=%d, K=%d, N=1
\n
"
,
M
,
K
);
printf
(
"lda=%d
\n
"
,
lda
);
printf
(
"sizeof(A)=%lu MB
\n
"
,
M
*
lda
*
sizeof
(
hip_bfloat16
)
/
1024
/
1024
);
printf
(
"L2 cache=%d MB
\n
"
,
get_l2_cache_size
());
printf
(
"Warmups=%d, Loops=%d
\n
"
,
warmups
,
loops
);
printf
(
"%s
\n
"
,
std
::
string
(
w_table
,
'-'
).
c_str
());
printf
(
"%-38s %10s %10s %10s %8s
\n
"
,
"Kernel Name"
,
"Time (us)"
,
"GFLOPS"
,
"BW (GB/s)"
,
"Result"
);
...
...
@@ -195,9 +186,9 @@ inline void run_benchmark(int warmups, int loops,
// 4. Metrics
double
gflops
=
(
2.0
*
M
*
K
)
/
(
avg_ms
*
1e-3
)
/
1e9
;
// Bandwidth = Read A + Read x + Write y
// Bandwidth = Read A + Read x +
Read y +
Write y
// A: M*K, x: K, y: M
double
bytes_moved
=
(
double
)(
M
*
K
+
K
+
M
)
*
sizeof
(
hip_bfloat16
);
double
bytes_moved
=
(
double
)(
M
*
K
+
K
+
M
+
M
)
*
sizeof
(
hip_bfloat16
);
double
bw
=
bytes_moved
/
(
avg_ms
*
1e-3
)
/
1e9
;
printf
(
"%-38s %10.1f %10.2f %10.2f %8s
\n
"
,
k
.
name
.
c_str
(),
avg_ms
*
1e3
,
...
...
gemv/main.cpp
View file @
cc91f72b
...
...
@@ -7,10 +7,10 @@ int main(int argc, char **argv) {
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
int
M
=
11264
;
int
K
=
4096
;
// int N = 1; // Unused
int
K
=
4096
;
int
lda
=
K
;
int
block_size
=
256
;
int
block_size
=
128
;
if
(
char
*
value
=
getCmdOption
(
argv
,
argv
+
argc
,
"--warmups"
))
{
warmups
=
std
::
stoi
(
value
);
...
...
@@ -28,6 +28,10 @@ int main(int argc, char **argv) {
alpha
=
std
::
stof
(
value
);
}
if
(
char
*
value
=
getCmdOption
(
argv
,
argv
+
argc
,
"--beta"
))
{
beta
=
std
::
stof
(
value
);
}
if
(
char
*
value
=
getCmdOption
(
argv
,
argv
+
argc
,
"-M"
))
{
M
=
std
::
stoi
(
value
);
}
...
...
@@ -82,7 +86,7 @@ int main(int argc, char **argv) {
constexpr
int
UNROLL
=
4
;
constexpr
int
ROWS_PER_WARP
=
2
;
#if defined(__HIP_PLATFORM_AMD__)
constexpr
int
TILE_K
=
calculate_tile_k
<
8
>
(
4
);
constexpr
int
TILE_K
=
calculate_tile_k
<
8
>
(
8
);
#else
constexpr
int
TILE_K
=
calculate_tile_k
<
8
>
(
1
);
#endif
...
...
@@ -253,6 +257,15 @@ int main(int argc, char **argv) {
<<<
grid
,
block_size
>>>
(
M
,
K
,
alpha
,
A
,
lda
,
x
,
beta
,
y
);
}});
// 打印信息
printf
(
"GEMV Benchmarks: y = alpha * A^T * x + beta * y
\n
"
);
printf
(
"Block size=%d
\n
"
,
block_size
);
printf
(
"alpha=%.2f, beta=%.2f
\n
"
,
alpha
,
beta
);
printf
(
"M=%d, N=1, K=%d, lda=%d
\n
"
,
M
,
K
,
lda
);
printf
(
"sizeof(A)=%lu MB
\n
"
,
M
*
lda
*
sizeof
(
hip_bfloat16
)
/
1024
/
1024
);
printf
(
"L2 cache=%d MB
\n
"
,
get_l2_cache_size
());
printf
(
"Warmups=%d, Loops=%d
\n
"
,
warmups
,
loops
);
// 运行所有测试
run_benchmark
(
warmups
,
loops
,
kernels
,
M
,
K
,
alpha
,
d_A
,
lda
,
d_x
,
beta
,
d_y
,
do_verify
);
...
...
gemv/run-all.sh
View file @
cc91f72b
...
...
@@ -7,21 +7,27 @@ BIND_CMD="numactl -N 0 -m 0"
make clean
CXX
=
hipcc make
GPU_ARCH
=
gfx936
# CXX=nvcc make GPU_ARCH=sm_80
W1
=
"--verify 1 -M 11264 -K 4096 --alpha 1 --beta 0 -B 128"
W2
=
"--verify 1 -M 4096 -K 11264 --alpha 1 --beta 0 -B 128"
W3
=
"--verify 1 -M 12288 -K 4096 --alpha 1 --beta 0 -B 128"
W4
=
"--verify 1 -M 4096 -K 4096 --alpha 1 --beta 1 -B 128"
if
[[
"
$*
"
==
*
"--pmc"
*
]]
;
then
PROF_CMD
=
"hipprof --trace-off --pmc"
${
PROF_CMD
}
-o
log/pmc-w1
${
BIND_CMD
}
./gemv_bench
--warmups
10
--loops
20
--verify
1
-M
11264
-K
4096
${
PROF_CMD
}
-o
log/pmc-w2
${
BIND_CMD
}
./gemv_bench
--warmups
10
--loops
20
--verify
1
-M
4096
-K
11264
${
PROF_CMD
}
-o
log/pmc-w3
${
BIND_CMD
}
./gemv_bench
--warmups
10
--loops
20
--verify
1
-M
12288
-K
4096
${
PROF_CMD
}
-o
log/pmc-w4
${
BIND_CMD
}
./gemv_bench
--warmups
10
--loops
20
--verify
1
-M
4096
-K
4096
${
PROF_CMD
}
-o
log/pmc-w1
${
BIND_CMD
}
./gemv_bench
--warmups
10
--loops
20
${
W1
}
${
PROF_CMD
}
-o
log/pmc-w2
${
BIND_CMD
}
./gemv_bench
--warmups
10
--loops
20
${
W2
}
${
PROF_CMD
}
-o
log/pmc-w3
${
BIND_CMD
}
./gemv_bench
--warmups
10
--loops
20
${
W3
}
${
PROF_CMD
}
-o
log/pmc-w4
${
BIND_CMD
}
./gemv_bench
--warmups
10
--loops
20
${
W4
}
elif
[[
"
$*
"
==
*
"--trace"
*
]]
;
then
PROF_CMD
=
"hipprof --hip-trace"
${
PROF_CMD
}
-o
log/trace-w1
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
1000
--verify
1
-M
11264
-K
4096
${
PROF_CMD
}
-o
log/trace-w2
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
1000
--verify
1
-M
4096
-K
11264
${
PROF_CMD
}
-o
log/trace-w3
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
1000
--verify
1
-M
12288
-K
4096
${
PROF_CMD
}
-o
log/trace-w4
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
1000
--verify
1
-M
4096
-K
4096
${
PROF_CMD
}
-o
log/trace-w1
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
1000
${
W1
}
${
PROF_CMD
}
-o
log/trace-w2
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
1000
${
W2
}
${
PROF_CMD
}
-o
log/trace-w3
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
1000
${
W3
}
${
PROF_CMD
}
-o
log/trace-w4
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
1000
${
W4
}
else
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
2000
--verify
1
-M
11264
-K
4096
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
2000
--verify
1
-M
4096
-K
11264
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
2000
--verify
1
-M
12288
-K
4096
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
2000
--verify
1
-M
4096
-K
4096
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
2000
${
W1
}
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
2000
${
W2
}
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
2000
${
W3
}
${
BIND_CMD
}
./gemv_bench
--warmups
100
--loops
2000
${
W4
}
fi
gemv/run-rocblas.sh
0 → 100644
View file @
cc91f72b
#!/bin/bash
chmod
u+x /opt/dtk/lib/rocblas/benchmark_tool/
*
export
PATH
=
/opt/dtk/lib/rocblas/benchmark_tool/:
${
PATH
}
BIND_CMD
=
"numactl -m 0 -N 0"
BATCH_SIZE
=
1
export
HIP_VISIBLE_DEVICES
=
1
# export ROCBLAS_TENSILE_GEMM_OVERRIDE_PATH=$(PWD)/tensil_gemms.csv
W1
=
"-f gemm_ex --transposeA T --transposeB N -m 11264 -n
${
BATCH_SIZE
}
-k 4096 --alpha 1 --a_type bf16_r --lda 4096 --b_type bf16_r --ldb 4096 --beta 0 --c_type bf16_r --ldc 11264 --d_type bf16_r --ldd 11264 --compute_type f32_r --algo 0 --solution_index 0 --flags 0"
W2
=
"-f gemm_ex --transposeA T --transposeB N -m 4096 -n
${
BATCH_SIZE
}
-k 11264 --alpha 1 --a_type bf16_r --lda 11264 --b_type bf16_r --ldb 11264 --beta 0 --c_type bf16_r --ldc 4096 --d_type bf16_r --ldd 4096 --compute_type f32_r --algo 0 --solution_index 0 --flags 0"
W3
=
"-f gemm_ex --transposeA T --transposeB N -m 12288 -n
${
BATCH_SIZE
}
-k 4096 --alpha 1 --a_type bf16_r --lda 4096 --b_type bf16_r --ldb 4096 --beta 0 --c_type bf16_r --ldc 12288 --d_type bf16_r --ldd 12288 --compute_type f32_r --algo 0 --solution_index 0 --flags 0"
W4
=
"-f gemm_ex --transposeA T --transposeB N -m 4096 -n
${
BATCH_SIZE
}
-k 4096 --alpha 1 --a_type bf16_r --lda 4096 --b_type bf16_r --ldb 4096 --beta 1 --c_type bf16_r --ldc 4096 --d_type bf16_r --ldd 4096 --compute_type f32_r --algo 0 --solution_index 0 --flags 0"
if
[[
"
$*
"
==
*
"--pmc"
*
]]
;
then
PROF_CMD
=
"hipprof --trace-off --pmc"
${
PROF_CMD
}
-o
log/pmc-blas-w1-bs
${
BATCH_SIZE
}
${
BIND_CMD
}
rocblas-bench
${
W1
}
${
PROF_CMD
}
-o
log/pmc-blas-w2-bs
${
BATCH_SIZE
}
${
BIND_CMD
}
rocblas-bench
${
W2
}
${
PROF_CMD
}
-o
log/pmc-blas-w3-bs
${
BATCH_SIZE
}
${
BIND_CMD
}
rocblas-bench
${
W3
}
${
PROF_CMD
}
-o
log/pmc-blas-w4-bs
${
BATCH_SIZE
}
${
BIND_CMD
}
rocblas-bench
${
W4
}
elif
[[
"
$*
"
==
*
"--trace"
*
]]
;
then
PROF_CMD
=
"hipprof --hip-trace"
${
PROF_CMD
}
-o
log/trace-blas-w1-bs
${
BATCH_SIZE
}
${
BIND_CMD
}
rocblas-bench
${
W1
}
${
PROF_CMD
}
-o
log/trace-blas-w2-bs
${
BATCH_SIZE
}
${
BIND_CMD
}
rocblas-bench
${
W2
}
${
PROF_CMD
}
-o
log/trace-blas-w3-bs
${
BATCH_SIZE
}
${
BIND_CMD
}
rocblas-bench
${
W3
}
${
PROF_CMD
}
-o
log/trace-blas-w4-bs
${
BATCH_SIZE
}
${
BIND_CMD
}
rocblas-bench
${
W4
}
else
${
BIND_CMD
}
rocblas-bench
${
W1
}
${
BIND_CMD
}
rocblas-bench
${
W2
}
${
BIND_CMD
}
rocblas-bench
${
W3
}
${
BIND_CMD
}
rocblas-bench
${
W4
}
fi
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment