Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
hg-misc-tools
Commits
845b2d24
Commit
845b2d24
authored
Jan 29, 2026
by
one
Browse files
Update GEMV benchmarks and evo2 scripts
parent
8ac49790
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
44 additions
and
18 deletions
+44
-18
evo2/test_evo2_generation_batched.py
evo2/test_evo2_generation_batched.py
+6
-4
gemv/gemv_bf16.h
gemv/gemv_bf16.h
+32
-13
gemv/main.cpp
gemv/main.cpp
+5
-1
gemv/run-all.sh
gemv/run-all.sh
+1
-0
No files found.
evo2/test_evo2_generation_batched.py
View file @
845b2d24
...
...
@@ -123,10 +123,12 @@ def main():
# Read and process sequences
sequences
=
read_prompts
(
'prompts.csv'
)
# DEBUG: replace all prompts with the longest prompt to enable uniform lengths
# Debugging: replace all prompts with the longest prompt
if
args
.
batch_size
>
1
:
longest_prompt
=
max
(
sequences
,
key
=
len
)
sequences
=
[
longest_prompt
]
*
len
(
sequences
)
print
(
f
"[debug] Using longest prompt len=
{
len
(
longest_prompt
)
}
for all sequences"
)
scores
=
generate_and_score
(
sequences
=
sequences
,
model
=
model
,
...
...
gemv/gemv_bf16.h
View file @
845b2d24
...
...
@@ -4,31 +4,45 @@
// Warp Size 根据架构自动选择
#if defined(__HIP_PLATFORM_AMD__)
#define WARP_SIZE 64 //
DCU
#define WARP_SIZE 64 //
Hygon/AMD
#else
#define WARP_SIZE 32 // N
VIDIA
#define WARP_SIZE 32 // N
vidia
#endif
#define VEC_WIDTH 8
#define OFFSET(i, j, lda) ((i) + (j) * (lda))
#define OFFSET_T(i, j, lda) ((i) * (lda) + (j))
/**
* 平台相关的 Shared Memory / LDS
*/
#if defined(__HIP_PLATFORM_AMD__)
// Hygon/AMD: 64KB LDS per CU
constexpr
int
MAX_SHMEM_BYTES_PER_BLOCK
=
65536
;
#else
// Nvidia: 48KB
constexpr
int
MAX_SHMEM_BYTES_PER_BLOCK
=
49152
;
#endif
/**
* 根据需求的并发 block 数量计算 shmem 用量(即 TILE_K 指定的 BF16 元素个数)
*
* AlignElements 为对齐粒度,即元素个数,默认 128-bit 对齐。
* - 8: 对齐到 128-bit (可能有利于 load128b)
* - 16: 对齐到 256-bit (某些 MFMA 指令需求)
*
* concurrent_blocks: 期望的并发 block 数(用于计算可用 shmem)
* - Hygon/AMD: 表示每个 CU 上的并发 block 数
* - Nvidia: 设置为 1 即可(每个 block 独立使用 shmem)
*/
template
<
int
AlignElements
=
8
>
constexpr
int
calculate_tile_k
(
int
concurrent_blocks
)
{
constexpr
int
calculate_tile_k
(
int
concurrent_blocks
=
1
)
{
// 安全检查
if
(
concurrent_blocks
<
1
)
concurrent_blocks
=
1
;
// 直接切分 LDS
constexpr
int
MAX_LDS_BYTES_PER_CU
=
65536
;
int
bytes_per_block
=
MAX_LDS_BYTES_PER_CU
/
concurrent_blocks
;
// 计算每个 block 可用的 shmem
int
bytes_per_block
=
MAX_SHMEM_BYTES_PER_BLOCK
/
concurrent_blocks
;
// 转为元素个数
int
max_elements
=
bytes_per_block
/
sizeof
(
hip_bfloat16
);
...
...
@@ -50,14 +64,19 @@ typedef float __attribute__((ext_vector_type(4))) float4_native;
/// 128-bit non-temporal load 或者 cached load
template
<
bool
USE_NTL
=
false
>
__device__
__forceinline__
bf16_x8
load_128b
(
const
hip_bfloat16
*
src
)
{
if
constexpr
(
USE_NTL
)
{
#if defined(__NVCC__) || defined(__CUDACC__)
// NVIDIA 平台:直接使用普通加载
// NVCC 的优化器通常会自动选择合适的加载指令(如 LDG)
// 如果需要显式控制,可以使用 __ldg() 或 PTX 内联汇编
return
*
reinterpret_cast
<
const
bf16_x8
*>
(
src
);
// Nvidia 平台:PTX 内联汇编实现 cache streaming (ld.global.cs)
uint4
tmp
;
// 128-bit = 4 x 32-bit
asm
volatile
(
"ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];"
:
"=r"
(
tmp
.
x
),
"=r"
(
tmp
.
y
),
"=r"
(
tmp
.
z
),
"=r"
(
tmp
.
w
)
:
"l"
(
src
)
:
"memory"
);
return
*
reinterpret_cast
<
bf16_x8
*>
(
&
tmp
);
#else
if
constexpr
(
USE_NTL
)
{
// DCU:使用 Clang 内置 non-temporal load 函数
// Hygon/AMD 平台:使用 Clang 内置 non-temporal load 函数
// 把地址转换为 float4_native 指针
const
float4_native
*
ptr
=
reinterpret_cast
<
const
float4_native
*>
(
src
);
...
...
@@ -66,10 +85,10 @@ __device__ __forceinline__ bf16_x8 load_128b(const hip_bfloat16 *src) {
// 把加载到的 128 位数据重新解释为 bf16_x8
return
*
reinterpret_cast
<
bf16_x8
*>
(
&
tmp
);
#endif
}
else
{
return
*
reinterpret_cast
<
const
bf16_x8
*>
(
src
);
}
#endif
}
/** y = alpha * A^T * x + 0 * y
...
...
gemv/main.cpp
View file @
845b2d24
...
...
@@ -70,8 +70,12 @@ int main(int argc, char **argv) {
constexpr
bool
NTL
=
true
;
constexpr
int
UNROLL
=
4
;
constexpr
int
TILE_K
=
calculate_tile_k
<
8
>
(
4
);
constexpr
int
ROWS_PER_WARP
=
2
;
#if defined(__HIP_PLATFORM_AMD__)
constexpr
int
TILE_K
=
calculate_tile_k
<
8
>
(
4
);
#else
constexpr
int
TILE_K
=
calculate_tile_k
<
8
>
(
1
);
#endif
kernels
.
push_back
(
{
"naive"
,
[
&
](
int
M
,
int
K
,
float
alpha
,
const
hip_bfloat16
*
A
,
int
lda
,
...
...
gemv/run-all.sh
View file @
845b2d24
...
...
@@ -5,6 +5,7 @@ export HIP_VISIBLE_DEVICES=1
BIND_CMD
=
"numactl -N 0 -m 0"
CXX
=
hipcc make
# CXX=nvcc make GPU_ARCH=sm_80
if
[[
"
$*
"
==
*
"--pmc"
*
]]
;
then
PROF_CMD
=
"hipprof --trace-off --pmc --pmc-type 3"
${
PROF_CMD
}
-o
log/pmc-k1
${
BIND_CMD
}
./gemv_bench
--verify
1
-M
11264
-K
4096
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment