Commit 57774b4a authored by one's avatar one
Browse files

Add GEMV lib and update Makefile and README

parent cc91f72b
GPU_ARCH ?= gfx936
CXX ?= hipcc
CXX_FLAGS ?= -std=c++17 -O3
HIPCC_FLAGS = --offload-arch=$(GPU_ARCH)
NVCC_FLAGS = -arch=$(GPU_ARCH) -x cu
TARGET := gemv_bench
SRC := main.cpp
LIB := libgemv_bf16.so
LIB_SRC := gemv_export.cpp
DEP := gemv_bf16.h gemv_utils.h hip_compat.h
.PHONY: all clean
IS_HIPCC := $(findstring hipcc,$(CXX))
# 根据编译器调整 Flags
ifneq (,$(IS_HIPCC))
# HIPCC
ARCH_FLAGS := --offload-arch=$(GPU_ARCH)
else
# NVCC
ARCH_FLAGS := -arch=$(GPU_ARCH) -x cu
endif
.PHONY: all clean lib lib.so
all: $(TARGET)
# 根据 CXX 变量判断编译器类型
ifneq (,$(findstring hipcc,$(CXX)))
# HIPCC 编译
$(TARGET): $(SRC) $(DEP)
$(CXX) $(CXX_FLAGS) $(HIPCC_FLAGS) $< -o $@
else
# NVCC 编译
lib: $(LIB)
lib.so: $(LIB)
$(LIB): $(LIB_SRC) $(DEP)
$(CXX) $(CXX_FLAGS) $(ARCH_FLAGS) -shared -fPIC -o $@ $<
$(TARGET): $(SRC) $(DEP)
$(CXX) $(CXX_FLAGS) $(NVCC_FLAGS) $< -o $@
endif
$(CXX) $(CXX_FLAGS) $(ARCH_FLAGS) -o $@ $<
clean:
rm -f $(TARGET)
rm -f $(TARGET) $(LIB) *.o lib lib.so
......@@ -7,7 +7,7 @@ GEMV Benchmarks
M: 输出维度,例如 11264
K: 归约维度,例如 4096
N: 始终为 1
beta: 始终为 0
beta: 为 0 或 1
## Build
......@@ -23,8 +23,8 @@ CXX=nvcc make GPU_ARCH=sm_80
```bash
# BW系列:
HIP_VISIBLE_DEVICES=1 numactl -N 0 -m 0 ./gemv_bench -M 11264 -K 4096
HIP_VISIBLE_DEVICES=1 numactl -N 0 -m 0 ./gemv_bench --verify 1 -M 11264 -K 4096
# A800:
./gemv_bench -M 11264 -K 4096
./gemv_bench --verify 1 -M 11264 -K 4096
```
\ No newline at end of file
#include "gemv_bf16.h"
extern "C" {
/** y = alpha * A^T * x + beta * y
*
* @param d_A: input matrix A
* @param M: number of rows of A
* @param K: number of columns of A
* @param lda: leading dimension of A
* @param d_x: input vector x
* @param d_y: output vector y
* @param alpha: scaling factor for A^T * x
* @param beta: scaling factor for y
*/
void gemv_bf16_TN_vec_warp_unroll_ntl(hip_bfloat16 *d_A, int M, int K, int lda,
hip_bfloat16 *d_x, hip_bfloat16 *d_y,
float alpha, float beta) {
constexpr bool USE_NTL = true;
constexpr int UNROLL = 4;
int block_size = 128;
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
dim3 grid_dim(grid);
dim3 block_dim(block_size);
gemv_bf16_TN_vec_warp_unroll<USE_NTL, UNROLL>
<<<grid_dim, block_dim>>>(M, K, alpha, d_A, lda, d_x, beta, d_y);
return;
}
} // extern "C"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment