Commit 65bf476e authored by one's avatar one
Browse files

Add trace fix script and refactor evo2 launch scripts

- Add fix-pt-trace.sh for repairing non-UTF-8 traces.
- Remove deprecated run-rocblas.sh.
- Update trace handler (worker names) and tune GPU bindings in run-all.sh.
parent 06fe2294
#!/bin/bash
# PyTorch Profiler 产生的 trace 文件如果包含非 UTF-8 的 stack 信息,
# 可以使用这个脚本强制修复。
INPUT_FILE="$1"
# 临时文件
TEMP_FILE="${INPUT_FILE}.temp.gz"
gzip -dc "$INPUT_FILE" | iconv -f utf-8 -t utf-8 -c | gzip > "$TEMP_FILE"
# 检查是否成功
if [ $? -eq 0 ]; then
echo "Overwriting original file..."
mv "$TEMP_FILE" "$INPUT_FILE"
echo "Done."
else
echo "Error, original file not overwritten."
rm -f "$TEMP_FILE"
exit 1
fi
\ No newline at end of file
#!/bin/bash
export HIP_VISIBLE_DEVICES=1
chmod u+x /opt/dtk/lib/rocblas/benchmark_tool/rocblas-bench
PROF_CMD='hipprof --hip-trace'
BENCH_CMD='numactl -m 0 -N 0 /opt/dtk/lib/rocblas/benchmark_tool/rocblas-bench'
BATCH_SIZE=1
LOG_PREFIX=log/trace-rocblas
echo
echo "===== Kernel 1 ====="
#RUN_PROF="${PROF_CMD} -o ${LOG_PREFIX}-k1-bs${BATCH_SIZE}"
${RUN_PROF} ${BENCH_CMD} -f gemm_ex --transposeA T --transposeB N -m 11264 -n ${BATCH_SIZE} -k 4096 --alpha 1 --a_type bf16_r --lda 4096 --b_type bf16_r --ldb 4096 --beta 0 --c_type bf16_r --ldc 11264 --d_type bf16_r --ldd 11264 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
echo
echo "===== Kernel 2 ====="
#RUN_PROF="${PROF_CMD} -o ${LOG_PREFIX}-k2-bs${BATCH_SIZE}"
${RUN_PROF} ${BENCH_CMD} -f gemm_ex --transposeA T --transposeB N -m 4096 -n ${BATCH_SIZE} -k 11264 --alpha 1 --a_type bf16_r --lda 11264 --b_type bf16_r --ldb 11264 --beta 0 --c_type bf16_r --ldc 4096 --d_type bf16_r --ldd 4096 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
echo
echo "===== Kernel 3 ====="
#RUN_PROF="${PROF_CMD} -o ${LOG_PREFIX}-k3-bs${BATCH_SIZE}"
${RUN_PROF} ${BENCH_CMD} -f gemm_ex --transposeA T --transposeB N -m 12288 -n ${BATCH_SIZE} -k 4096 --alpha 1 --a_type bf16_r --lda 4096 --b_type bf16_r --ldb 4096 --beta 0 --c_type bf16_r --ldc 12288 --d_type bf16_r --ldd 12288 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
echo
echo "===== Kernel 4 ====="
#RUN_PROF="${PROF_CMD} -o ${LOG_PREFIX}-k4-bs${BATCH_SIZE}"
${RUN_PROF} ${BENCH_CMD} -f gemm_ex --transposeA T --transposeB N -m 4096 -n ${BATCH_SIZE} -k 4096 --alpha 1 --a_type bf16_r --lda 4096 --b_type bf16_r --ldb 4096 --beta 1 --c_type bf16_r --ldc 4096 --d_type bf16_r --ldd 4096 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
......@@ -107,9 +107,14 @@ def generate_and_score(
def custom_trace_handler(
dir_name="./log/pt-trace/", sort_by="self_device_time_total", top_n=20
dir_name="./log/pt-trace/",
worker_name=None,
sort_by="self_device_time_total",
top_n=20,
):
tb_handler = torch.profiler.tensorboard_trace_handler(dir_name=dir_name)
tb_handler = torch.profiler.tensorboard_trace_handler(
dir_name=dir_name, worker_name=worker_name
)
field_fallbacks = {
"self_device_time_total": "self_cuda_time_total",
......@@ -175,16 +180,13 @@ def generate_and_score_prof(
print("\n[TRACE] Start profiling...")
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=torch.profiler.schedule(wait=0, warmup=trace_step, active=1, repeat=1),
on_trace_ready=custom_trace_handler(dir_name="./log/pt-trace/"),
record_shapes=True,
profile_memory=True,
with_stack=True,
profile_memory=False, # 按需开启
with_stack=False, # 按需开启
with_flops=True,
with_modules=True,
) as prof:
for i in range(0, len(prompts), batch_size):
batch_prompts = prompts[i : i + batch_size]
......
#!/bin/bash
set -e
# BW150
# BW1000
export HIP_VISIBLE_DEVICES=1
BIND_CMD="numactl -N 0 -m 0"
BIND_CMD="numactl -N 1 -m 1"
# BW150
# export HIP_VISIBLE_DEVICES=1
# BIND_CMD="numactl -N 0 -m 0"
make clean
CXX=hipcc make GPU_ARCH=gfx936
......
......@@ -3,7 +3,8 @@
chmod u+x /opt/dtk/lib/rocblas/benchmark_tool/*
export PATH=/opt/dtk/lib/rocblas/benchmark_tool/:${PATH}
BIND_CMD="numactl -m 0 -N 0"
# BW1000
BIND_CMD="numactl -m 1 -N 1"
BATCH_SIZE=1
export HIP_VISIBLE_DEVICES=1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment