run.sh 497 Bytes
Newer Older
silencealiang's avatar
add  
silencealiang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
export TORCHINDUCTOR_BENCHMARK_FUSION=1
export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1

# export TORCHINDUCTOR_BENCHMARK_KERNEL=1
export TORCHINDUCTOR_MAX_AUTOTUNE=1

#export FLASH_ATTENTION_PRINT_PARAM=1
export TORCHINDUCTOR_CACHE_DIR=./cache

# export USE_AOTRITON_FA=1
# export USE_BSHD=1 # use fa bsdh layout
#for uniq kernel name
#export TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1

mpirun --allow-run-as-root -np 8 ./Llama_pretraining.sh localhost