export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 export TORCHINDUCTOR_BENCHMARK_FUSION=1 export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1 # export TORCHINDUCTOR_BENCHMARK_KERNEL=1 export TORCHINDUCTOR_MAX_AUTOTUNE=1 #export FLASH_ATTENTION_PRINT_PARAM=1 export TORCHINDUCTOR_CACHE_DIR=./cache # export USE_AOTRITON_FA=1 # export USE_BSHD=1 # use fa bsdh layout #for uniq kernel name #export TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1 mpirun --allow-run-as-root -np 8 ./Llama_pretraining.sh localhost