ninja_required_version = 1.3
cxx = c++
nvcc = /public/software/compiler/rocm/dtk-22.10/bin/hipcc

cflags = -DTORCH_EXTENSION_NAME=fused_mix_prec_layer_norm_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include/TH -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include/THC -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include/THH -isystem /public/software/compiler/rocm/dtk-22.10/include -isystem /public/software/compiler/rocm/dtk-22.10/miopen/include -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/include/python3.7m -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3
post_cflags = 
cuda_cflags = -DWITH_HIP -DTORCH_EXTENSION_NAME=fused_mix_prec_layer_norm_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include/TH -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include/THC -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/include/THH -isystem /public/software/compiler/rocm/dtk-22.10/include -isystem /public/software/compiler/rocm/dtk-22.10/miopen/include -isystem /public/home/zhuwenwen/miniconda3/envs/megatron-lm/include/python3.7m -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -fPIC -D__HIP_PLATFORM_HCC__=1 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 --amdgpu-target=gfx906 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -fno-gpu-rdc
cuda_post_cflags = 
ldflags = -shared -L/public/home/zhuwenwen/miniconda3/envs/megatron-lm/lib/python3.7/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/public/software/compiler/rocm/dtk-22.10/lib -lamdhip64

rule compile
  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
  depfile = $out.d
  deps = gcc

rule cuda_compile
  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags

rule link
  command = $cxx $in $ldflags -o $out

build layer_norm_cuda.o: compile /public/home/zhuwenwen/Megatron-LM-3.0.2/megatron/fused_kernels/layer_norm_cuda.cpp
build layer_norm_hip_kernel.cuda.o: cuda_compile /public/home/zhuwenwen/Megatron-LM-3.0.2/megatron/fused_kernels/layer_norm_hip_kernel.hip

build fused_mix_prec_layer_norm_cuda.so: link layer_norm_cuda.o layer_norm_hip_kernel.cuda.o

default fused_mix_prec_layer_norm_cuda.so