#
# This is the makefile for the eddy project. It can be used
# to compile the following variants of eddy:
#
#   - make cuda=1 (compiles a CUDA-capable version)
#   - make cpu=1  (compiles a multi-threaded CPU version)
#   - make        (equivalent to make cpu=1)
#
# The resulting eddy binary executables will be named like so:
#   - cpu:   eddy_cpu
#   - cuda:  eddy_cudaX.Y (where X.Y is the CUDA version that
#            the binary was compiled against)
#
# Both variants can be compiled in one invocation, e.g.:
#
#     make cpu=1 cuda=1
#
# Notes regarding the cuda variant:
#
#  - By default a fat binary with code for all the compute
#    capabilities allowed for by a given CUDA version will
#    be compiled. If one only need 3.0--3.5 (for testing)
#    one can build with the fastbuild=1 option which saves
#    a lot of compilation time, e.g.:
#
#        make cuda=1 fastbuild=1
#
#  - Alternately, to compile for a specific compute
#    capabillity (or range thereof), you can specify the
#    GENCODEFLAGS variable, e.g.:
#
#        make cuda=1 GENCODEFLAGS="-gencode arch=compute_30,code=sm_30"
#
#  - Or you can specify a specific compute capability with
#    the COMPUTE_CAPABILITY vraiable, e.g.:
#
#        make cuda=1 COMPUTE_CAPABILITY="30"
#
#  - By default, all CUDA libraries (libcuda, libcudart, and those
#    specified in the CUDALIBS variable below) will be dynamically
#    linked in the resulting executable. The CUDA_STATIC variable
#    can be specified to statically link these libraries instead,
#    e.g.:
#
#        make cuda=1 CUDA_STATIC=1
#
#  - To compile against a specific version of the CUDA toolkit,
#    just make sure that the relevant nvcc command is on your $PATH
#    variable. For example:
#
#        PATH=/usr/local/cuda10.2/bin:$PATH make cuda=1
#
#    Alternately, you can set the $NVCC variable to refer to the
#    specific nvcc executable, e.g.:
#
#        NVCC=/usr/local/cuda11.3/bin/nvcc make cuda=1
#
# The CUDA_HOME, GENCODEFLAGS, and CUDA_STATIC variables are all
# handled in $FSLCONFDIR/buildSettings.mk.

include ${FSLCONFDIR}/default.mk

PROJNAME = eddy
SCRIPTS  =
XFILES   =

ifdef cpu
  XFILES  += eddy_cpu
  SCRIPTS += eddy
endif
ifdef cuda
  XFILES += eddy_cuda${CUDA_VER}
endif

# Default to cpu variant
# if no flags were provided
ifeq (${XFILES},)
  XFILES = eddy_cpu
endif

# -rdynamic allows meaningful backtraces to
# be emitted on segmentation faults and
# other crashes (see fsl/utils/stack_dump.h).
# Understood by both clang++ and g++
USRLDFLAGS   = -rdynamic
USRNVCCFLAGS = -DCOMPILE_GPU -Icuda
CUDALIBS     = -L /workspace/FSL-install/lib -lfsl-cudabasisfield_cuda11.8 \
  -lhipblas -L/workspace/FSL-install/lib -L/workspace/FSL-install/lib \
  -lfsl-topup -lfsl-warpfns -lfsl-meshclass -lfsl-basisfield -lfsl-newimage \
  -lfsl-miscmaths -lfsl-cprob -lfsl-NewNifti -lfsl-znz -lfsl-utils  \
  -llapack -lblas -lz -lm -fvisibility=default -fPIC 
LIBS         = -L /workspace/FSL-install/lib -lfsl-topup -lfsl-warpfns -lfsl-meshclass -lfsl-basisfield \
               -lfsl-newimage -lfsl-miscmaths -lfsl-cprob -lfsl-NewNifti \
               -lfsl-znz -lfsl-utils 

# Skip CUDA fat binary creation if fastbuild
# is set (overriding GENCODEFLAGS defined
# in FSLDIR/config/buildSettings.mk)
ifeq ($(fastbuild),1)
  GENCODEFLAGS := -gencode arch=compute_30,code=sm_30
endif

# Build a specific compute capability if
# requested (overridding fastbuild and
# GENCODEFLAGS)
ifdef COMPUTE_CAPABILITY
  GENCODEFLAGS := -gencode arch=compute_$(COMPUTE_CAPABILITY),code=sm_$(COMPUTE_CAPABILITY)
endif

# Compiled for all variants
OBJS     := eddy.o b0Predictor.o BiasFieldEstimatorImpl.o \
            CPUStackResampler.o DiffusionGP.o fmriPredictor.o ECModels.o \
            LongECModels.o ECScanClasses.o EddyCommandLineOptions.o \
            EddyHelperClasses.o EddyUtils.o HyParEstimator.o \
            KMatrix.o MoveBySuscCF.o PostEddyAlignShellsFunctions.o

# Compiled for CPU variant
CPUOBJS  := LSResampler.o PostEddyCF.o

# Compiled for cuda variant
CUDAOBJS := CudaVolume.o DerivativeCalculator.o DiffusionGP.o fmriPredictor.o \
            EddyCudaHelperFunctions.o EddyGpuUtils.o EddyInternalGpuUtils.o \
            EddyKernels.o EddyMatrixKernels.o GpuPredictorChunk.o \
            LSResampler.o PostEddyCF.o StackResampler.o

# use separate build dirs for each variant.  The
# BUILDDIR and CUDABUILDDIR variables are used by
# the depend.mk rule in $FSLDIR/config/rules.mk,
# which is used to automatically generate
# dependencies for each object file. Dependencies
# for CUDA object files are explicitly listed at
# the end of this Makefile.
CUDABUILDDIR = cudabuild/
BUILDDIR     = cpubuild/
CPUOBJS     := $(OBJS:%.o=cpubuild/%.o)  $(CPUOBJS:%.o=cpubuild/%.o)
CUDAOBJS    := $(OBJS:%.o=cudabuild/%.o) $(CUDAOBJS:%.o=cudabuild/cuda${CUDA_VER}/%.o)

all: ${XFILES}

clean:
	@rm -f depend.mk eddy_cpu eddy_cuda*
	@rm -rf cudabuild cpubuild

HIPCXXFLAGS = -DARMA_ALLOW_FAKE_GCC -std=c++17 -fPIC -g -O0 \
 -I /opt/dtk/include -I  ${FSLCONFDIR}/../include  -I .
HIPLDFLAGS = -L /opt/dtk/lib -l hipblas -l hipblaslt -l galaxyhip -l amdhip64 -DARMA_ALLOW_FAKE_GCC -std=c++17 -fPIC -g -O0 

# -DARMA_ALLOW_FAKE_GCC -std=c++17 -fPIC -g -O0
#################################
# CPU executable and object files
#################################

eddy_cpu: ${CPUOBJS}
	@mkdir -p cpubuild
	hipcc ${CXXFLAGS} -o $@ $^ ${LDFLAGS} -fvisibility=default -fPIC ${LIBS}

cpubuild/%.o: %.cpp
	@mkdir -p cpubuild
	hipcc $(CXXFLAGS) -c -o $@ $< -g -O0 -fvisibility=default -fPIC

#################################
# GPU executable and object files
#################################

# eddy_cuda${CUDA_VER}: ${CUDAOBJS}
# 	${NVCC} ${NVCCFLAGS} -o $@ $^ ${NVCCLDFLAGS} -fvisibility=default -fPIC

eddy_cuda${CUDA_VER}: ${CUDAOBJS}
	hipcc ${NVCCFLAGS} -o $@ $^ ${HIPLDFLAGS} ${CUDALIBS} 


cudabuild/cuda${CUDA_VER}/%.o: cuda/%.cpp
	@mkdir -p cudabuild/cuda${CUDA_VER}
	hipcc $(NVCCFLAGS) -c -o $@ $< -g -O0 -fvisibility=default -fPIC --gpu-max-threads-per-block=1024

cudabuild/%.o: %.cpp
	@mkdir -p cudabuild
	hipcc $(CUDACXXFLAGS) -c -o $@ $< -g -O0 -g -fvisibility=default -fPIC