Makefile 15.9 KB
Newer Older
1
2
3
4
# Copyright (c) Microsoft Corporation - All rights reserved
# Licensed under the MIT License


5
6
7
8
SB_MICRO_PATH ?= /usr/local
MPI_HOME ?= /usr/local/mpi
HIP_HOME ?= /opt/rocm/hip
RCCL_HOME ?= /opt/rocm/rccl
9
HPCX_HOME ?= /opt/hpcx
10
ROCM_PATH ?= /opt/rocm
11

12
13
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
14
HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
15
ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
16

17
18
NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)

one's avatar
one committed
19
.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm rocm_hpcg
20

21
# Build targets.
22
all: cuda rocm
23
# msccl: api change in cudaStreamUpdateCaptureDependencies
24
cuda_with_msccl: cuda cuda_msccl
25
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
26
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
one's avatar
one committed
27
dtk: common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm rocm_hpcg
28
29
cpu: common cpu_perftest
common: fio cpu_stream
30
31
32

# non aarch64 specific targets
ifneq ($(shell uname -m), aarch64)
33
common: cpu_hpl
34
directx_amd: directx_amf_encoding_latency
35
endif
36

37
38
39
40
# Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
sb_micro_path:
	mkdir -p $(SB_MICRO_PATH)/bin
	mkdir -p $(SB_MICRO_PATH)/lib
41

42
# Build cutlass.
43
# for cuda 12.9 and later Build from commit v3.9 (3.9 release commit) for blackwell support
44
cuda_cutlass:
45
46
47
48
49
ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
	$(eval ARCHS := "100;103")
	if [ -d cutlass ]; then rm -rf cutlass; fi
	git clone --branch v4.1.0 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
50
	$(eval ARCHS := "90;100")
51
	if [ -d cutlass ]; then rm -rf cutlass; fi
52
	git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
53
else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
54
	$(eval ARCHS := "70;75;80;86;89;90")
55
56
57
else
	$(eval ARCHS := "70;75;80;86")
endif
58

59
ifneq (,$(wildcard cutlass/CMakeLists.txt))
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
	cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin \
		-DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib \
		-DCMAKE_BUILD_TYPE=Release \
		-DCUTLASS_NVCC_ARCHS=$(ARCHS) \
		-DCUTLASS_ENABLE_EXAMPLES=OFF \
		-DCUTLASS_ENABLE_TESTS=OFF \
		-S ./cutlass \
		-B ./cutlass/build \
		-DCUTLASS_LIBRARY_KERNELS="cutlass_simt_dgemm_128x128_8x2_*,\
cutlass_simt_sgemm_128x128_8x2_*,\
cutlass_simt_hgemm_256x128_8x2_*,\
cutlass_tensorop_h884gemm_256x128_32x2_*,\
cutlass_tensorop_d884gemm_128x128_16x3_*,\
cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*,\
cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*,\
cutlass_tensorop_h16816gemm_256x128_32x3_*,\
cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*,\
cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*"
78
	cmake --build ./cutlass/build -j $(shell nproc --ignore=2) --target install
79
	rm -rf ./cutlass/build
80
endif
81

82
83
84
85
# Build cuda-samples/Samples/bandwidthTest.
# cuda-samples is released together with CUDA, they have the exact same version. Like v10.0, v11.1 and so on.
# The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker.
# The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing.
86
# The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release (see the change log). For up-to-date bandwidth measurements, refer instead to the NVBandwith utility.
87
cuda_bandwidthTest: sb_micro_path
88
89
	if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
	git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
90
91
92
ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
	@echo "Skip cuda-samples build for CUDA>=12.9"
else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
93
94
95
96
	$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
	cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make
	cp -v ./$(TEST_PATH)/build/bandwidthTest $(SB_MICRO_PATH)/bin/
else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
97
98
	$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
	$(eval ARCHS := "70 75 80 86 90")
99
100
	cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
	cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
101
102
103
else
	$(eval TEST_PATH := "./cuda-samples/Samples/bandwidthTest")
	$(eval ARCHS := "70 75 80 86")
104
	cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
105
	cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
106
endif
107

108
109
# Build nccl-tests.
# The version we use is the tag v2.13.13
110
cuda_nccl_tests: sb_micro_path
111
ifneq (,$(wildcard nccl-tests/Makefile))
112
	cd ./nccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
113
	cp -v -r ./nccl-tests/build/* $(SB_MICRO_PATH)/bin/
114
endif
115
116

# Build perftest.
one's avatar
one committed
117
# The commit we use is ea1c778782df3ec09b5f8101017fc0140b51a63d.
118
cuda_perftest:
119
120
121
ifneq (,$(wildcard perftest/autogen.sh))
	cd perftest && ./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --prefix=$(SB_MICRO_PATH) && make -j && make install
endif
122
123
rocm_perftest:
ifneq (,$(wildcard perftest/autogen.sh))
one's avatar
one committed
124
	cd perftest && ./autogen.sh && ./configure --enable-rocm --with-rocm=$(ROCM_PATH) --prefix=$(SB_MICRO_PATH) && make -j && make install
125
endif
126
127
128
129
cpu_perftest:
ifneq (,$(wildcard perftest/autogen.sh))
	cd perftest && ./autogen.sh && ./configure --prefix=$(SB_MICRO_PATH) && make -j && make install
endif
130

one's avatar
one committed
131
# Build FIO from commit ed675d347 (fio-3.41 tag).
132
133
fio:
ifneq (,$(wildcard fio/Makefile))
134
	cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) --disable-native && make -j && make install
135
endif
136

one's avatar
one committed
137
# Build rccl-tests from commit 66e513c of default branch.
138
rocm_rccl_tests: sb_micro_path
one's avatar
one committed
139
140
141
142
143
144
145
146
ifneq (, $(wildcard rccl-tests/install.sh))
	cd ./rccl-tests && \
	ln -sf $$(which hipify-perl) $(ROCM_PATH)/bin/hipify-perl || true && \
	./install.sh --mpi --mpi_home $(MPI_HOME) \
	--rocm_home $(ROCM_PATH) \
	--rccl_home $(RCCL_HOME) \
	--hip_compiler hipcc \
	--gpu_targets $$(paste -sd ',' $(ROCM_PATH)/bin/target.lst)
147
	cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/
148
endif
149
150
151
152
153
154

# Build rocblas-bench.
# RocBLAS is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.
# Since it takes several hours to build, avoid to build again if rocblas-bench exsists.
rocm_rocblas: sb_micro_path
155
156
157
158
159
160
161
	@if [ ! -e $(SB_MICRO_PATH)/bin/rocblas-bench ] && [ -z `which rocblas-bench` ]; then \
		if [ -d rocBLAS ]; then rm -rf rocBLAS; fi; \
		git clone -b ${ROCBLAS_BRANCH} https://github.com/ROCmSoftwarePlatform/rocBLAS.git ./rocBLAS; \
		sed -i 's|#include "gemm.hpp"|#include "Tensile/gemm.hpp"|' rocBLAS/clients/benchmarks/../../library/src/blas3/rocblas_trtri.hpp; \
		cd ./rocBLAS && ./install.sh --dependencies --clients-only; \
		cp -v $(SB_MICRO_PATH)/third_party/rocBLAS/build/release/clients/staging/rocblas-bench $(SB_MICRO_PATH)/bin/; \
	fi
162

163
164
165
166
167
168
169
# Build hipblaslt-bench.
# hipBLASLt is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.
# Since it takes several hours to build, avoid to build again if hipblaslt-bench exsists.
rocm_hipblaslt: sb_micro_path
	@if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \
		if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \
170
		git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \
171
		cd ./hipBLASLt && ./install.sh -dc; \
172
		cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/;  \
173
174
	fi

175
176
177
# Build hipBusBandwidth.
# HIP is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.
178
rocm_bandwidthTest: sb_micro_path
179
180
181
	git clone -b ${ROCM_VER} https://github.com/ROCm-Developer-Tools/HIP.git
	cd ./HIP/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
	cp -v ./HIP/samples/1_Utils/hipBusBandwidth/build/hipBusBandwidth $(SB_MICRO_PATH)/bin/
182

183
184
185
186
187
188
189
190
191
192
193
194
# Build BabelStream hip-stream from submodule tag v5.0.
rocm_babelstream_hip: sb_micro_path
ifneq (,$(wildcard BabelStream/CMakeLists.txt))
	cd ./BabelStream && \
	cmake -S . -B build \
		-DMODEL=hip \
		-DCMAKE_CXX_COMPILER=hipcc \
		-DCXX_EXTRA_FLAGS="--gpu-max-threads-per-block=1024" && \
	cmake --build build -j $(NUM_MAKE_JOBS)
	cp -v ./BabelStream/build/hip-stream $(SB_MICRO_PATH)/bin/
endif

one's avatar
one committed
195
196
197
198
199
200
201
202
203
204
205
206
207
# Build rocHPCG and stage helper scripts for SuperBench DTK image.
rocm_hpcg: sb_micro_path
ifneq (,$(wildcard rocHPCG/install.sh))
	cd ./rocHPCG && \
	if [ ! -f ./run_rochpcg ]; then \
		git apply ../rochpcg-scripts-bw.patch; \
	fi && \
	./install.sh --with-rocm=$(ROCM_PATH) --with-mpi=$(MPI_HOME) --gpu-aware-mpi=on
	cp -v ./rocHPCG/build/release/rochpcg-install/bin/rochpcg $(SB_MICRO_PATH)/bin/
	cp -v ./rocHPCG/run_rochpcg $(SB_MICRO_PATH)/bin/
	chmod +x $(SB_MICRO_PATH)/bin/rochpcg $(SB_MICRO_PATH)/bin/run_rochpcg
endif

208
209
210
211
212
# Build GPCNET from commit c56fd9.
gpcnet: sb_micro_path
	bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"
	cp -v ./GPCNET/network_test $(SB_MICRO_PATH)/bin/
	cp -v ./GPCNET/network_load_test $(SB_MICRO_PATH)/bin/
213

rafsalas19's avatar
rafsalas19 committed
214
# Build GPU burn from main branch (only branch that exists)
215
216
217
218
219
220
cuda_gpuburn: sb_micro_path
ifneq (,$(wildcard gpu-burn/Makefile))
	cd ./gpu-burn && make
	cp -v ./gpu-burn/gpu_burn $(SB_MICRO_PATH)/bin/
	cp -v ./gpu-burn/compare.ptx $(SB_MICRO_PATH)/bin/
endif
rafsalas19's avatar
rafsalas19 committed
221

222
# Build HPL from main branch
rafsalas19's avatar
rafsalas19 committed
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
cpu_hpl: sb_micro_path
ifneq (,$(wildcard hpl-tests/Makefile))
	cd ./hpl-tests && \
    wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
	tar xzf hpl-2.3.tar.gz && \
	cp Make.Linux_zen3 hpl-2.3 && \
	cp Make.Linux_zen4 hpl-2.3 && \
	make all
	cp -v ./hpl-tests/hpl-2.3/bin/Linux_zen3/xhpl $(SB_MICRO_PATH)/bin/xhpl_z3
	cp -v ./hpl-tests/hpl-2.3/bin/Linux_zen4/xhpl $(SB_MICRO_PATH)/bin/xhpl_z4
	cp -v ./hpl-tests/hpl_run.sh $(SB_MICRO_PATH)/bin/
	cp -v ./hpl-tests/bindmem.sh $(SB_MICRO_PATH)/bin/
	cp -v ./hpl-tests/template_hpl.dat $(SB_MICRO_PATH)/bin/
endif

238
# Build STREAM
rafsalas19's avatar
rafsalas19 committed
239
240
241
242
243
cpu_stream: sb_micro_path
ifneq (,$(wildcard stream-tests/Makefile))
	cd ./stream-tests && \
    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
	make all
244
	cp -v ./stream-tests/stream* $(SB_MICRO_PATH)/bin/
rafsalas19's avatar
rafsalas19 committed
245
endif
246
247
248
249
250
251
252
253

# Build AMD Encoder Latency Test
directx_amf_encoding_latency:
	@if not exist "AMF" (git clone -b v1.4.29 https://github.com/GPUOpen-LibrariesAndSDKs/AMF.git)
	@if exist "AMF\amf\public\samples\CPPSamples_vs2019.sln" ( \
		curl -L -o vs_buildtools.exe https://aka.ms/vs/16/release/vs_buildtools.exe && echo "Downloaded vs_buildtools.exe" && \
		start /wait vs_buildtools.exe --quiet --wait --norestart --nocache --installPath C:/temp/BuildTools --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.ATLMFC --includeRecommended  && echo "Installed VS Build Tools" && \
		del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \
254
		"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
255
	)
256

257
# Install requirements for Megatron-LM
258
259
megatron_lm:
	cd Megatron && \
260
	python -m pip install --no-cache-dir -r requirements.txt
261

262
# Install requirements for Megatron-DeepSpeed
263
264
megatron_deepspeed:
	cd Megatron && \
265
	python -m pip install --no-cache-dir -r requirements.txt && \
266
	python -m pip install DeepSpeed
267

268
269
270
271
272
273
274
275
276
277
278
279
rocm_megatron_lm:
	cd Megatron && mkdir -p rocm && cd rocm && \
	if [ ! -d "Megatron-LM" ]; then \
		git clone -b rocm_dev https://github.com/ROCm/Megatron-LM.git ; \
	fi
	cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/
	git clone https://github.com/caaatch22/grouped_gemm.git &&\
    	cd grouped_gemm &&\
    	git checkout 8a9b438 &&\
    	git submodule update --init --recursive &&\
    	pip install .

280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# Instal apex of ROCm due to dependency of Megatron
apex_rocm:
	$(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)"))
	$(eval TORCH_MAJOR_VERSION ?= $(word 1,$(subst ., ,$(TORCH_VERSION))))
	$(eval TORCH_MINOR_VERSION ?= $(word 2,$(subst ., ,$(TORCH_VERSION))))
	if [ ! -d "apex" ]; then \
		git clone https://github.com/ROCmSoftwarePlatform/apex.git ; \
	fi
	cd apex && \
	if [ "$$(expr $(TORCH_MAJOR_VERSION) \> 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) \> 1)" -eq 1 ]; then \
		git checkout master ; \
	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 1)" -eq 1 ]; then \
		git checkout release/1.1.0 ; \
	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 0)" -eq 1 ]; then \
		git checkout release/1.0.0 ; \
	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 1)" -eq 1 ]; then \
	    git checkout release/1.0.0 ; \
	fi
	pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex

300
301
# Build MSCCL for CUDA
cuda_msccl: sb_micro_path
302
303
304
305
306
307
ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
	$(eval ARCHS := 100 103)
	if [ -d msccl ]; then rm -rf msccl; fi; \
	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
308
309
310
311
312
313
314
315
316
317
318
    # Get commit 87048bd from msscl to support updated nccl and sm_100
	$(eval ARCHS := 75 80 86 89 90 100)
	if [ -d msccl ]; then rm -rf msccl; fi; \
	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
    && git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
	$(eval ARCHS := 70 75 80 86 89 90)
else
	$(eval ARCHS := 70 75 80 86")
endif
	$(eval NVCC_GENCODE := "$(foreach arch, $(ARCHS), $(NVCC_GENCODE) -gencode=arch=compute_$(arch),code=sm_$(arch))")
319
320
ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
	cd ./msccl/executor/msccl-executor-nccl && \
321
	make -j ${NUM_MAKE_JOBS} src.build NVCC_GENCODE=$(NVCC_GENCODE) && \
322
323
324
325
326
327
	cd ../../..
	mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \
	cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/
endif
ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile))
	cd ./msccl/scheduler/msccl-scheduler && \
328
	CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j ${NUM_MAKE_JOBS} && \
329
330
331
332
333
334
	cd ../../..
	mkdir -p $(SB_MICRO_PATH)/lib/msccl-scheduler && \
	cp -r -v ./msccl/scheduler/msccl-scheduler/build/* $(SB_MICRO_PATH)/lib/msccl-scheduler/
endif
ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
	cd ./msccl/tests/msccl-tests-nccl && \
335
	make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl NVCC_GENCODE=$(NVCC_GENCODE) -j ${NUM_MAKE_JOBS} && cd ../../..
336
337
338
	mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \
	cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/
endif
339
340
341

# Build nvbandwidth.
nvbandwidth: sb_micro_path
342
	cd ./nvbandwidth && git apply ../nvbandwidth.patch && cp ../nvbandwidth_testcases_patched.h ./testcases_patched.h && cmake . && make && cd ..
343
	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin