gen_linux.sh 9.35 KB
Newer Older
1
#!/bin/bash
2
3
# This script is intended to run inside the go generate
# working directory must be llm/generate/
4

5
# First we build one or more CPU based LLM libraries
6
#
7
8
# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
# library dependencies
9
#
10
11
12
# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
# libraries are quite large, and also dynamically load data files at runtime
# which in turn are large, so we don't attempt to cary them as payload
13
14
15
16

set -ex
set -o pipefail

17
18
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
amdGPUs() {
19
20
21
22
    if [ -n "${AMDGPU_TARGETS}" ]; then
        echo "${AMDGPU_TARGETS}"
        return
    fi
23
24
25
26
27
28
    GPU_LIST=(
        "gfx900"
        "gfx906:xnack-"
        "gfx908:xnack-"
        "gfx90a:xnack+"
        "gfx90a:xnack-"
29
30
31
        "gfx940"
        "gfx941"
        "gfx942"
32
33
34
35
36
37
38
39
40
41
42
43
44
        "gfx1010"
        "gfx1012"
        "gfx1030"
        "gfx1100"
        "gfx1101"
        "gfx1102"
    )
    (
        IFS=$';'
        echo "'${GPU_LIST[*]}'"
    )
}

45
echo "Starting linux generate script"
46
47
48
49
50
51
52
if [ -z "${CUDACXX}" ]; then
    if [ -x /usr/local/cuda/bin/nvcc ]; then
        export CUDACXX=/usr/local/cuda/bin/nvcc
    else
        # Try the default location in case it exists
        export CUDACXX=$(command -v nvcc)
    fi
53
fi
54
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
55
56
57
58
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
apply_patches
59

60
init_vars
Jeremy's avatar
Jeremy committed
61
if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" ]; then
62
63
64
65
66
67
68
69
70
    if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
        # Static build for linking into the Go binary
        init_vars
        CMAKE_TARGETS="--target llama --target ggml"
        CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
        BUILD_DIR="../build/linux/${ARCH}_static"
        echo "Building static library"
        build
    fi
Jeremy's avatar
Jeremy committed
71
fi
72

Jeremy's avatar
Jeremy committed
73
74
init_vars
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
75

76
77
78
    # Users building from source can tune the exact flags we pass to cmake for configuring
    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
79
        init_vars
80
81
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
82
        BUILD_DIR="../build/linux/${ARCH}/cpu"
83
84
        echo "Building custom CPU"
        build
85
        compress
86
87
88
89
90
91
92
93
94
95
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
        # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
        # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
        # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
        # Note: the following seem to yield slower results than AVX2 - ymmv
        # -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
        # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
        # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
96

97
        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
98
99
100
101
        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
            #
102
            init_vars
103
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
104
            BUILD_DIR="../build/linux/${ARCH}/cpu"
105
106
            echo "Building LCD CPU"
            build
107
            compress
108
        fi
109

110
        if [ "${ARCH}" == "x86_64" ]; then
111
            #
112
            # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
113
            #
114
115
116
117
118
119
120
            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
                #
                # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
                # Approximately 400% faster than LCD on same CPU
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
121
                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
122
123
                echo "Building AVX CPU"
                build
124
                compress
125
            fi
126

127
128
129
130
131
132
133
            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
                #
                # ~2013 CPU Dynamic library
                # Approximately 10% faster than AVX on same CPU
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
134
                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
135
136
                echo "Building AVX2 CPU"
                build
137
                compress
138
            fi
139
        fi
140
    fi
141
142
143
else
    echo "Skipping CPU generation step as requested"
fi
Daniel Hiltgen's avatar
Daniel Hiltgen committed
144

145
146
# If needed, look for the default CUDA toolkit location
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
147
148
149
    CUDA_LIB_DIR=/usr/local/cuda/lib64
fi

150
151
152
153
154
# If needed, look for CUDA on Arch Linux
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
fi

155
156
157
158
159
# Allow override in case libcudart is in the wrong place
if [ -z "${CUDART_LIB_DIR}" ]; then
    CUDART_LIB_DIR="${CUDA_LIB_DIR}"
fi

160
if [ -d "${CUDA_LIB_DIR}" ]; then
161
162
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
163
    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
164
165
166
    if [ -n "${CUDA_MAJOR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
167
168
169
170
171
172
173
174
175
    if [ "${ARCH}" == "arm64" ]; then
        echo "ARM CPU detected - disabling unsupported AVX instructions"
        
        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
        #
        # CUDA compute < 6.0 lacks proper FP16 support on ARM. 
        # Disabling has minimal performance effect while maintaining compatibility. 
        ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
    fi
176
    CMAKE_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
177
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
178
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
179
    build
180

181
    # Carry the CUDA libs as payloads to help reduce dependency burden on users
182
183
184
    #
    # TODO - in the future we may shift to packaging these separately and conditionally
    #        downloading them in the install script.
185
    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
186
187
188
    for lib in libcudart.so libcublas.so libcublasLt.so ; do
        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
189
            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
190
        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
191
            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
192
        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
193
            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
194
        else
195
            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
196
197
        fi
    done
198
    compress
199

200
fi
201

202
if [ -z "${ROCM_PATH}" ]; then
203
204
205
206
    # Try the default location in case it exists
    ROCM_PATH=/opt/rocm
fi

207
if [ -z "${CLBlast_DIR}" ]; then
208
209
210
211
212
213
    # Try the default location in case it exists
    if [ -d /usr/lib/cmake/CLBlast ]; then
        export CLBlast_DIR=/usr/lib/cmake/CLBlast
    fi
fi

214
215
if [ -d "${ROCM_PATH}" ]; then
    echo "ROCm libraries detected - building dynamic ROCm library"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
216
217
    if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
218
    fi
219
    init_vars
220
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
221
    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
222
    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
223
    build
224

Daniel Hiltgen's avatar
Daniel Hiltgen committed
225
    # Record the ROCM dependencies
226
227
228
229
    rm -f "${BUILD_DIR}/bin/deps.txt"
    touch "${BUILD_DIR}/bin/deps.txt"
    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
230
    done
231
    # bomb out if for some reason we didn't get a few deps
232
233
    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
        cat "${BUILD_DIR}/bin/deps.txt"
234
235
236
        echo "ERROR: deps file short"
        exit 1
    fi
237
    compress
238
fi
239
240

cleanup
241
echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"