Optimize container images for startup (#6547)

* Optimize container images for startup This change adjusts how to handle runner payloads to support container builds where we keep them extracted in the filesystem. This makes it easier to optimize the cpu/cuda vs cpu/rocm images for size, and should result in faster startup times for container images. * Refactor payload logic and add buildx support for faster builds * Move payloads around * Review comments * Converge to buildx based helper scripts * Use docker buildx action for release

Optimize container images for startup (#6547)
* Optimize container images for startup This change adjusts how to handle runner payloads to support container builds where we keep them extracted in the filesystem. This makes it easier to optimize the cpu/cuda vs cpu/rocm images for size, and should result in faster startup times for container images. * Refactor payload logic and add buildx support for faster builds * Move payloads around * Review comments * Converge to buildx based helper scripts * Use docker buildx action for release
cd5c8f64 · Daniel Hiltgen · GitHub · fef257c5 · cd5c8f64 · cd5c8f64
Unverified Commit cd5c8f64 authored Sep 12, 2024 by Daniel Hiltgen Committed by GitHub Sep 12, 2024
20 changed files
--- a/.dockerignore
+++ b/.dockerignore
@@ -7,3 +7,5 @@ llm/llama.cpp
 .env
 .cache
 test_data
+llm/build
+llama/build
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -102,8 +102,8 @@ jobs:
        with:
          name: generate-windows-cpu
          path: |
-            llm/build/**/bin/*
-            llm/build/**/*.a
+            build/**/*
+            build/**/*.a
            dist/windows-amd64/**

  # ROCm generation step
@@ -176,7 +176,7 @@ jobs:
        with:
          name: generate-windows-rocm
          path: |
-            llm/build/**/bin/*
+            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
@@ -265,7 +265,7 @@ jobs:
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
-            llm/build/**/bin/*
+            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
@@ -338,7 +338,7 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
-      - run: dir llm/build
+      - run: dir build
      - run: |
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
@@ -359,9 +359,7 @@ jobs:
    environment: release
    runs-on: linux
    env:
-      OLLAMA_SKIP_MANIFEST_CREATE: '1'
      BUILD_ARCH: amd64
-      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@@ -369,14 +367,8 @@ jobs:
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
-          ./scripts/build_docker.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-amd64
@@ -390,9 +382,7 @@ jobs:
    environment: release
    runs-on: linux-arm64
    env:
-      OLLAMA_SKIP_MANIFEST_CREATE: '1'
      BUILD_ARCH: arm64
-      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@@ -421,14 +411,8 @@ jobs:
          sudo usermod -aG docker $USER
          sudo apt-get install acl
          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
-          ./scripts/build_docker.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-arm64
@@ -436,6 +420,181 @@ jobs:
            dist/*linux*
            !dist/*-cov

+  # Container image build
+  build-linux:
+    environment: release
+    strategy:
+      matrix:
+        runner:
+          - linux
+          - linux-arm64
+    runs-on: ${{ matrix.runner }}
+    env:
+      FINAL_IMAGE_REPO: ollama/ollama
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: 'Install Docker'
+        if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y ca-certificates curl
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+          echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt-get update
+          sudo apt-get install -y docker-ce docker-ce-cli containerd.io
+          sudo usermod -aG docker $USER
+          sudo apt-get install acl
+          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.FINAL_IMAGE_REPO }}
+          flavor: |
+            latest=false
+          tags: |
+            type=ref,event=tag
+            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
+            type=semver,pattern={{version}}
+      - name: Set Version
+        shell: bash
+        run: |
+          machine=$(uname -m)
+          case ${machine} in
+            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
+            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
+          esac >>$GITHUB_ENV
+          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
+      - name: Build and push by digest
+        id: build
+        uses: docker/build-push-action@v6
+        with:
+          context: "."
+          platforms: linux/${{ env.ARCH }}
+          build-args: |
+            GOFLAGS
+          outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
+      - name: Export digest
+        run: |
+          mkdir -p /tmp/digests
+          digest="${{ steps.build.outputs.digest }}"
+          touch "/tmp/digests/${digest#sha256:}"
+      - name: Upload digest
+        uses: actions/upload-artifact@v4
+        with:
+          name: digests-${{ env.PLATFORM_PAIR }}
+          path: /tmp/digests/*
+          if-no-files-found: error
+          retention-days: 1
+  merge:
+    environment: release
+    runs-on: linux
+    needs:
+      - build-linux
+    env:
+      FINAL_IMAGE_REPO: ollama/ollama
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Download digests
+        uses: actions/download-artifact@v4
+        with:
+          path: /tmp/digests
+          pattern: digests-*
+          merge-multiple: true
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.FINAL_IMAGE_REPO }}
+          flavor: |
+            latest=false
+          tags: |
+            type=ref,event=tag
+            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
+            type=semver,pattern={{version}}
+      - name: Set Version
+        shell: bash
+        run: |
+          machine=$(uname -m)
+          case ${machine} in
+            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
+            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
+          esac >>$GITHUB_ENV
+          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
+      - name: Create manifest list and push
+        working-directory: /tmp/digests
+        run: |
+          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
+            $(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
+      - name: Inspect image
+        run: |
+          docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}          
+  build-linux-rocm:
+    environment: release
+    runs-on: linux
+    env:
+      FINAL_IMAGE_REPO: ollama/ollama
+      ARCH: amd64
+      PLATFORM_PAIR: linux-amd64
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.FINAL_IMAGE_REPO }}
+          flavor: |
+            latest=false
+          tags: |
+            type=ref,event=tag
+            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
+            type=semver,pattern={{version}}
+      - name: Set Version
+        shell: bash
+        run: |
+          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
+      - name: Build and push by digest
+        id: build
+        uses: docker/build-push-action@v6
+        with:
+          context: "."
+          target: runtime-rocm
+          build-args: |
+            GOFLAGS
+          tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm
+          push: true
+
  # Aggregate all the assets and ship a release
  release:
    needs:
@@ -448,8 +607,6 @@ jobs:
    permissions:
      contents: write
    env:
-      OLLAMA_SKIP_IMAGE_BUILD: '1'
-      PUSH: '1'
      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@v4
@@ -458,12 +615,6 @@ jobs:
        run: |
          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - run: ./scripts/build_docker.sh
      - name: Retrieve built artifact
        uses: actions/download-artifact@v4
        with:

--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -81,12 +81,6 @@ jobs:
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
        name: 'Unix Go Generate'
      - run: go build .
-      - uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
-          path: |
-            llm/build/**/bin/*
-            llm/build/**/*.a
  generate-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@@ -114,12 +108,6 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
-      - uses: actions/upload-artifact@v4
-        with:
-          name: cuda-${{ matrix.cuda-version }}-libraries
-          path: |
-            llm/build/**/bin/*
-            dist/windows-amd64/**
  generate-rocm:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
@@ -147,12 +135,6 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
-      - uses: actions/upload-artifact@v4
-        with:
-          name: rocm-${{ matrix.rocm-version }}-libraries
-          path: |
-            llm/build/**/bin/*
-            dist/windows-amd64/**

  # ROCm generation step
  generate-windows-rocm:
@@ -189,7 +171,6 @@ jobs:
        name: go generate
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
-      # TODO - do we need any artifacts?

  # CUDA generation step
  generate-windows-cuda:
@@ -231,7 +212,6 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
-      # TODO - do we need any artifacts?

  lint:
    strategy:
@@ -263,14 +243,6 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
-      - run: |
-          mkdir -p llm/build/linux/$ARCH/stub/bin
-          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
-        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
-      - run: |
-          mkdir -p llm/build/darwin/$ARCH/stub/bin
-          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
-        if: ${{ startsWith(matrix.os, 'macos-') }}
      - uses: golangci/golangci-lint-action@v6
        with:
          args: --timeout 8m0s -v
@@ -301,23 +273,10 @@ jobs:
          cache: true
      - run: |
          case ${{ matrix.arch }} in
-            amd64) echo ARCH=x86_64 ;;
+            amd64) echo ARCH=amd64 ;;
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
-      - run: |
-          mkdir -p llm/build/linux/$ARCH/stub/bin
-          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
-        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
-      - run: |
-          mkdir -p llm/build/darwin/$ARCH/stub/bin
-          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
-        if: ${{ startsWith(matrix.os, 'macos-') }}
-        shell: bash
      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
-      - uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.os }}-binaries
-          path: ollama
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,7 @@ ggml-metal.metal
 test_data
 *.crt
 llm/build
+build/*/*/*
+!build/**/placeholder
+llama/build
 __debug_bin*
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,7 +47,7 @@ RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
    bash gen_linux.sh

-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -63,7 +63,7 @@ RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
    CUDA_VARIANT="_v11" \
    bash gen_linux.sh

-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -143,64 +143,103 @@ RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh


-# Intermediate stage used for ./scripts/build_linux.sh
+# Intermediate stages used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED=1
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
+RUN cd dist/linux-$GOARCH && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+RUN cd dist/linux-$GOARCH-rocm && \
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz

-# Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 ENV CGO_ENABLED=1
 ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
+RUN cd dist/linux-$GOARCH && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz

-# Strip out ROCm dependencies to keep the primary image lean
-FROM --platform=linux/amd64 ubuntu:22.04 AS amd64-libs-without-rocm
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
-RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa*
+FROM --platform=linux/amd64 scratch AS dist-amd64
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
+FROM --platform=linux/arm64 scratch AS dist-arm64
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
+FROM dist-$TARGETARCH as dist
+
+
+# Optimized container images do not cary nested payloads
+FROM --platform=linux/amd64 static-build-amd64 AS container-build-amd64
+WORKDIR /go/src/github.com/ollama/ollama
+COPY . .
+ARG GOFLAGS
+ARG CGO_CFLAGS
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-amd64/bin/ollama .
+
+FROM --platform=linux/arm64 static-build-arm64 AS container-build-arm64
+WORKDIR /go/src/github.com/ollama/ollama
+COPY . .
+ARG GOFLAGS
+ARG CGO_CFLAGS
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-arm64/bin/ollama .

-# Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
-COPY --from=amd64-libs-without-rocm /scratch/ /lib/
-RUN apt-get update && apt-get install -y ca-certificates && \
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-RUN apt-get update && apt-get install -y ca-certificates && \
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/

-# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
-FROM  rocm/dev-centos-7:${ROCM_VERSION}-complete AS runtime-rocm
-RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-RUN ln -s /opt/rocm/lib /lib/ollama
+# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
+FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
+# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
+# across releases
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 EXPOSE 11434
 ENV OLLAMA_HOST=0.0.0.0


--- a/build/darwin/amd64/placeholder
+++ b/build/darwin/amd64/placeholder
+This is here to make sure the build/ directory exists for the go:embed command
--- a/build/darwin/arm64/placeholder
+++ b/build/darwin/arm64/placeholder
+This is here to make sure the build/ directory exists for the go:embed command
--- a/build/embed_darwin_amd64.go
+++ b/build/embed_darwin_amd64.go
+package build
+
+import "embed"
+
+// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
+
+//go:embed darwin/amd64/*
+var EmbedFS embed.FS
--- a/build/embed_darwin_arm64.go
+++ b/build/embed_darwin_arm64.go
+package build
+
+import "embed"
+
+// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
+
+//go:embed darwin/arm64/*
+var EmbedFS embed.FS
--- a/build/embed_linux.go
+++ b/build/embed_linux.go
+package build
+
+import "embed"
+
+//go:embed linux/*
+var EmbedFS embed.FS
--- a/build/embed_unused.go
+++ b/build/embed_unused.go
+//go:build !linux && !darwin
+
+package build
+
+import "embed"
+
+// unused on windows
+var EmbedFS embed.FS
--- a/build/linux/amd64/placeholder
+++ b/build/linux/amd64/placeholder
+This is here to make sure the build/ directory exists for the go:embed command
--- a/build/linux/arm64/placeholder
+++ b/build/linux/arm64/placeholder
+This is here to make sure the build/ directory exists for the go:embed command
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -179,53 +179,6 @@ var (
 	HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
 )

-func RunnersDir() (p string) {
-	if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
-		return p
-	}
-
-	if runtime.GOOS != "windows" {
-		return
-	}
-
-	defer func() {
-		if p == "" {
-			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
-		}
-	}()
-
-	// On Windows we do not carry the payloads inside the main executable
-	exe, err := os.Executable()
-	if err != nil {
-		return
-	}
-
-	cwd, err := os.Getwd()
-	if err != nil {
-		return
-	}
-
-	var paths []string
-	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
-		paths = append(paths,
-			root,
-			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
-			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
-		)
-	}
-
-	// Try a few variations to improve developer experience when building from source in the local tree
-	for _, path := range paths {
-		candidate := filepath.Join(path, "lib", "ollama", "runners")
-		if _, err := os.Stat(candidate); err == nil {
-			p = candidate
-			break
-		}
-	}
-
-	return p
-}
-
 func Uint(key string, defaultValue uint) func() uint {
 	return func() uint {
 		if s := Var(key); s != "" {
@@ -290,7 +243,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
-		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},


--- a/gpu/assets.go
+++ b/gpu/assets.go
-package gpu
-
-import (
-	"errors"
-	"fmt"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"strconv"
-	"strings"
-	"sync"
-	"syscall"
-	"time"
-
-	"github.com/ollama/ollama/envconfig"
-)
-
-var (
-	lock        sync.Mutex
-	payloadsDir = ""
-)
-
-func PayloadsDir() (string, error) {
-	lock.Lock()
-	defer lock.Unlock()
-	var err error
-	if payloadsDir == "" {
-		runnersDir := envconfig.RunnersDir()
-
-		if runnersDir != "" {
-			payloadsDir = runnersDir
-			return payloadsDir, nil
-		}
-
-		// The remainder only applies on non-windows where we still carry payloads in the main executable
-		cleanupTmpDirs()
-		tmpDir := envconfig.TmpDir()
-		if tmpDir == "" {
-			tmpDir, err = os.MkdirTemp("", "ollama")
-			if err != nil {
-				return "", fmt.Errorf("failed to generate tmp dir: %w", err)
-			}
-		} else {
-			err = os.MkdirAll(tmpDir, 0o755)
-			if err != nil {
-				return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
-			}
-		}
-
-		// Track our pid so we can clean up orphaned tmpdirs
-		n := filepath.Join(tmpDir, "ollama.pid")
-		if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
-			return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
-		}
-
-		// We create a distinct subdirectory for payloads within the tmpdir
-		// This will typically look like /tmp/ollama3208993108/runners on linux
-		payloadsDir = filepath.Join(tmpDir, "runners")
-	}
-	return payloadsDir, nil
-}
-
-// Best effort to clean up prior tmpdirs
-func cleanupTmpDirs() {
-	matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
-	if err != nil {
-		return
-	}
-
-	for _, match := range matches {
-		raw, err := os.ReadFile(match)
-		if errors.Is(err, os.ErrNotExist) {
-			slog.Debug("not a ollama runtime directory, skipping", "path", match)
-			continue
-		} else if err != nil {
-			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		pid, err := strconv.Atoi(string(raw))
-		if err != nil {
-			slog.Warn("invalid pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		p, err := os.FindProcess(pid)
-		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-			slog.Warn("process still running, skipping", "pid", pid, "path", match)
-			continue
-		}
-
-		if err := os.Remove(match); err != nil {
-			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
-		}
-
-		runners := filepath.Join(filepath.Dir(match), "runners")
-		if err := os.RemoveAll(runners); err != nil {
-			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
-		}
-
-		if err := os.Remove(filepath.Dir(match)); err != nil {
-			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
-		}
-	}
-}
-
-func Cleanup() {
-	lock.Lock()
-	defer lock.Unlock()
-	runnersDir := envconfig.RunnersDir()
-	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
-		// We want to fully clean up the tmpdir parent of the payloads dir
-		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
-		slog.Debug("cleaning up", "dir", tmpDir)
-		err := os.RemoveAll(tmpDir)
-		if err != nil {
-			// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
-			time.Sleep(1000 * time.Millisecond)
-			err = os.RemoveAll(tmpDir)
-			if err != nil {
-				slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
-			}
-		}
-	}
-}
-
-func UpdatePath(dir string) {
-	if runtime.GOOS == "windows" {
-		tmpDir := filepath.Dir(dir)
-		pathComponents := strings.Split(os.Getenv("PATH"), ";")
-		i := 0
-		for _, comp := range pathComponents {
-			if strings.EqualFold(comp, dir) {
-				return
-			}
-			// Remove any other prior paths to our temp dir
-			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
-				pathComponents[i] = comp
-				i++
-			}
-		}
-		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
-		slog.Info("updating", "PATH", newPath)
-		os.Setenv("PATH", newPath)
-	}
-	// linux and darwin rely on rpath
-}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -93,10 +93,9 @@ func initCudaHandles() *cudaHandles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 	}
-	tmpDir, _ := PayloadsDir()
-	if tmpDir != "" {
-		// TODO - add "payloads" for subprocess
-		cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
+	libDir := LibraryDir()
+	if libDir != "" {
+		cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
 	}
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)


--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -31,6 +31,7 @@ init_vars() {
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
        DIST_BASE=../../dist/darwin-${GOARCH}/
+        PAYLOAD_BASE=../../build/darwin/${GOARCH}
        ;;
    "Linux")
        LIB_EXT="so"
@@ -40,6 +41,7 @@ init_vars() {
        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
        DIST_BASE=../../dist/linux-${GOARCH}/
+        PAYLOAD_BASE=../../build/linux/${GOARCH}
        ;;
    *)
        ;;
@@ -47,7 +49,8 @@ init_vars() {
    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
-    GZIP=$(which pigz 2>/dev/null || echo "gzip")
+    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
+    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
 }

 git_module_setup() {
@@ -91,17 +94,34 @@ build() {
    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
 }

+dist() {
+    [ -z "${RUNNER}" ] && exit 1
+    mkdir -p ${RUNNER_BASE}/${RUNNER}/
+    for f in ${BUILD_DIR}/bin/* ; do
+        cp ${f} ${RUNNER_BASE}/${RUNNER}/
+    done
+    # check for lib directory
+    if [ -d ${BUILD_DIR}/lib ]; then
+        for f in ${BUILD_DIR}/lib/* ; do
+            cp ${f} ${RUNNER_BASE}/${RUNNER}/
+        done
+    fi
+}
+
+# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
 compress() {
-    echo "Compressing payloads to reduce overall binary size..."
-    rm -rf ${BUILD_DIR}/bin/*.gz
+    [ -z "${RUNNER}" ] && exit 1
+    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
+    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
+    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
    for f in ${BUILD_DIR}/bin/* ; do
-        ${GZIP} -n --best -f ${f} &
+        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
        compress_pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
-            ${GZIP} -n --best -f ${f} &
+            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
            compress_pids+=" $!"
        done
    fi

--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -39,7 +39,8 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu"
+        RUNNER=cpu
+        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
        echo "Building LCD CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
@@ -51,7 +52,8 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
+        RUNNER=cpu_avx
+        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
        echo "Building AVX CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
@@ -63,7 +65,8 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
+        RUNNER=cpu_avx2
+        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
        echo "Building AVX2 CPU"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
        build
@@ -84,7 +87,8 @@ case "${GOARCH}" in
    if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
        init_vars
        CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/metal"
+        RUNNER="metal"
+        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server

--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -79,10 +79,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/linux/${ARCH}/cpu"
+        RUNNER="cpu"
+        BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
        echo "Building custom CPU"
        build
        install
+        dist
        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@@ -102,10 +104,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            #
            init_vars
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="../build/linux/${ARCH}/cpu"
+            RUNNER=cpu
+            BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
            echo "Building LCD CPU"
            build
            install
+            dist
            compress
        fi

@@ -120,10 +124,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
+                RUNNER=cpu_avx
+                BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
                echo "Building AVX CPU"
                build
                install
+                dist
                compress
            fi

@@ -134,10 +140,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
+                RUNNER=cpu_avx2
+                BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
                echo "Building AVX2 CPU"
                build
                install
+                dist
                compress
            fi
        fi
@@ -187,11 +195,13 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
    fi
    export CUDAFLAGS="-t8"
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
-    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
+    RUNNER=cuda${CUDA_VARIANT}
+    BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
    build
    install
+    dist
    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
    mkdir -p "${CUDA_DIST_DIR}"
    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
@@ -212,7 +222,8 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
    CC=icx
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
-    BUILD_DIR="../build/linux/${ARCH}/oneapi"
+    RUNNER=oneapi
+    BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
@@ -231,6 +242,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
    install
+    dist
    compress
 fi

@@ -259,7 +271,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
        echo "Building custom ROCM GPU"
    fi
-    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
+    RUNNER=rocm${ROCM_VARIANT}
+    BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
    # ROCm dependencies are too large to fit into a unified bundle
    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
    # TODO figure out how to disable runpath (rpath)
@@ -269,13 +282,17 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then

    # copy the ROCM dependencies
    mkdir -p "${ROCM_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
        cp -a "${dep}"* "${ROCM_DIST_DIR}"
+        if [ $(readlink -f "${dep}") != "${dep}" ] ; then
+            cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
+        fi
    done
    install
+    dist
    compress
 fi

 cleanup
 wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
+echo "go generate completed.  LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
 package llm

 import (
-	"embed"
 	"syscall"
 )

-//go:embed build/darwin/arm64/*/bin/*
-var libEmbed embed.FS
-
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}