Remove submodule and shift to Go server - 0.4.0 (#7157)

* Remove llama.cpp submodule and shift new build to top * CI: install msys and clang gcc on win Needed for deepseek to work properly on windows

Remove submodule and shift to Go server - 0.4.0 (#7157)
* Remove llama.cpp submodule and shift new build to top * CI: install msys and clang gcc on win Needed for deepseek to work properly on windows
b754f5a6 · Daniel Hiltgen · GitHub · a805e594 · b754f5a6 · b754f5a6
Unverified Commit b754f5a6 authored Oct 30, 2024 by Daniel Hiltgen Committed by GitHub Oct 30, 2024
20 changed files
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,9 +3,7 @@ ollama
 app
 macapp
 dist
-llm/llama.cpp
 .env
 .cache
 test_data
-llm/build
 llama/build
--- a/.gitattributes
+++ b/.gitattributes
-llm/ext_server/* linguist-vendored
 llama/**/*.cpp linguist-vendored
 llama/**/*.hpp linguist-vendored
 llama/**/*.h linguist-vendored

--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -48,8 +48,8 @@ jobs:
        with:
          name: dist-darwin
          path: |
-            dist/*arwin*
+            dist/Ollama-darwin.zip
-            !dist/*-cov
+            dist/ollama-darwin
  # Windows builds take a long time to both install the dependencies and build, so parallelize
  # CPU generation step
@@ -85,6 +85,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@@ -92,19 +110,19 @@ jobs:
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
-        name: go generate
+          make -j $cores
+        name: make
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cpu
          path: |
            build/**/*
            build/**/*.a
-            llm/build/**/*.a
            dist/windows-amd64/**
  # ROCm generation step
@@ -140,6 +158,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@@ -158,31 +194,21 @@ jobs:
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
-        name: go generate
+          make -j $cores
-      - name: 'gather rocm dependencies'
+        name: make
-        run: |
-          $HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          md "dist\deps\bin\rocblas\library"
-          cp "${HIP_PATH}\bin\hipblas.dll" "dist\deps\bin\"
-          cp "${HIP_PATH}\bin\rocblas.dll" "dist\deps\bin\"
-          cp "${HIP_PATH}\bin\rocblas\library\*" "dist\deps\bin\rocblas\library\"
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-rocm
          path: |
            build/**/*
            dist/windows-amd64/**
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-rocm-deps
-          path: dist/deps/*
  # CUDA generation step
  generate-windows-cuda:
@@ -224,6 +250,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@@ -245,34 +289,23 @@ jobs:
      - name: 'Verify CUDA'
        run: nvcc -V
      - run: go get ./...
-      - name: go generate
+      - name: make
        run: |
          $gopath=(get-command go).source | split-path -parent
          $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$cudabin;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
-      - name: 'gather cuda dependencies'
+          make -j $cores
-        run: |
-          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
-          md "dist\deps"
-          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
            build/**/*
            dist/windows-amd64/**
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-cuda-deps-${{ matrix.cuda.version }}
-          path: dist/deps/*
  # windows arm64 generate, go build, and zip file (no installer)
  # Output of this build is aggregated into the final x86 build
@@ -292,6 +325,30 @@ jobs:
          choco install -y --no-progress git gzip
          echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      # pacman is buggy on win arm64, so we avoid using it, but rely on the binary artifacts
+      # we download the sfx (7zip bundle) which isn't fully set up, but the binaries we need to build work
+      - name: Install msys2 x64
+        run: |
+          $url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-base-x86_64-20240727.sfx.exe"
+          write-host "Downloading MSYS2"
+          Invoke-WebRequest -Uri "$url" -outfile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @(
+              '-y', '-oC:\'
+              ) -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      # since pacman isn't reliable, we just download the tar file and extract directly
+      - name: Downloading and extracting msys2 make tar file
+        run: |
+          $url="https://mirror.msys2.org/msys/x86_64/make-4.4.1-2-x86_64.pkg.tar.zst"
+          write-host "Downloading make"
+          Invoke-WebRequest -Uri "$url" -outfile c:\msys64\make.tar.zst
+          cd c:\msys64; tar -xf make.tar.zst
+          rm c:\msys64\make.tar.zst
+      - name: Verify Make works properly
+        run: |
+          echo $env:PATH
+          make --version
      - name: Install Visual Studio 2022
        run: |
          $components = @(
@@ -385,10 +442,9 @@ jobs:
      - run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -skipautomaticlocation
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$gccpath;$env:PATH"
-          $env:PATH="$gopath;$gccpath;$env:PATH;C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin"
          echo $env:PATH
          $env:ARCH="arm64"
          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
@@ -441,6 +497,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@@ -455,15 +529,6 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cuda-12
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-cuda-deps-11
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-cuda-deps-12
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-rocm-deps
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
@@ -474,11 +539,12 @@ jobs:
      - run: dir build
      - run: |
          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_GENERATE="1"
+          $env:ARCH="amd64"
          & .\scripts\build_windows.ps1
      - uses: actions/upload-artifact@v4
        with:

--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -21,9 +21,6 @@ jobs:
  changes:
    runs-on: ubuntu-latest
    outputs:
-      GENERATE: ${{ steps.changes.outputs.GENERATE }}
-      GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }}
-      GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }}
      RUNNERS: ${{ steps.changes.outputs.RUNNERS }}
    steps:
      - uses: actions/checkout@v4
@@ -39,53 +36,12 @@ jobs:
          }
          {
-            echo GENERATE=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
-            echo GENERATE_CUDA=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
-            echo GENERATE_ROCM=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
            echo RUNNERS=$(changed 'llama/**')
          } >>$GITHUB_OUTPUT
-  generate:
+  runners-linux-cuda:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        arch: [amd64, arm64]
-        exclude:
-          - os: ubuntu-latest
-            arch: arm64
-          - os: windows-2019
-            arch: arm64
-    runs-on: ${{ matrix.os }}
-    env:
-      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$gccpath;$env:PATH"
-          echo $env:PATH
-          go generate -x ./...
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        name: 'Windows Go Generate'
-      - run: go generate -x ./...
-        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        name: 'Unix Go Generate'
-      - run: go build .
-  generate-cuda:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
    strategy:
      matrix:
        cuda-version:
@@ -95,8 +51,6 @@ jobs:
    steps:
      - run: |
          apt-get update && apt-get install -y git build-essential curl
-          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
-            | tar -zx -C /usr --strip-components 1
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/checkout@v4
@@ -107,12 +61,11 @@ jobs:
      - run: go get ./...
      - run: |
          git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-        env:
+          make -j $cores cuda_v11
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+  runners-linux-rocm:
-  generate-rocm:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
      matrix:
        rocm-version:
@@ -122,8 +75,6 @@ jobs:
    steps:
      - run: |
          apt-get update && apt-get install -y git build-essential curl rocm-libs
-          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
-            | tar -zx -C /usr --strip-components 1
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/checkout@v4
@@ -134,14 +85,13 @@ jobs:
      - run: go get ./...
      - run: |
          git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-        env:
+          make -j $cores rocm
-          OLLAMA_SKIP_CPU_GENERATE: '1'
  # ROCm generation step
-  generate-windows-rocm:
+  runners-windows-rocm:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
@@ -160,24 +110,42 @@ jobs:
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
-        name: go generate
+          write-host $env:HIP_PATH
-        env:
+          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+          make -j $cores rocm
+        name: make
  # CUDA generation step
-  generate-windows-cuda:
+  runners-windows-cuda:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
@@ -201,21 +169,40 @@ jobs:
          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
      - name: 'Verify CUDA'
        run: nvcc -V
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
      - run: go get ./...
-      - name: go generate
+      - name: make
        run: |
          $gopath=(get-command go).source | split-path -parent
          $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$cudabin;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          make -j $cores cuda_v11
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
-  runners:
+  runners-cpu:
    needs: [changes]
    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
@@ -239,20 +226,41 @@ jobs:
          go-version-file: go.mod
          cache: true
      - run: go get ./...
+      - name: Install msys2
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
      - name: 'Build Windows Go Runners'
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
-          make -C llama -j 4      
+          make -j 4      
      - name: 'Build Unix Go Runners'
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        run: make -C llama -j 4
+        run: make -j 4
      - run: go build .
  lint:
@@ -302,9 +310,6 @@ jobs:
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
-      OLLAMA_CPU_TARGET: 'static'
-      OLLAMA_SKIP_CPU_GENERATE: '1'
-      OLLAMA_SKIP_METAL_GENERATE: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@@ -319,7 +324,6 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
-      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
@@ -333,4 +337,4 @@ jobs:
          submodules: recursive
      - name: Verify patches carry all the changes
        run: |
-          cd llama && make apply-patches sync && git diff --compact-summary --exit-code .
+          make apply-patches sync && git diff --compact-summary --exit-code llama
\ No newline at end of file
--- a/.gitmodules
+++ b/.gitmodules
-[submodule "llama.cpp"]
-	path = llm/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
-	shallow = true
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,168 +6,134 @@ ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
-# Copy the minimal context we need to run the generate scripts
+### To create a local image for building linux binaries on mac or windows with efficient incremental builds
-FROM scratch AS llm-code
+#
-COPY .git .git
+# docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile --target unified-builder-amd64 .
-COPY .gitmodules .gitmodules
+# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
-COPY llm llm
+#
+### Then incremental builds will be much faster in this container
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
+#
-ARG CMAKE_VERSION
+# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
-COPY ./scripts/rh_linux_deps.sh /
+#
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH=amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
-    CUDA_VARIANT="_v11" \
-    bash gen_linux.sh
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
 ARG CMAKE_VERSION
+ARG GOLANG_VERSION
+ARG CUDA_VERSION_11
+ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ARG CGO_CFLAGS
+RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
-ARG CUDA_V12_ARCHITECTURES
+    dnf clean all && \
-ENV GOARCH=amd64
+    dnf install -y \
-RUN --mount=type=cache,target=/root/.ccache \
+    zsh \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
+    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
+# TODO intel oneapi goes here...
-    CUDA_VARIANT="_v12" \
+ENV GOARCH amd64
-    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
+ENV CGO_ENABLED 1
-    bash gen_linux.sh
+WORKDIR /go/src/github.com/ollama/ollama/
+ENTRYPOINT [ "zsh" ]
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
+### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
+# Note: this does not contain jetson variants
+#
+# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile --target unified-builder-arm64 .
+# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
+#
+FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
 ARG CMAKE_VERSION
+ARG GOLANG_VERSION
+ARG CUDA_VERSION_11
+ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+    dnf config-manager --set-enabled appstream && \
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+    dnf clean all && \
-ARG CGO_CFLAGS
+    dnf install -y \
+    zsh \
+    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
+    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
+ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
+ENV GOARCH amd64
+ENV CGO_ENABLED 1
+WORKDIR /go/src/github.com/ollama/ollama/
+ENTRYPOINT [ "zsh" ]
+FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
+COPY . .
+ARG OLLAMA_SKIP_CUDA_GENERATE
+ARG OLLAMA_SKIP_CUDA_11_GENERATE
+ARG OLLAMA_SKIP_CUDA_12_GENERATE
+ARG OLLAMA_SKIP_ROCM_GENERATE
 ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH=arm64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
-    CUDA_VARIANT="_v11" \
-    bash gen_linux.sh
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
 ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH=arm64
+ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
+        make -C llama -j $(expr $(nproc) / 2 ) ; \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
+    else \
-    CUDA_VARIANT="_v12" \
+        make -C llama -j 5 ; \
-    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
+    fi
-    bash gen_linux.sh
+FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
+COPY . .
-ARG CMAKE_VERSION
+ARG OLLAMA_SKIP_CUDA_GENERATE
-COPY ./scripts/rh_linux_deps.sh /
+ARG OLLAMA_SKIP_CUDA_11_GENERATE
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
+ARG CUDA_V11_ARCHITECTURES
-ENV LIBRARY_PATH=/opt/amdgpu/lib64
+ARG CUDA_V12_ARCHITECTURES
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+ARG OLLAMA_FAST_BUILD
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG AMDGPU_TARGETS
-ENV GOARCH=amd64
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+    make -C llama -j 8
-RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - )
-FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-ARG OLLAMA_CUSTOM_CPU_DEFS
-ARG CGO_CFLAGS
-ENV GOARCH=amd64
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
+# Intermediate stages used for ./scripts/build_linux.sh
-RUN --mount=type=cache,target=/root/.ccache \
+FROM --platform=linux/amd64 centos:7 AS builder-amd64
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
-FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh
-FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh
-FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+ENV CGO_ENABLED 1
-ARG OLLAMA_CUSTOM_CPU_DEFS
+ENV GOARCH amd64
-ARG CGO_CFLAGS
-ENV GOARCH=arm64
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
-# Intermediate stages used for ./scripts/build_linux.sh
-FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
-ENV CGO_ENABLED=1
 WORKDIR /go/src/github.com/ollama/ollama
+FROM --platform=linux/amd64 builder-amd64 AS build-amd64
 COPY . .
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
+ARG OLLAMA_SKIP_ROCM_GENERATE
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN cd dist/linux-$GOARCH-rocm && \
+RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz
+    cd dist/linux-$GOARCH-rocm && \
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
+    fi
-FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
+FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
-ENV CGO_ENABLED=1
+ARG CMAKE_VERSION
 ARG GOLANG_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+ENV CGO_ENABLED 1
+ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama
+FROM --platform=linux/arm64 builder-arm64 AS build-arm64
 COPY . .
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
@@ -179,11 +145,11 @@ FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
 FROM --platform=linux/arm64 scratch AS dist-arm64
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH as dist
+FROM dist-$TARGETARCH AS dist
 # Optimized container images do not cary nested payloads
-FROM --platform=linux/amd64 cpu-builder-amd64 AS container-build-amd64
+FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
@@ -191,7 +157,7 @@ ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
-FROM --platform=linux/arm64 cpu-builder-arm64 AS container-build-arm64
+FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
@@ -199,48 +165,52 @@ ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
+# For amd64 container images, filter out cuda/rocm to minimize size
+FROM runners-amd64 AS runners-cuda-amd64
+RUN rm -rf \
+    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
+    ./dist/linux-amd64/lib/ollama/runners/rocm*
+FROM runners-amd64 AS runners-rocm-amd64
+RUN rm -rf \
+    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
+    ./dist/linux-amd64/lib/ollama/libcu*.so* \
+    ./dist/linux-amd64/lib/ollama/runners/cuda*
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
 # Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
 # across releases
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 EXPOSE 11434
-ENV OLLAMA_HOST=0.0.0.0
+ENV OLLAMA_HOST 0.0.0.0
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 FROM runtime-$TARGETARCH
 EXPOSE 11434
-ENV OLLAMA_HOST=0.0.0.0
+ENV OLLAMA_HOST 0.0.0.0
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

--- a/Makefile
+++ b/Makefile
+GOALS := $(or $(MAKECMDGOALS),all)
+.PHONY: $(GOALS)
+$(GOALS):
+	$(MAKE) -C llama $@
\ No newline at end of file
--- a/docs/development.md
+++ b/docs/development.md
 # Development
-> [!IMPORTANT]
-> The `llm` package that loads and runs models is being updated to use a new [Go runner](#transition-to-go-runner): this should only impact a small set of PRs however it does change how the project is built.
-Install required tools:
- cmake version 3.24 or higher
- go version 1.22 or higher
- gcc version 11.4.0 or higher
-### MacOS
-```bash
-brew install go cmake gcc
-```
-Optionally enable debugging and more verbose logging:
-```bash
-# At build time
-export CGO_CFLAGS="-g"
-# At runtime
-export OLLAMA_DEBUG=1
-```
-Get the required libraries and build the native LLM code:
-```bash
-go generate ./...
-```
-Then build ollama:
-```bash
-go build .
-```
-Now you can run `ollama`:
-```bash
-./ollama
-```
-### Linux
-#### Linux CUDA (NVIDIA)
-_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-development and runtime packages.
-Typically the build scripts will auto-detect CUDA, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
-a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
-Then generate dependencies:
-```
-go generate ./...
-```
-Then build the binary:
-```
-go build .
-```
-#### Linux ROCm (AMD)
-_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `cmake` and `golang`.
-Typically the build scripts will auto-detect ROCm, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `ROCM_PATH` to the location of the ROCm
-install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
-the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
-```
-go generate ./...
-```
-Then build the binary:
-```
-go build .
-```
-ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
-#### Advanced CPU Settings
-By default, running `go generate ./...` will compile a few different variations
-of the LLM library based on common CPU families and vector math capabilities,
-including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. If you would like to build a CPU-based build customized for your
-processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
-like to use. For example, to compile an optimized binary for an Intel i9-9880H,
-you might use:
-```
-OLLAMA_CUSTOM_CPU_DEFS="-DGGML_AVX=on -DGGML_AVX2=on -DGGML_F16C=on -DGGML_FMA=on" go generate ./...
-go build .
-```
-#### Containerized Linux Build
-If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
-### Windows
-Note: The Windows build for Ollama is still under development.
-First, install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
- Go version 1.22 or higher
- MinGW (pick one variant) with GCC.
-  - [MinGW-w64](https://www.mingw-w64.org/)
-  - [MSYS2](https://www.msys2.org/)
- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
-Then, build the `ollama` binary:
-```powershell
-$env:CGO_ENABLED="1"
-go generate ./...
-go build .
-```
-#### Windows CUDA (NVIDIA)
-In addition to the common Windows development tools described above, install CUDA after installing MSVC.
- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
-#### Windows ROCm (AMD Radeon)
-In addition to the common Windows development tools described above, install AMDs HIP package after installing MSVC.
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
- [Strawberry Perl](https://strawberryperl.com/)
-Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
-#### Windows arm64
-The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
-```powershell
-import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
-Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
-```
-You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
-Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
-```
-pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
-```
-You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
-## Transition to Go runner
-The Ollama team is working on moving to a new Go based runner that loads and runs models in a subprocess to replace the previous code under `ext_server`. During this transition period, this new Go runner is "opt in" at build time, and requires using a different approach to build.
-After the transition to use the Go server exclusively, both `make` and `go generate` will build the Go runner.
 Install required tools:
 - go version 1.22 or higher
@@ -201,7 +23,7 @@ export OLLAMA_DEBUG=1
 Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)
 ```bash
-make -C llama -j 5
+make -j 5
 ```
 Then build ollama:
@@ -238,7 +60,7 @@ a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 ```
-make -C llama -j 5
+make -j 5
 ```
 Then build the binary:
@@ -263,7 +85,7 @@ the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 ```
-make -C llama -j 5
+make -j 5
 ```
 Then build the binary:
@@ -308,7 +130,7 @@ Then, build the `ollama` binary:
 ```powershell
 $env:CGO_ENABLED="1"
-make -C llama -j 8
+make -j 8
 go build .
 ```

--- a/llama/Dockerfile
+++ b/llama/Dockerfile
-# Note: once we have fully transitioned to the Go server, this will replace the old Dockerfile at the top of the tree
-ARG GOLANG_VERSION=1.22.8
-ARG CMAKE_VERSION=3.22.1
-ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
-ARG CUDA_VERSION_12=12.4.0
-ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
-ARG ROCM_VERSION=6.1.2
-### To create a local image for building linux binaries on mac or windows with efficient incremental builds
-#
-# docker build --platform linux/amd64 -t builder-amd64 -f llama/Dockerfile --target unified-builder-amd64 .
-# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
-#
-### Then incremental builds will be much faster in this container
-#
-# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
-#
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-# TODO intel oneapi goes here...
-ENV GOARCH amd64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
-### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
-# Note: this does not contain jetson variants
-#
-# docker build --platform linux/arm64 -t builder-arm64 -f llama/Dockerfile --target unified-builder-arm64 .
-# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
-#
-FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
-    dnf config-manager --set-enabled appstream && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH amd64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
-FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG OLLAMA_SKIP_ROCM_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
-ARG OLLAMA_FAST_BUILD
-RUN --mount=type=cache,target=/root/.ccache \
-    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -C llama -j $(expr $(nproc) / 2 ) ; \
-    else \
-        make -C llama -j 5 ; \
-    fi
-FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
-ARG OLLAMA_FAST_BUILD
-RUN --mount=type=cache,target=/root/.ccache \
-    make -C llama -j 8
-# Intermediate stages used for ./scripts/build_linux.sh
-FROM --platform=linux/amd64 centos:7 AS builder-amd64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH amd64
-WORKDIR /go/src/github.com/ollama/ollama
-FROM --platform=linux/amd64 builder-amd64 AS build-amd64
-COPY . .
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-ARG OLLAMA_SKIP_ROCM_GENERATE
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
-    cd dist/linux-$GOARCH-rocm && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
-    fi
-FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH arm64
-WORKDIR /go/src/github.com/ollama/ollama
-FROM --platform=linux/arm64 builder-arm64 AS build-arm64
-COPY . .
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-FROM --platform=linux/amd64 scratch AS dist-amd64
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM --platform=linux/arm64 scratch AS dist-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH AS dist
-# Optimized container images do not cary nested payloads
-FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-# For amd64 container images, filter out cuda/rocm to minimize size
-FROM runners-amd64 AS runners-cuda-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
-    ./dist/linux-amd64/lib/ollama/runners/rocm*
-FROM runners-amd64 AS runners-rocm-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
-    ./dist/linux-amd64/lib/ollama/libcu*.so* \
-    ./dist/linux-amd64/lib/ollama/runners/cuda*
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
-# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
-# across releases
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
-ENTRYPOINT ["/bin/ollama"]
-CMD ["serve"]
-FROM runtime-$TARGETARCH
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
-ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-ENV NVIDIA_VISIBLE_DEVICES=all
-ENTRYPOINT ["/bin/ollama"]
-CMD ["serve"]
--- a/llama/README.md
+++ b/llama/README.md
@@ -95,31 +95,17 @@ make -j
 Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model.  While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit.  A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
-> [!IMPORTANT]
-> Prior to merging #7157 we continue to leverage a submodule for llama.cpp which establishes the tracking commit.  After merging that PR a new manifest file we be utilized
 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 ### Updating Base Commit
 **Pin to new base commit**
-To update to a newer base commit, select the upstream git tag or commit
+To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring.env`
-> [!IMPORTANT]
-> After merging #7157 a manifest will be used instead of the submodule
-```
-cd llm/llama.cpp
-git fetch
-git checkout NEW_BASE_COMMIT
-cd ..
-git add llama.cpp
-```
 #### Applying patches
@@ -128,13 +114,13 @@ When updating to a newer base commit, the existing patches may not apply cleanly
 Start by applying the patches.  If any of the patches have conflicts, the `git am` will stop at the first failure.
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed.  Save the file(s) and continue the patch series with `git am --continue` .  If any additional patches fail, follow the same pattern until the full patch series is applied.  Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
 ```
-make -C llama create-patches sync
+make create-patches sync
 ```
 Build and test Ollama, and make any necessary changes to the Go code based on the new base commit.  Submit your PR to the Ollama repo.
@@ -144,14 +130,14 @@ Build and test Ollama, and make any necessary changes to the Go code based on th
 When working on new fixes or features that impact vendored code, use the following model.  First get a clean tracking repo with all current patches applied:
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 Now edit the upstream native code in the `./vendor/` directory.  You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing.  Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
 ```
-make -C llama sync
+make sync
-make -C llama -j 8
+make -j 8
 go build .
 ```
@@ -161,7 +147,7 @@ go build .
 Iterate until you're ready to submit PRs.  Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
 ```
-make -C llama create-patches
+make create-patches
 ```
 > [!IMPORTANT]

--- a/llama/llama.go
+++ b/llama/llama.go
 package llama
+//go:generate make -j 8
 /*
 #cgo CFLAGS: -O2 -std=c11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
 #cgo CXXFLAGS: -O2 -std=c++11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE

--- a/llama/make/Makefile.sync
+++ b/llama/make/Makefile.sync
 # Helpers for managing our vendored llama.cpp repo and patch set
-# TODO - this should include a manifest file at the top of the tree 
+REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
-LLAMACPP_BASE_COMMIT=$(shell cd ../llm/llama.cpp && git rev-parse HEAD)
+DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
-LLAMACPP_REPO := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))vendor/
+include $(REPO_ROOT)llama/vendoring
+LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
-DST_DIR=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
 LLAMACPP_PATCH_DIR := $(DST_DIR)patches/

--- a/llama/vendoring
+++ b/llama/vendoring
+LLAMACPP_BASE_COMMIT=3f1ae2e32cde00c39b96be6d01c2997c29bae555
\ No newline at end of file
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
-set(TARGET ollama_llama_server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp httplib.h)
-install(TARGETS ${TARGET} RUNTIME)
-target_compile_definitions(${TARGET} PRIVATE
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
-)
-target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
-if (WIN32)
-    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
-    target_link_options(${TARGET} PRIVATE -municode -Wl,/subsystem:console)
-endif()
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
\ No newline at end of file
--- a/llm/ext_server/httplib.h
+++ b/llm/ext_server/httplib.h
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
-# common logic across linux and darwin
-init_vars() {
-    case "${GOARCH}" in
-    "amd64")
-        ARCH="x86_64"
-        ;;
-    "arm64")
-        ARCH="arm64"
-        ;;
-    *)
-        echo "GOARCH must be set"
-        echo "this script is meant to be run from within go generate"
-        exit 1
-        ;;
-    esac
-    LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
-    CMAKE_TARGETS="--target ollama_llama_server"
-    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
-    else
-        # TODO - add additional optimization flags...
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
-    fi
-    case $(uname -s) in
-    "Darwin")
-        LIB_EXT="dylib"
-        WHOLE_ARCHIVE="-Wl,-force_load"
-        NO_WHOLE_ARCHIVE=""
-        GCC_ARCH="-arch ${ARCH}"
-        DIST_BASE=../../dist/darwin-${GOARCH}/
-        PAYLOAD_BASE=../../build/darwin/${GOARCH}
-        ;;
-    "Linux")
-        LIB_EXT="so"
-        WHOLE_ARCHIVE="-Wl,--whole-archive"
-        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
-        # Cross compiling not supported on linux - Use docker
-        GCC_ARCH=""
-        DIST_BASE=../../dist/linux-${GOARCH}/
-        PAYLOAD_BASE=../../build/linux/${GOARCH}
-        ;;
-    *)
-        ;;
-    esac
-    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
-        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    fi
-    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
-    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
-}
-git_module_setup() {
-    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
-        echo "Skipping submodule initialization"
-        return
-    fi
-    # Make sure the tree is clean after the directory moves
-    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
-        echo "Cleaning up old submodule"
-        rm -rf ${LLAMACPP_DIR}
-    fi
-    git submodule init
-    git submodule update --force ${LLAMACPP_DIR}
-}
-apply_patches() {
-    # apply temporary patches until fix is upstream
-    for patch in ../patches/*.patch; do
-        git -c 'user.name=nobody' -c 'user.email=<>' -C ${LLAMACPP_DIR} am ${patch}
-    done
-}
-build() {
-    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
-    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
-    # remove unnecessary build artifacts
-    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
-}
-dist() {
-    [ -z "${RUNNER}" ] && exit 1
-    mkdir -p ${RUNNER_BASE}/${RUNNER}/
-    for f in ${BUILD_DIR}/bin/* ; do
-        cp ${f} ${RUNNER_BASE}/${RUNNER}/
-    done
-    # check for lib directory
-    if [ -d ${BUILD_DIR}/lib ]; then
-        for f in ${BUILD_DIR}/lib/* ; do
-            cp ${f} ${RUNNER_BASE}/${RUNNER}/
-        done
-    fi
-}
-# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
-compress() {
-    [ -z "${RUNNER}" ] && exit 1
-    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
-    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
-    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
-    for f in ${BUILD_DIR}/bin/* ; do
-        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
-        compress_pids+=" $!"
-    done
-    # check for lib directory
-    if [ -d ${BUILD_DIR}/lib ]; then
-        for f in ${BUILD_DIR}/lib/* ; do
-            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
-            compress_pids+=" $!"
-        done
-    fi
-    echo
-}
-wait_for_compress() {
-    for pid in ${compress_pids}; do
-        wait $pid
-    done
-    echo "Finished compression"
-}
-install() {
-    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
-    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
-        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
-        cp -af "${lib}" "${BUILD_DIR}/bin/"
-    done
-}
-# Keep the local tree clean after we're done with the build
-cleanup() {
-    git submodule update --force ${LLAMACPP_DIR}
-}
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
-#!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be ./llm/generate/
-# TODO - add hardening to detect missing tools (cmake, etc.)
-set -ex
-set -o pipefail
-compress_pids=""
-echo "Starting darwin generate script"
-source $(dirname $0)/gen_common.sh
-init_vars
-git_module_setup
-apply_patches
-sign() {
-    if [ -n "$APPLE_IDENTITY" ]; then
-        codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
-    fi
-}
-COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
-case "${GOARCH}" in
-"amd64")
-    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DGGML_METAL=off -DGGML_NATIVE=off"
-    if [ -z "$OLLAMA_SKIP_CPU_GENERATE" ]; then
-        #
-        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-        #
-        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        RUNNER=cpu
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        echo "Building LCD CPU"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-        #
-        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-        # Approximately 400% faster than LCD on same CPU
-        #
-        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        RUNNER=cpu_avx
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        echo "Building AVX CPU"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-        #
-        # ~2013 CPU Dynamic library
-        # Approximately 10% faster than AVX on same CPU
-        #
-        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-        RUNNER=cpu_avx2
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        echo "Building AVX2 CPU"
-        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-    fi
-    ;;
-"arm64")
-    if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
-        init_vars
-        CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
-        RUNNER="metal"
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-    fi
-    ;;
-*)
-    echo "GOARCH must be set"
-    echo "this script is meant to be run from within go generate"
-    exit 1
-    ;;
-esac
-cleanup
-wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh