Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
ee4fd16f
Unverified
Commit
ee4fd16f
authored
Sep 20, 2023
by
Michael Yang
Committed by
GitHub
Sep 20, 2023
Browse files
Merge pull request #556 from jmorganca/pack-cuda
pack in cuda libs
parents
084e4c78
a9ed7cc6
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
52 additions
and
115 deletions
+52
-115
docs/development.md
docs/development.md
+1
-1
llm/llama.cpp/generate_darwin_amd64.go
llm/llama.cpp/generate_darwin_amd64.go
+4
-5
llm/llama.cpp/generate_darwin_arm64.go
llm/llama.cpp/generate_darwin_arm64.go
+4
-5
llm/llama.cpp/generate_linux.go
llm/llama.cpp/generate_linux.go
+9
-9
llm/llama.cpp/generate_windows.go
llm/llama.cpp/generate_windows.go
+2
-6
llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
...ch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
+0
-32
llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
+0
-0
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
+27
-0
llm/llama.cpp/patches/0002-34B-model-support.patch
llm/llama.cpp/patches/0002-34B-model-support.patch
+0
-0
llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
...etal-fix-synchronization-in-new-matrix-multiplicati.patch
+0
-0
llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
...es/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+0
-0
llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
...ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
+0
-0
llm/llama.go
llm/llama.go
+4
-56
server/routes.go
server/routes.go
+1
-1
No files found.
docs/development.md
View file @
ee4fd16f
...
@@ -35,5 +35,5 @@ Now you can run `ollama`:
...
@@ -35,5 +35,5 @@ Now you can run `ollama`:
## Building on Linux with GPU support
## Building on Linux with GPU support
-
Install cmake and nvidia-cuda-toolkit
-
Install cmake and nvidia-cuda-toolkit
-
run
`
CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\)\.\([0-9]\+\).*$/\1/p')
go generate ./...`
-
run
`go generate ./...`
-
run
`go build .`
-
run
`go build .`
llm/llama.cpp/generate_darwin_amd64.go
View file @
ee4fd16f
...
@@ -3,11 +3,10 @@ package llm
...
@@ -3,11 +3,10 @@ package llm
//go:generate git submodule init
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate cmake --build ggml/build/cpu --target server --config Release
...
...
llm/llama.cpp/generate_darwin_arm64.go
View file @
ee4fd16f
...
@@ -3,11 +3,10 @@ package llm
...
@@ -3,11 +3,10 @@ package llm
//go:generate git submodule init
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/metal --target server --config Release
//go:generate cmake --build ggml/build/metal --target server --config Release
...
...
llm/llama.cpp/generate_linux.go
View file @
ee4fd16f
...
@@ -3,19 +3,19 @@ package llm
...
@@ -3,19 +3,19 @@ package llm
//go:generate git submodule init
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate cmake -S ggml -B ggml/build/cuda
-${CUDA_VERSION}
-DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda
-${CUDA_VERSION}
--target server --config Release
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate cmake -S gguf -B gguf/build/cuda
-${CUDA_VERSION}
-DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cuda
-${CUDA_VERSION}
--target server --config Release
//go:generate cmake --build gguf/build/cuda --target server --config Release
llm/llama.cpp/generate.go
→
llm/llama.cpp/generate
_windows
.go
View file @
ee4fd16f
//go:build !darwin
// +build !darwin
package
llm
package
llm
//go:generate git submodule init
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate cmake --build ggml/build/cpu --target server --config Release
...
...
llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
deleted
100644 → 0
View file @
084e4c78
From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 5 Sep 2023 16:05:08 -0400
Subject: [PATCH] metal: add missing barriers for mul-mat #2699
---
ggml-metal.metal | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f31252..ce3541f 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@
kernel void kernel_mul_mm(device const uchar * src0,
//load data and store to threadgroup memory
half4x4 temp_a;
dequantize_func(x, il, temp_a);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16)
for (int i = 0; i < 16; i++) {
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,6 +1896,7 @@
kernel void kernel_mul_mm(device const uchar * src0,
}
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
+ threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
--
2.39.2 (Apple Git-143)
llm/llama.cpp/
ggml_
patch/0001-add-detokenize-endpoint.patch
→
llm/llama.cpp/patch
es
/0001-add-detokenize-endpoint.patch
View file @
ee4fd16f
File moved
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
0 → 100644
View file @
ee4fd16f
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 20 Sep 2023 14:19:52 -0700
Subject: [PATCH] copy cuda runtime libraries
---
CMakeLists.txt | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 824d9f2..dd24137 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -274,6 +274,10 @@
if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
--
2.42.0
llm/llama.cpp/
ggml_
patch/0002-34B-model-support.patch
→
llm/llama.cpp/patch
es
/0002-34B-model-support.patch
View file @
ee4fd16f
File moved
llm/llama.cpp/
ggml_
patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
→
llm/llama.cpp/patch
es
/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
View file @
ee4fd16f
File moved
llm/llama.cpp/
ggml_
patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
→
llm/llama.cpp/patch
es
/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
View file @
ee4fd16f
File moved
llm/llama.cpp/
ggml_
patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
→
llm/llama.cpp/patch
es
/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
View file @
ee4fd16f
File moved
llm/llama.go
View file @
ee4fd16f
...
@@ -17,7 +17,6 @@ import (
...
@@ -17,7 +17,6 @@ import (
"os/exec"
"os/exec"
"path"
"path"
"path/filepath"
"path/filepath"
"regexp"
"runtime"
"runtime"
"strconv"
"strconv"
"strings"
"strings"
...
@@ -29,46 +28,6 @@ import (
...
@@ -29,46 +28,6 @@ import (
//go:embed llama.cpp/*/build/*/bin/*
//go:embed llama.cpp/*/build/*/bin/*
var
llamaCppEmbed
embed
.
FS
var
llamaCppEmbed
embed
.
FS
func
cudaVersion
()
int
{
// first try nvcc, it gives the most accurate version if available
cmd
:=
exec
.
Command
(
"nvcc"
,
"--version"
)
output
,
err
:=
cmd
.
CombinedOutput
()
if
err
==
nil
{
// regex to match the CUDA version line in nvcc --version output
re
:=
regexp
.
MustCompile
(
`release (\d+\.\d+),`
)
matches
:=
re
.
FindStringSubmatch
(
string
(
output
))
if
len
(
matches
)
>=
2
{
cudaVersion
:=
matches
[
1
]
cudaVersionParts
:=
strings
.
Split
(
cudaVersion
,
"."
)
cudaMajorVersion
,
err
:=
strconv
.
Atoi
(
cudaVersionParts
[
0
])
if
err
==
nil
{
return
cudaMajorVersion
}
}
}
// fallback to nvidia-smi
cmd
=
exec
.
Command
(
"nvidia-smi"
)
output
,
err
=
cmd
.
CombinedOutput
()
if
err
!=
nil
{
return
-
1
}
re
:=
regexp
.
MustCompile
(
`CUDA Version: (\d+\.\d+)`
)
matches
:=
re
.
FindStringSubmatch
(
string
(
output
))
if
len
(
matches
)
<
2
{
return
-
1
}
cudaVersion
:=
matches
[
1
]
cudaVersionParts
:=
strings
.
Split
(
cudaVersion
,
"."
)
cudaMajorVersion
,
err
:=
strconv
.
Atoi
(
cudaVersionParts
[
0
])
if
err
!=
nil
{
return
-
1
}
return
cudaMajorVersion
}
type
ModelRunner
struct
{
type
ModelRunner
struct
{
Path
string
// path to the model runner executable
Path
string
// path to the model runner executable
}
}
...
@@ -86,21 +45,10 @@ func chooseRunners(runnerType string) []ModelRunner {
...
@@ -86,21 +45,10 @@ func chooseRunners(runnerType string) []ModelRunner {
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"server"
),
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"server"
),
}
}
case
"linux"
:
case
"linux"
:
cuda
:=
cudaVersion
()
if
cuda
==
11
{
// prioritize CUDA 11 runner
runners
=
[]
string
{
runners
=
[]
string
{
path
.
Join
(
buildPath
,
"cuda-11"
,
"bin"
,
"server"
),
path
.
Join
(
buildPath
,
"cuda"
,
"bin"
,
"server"
),
path
.
Join
(
buildPath
,
"cuda-12"
,
"bin"
,
"server"
),
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"server"
),
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"server"
),
}
}
}
else
{
runners
=
[]
string
{
path
.
Join
(
buildPath
,
"cuda-12"
,
"bin"
,
"server"
),
path
.
Join
(
buildPath
,
"cuda-11"
,
"bin"
,
"server"
),
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"server"
),
}
}
case
"windows"
:
case
"windows"
:
// TODO: select windows GPU runner here when available
// TODO: select windows GPU runner here when available
runners
=
[]
string
{
runners
=
[]
string
{
...
@@ -353,7 +301,7 @@ func newLlama(model string, adapters []string, runners []ModelRunner, opts api.O
...
@@ -353,7 +301,7 @@ func newLlama(model string, adapters []string, runners []ModelRunner, opts api.O
runner
.
Path
,
runner
.
Path
,
append
(
params
,
"--port"
,
strconv
.
Itoa
(
port
))
...
,
append
(
params
,
"--port"
,
strconv
.
Itoa
(
port
))
...
,
)
)
cmd
.
Env
=
append
(
os
.
Environ
(),
fmt
.
Sprintf
(
"LD_LIBRARY_PATH=%s"
,
filepath
.
Dir
(
runner
.
Path
)))
cmd
.
Stdout
=
os
.
Stderr
cmd
.
Stdout
=
os
.
Stderr
cmd
.
Stderr
=
os
.
Stderr
cmd
.
Stderr
=
os
.
Stderr
...
...
server/routes.go
View file @
ee4fd16f
...
@@ -556,7 +556,7 @@ func Serve(ln net.Listener, origins []string) error {
...
@@ -556,7 +556,7 @@ func Serve(ln net.Listener, origins []string) error {
if
runtime
.
GOOS
==
"linux"
{
if
runtime
.
GOOS
==
"linux"
{
// check compatibility to log warnings
// check compatibility to log warnings
if
_
,
err
:=
llm
.
CheckVRAM
();
err
!=
nil
{
if
_
,
err
:=
llm
.
CheckVRAM
();
err
!=
nil
{
log
.
Printf
(
"Warning: GPU support not enabled,
you may need to
install GPU drivers: %v"
,
err
)
log
.
Printf
(
"Warning: GPU support
may
not enabled,
check you have installed
install GPU drivers: %v"
,
err
)
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment