Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
6c6a31a1
"vscode:/vscode.git/clone" did not exist on "6c49d542a352b556e412d8763592050d3a0dec77"
Commit
6c6a31a1
authored
Sep 20, 2023
by
Michael Yang
Browse files
embed libraries using cmake
parent
fc6ec356
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
43 additions
and
58 deletions
+43
-58
docs/development.md
docs/development.md
+1
-1
llm/llama.cpp/generate.go
llm/llama.cpp/generate.go
+2
-3
llm/llama.cpp/generate_darwin_amd64.go
llm/llama.cpp/generate_darwin_amd64.go
+4
-5
llm/llama.cpp/generate_darwin_arm64.go
llm/llama.cpp/generate_darwin_arm64.go
+4
-5
llm/llama.cpp/generate_linux.go
llm/llama.cpp/generate_linux.go
+5
-12
llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
...ch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
+0
-32
llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
+0
-0
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
+27
-0
llm/llama.cpp/patches/0002-34B-model-support.patch
llm/llama.cpp/patches/0002-34B-model-support.patch
+0
-0
llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
...etal-fix-synchronization-in-new-matrix-multiplicati.patch
+0
-0
llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
...es/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+0
-0
llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
...ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
+0
-0
No files found.
docs/development.md
View file @
6c6a31a1
...
@@ -35,5 +35,5 @@ Now you can run `ollama`:
...
@@ -35,5 +35,5 @@ Now you can run `ollama`:
## Building on Linux with GPU support
## Building on Linux with GPU support
-
Install cmake and nvidia-cuda-toolkit
-
Install cmake and nvidia-cuda-toolkit
-
run
`
CUBLAS_PATH=/path/to/libcublas.so CUDART_PATH=/path/to/libcudart.so CUBLASLT_PATH=/path/to/libcublasLt.so
go generate ./...`
-
run
`go generate ./...`
-
run
`go build .`
-
run
`go build .`
llm/llama.cpp/generate.go
View file @
6c6a31a1
...
@@ -6,9 +6,8 @@ package llm
...
@@ -6,9 +6,8 @@ package llm
//go:generate git submodule init
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate cmake --build ggml/build/cpu --target server --config Release
...
...
llm/llama.cpp/generate_darwin_amd64.go
View file @
6c6a31a1
...
@@ -3,11 +3,10 @@ package llm
...
@@ -3,11 +3,10 @@ package llm
//go:generate git submodule init
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate cmake --build ggml/build/cpu --target server --config Release
...
...
llm/llama.cpp/generate_darwin_arm64.go
View file @
6c6a31a1
...
@@ -3,11 +3,10 @@ package llm
...
@@ -3,11 +3,10 @@ package llm
//go:generate git submodule init
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/metal --target server --config Release
//go:generate cmake --build ggml/build/metal --target server --config Release
...
...
llm/llama.cpp/generate_linux.go
View file @
6c6a31a1
...
@@ -3,15 +3,15 @@ package llm
...
@@ -3,15 +3,15 @@ package llm
//go:generate git submodule init
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate cmake --build gguf/build/cpu --target server --config Release
...
@@ -19,10 +19,3 @@ package llm
...
@@ -19,10 +19,3 @@ package llm
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate cp --dereference ${CUBLAS_PATH} ggml/build/cuda/bin/libcublas.so.11
//go:generate cp --dereference ${CUBLAS_PATH} gguf/build/cuda/bin/libcublas.so.11
//go:generate cp --dereference ${CUDART_PATH} ggml/build/cuda/bin/libcudart.so.11.0
//go:generate cp --dereference ${CUDART_PATH} gguf/build/cuda/bin/libcudart.so.11.0
//go:generate cp --dereference ${CUBLASLT_PATH} ggml/build/cuda/bin/libcublasLt.so.11
//go:generate cp --dereference ${CUBLASLT_PATH} gguf/build/cuda/bin/libcublasLt.so.11
llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
deleted
100644 → 0
View file @
fc6ec356
From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 5 Sep 2023 16:05:08 -0400
Subject: [PATCH] metal: add missing barriers for mul-mat #2699
---
ggml-metal.metal | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f31252..ce3541f 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@
kernel void kernel_mul_mm(device const uchar * src0,
//load data and store to threadgroup memory
half4x4 temp_a;
dequantize_func(x, il, temp_a);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16)
for (int i = 0; i < 16; i++) {
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,6 +1896,7 @@
kernel void kernel_mul_mm(device const uchar * src0,
}
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
+ threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
--
2.39.2 (Apple Git-143)
llm/llama.cpp/
ggml_
patch/0001-add-detokenize-endpoint.patch
→
llm/llama.cpp/patch
es
/0001-add-detokenize-endpoint.patch
View file @
6c6a31a1
File moved
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
0 → 100644
View file @
6c6a31a1
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 20 Sep 2023 14:19:52 -0700
Subject: [PATCH] copy cuda runtime libraries
---
CMakeLists.txt | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 824d9f2..dd24137 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -274,6 +274,10 @@
if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
--
2.42.0
llm/llama.cpp/
ggml_
patch/0002-34B-model-support.patch
→
llm/llama.cpp/patch
es
/0002-34B-model-support.patch
View file @
6c6a31a1
File moved
llm/llama.cpp/
ggml_
patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
→
llm/llama.cpp/patch
es
/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
View file @
6c6a31a1
File moved
llm/llama.cpp/
ggml_
patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
→
llm/llama.cpp/patch
es
/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
View file @
6c6a31a1
File moved
llm/llama.cpp/
ggml_
patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
→
llm/llama.cpp/patch
es
/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
View file @
6c6a31a1
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment