Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
6c6a31a1
Commit
6c6a31a1
authored
Sep 20, 2023
by
Michael Yang
Browse files
embed libraries using cmake
parent
fc6ec356
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
43 additions
and
58 deletions
+43
-58
docs/development.md
docs/development.md
+1
-1
llm/llama.cpp/generate.go
llm/llama.cpp/generate.go
+2
-3
llm/llama.cpp/generate_darwin_amd64.go
llm/llama.cpp/generate_darwin_amd64.go
+4
-5
llm/llama.cpp/generate_darwin_arm64.go
llm/llama.cpp/generate_darwin_arm64.go
+4
-5
llm/llama.cpp/generate_linux.go
llm/llama.cpp/generate_linux.go
+5
-12
llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
...ch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
+0
-32
llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
+0
-0
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
+27
-0
llm/llama.cpp/patches/0002-34B-model-support.patch
llm/llama.cpp/patches/0002-34B-model-support.patch
+0
-0
llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
...etal-fix-synchronization-in-new-matrix-multiplicati.patch
+0
-0
llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
...es/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+0
-0
llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
...ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
+0
-0
No files found.
docs/development.md
View file @
6c6a31a1
...
...
@@ -35,5 +35,5 @@ Now you can run `ollama`:
## Building on Linux with GPU support
-
Install cmake and nvidia-cuda-toolkit
-
run
`
CUBLAS_PATH=/path/to/libcublas.so CUDART_PATH=/path/to/libcudart.so CUBLASLT_PATH=/path/to/libcublasLt.so
go generate ./...`
-
run
`go generate ./...`
-
run
`go build .`
llm/llama.cpp/generate.go
View file @
6c6a31a1
...
...
@@ -6,9 +6,8 @@ package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
...
...
llm/llama.cpp/generate_darwin_amd64.go
View file @
6c6a31a1
...
...
@@ -3,11 +3,10 @@ package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/cpu --target server --config Release
...
...
llm/llama.cpp/generate_darwin_arm64.go
View file @
6c6a31a1
...
...
@@ -3,11 +3,10 @@ package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/metal --target server --config Release
...
...
llm/llama.cpp/generate_linux.go
View file @
6c6a31a1
...
...
@@ -3,15 +3,15 @@ package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cpu --target server --config Release
...
...
@@ -19,10 +19,3 @@ package llm
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate cp --dereference ${CUBLAS_PATH} ggml/build/cuda/bin/libcublas.so.11
//go:generate cp --dereference ${CUBLAS_PATH} gguf/build/cuda/bin/libcublas.so.11
//go:generate cp --dereference ${CUDART_PATH} ggml/build/cuda/bin/libcudart.so.11.0
//go:generate cp --dereference ${CUDART_PATH} gguf/build/cuda/bin/libcudart.so.11.0
//go:generate cp --dereference ${CUBLASLT_PATH} ggml/build/cuda/bin/libcublasLt.so.11
//go:generate cp --dereference ${CUBLASLT_PATH} gguf/build/cuda/bin/libcublasLt.so.11
llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
deleted
100644 → 0
View file @
fc6ec356
From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 5 Sep 2023 16:05:08 -0400
Subject: [PATCH] metal: add missing barriers for mul-mat #2699
---
ggml-metal.metal | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f31252..ce3541f 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@
kernel void kernel_mul_mm(device const uchar * src0,
//load data and store to threadgroup memory
half4x4 temp_a;
dequantize_func(x, il, temp_a);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16)
for (int i = 0; i < 16; i++) {
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,6 +1896,7 @@
kernel void kernel_mul_mm(device const uchar * src0,
}
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
+ threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
--
2.39.2 (Apple Git-143)
llm/llama.cpp/
ggml_
patch/0001-add-detokenize-endpoint.patch
→
llm/llama.cpp/patch
es
/0001-add-detokenize-endpoint.patch
View file @
6c6a31a1
File moved
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
0 → 100644
View file @
6c6a31a1
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 20 Sep 2023 14:19:52 -0700
Subject: [PATCH] copy cuda runtime libraries
---
CMakeLists.txt | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 824d9f2..dd24137 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -274,6 +274,10 @@
if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
--
2.42.0
llm/llama.cpp/
ggml_
patch/0002-34B-model-support.patch
→
llm/llama.cpp/patch
es
/0002-34B-model-support.patch
View file @
6c6a31a1
File moved
llm/llama.cpp/
ggml_
patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
→
llm/llama.cpp/patch
es
/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
View file @
6c6a31a1
File moved
llm/llama.cpp/
ggml_
patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
→
llm/llama.cpp/patch
es
/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
View file @
6c6a31a1
File moved
llm/llama.cpp/
ggml_
patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
→
llm/llama.cpp/patch
es
/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
View file @
6c6a31a1
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment