Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
8f8e736b
Unverified
Commit
8f8e736b
authored
Jul 05, 2024
by
Jeffrey Morgan
Committed by
GitHub
Jul 05, 2024
Browse files
update llama.cpp submodule to `d7fd29f` (#5475)
parent
d89454de
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
150 additions
and
422 deletions
+150
-422
docs/development.md
docs/development.md
+1
-1
llm/ext_server/CMakeLists.txt
llm/ext_server/CMakeLists.txt
+13
-13
llm/generate/gen_darwin.sh
llm/generate/gen_darwin.sh
+8
-8
llm/generate/gen_linux.sh
llm/generate/gen_linux.sh
+18
-18
llm/generate/gen_windows.ps1
llm/generate/gen_windows.ps1
+22
-22
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/llm.go
llm/llm.go
+7
-7
llm/patches/01-load-progress.diff
llm/patches/01-load-progress.diff
+7
-7
llm/patches/03-load_exception.diff
llm/patches/03-load_exception.diff
+6
-18
llm/patches/04-metal.diff
llm/patches/04-metal.diff
+3
-3
llm/patches/05-default-pretokenizer.diff
llm/patches/05-default-pretokenizer.diff
+9
-9
llm/patches/06-qwen2.diff
llm/patches/06-qwen2.diff
+3
-3
llm/patches/07-embeddings.diff
llm/patches/07-embeddings.diff
+45
-0
llm/patches/07-gemma.diff
llm/patches/07-gemma.diff
+0
-305
llm/patches/09-pooling.diff
llm/patches/09-pooling.diff
+7
-7
No files found.
docs/development.md
View file @
8f8e736b
...
@@ -104,7 +104,7 @@ like to use. For example, to compile an optimized binary for an Intel i9-9880H,
...
@@ -104,7 +104,7 @@ like to use. For example, to compile an optimized binary for an Intel i9-9880H,
you might use:
you might use:
```
```
OLLAMA_CUSTOM_CPU_DEFS="-D
LLAMA
_AVX=on -D
LLAMA
_AVX2=on -D
LLAMA
_F16C=on -D
LLAMA
_FMA=on" go generate ./...
OLLAMA_CUSTOM_CPU_DEFS="-D
GGML
_AVX=on -D
GGML
_AVX2=on -D
GGML
_F16C=on -D
GGML
_FMA=on" go generate ./...
go build .
go build .
```
```
...
...
llm/ext_server/CMakeLists.txt
View file @
8f8e736b
set
(
TARGET ollama_llama_server
)
set
(
TARGET ollama_llama_server
)
option
(
LLAMA_SERVER_VERBOSE
"Build verbose logging option for Server"
ON
)
option
(
LLAMA_SERVER_VERBOSE
"Build verbose logging option for Server"
ON
)
include_directories
(
${
CMAKE_CURRENT_SOURCE_DIR
}
)
include_directories
(
${
CMAKE_CURRENT_SOURCE_DIR
}
)
add_executable
(
${
TARGET
}
server.cpp utils.hpp json.hpp httplib.h
)
add_executable
(
${
TARGET
}
server.cpp utils.hpp json.hpp httplib.h
)
install
(
TARGETS
${
TARGET
}
RUNTIME
)
install
(
TARGETS
${
TARGET
}
RUNTIME
)
target_compile_definitions
(
${
TARGET
}
PRIVATE
target_compile_definitions
(
${
TARGET
}
PRIVATE
SERVER_VERBOSE=$<BOOL:
${
LLAMA_SERVER_VERBOSE
}
>
SERVER_VERBOSE=$<BOOL:
${
LLAMA_SERVER_VERBOSE
}
>
)
)
target_link_libraries
(
${
TARGET
}
PRIVATE common llava
${
CMAKE_THREAD_LIBS_INIT
}
)
target_link_libraries
(
${
TARGET
}
PRIVATE
ggml llama
common llava
${
CMAKE_THREAD_LIBS_INIT
}
)
if
(
WIN32
)
if
(
WIN32
)
TARGET_LINK_LIBRARIES
(
${
TARGET
}
PRIVATE ws2_32
)
TARGET_LINK_LIBRARIES
(
${
TARGET
}
PRIVATE ws2_32
)
endif
()
endif
()
target_compile_features
(
${
TARGET
}
PRIVATE cxx_std_11
)
target_compile_features
(
${
TARGET
}
PRIVATE cxx_std_11
)
\ No newline at end of file
llm/generate/gen_darwin.sh
View file @
8f8e736b
...
@@ -18,16 +18,16 @@ sign() {
...
@@ -18,16 +18,16 @@ sign() {
fi
fi
}
}
COMMON_DARWIN_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -D
LLAMA
_METAL_EMBED_LIBRARY=on -D
LLAMA
_OPENMP=off"
COMMON_DARWIN_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -D
GGML
_METAL_EMBED_LIBRARY=on -D
GGML
_OPENMP=off"
case
"
${
GOARCH
}
"
in
case
"
${
GOARCH
}
"
in
"amd64"
)
"amd64"
)
COMMON_CPU_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-D
LLAMA
_METAL=off -D
LLAMA
_NATIVE=off"
COMMON_CPU_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-D
GGML
_METAL=off -D
GGML
_NATIVE=off"
# Static build for linking into the Go binary
# Static build for linking into the Go binary
init_vars
init_vars
CMAKE_TARGETS
=
"--target llama --target ggml"
CMAKE_TARGETS
=
"--target llama --target ggml"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-DBUILD_SHARED_LIBS=off -D
LLAMA
_BLAS=off -D
LLAMA
_ACCELERATE=off -D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-DBUILD_SHARED_LIBS=off -D
GGML
_BLAS=off -D
GGML
_ACCELERATE=off -D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
_static"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
_static"
echo
"Building static library"
echo
"Building static library"
build
build
...
@@ -37,7 +37,7 @@ case "${GOARCH}" in
...
@@ -37,7 +37,7 @@ case "${GOARCH}" in
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
#
init_vars
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_ACCELERATE=off -D
LLAMA
_BLAS=off -D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_ACCELERATE=off -D
GGML
_BLAS=off -D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/cpu"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/cpu"
echo
"Building LCD CPU"
echo
"Building LCD CPU"
build
build
...
@@ -49,7 +49,7 @@ case "${GOARCH}" in
...
@@ -49,7 +49,7 @@ case "${GOARCH}" in
# Approximately 400% faster than LCD on same CPU
# Approximately 400% faster than LCD on same CPU
#
#
init_vars
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_ACCELERATE=off -D
LLAMA
_BLAS=off -D
LLAMA
_AVX=on -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_ACCELERATE=off -D
GGML
_BLAS=off -D
GGML
_AVX=on -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/cpu_avx"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/cpu_avx"
echo
"Building AVX CPU"
echo
"Building AVX CPU"
build
build
...
@@ -61,7 +61,7 @@ case "${GOARCH}" in
...
@@ -61,7 +61,7 @@ case "${GOARCH}" in
# Approximately 10% faster than AVX on same CPU
# Approximately 10% faster than AVX on same CPU
#
#
init_vars
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_ACCELERATE=on -D
LLAMA
_BLAS=off -D
LLAMA
_AVX=on -D
LLAMA
_AVX2=on -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=on -D
LLAMA
_F16C=on
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_ACCELERATE=on -D
GGML
_BLAS=off -D
GGML
_AVX=on -D
GGML
_AVX2=on -D
GGML
_AVX512=off -D
GGML
_FMA=on -D
GGML
_F16C=on
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/cpu_avx2"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/cpu_avx2"
echo
"Building AVX2 CPU"
echo
"Building AVX2 CPU"
EXTRA_LIBS
=
"
${
EXTRA_LIBS
}
-framework Accelerate -framework Foundation"
EXTRA_LIBS
=
"
${
EXTRA_LIBS
}
-framework Accelerate -framework Foundation"
...
@@ -75,14 +75,14 @@ case "${GOARCH}" in
...
@@ -75,14 +75,14 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
# Static build for linking into the Go binary
init_vars
init_vars
CMAKE_TARGETS
=
"--target llama --target ggml"
CMAKE_TARGETS
=
"--target llama --target ggml"
CMAKE_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3
-DLLAMA_BLAS=off
-DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
_static"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
_static"
echo
"Building static library"
echo
"Building static library"
build
build
if
[
-z
"
$OLLAMA_SKIP_METAL_GENERATE
"
]
;
then
if
[
-z
"
$OLLAMA_SKIP_METAL_GENERATE
"
]
;
then
init_vars
init_vars
CMAKE_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DLLAMA_ACCELERATE=on
-DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-DLLAMA_METAL=on
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/metal"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/metal"
EXTRA_LIBS
=
"
${
EXTRA_LIBS
}
-framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
EXTRA_LIBS
=
"
${
EXTRA_LIBS
}
-framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build
build
...
...
llm/generate/gen_linux.sh
View file @
8f8e736b
...
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
...
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
export
CUDACXX
=
$(
command
-v
nvcc
)
export
CUDACXX
=
$(
command
-v
nvcc
)
fi
fi
fi
fi
COMMON_CMAKE_DEFS
=
"-DCMAKE_POSITION_INDEPENDENT_CODE=on -D
LLAMA
_NATIVE=off -D
LLAMA
_AVX=on -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off -D
LLAMA
_OPENMP=off"
COMMON_CMAKE_DEFS
=
"-DCMAKE_POSITION_INDEPENDENT_CODE=on -D
GGML
_NATIVE=off -D
GGML
_AVX=on -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off -D
GGML
_OPENMP=off"
source
$(
dirname
$0
)
/gen_common.sh
source
$(
dirname
$0
)
/gen_common.sh
init_vars
init_vars
git_module_setup
git_module_setup
...
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
...
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
# Static build for linking into the Go binary
# Static build for linking into the Go binary
init_vars
init_vars
CMAKE_TARGETS
=
"--target llama --target ggml"
CMAKE_TARGETS
=
"--target llama --target ggml"
CMAKE_DEFS
=
"-DBUILD_SHARED_LIBS=off -D
LLAMA
_NATIVE=off -D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off -D
LLAMA
_OPENMP=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"-DBUILD_SHARED_LIBS=off -D
GGML
_NATIVE=off -D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off -D
GGML
_OPENMP=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
_static"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
_static"
echo
"Building static library"
echo
"Building static library"
build
build
...
@@ -84,22 +84,22 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
...
@@ -84,22 +84,22 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
compress
compress
else
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -D
LLAMA
_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -D
GGML
_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -D
LLAMA
_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -D
GGML
_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -D
LLAMA
_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -D
GGML
_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -D
LLAMA
_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# -D
GGML
_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# Note: the following seem to yield slower results than AVX2 - ymmv
# Note: the following seem to yield slower results than AVX2 - ymmv
# -D
LLAMA
_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -D
GGML
_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -D
LLAMA
_AVX512_VBMI -- 2018 Intel Cannon Lake
# -D
GGML
_AVX512_VBMI -- 2018 Intel Cannon Lake
# -D
LLAMA
_AVX512_VNNI -- 2021 Intel Alder Lake
# -D
GGML
_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS
=
"-DCMAKE_POSITION_INDEPENDENT_CODE=on -D
LLAMA
_NATIVE=off -D
LLAMA
_OPENMP=off"
COMMON_CPU_DEFS
=
"-DCMAKE_POSITION_INDEPENDENT_CODE=on -D
GGML
_NATIVE=off -D
GGML
_OPENMP=off"
if
[
-z
"
${
OLLAMA_CPU_TARGET
}
"
-o
"
${
OLLAMA_CPU_TARGET
}
"
=
"cpu"
]
;
then
if
[
-z
"
${
OLLAMA_CPU_TARGET
}
"
-o
"
${
OLLAMA_CPU_TARGET
}
"
=
"cpu"
]
;
then
#
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
#
init_vars
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cpu"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cpu"
echo
"Building LCD CPU"
echo
"Building LCD CPU"
build
build
...
@@ -116,7 +116,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
...
@@ -116,7 +116,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Approximately 400% faster than LCD on same CPU
# Approximately 400% faster than LCD on same CPU
#
#
init_vars
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_AVX=on -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_AVX=on -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cpu_avx"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cpu_avx"
echo
"Building AVX CPU"
echo
"Building AVX CPU"
build
build
...
@@ -129,7 +129,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
...
@@ -129,7 +129,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Approximately 10% faster than AVX on same CPU
# Approximately 10% faster than AVX on same CPU
#
#
init_vars
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_AVX=on -D
LLAMA
_AVX2=on -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=on -D
LLAMA
_F16C=on
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_AVX=on -D
GGML
_AVX2=on -D
GGML
_AVX512=off -D
GGML
_FMA=on -D
GGML
_F16C=on
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cpu_avx2"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cpu_avx2"
echo
"Building AVX2 CPU"
echo
"Building AVX2 CPU"
build
build
...
@@ -170,15 +170,15 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
...
@@ -170,15 +170,15 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
#
#
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
# Disabling has minimal performance effect while maintaining compatibility.
# Disabling has minimal performance effect while maintaining compatibility.
ARM64_DEFS
=
"-D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_CUDA_F16=off"
ARM64_DEFS
=
"-D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_CUDA_F16=off"
fi
fi
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
if
[
-n
"
${
OLLAMA_CUSTOM_CUDA_DEFS
}
"
]
;
then
if
[
-n
"
${
OLLAMA_CUSTOM_CUDA_DEFS
}
"
]
;
then
echo
"OLLAMA_CUSTOM_CUDA_DEFS=
\"
${
OLLAMA_CUSTOM_CUDA_DEFS
}
\"
"
echo
"OLLAMA_CUSTOM_CUDA_DEFS=
\"
${
OLLAMA_CUSTOM_CUDA_DEFS
}
\"
"
CMAKE_CUDA_DEFS
=
"-D
LLAMA
_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
${
OLLAMA_CUSTOM_CUDA_DEFS
}
"
CMAKE_CUDA_DEFS
=
"-D
GGML
_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
${
OLLAMA_CUSTOM_CUDA_DEFS
}
"
echo
"Building custom CUDA GPU"
echo
"Building custom CUDA GPU"
else
else
CMAKE_CUDA_DEFS
=
"-D
LLAMA
_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -D
LLAMA
_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
"
CMAKE_CUDA_DEFS
=
"-D
GGML
_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -D
GGML
_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
-DCMAKE_LIBRARY_PATH=/usr/local/cuda/compat
"
fi
fi
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
${
ARM64_DEFS
}
${
CMAKE_CUDA_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
${
ARM64_DEFS
}
${
CMAKE_CUDA_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cuda
${
CUDA_VARIANT
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cuda
${
CUDA_VARIANT
}
"
...
@@ -216,7 +216,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
...
@@ -216,7 +216,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
init_vars
init_vars
source
${
ONEAPI_ROOT
}
/setvars.sh
--force
# set up environment variables for oneAPI
source
${
ONEAPI_ROOT
}
/setvars.sh
--force
# set up environment variables for oneAPI
CC
=
icx
CC
=
icx
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
-DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -D
LLAMA
_SYCL=ON -D
LLAMA
_SYCL_F16=OFF"
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
-DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -D
GGML
_SYCL=ON -D
GGML
_SYCL_F16=OFF"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/oneapi"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/oneapi"
EXTRA_LIBS
=
"-fsycl -Wl,-rpath,
${
ONEAPI_ROOT
}
/compiler/latest/lib,-rpath,
${
ONEAPI_ROOT
}
/mkl/latest/lib,-rpath,
${
ONEAPI_ROOT
}
/tbb/latest/lib,-rpath,
${
ONEAPI_ROOT
}
/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
EXTRA_LIBS
=
"-fsycl -Wl,-rpath,
${
ONEAPI_ROOT
}
/compiler/latest/lib,-rpath,
${
ONEAPI_ROOT
}
/mkl/latest/lib,-rpath,
${
ONEAPI_ROOT
}
/tbb/latest/lib,-rpath,
${
ONEAPI_ROOT
}
/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
DEBUG_FLAGS
=
""
# icx compiles with -O0 if we pass -g, so we must remove it
DEBUG_FLAGS
=
""
# icx compiles with -O0 if we pass -g, so we must remove it
...
@@ -254,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
...
@@ -254,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
ROCM_VARIANT
=
_v
$(
ls
${
ROCM_PATH
}
/lib/librocblas.so.
*
.
*
.????? |
cut
-f5
-d
.
||
true
)
ROCM_VARIANT
=
_v
$(
ls
${
ROCM_PATH
}
/lib/librocblas.so.
*
.
*
.????? |
cut
-f5
-d
.
||
true
)
fi
fi
init_vars
init_vars
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
-D
LLAMA
_HIPBLAS=on -DCMAKE_C_COMPILER=
$ROCM_PATH
/llvm/bin/clang -DCMAKE_CXX_COMPILER=
$ROCM_PATH
/llvm/bin/clang++ -DAMDGPU_TARGETS=
$(
amdGPUs
)
-DGPU_TARGETS=
$(
amdGPUs
)
"
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
-D
GGML
_HIPBLAS=on -DCMAKE_C_COMPILER=
$ROCM_PATH
/llvm/bin/clang -DCMAKE_CXX_COMPILER=
$ROCM_PATH
/llvm/bin/clang++ -DAMDGPU_TARGETS=
$(
amdGPUs
)
-DGPU_TARGETS=
$(
amdGPUs
)
"
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
if
[
-n
"
${
OLLAMA_CUSTOM_ROCM_DEFS
}
"
]
;
then
if
[
-n
"
${
OLLAMA_CUSTOM_ROCM_DEFS
}
"
]
;
then
echo
"OLLAMA_CUSTOM_ROCM_DEFS=
\"
${
OLLAMA_CUSTOM_ROCM_DEFS
}
\"
"
echo
"OLLAMA_CUSTOM_ROCM_DEFS=
\"
${
OLLAMA_CUSTOM_ROCM_DEFS
}
\"
"
...
...
llm/generate/gen_windows.ps1
View file @
8f8e736b
...
@@ -39,8 +39,8 @@ function init_vars {
...
@@ -39,8 +39,8 @@ function init_vars {
}
}
$
script
:
cmakeDefs
=
@(
$
script
:
cmakeDefs
=
@(
"-DBUILD_SHARED_LIBS=on"
,
"-DBUILD_SHARED_LIBS=on"
,
"-D
LLAMA
_NATIVE=off"
,
"-D
GGML
_NATIVE=off"
,
"-D
LLAMA
_OPENMP=off"
"-D
GGML
_OPENMP=off"
)
)
$
script
:
commonCpuDefs
=
@(
"-DCMAKE_POSITION_INDEPENDENT_CODE=on"
)
$
script
:
commonCpuDefs
=
@(
"-DCMAKE_POSITION_INDEPENDENT_CODE=on"
)
$
script
:
ARCH
=
$
Env
:
PROCESSOR_ARCHITECTURE
.
ToLower
()
$
script
:
ARCH
=
$
Env
:
PROCESSOR_ARCHITECTURE
.
ToLower
()
...
@@ -182,9 +182,9 @@ function cleanup {
...
@@ -182,9 +182,9 @@ function cleanup {
}
}
# -D
LLAMA
_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -D
GGML
_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -D
LLAMA
_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -D
GGML
_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -D
LLAMA
_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# -D
GGML
_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
function build_static() {
function build_static() {
...
@@ -204,13 +204,13 @@ function build_static() {
...
@@ -204,13 +204,13 @@ function build_static() {
"
-DCMAKE_C_COMPILER
=
gcc.exe
",
"
-DCMAKE_C_COMPILER
=
gcc.exe
",
"
-DCMAKE_CXX_COMPILER
=
g
++.
exe
",
"
-DCMAKE_CXX_COMPILER
=
g
++.
exe
",
"
-DBUILD_SHARED_LIBS
=
off
",
"
-DBUILD_SHARED_LIBS
=
off
",
"
-D
LLAMA
_NATIVE
=
off
",
"
-D
GGML
_NATIVE
=
off
",
"
-D
LLAMA
_AVX
=
off
",
"
-D
GGML
_AVX
=
off
",
"
-D
LLAMA
_AVX2
=
off
",
"
-D
GGML
_AVX2
=
off
",
"
-D
LLAMA
_AVX512
=
off
",
"
-D
GGML
_AVX512
=
off
",
"
-D
LLAMA
_F16C
=
off
",
"
-D
GGML
_F16C
=
off
",
"
-D
LLAMA
_FMA
=
off
",
"
-D
GGML
_FMA
=
off
",
"
-D
LLAMA
_OPENMP
=
off
")
"
-D
GGML
_OPENMP
=
off
")
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
_static
"
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
_static
"
write-host "
Building
static
library
"
write-host "
Building
static
library
"
build
build
...
@@ -224,7 +224,7 @@ function build_cpu($gen_arch) {
...
@@ -224,7 +224,7 @@ function build_cpu($gen_arch) {
if ((-not "
${env:OLLAMA_SKIP_CPU_GENERATE}
" ) -and ((-not "
${env:OLLAMA_CPU_TARGET}
") -or ("
${env:OLLAMA_CPU_TARGET}
" -eq "
cpu
"))) {
if ((-not "
${env:OLLAMA_SKIP_CPU_GENERATE}
" ) -and ((-not "
${env:OLLAMA_CPU_TARGET}
") -or ("
${env:OLLAMA_CPU_TARGET}
" -eq "
cpu
"))) {
# remaining llama.cpp builds use MSVC
# remaining llama.cpp builds use MSVC
init_vars
init_vars
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
",
$gen_arch
, "
-D
LLAMA
_AVX
=
off
", "
-D
LLAMA
_AVX2
=
off
", "
-D
LLAMA
_AVX512
=
off
", "
-D
LLAMA
_FMA
=
off
", "
-D
LLAMA
_F16C
=
off
") +
$
script
:
cmakeDefs
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
",
$gen_arch
, "
-D
GGML
_AVX
=
off
", "
-D
GGML
_AVX2
=
off
", "
-D
GGML
_AVX512
=
off
", "
-D
GGML
_FMA
=
off
", "
-D
GGML
_F16C
=
off
") +
$
script
:
cmakeDefs
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
/cpu
"
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
/cpu
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cpu
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cpu
"
write-host "
Building
LCD
CPU
"
write-host "
Building
LCD
CPU
"
...
@@ -239,7 +239,7 @@ function build_cpu($gen_arch) {
...
@@ -239,7 +239,7 @@ function build_cpu($gen_arch) {
function build_cpu_avx() {
function build_cpu_avx() {
if ((-not "
${env:OLLAMA_SKIP_CPU_GENERATE}
" ) -and ((-not "
${env:OLLAMA_CPU_TARGET}
") -or ("
${env:OLLAMA_CPU_TARGET}
" -eq "
cpu_avx
"))) {
if ((-not "
${env:OLLAMA_SKIP_CPU_GENERATE}
" ) -and ((-not "
${env:OLLAMA_CPU_TARGET}
") -or ("
${env:OLLAMA_CPU_TARGET}
" -eq "
cpu_avx
"))) {
init_vars
init_vars
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
", "
x64
", "
-D
LLAMA
_AVX
=
on
", "
-D
LLAMA
_AVX2
=
off
", "
-D
LLAMA
_AVX512
=
off
", "
-D
LLAMA
_FMA
=
off
", "
-D
LLAMA
_F16C
=
off
") +
$
script
:
cmakeDefs
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
", "
x64
", "
-D
GGML
_AVX
=
on
", "
-D
GGML
_AVX2
=
off
", "
-D
GGML
_AVX512
=
off
", "
-D
GGML
_FMA
=
off
", "
-D
GGML
_F16C
=
off
") +
$
script
:
cmakeDefs
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
/cpu_avx
"
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
/cpu_avx
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cpu_avx
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cpu_avx
"
write-host "
Building
AVX
CPU
"
write-host "
Building
AVX
CPU
"
...
@@ -254,7 +254,7 @@ function build_cpu_avx() {
...
@@ -254,7 +254,7 @@ function build_cpu_avx() {
function build_cpu_avx2() {
function build_cpu_avx2() {
if ((-not "
${env:OLLAMA_SKIP_CPU_GENERATE}
" ) -and ((-not "
${env:OLLAMA_CPU_TARGET}
") -or ("
${env:OLLAMA_CPU_TARGET}
" -eq "
cpu_avx2
"))) {
if ((-not "
${env:OLLAMA_SKIP_CPU_GENERATE}
" ) -and ((-not "
${env:OLLAMA_CPU_TARGET}
") -or ("
${env:OLLAMA_CPU_TARGET}
" -eq "
cpu_avx2
"))) {
init_vars
init_vars
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
", "
x64
", "
-D
LLAMA
_AVX
=
on
", "
-D
LLAMA
_AVX2
=
on
", "
-D
LLAMA
_AVX512
=
off
", "
-D
LLAMA
_FMA
=
on
", "
-D
LLAMA
_F16C
=
on
") +
$
script
:
cmakeDefs
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
", "
x64
", "
-D
GGML
_AVX
=
on
", "
-D
GGML
_AVX2
=
on
", "
-D
GGML
_AVX512
=
off
", "
-D
GGML
_FMA
=
on
", "
-D
GGML
_F16C
=
on
") +
$
script
:
cmakeDefs
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
/cpu_avx2
"
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
/cpu_avx2
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cpu_avx2
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cpu_avx2
"
write-host "
Building
AVX2
CPU
"
write-host "
Building
AVX2
CPU
"
...
@@ -279,9 +279,9 @@ function build_cuda() {
...
@@ -279,9 +279,9 @@ function build_cuda() {
$
script
:
distDir
="
$
script
:
DIST_BASE
\cuda
$
script
:
CUDA_VARIANT
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cuda
$
script
:
CUDA_VARIANT
"
$
script
:
cmakeDefs
+= @(
$
script
:
cmakeDefs
+= @(
"
-A
", "
x64
",
"
-A
", "
x64
",
"
-D
LLAMA
_CUDA
=
ON
",
"
-D
GGML
_CUDA
=
ON
",
"
-D
LLAMA
_AVX
=
on
",
"
-D
GGML
_AVX
=
on
",
"
-D
LLAMA
_AVX2
=
off
",
"
-D
GGML
_AVX2
=
off
",
"
-DCUDAToolkit_INCLUDE_DIR
=
$
script
:
CUDA_INCLUDE_DIR
",
"
-DCUDAToolkit_INCLUDE_DIR
=
$
script
:
CUDA_INCLUDE_DIR
",
"
-DCMAKE_CUDA_FLAGS
=
-t8
",
"
-DCMAKE_CUDA_FLAGS
=
-t8
",
"
-DCMAKE_CUDA_ARCHITECTURES
=
${script:CMAKE_CUDA_ARCHITECTURES}
"
"
-DCMAKE_CUDA_ARCHITECTURES
=
${script:CMAKE_CUDA_ARCHITECTURES}
"
...
@@ -319,7 +319,7 @@ function build_oneapi() {
...
@@ -319,7 +319,7 @@ function build_oneapi() {
$
script
:
distDir
=
"
$
script
:
DIST_BASE
\oneapi
$
script
:
ONEAPI_VARIANT
"
$
script
:
distDir
=
"
$
script
:
DIST_BASE
\oneapi
$
script
:
ONEAPI_VARIANT
"
$
script
:
cmakeDefs
+=
@(
$
script
:
cmakeDefs
+=
@(
"-G"
,
"MinGW Makefiles"
,
"-G"
,
"MinGW Makefiles"
,
"-D
LLAMA
_SYCL=ON"
,
"-D
GGML
_SYCL=ON"
,
"-DCMAKE_C_COMPILER=icx"
,
"-DCMAKE_C_COMPILER=icx"
,
"-DCMAKE_CXX_COMPILER=icx"
,
"-DCMAKE_CXX_COMPILER=icx"
,
"-DCMAKE_BUILD_TYPE=Release"
"-DCMAKE_BUILD_TYPE=Release"
...
@@ -365,10 +365,10 @@ function build_rocm() {
...
@@ -365,10 +365,10 @@ function build_rocm() {
"-G"
,
"Ninja"
,
"-G"
,
"Ninja"
,
"-DCMAKE_C_COMPILER=clang.exe"
,
"-DCMAKE_C_COMPILER=clang.exe"
,
"-DCMAKE_CXX_COMPILER=clang++.exe"
,
"-DCMAKE_CXX_COMPILER=clang++.exe"
,
"-D
LLAMA
_HIPBLAS=on"
,
"-D
GGML
_HIPBLAS=on"
,
"-DHIP_PLATFORM=amd"
,
"-DHIP_PLATFORM=amd"
,
"-D
LLAMA
_AVX=on"
,
"-D
GGML
_AVX=on"
,
"-D
LLAMA
_AVX2=off"
,
"-D
GGML
_AVX2=off"
,
"-DCMAKE_POSITION_INDEPENDENT_CODE=on"
,
"-DCMAKE_POSITION_INDEPENDENT_CODE=on"
,
"-DAMDGPU_TARGETS=
$(
amdGPUs
)
",
"-DAMDGPU_TARGETS=
$(
amdGPUs
)
",
"
-DGPU_TARGETS
=
$
(
amdGPUs
)
"
"
-DGPU_TARGETS
=
$
(
amdGPUs
)
"
...
...
llama.cpp
@
d7fd29ff
Compare
7c26775a
...
d7fd29ff
Subproject commit
7c26775adb579e92b59c82e8084c07a1d0f75e9c
Subproject commit
d7fd29fff16456ce9c3a23fd2d09a66256b05aff
llm/llm.go
View file @
8f8e736b
package
llm
package
llm
// #cgo CFLAGS: -Illama.cpp
// #cgo CFLAGS: -Illama.cpp
/include -Illama.cpp/ggml/include
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a
-lstdc++
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/
src/
libllama.a
${SRCDIR}/build/darwin/arm64_static/ggml/src/libggml.a -lstdc++ -framework Accelerate -framework Metal
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a
-lstdc++
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/
src/
libllama.a
${SRCDIR}/build/darwin/x86_64_static/ggml/src/libggml.a -lstdc++ -framework Accelerate -framework Metal
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/
src/
libllama.a
${SRCDIR}/build/windows/amd64_static/ggml/src/libggml.a
-static -lstdc++
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/
src/
libllama.a
${SRCDIR}/build/windows/arm64_static/ggml/src/libggml.a
-static -lstdc++
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/
src/
libllama.a
${SRCDIR}/build/linux/x86_64_static/ggml/src/libggml.a
-lstdc++
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/
src/
libllama.a
${SRCDIR}/build/linux/arm64_static/ggml/src/libggml.
-lstdc++
// #include <stdlib.h>
// #include <stdlib.h>
// #include "llama.h"
// #include "llama.h"
import
"C"
import
"C"
...
...
llm/patches/01-load-progress.diff
View file @
8f8e736b
diff --git a/common/common.cpp b/common/common.cpp
diff --git a/common/common.cpp b/common/common.cpp
index
73ff0e85..6adb1a92
100644
index
2c05a4d4..927f0e3d
100644
--- a/common/common.cpp
--- a/common/common.cpp
+++ b/common/common.cpp
+++ b/common/common.cpp
@@ -2
447
,6 +2
447
,8 @@
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
@@ -2
093
,6 +2
093
,8 @@
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.check_tensors = params.check_tensors;
...
@@ -12,10 +12,10 @@ index 73ff0e85..6adb1a92 100644
...
@@ -12,10 +12,10 @@ index 73ff0e85..6adb1a92 100644
mparams.kv_overrides = NULL;
mparams.kv_overrides = NULL;
} else {
} else {
diff --git a/common/common.h b/common/common.h
diff --git a/common/common.h b/common/common.h
index
58ed72f4..0bb2605e
100644
index
65c0ef81..ebca2c77
100644
--- a/common/common.h
--- a/common/common.h
+++ b/common/common.h
+++ b/common/common.h
@@ -18
0
,6 +18
0
,13 @@
struct gpt_params {
@@ -18
4
,6 +18
4
,13 @@
struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
std::vector<std::string> image; // path to image file(s)
...
@@ -26,6 +26,6 @@ index 58ed72f4..0bb2605e 100644
...
@@ -26,6 +26,6 @@ index 58ed72f4..0bb2605e 100644
+ // context pointer passed to the progress callback
+ // context pointer passed to the progress callback
+ void * progress_callback_user_data;
+ void * progress_callback_user_data;
+
+
//
server params
//
embedding
int32_t port = 8080; // server listens on this network port
bool embedding = false; // get only sentence embedding
int32_t
timeout_read = 600; // http read timeo
ut in
seconds
int32_t
embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absol
ut
e
in
t16, 1=taxicab, 2=euclidean, >2=p-norm)
llm/patches/03-load_exception.diff
View file @
8f8e736b
From 544a2d2e646d39e878d87dfbb3398a356bc560ab Mon Sep 17 00:00:00 2001
diff --git a/src/llama.cpp b/src/llama.cpp
From: Michael Yang <mxyng@pm.me>
index 73f52435..58a00fb1 100644
Date: Thu, 23 May 2024 11:18:45 -0700
--- a/src/llama.cpp
Subject: [PATCH] throw exception on load errors
+++ b/src/llama.cpp
@@ -7241,7 +7241,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
---
llama.cpp | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/llama.cpp b/llama.cpp
index 15c66077..8ba90b6a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6346,7 +6346,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
}
}
} catch (const std::exception & err) {
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
...
@@ -20,7 +11,7 @@ index 15c66077..8ba90b6a 100644
...
@@ -20,7 +11,7 @@ index 15c66077..8ba90b6a 100644
}
}
return 0;
return 0;
@@ -156
00
,16 +156
00
,23 @@
struct llama_model * llama_load_model_from_file(
@@ -1
7
56
4
,16 +1
7
56
4
,23 @@
struct llama_model * llama_load_model_from_file(
}
}
model->rpc_servers.push_back(servers);
model->rpc_servers.push_back(servers);
}
}
...
@@ -52,6 +43,3 @@ index 15c66077..8ba90b6a 100644
...
@@ -52,6 +43,3 @@ index 15c66077..8ba90b6a 100644
}
}
return model;
return model;
--
2.45.1
llm/patches/04-metal.diff
View file @
8f8e736b
diff --git a/ggml-metal.m b/ggml-metal.m
diff --git a/ggml
/src/ggml
-metal.m b/ggml
/src/ggml
-metal.m
index 0207b787..b5e9884b 100644
index 0207b787..b5e9884b 100644
--- a/ggml-metal.m
--- a/ggml
/src/ggml
-metal.m
+++ b/ggml-metal.m
+++ b/ggml
/src/ggml
-metal.m
@@ -1396,27 +1396,23 @@
static enum ggml_status ggml_metal_graph_compute(
@@ -1396,27 +1396,23 @@
static enum ggml_status ggml_metal_graph_compute(
// to the matrix-vector kernel
// to the matrix-vector kernel
int ne11_mm_min = 1;
int ne11_mm_min = 1;
...
...
llm/patches/05-default-pretokenizer.diff
View file @
8f8e736b
diff --git a/llama.cpp b/llama.cpp
diff --git a/
src/
llama.cpp b/
src/
llama.cpp
index
61948751..4b72a293
100644
index
73f52435..2b81b4bd
100644
--- a/llama.cpp
--- a/
src/
llama.cpp
+++ b/llama.cpp
+++ b/
src/
llama.cpp
@@ -
4824
,16 +
4824
,7 @@
static void llm_load_vocab(
@@ -
5092
,16 +
5092
,7 @@
static void llm_load_vocab(
// for now, only BPE models have pre-tokenizers
// for now, only BPE models have pre-tokenizers
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
...
@@ -20,13 +20,13 @@ index 61948751..4b72a293 100644
...
@@ -20,13 +20,13 @@ index 61948751..4b72a293 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama3" ||
@@ -
4888,7 +4879
,8 @@
static void llm_load_vocab(
@@ -
5164,7 +5155
,8 @@
static void llm_load_vocab(
tokenizer_pre == "
poro-chat
") {
tokenizer_pre == "
jais
") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_
PORO
;
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_
JAIS
;
} else {
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
}
} else {
} else
if (vocab.type == LLAMA_VOCAB_TYPE_SPM)
{
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
llm/patches/06-qwen2.diff
View file @
8f8e736b
diff --git a/llama.cpp b/llama.cpp
diff --git a/
src/
llama.cpp b/
src/
llama.cpp
index 40d2ec2c..f34eb79a 100644
index 40d2ec2c..f34eb79a 100644
--- a/llama.cpp
--- a/
src/
llama.cpp
+++ b/llama.cpp
+++ b/
src/
llama.cpp
@@ -6943,7 +6943,7 @@
static struct ggml_tensor * llm_build_kqv(
@@ -6943,7 +6943,7 @@
static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
cb(kq, "kq", il);
...
...
llm/patches/07-embeddings.diff
0 → 100644
View file @
8f8e736b
diff --git a/src/llama.cpp b/src/llama.cpp
index 1fe2b9f7..a43312a7 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13689,7 +13689,7 @@
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
- const bool has_logits = !cparams.embeddings;
+ const bool has_logits = cparams.causal_attn;
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -13959,17 +13959,25 @@
static int llama_decode_internal(
// no output
res = nullptr;
embd = nullptr;
- } else if (cparams.embeddings) {
- res = nullptr; // do not extract logits for embedding case
- embd = gf->nodes[gf->n_nodes - 1];
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
- embd = gf->nodes[gf->n_nodes - 2];
+ }
+
+ if (cparams.embeddings) {
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
+ embd = gf->nodes[i];
+ if (strcmp(embd->name, "result_embd_pooled") == 0) {
+ break;
+ }
}
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
- } else {
+ } else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
}
+
+ if (!cparams.causal_attn) {
+ res = nullptr; // do not extract logits when not needed
+ }
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
ggml_backend_sched_alloc_graph(lctx.sched, gf);
llm/patches/07-gemma.diff
deleted
100644 → 0
View file @
d89454de
From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
From: Ollama maintainers <hello@ollama.com>
Date: Wed, 26 Jun 2024 16:18:09 -0700
Subject: [PATCH] Architecture support
---
llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 193 insertions(+), 1 deletion(-)
diff --git a/llama.cpp b/llama.cpp
index 61948751..3b4196f5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -217,6 +217,7 @@
enum llm_arch {
LLM_ARCH_INTERNLM2,
LLM_ARCH_MINICPM,
LLM_ARCH_GEMMA,
+ LLM_ARCH_GEMMA2,
LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA,
LLM_ARCH_XVERSE,
@@ -255,6 +256,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" },
+ { LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
@@ -464,10 +466,12 @@
enum llm_tensor {
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
@@ -960,6 +964,24 @@
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_GEMMA2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
+ },
+ },
{
LLM_ARCH_STARCODER2,
{
@@ -1941,6 +1963,8 @@
enum e_model {
MODEL_8x22B,
MODEL_16x12B,
MODEL_10B_128x3_66B,
+ MODEL_9B,
+ MODEL_27B,
};
static const size_t kiB = 1024;
@@ -2114,6 +2138,7 @@
struct llama_layer {
struct ggml_tensor * attn_out_norm_b;
struct ggml_tensor * attn_q_a_norm;
struct ggml_tensor * attn_kv_a_norm;
+ struct ggml_tensor * attn_post_norm;
// attention
struct ggml_tensor * wq;
@@ -2136,6 +2161,7 @@
struct llama_layer {
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
+ struct ggml_tensor * ffn_post_norm;
struct ggml_tensor * layer_out_norm;
struct ggml_tensor * layer_out_norm_b;
struct ggml_tensor * ffn_norm_exps;
@@ -4529,6 +4555,16 @@
static void llm_load_hparams(
}
} break;
case LLM_ARCH_GEMMA:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 18: model.type = e_model::MODEL_9B; break;
+ case 28: model.type = e_model::MODEL_27B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GEMMA2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6305,6 +6341,40 @@
static bool llm_load_tensors(
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
}
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+ const int64_t n_ff = hparams.n_ff;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+ }
+ } break;
case LLM_ARCH_STARCODER2:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -10614,6 +10684,123 @@
struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_gemma2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+ cb(Qcur, "Qcur_scaled", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+ model.layers[il].wo, NULL,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_post_norm", il);
+
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = llm_build_norm(ctx0, sa_out, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -11847,6 +12034,10 @@
static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_gemma();
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ result = llm.build_gemma2();
+ } break;
case LLM_ARCH_STARCODER2:
{
result = llm.build_starcoder2();
@@ -16671,6 +16862,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
case LLM_ARCH_GEMMA:
+ case LLM_ARCH_GEMMA2:
case LLM_ARCH_STARCODER2:
case LLM_ARCH_GPTNEOX:
return LLAMA_ROPE_TYPE_NEOX;
@@ -18551,7 +18743,7 @@
static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<s>assistant\n";
}
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
// google/gemma-7b-it
std::string system_prompt = "";
for (auto message : chat) {
--
2.45.2
llm/patches/09-pooling.diff
View file @
8f8e736b
diff --git a/llama.cpp b/llama.cpp
diff --git a/
src/
llama.cpp b/
src/
llama.cpp
index
61948751..61fe7b57
100644
index
721b8f4e..cfe7ac40
100644
--- a/llama.cpp
--- a/
src/
llama.cpp
+++ b/llama.cpp
+++ b/
src/
llama.cpp
@@ -
7591
,14 +
7591
,14 @@
struct llm_build_context {
@@ -
8420
,14 +
8420
,14 @@
struct llm_build_context {
}
}
struct ggml_tensor * build_inp_mean() {
struct ggml_tensor * build_inp_mean() {
...
@@ -19,7 +19,7 @@ index 61948751..61fe7b57 100644
...
@@ -19,7 +19,7 @@ index 61948751..61fe7b57 100644
cb(lctx.inp_cls, "inp_cls", -1);
cb(lctx.inp_cls, "inp_cls", -1);
ggml_set_input(lctx.inp_cls);
ggml_set_input(lctx.inp_cls);
return lctx.inp_cls;
return lctx.inp_cls;
@@ -1
2062
,19 +1
2062
,16 @@
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
@@ -1
3847
,19 +1
3847
,16 @@
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_mean->data;
float * data = (float *) lctx.inp_mean->data;
...
@@ -42,7 +42,7 @@ index 61948751..61fe7b57 100644
...
@@ -42,7 +42,7 @@ index 61948751..61fe7b57 100644
const uint64_t s = sum[i];
const uint64_t s = sum[i];
if (s > 0) {
if (s > 0) {
div[i] = 1.0f/float(s);
div[i] = 1.0f/float(s);
@@ -1
2094
,14 +1
2091
,11 @@
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
@@ -1
3879
,14 +1
3876
,11 @@
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment