Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
8f8e736b
"sgl-kernel/vscode:/vscode.git/clone" did not exist on "15f34013432f9a34508a7a03537c31548d6aaa26"
Unverified
Commit
8f8e736b
authored
Jul 05, 2024
by
Jeffrey Morgan
Committed by
GitHub
Jul 05, 2024
Browse files
update llama.cpp submodule to `d7fd29f` (#5475)
parent
d89454de
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
150 additions
and
422 deletions
+150
-422
docs/development.md
docs/development.md
+1
-1
llm/ext_server/CMakeLists.txt
llm/ext_server/CMakeLists.txt
+13
-13
llm/generate/gen_darwin.sh
llm/generate/gen_darwin.sh
+8
-8
llm/generate/gen_linux.sh
llm/generate/gen_linux.sh
+18
-18
llm/generate/gen_windows.ps1
llm/generate/gen_windows.ps1
+22
-22
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/llm.go
llm/llm.go
+7
-7
llm/patches/01-load-progress.diff
llm/patches/01-load-progress.diff
+7
-7
llm/patches/03-load_exception.diff
llm/patches/03-load_exception.diff
+6
-18
llm/patches/04-metal.diff
llm/patches/04-metal.diff
+3
-3
llm/patches/05-default-pretokenizer.diff
llm/patches/05-default-pretokenizer.diff
+9
-9
llm/patches/06-qwen2.diff
llm/patches/06-qwen2.diff
+3
-3
llm/patches/07-embeddings.diff
llm/patches/07-embeddings.diff
+45
-0
llm/patches/07-gemma.diff
llm/patches/07-gemma.diff
+0
-305
llm/patches/09-pooling.diff
llm/patches/09-pooling.diff
+7
-7
No files found.
docs/development.md
View file @
8f8e736b
...
...
@@ -104,7 +104,7 @@ like to use. For example, to compile an optimized binary for an Intel i9-9880H,
you might use:
```
OLLAMA_CUSTOM_CPU_DEFS="-D
LLAMA
_AVX=on -D
LLAMA
_AVX2=on -D
LLAMA
_F16C=on -D
LLAMA
_FMA=on" go generate ./...
OLLAMA_CUSTOM_CPU_DEFS="-D
GGML
_AVX=on -D
GGML
_AVX2=on -D
GGML
_F16C=on -D
GGML
_FMA=on" go generate ./...
go build .
```
...
...
llm/ext_server/CMakeLists.txt
View file @
8f8e736b
...
...
@@ -7,7 +7,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions
(
${
TARGET
}
PRIVATE
SERVER_VERBOSE=$<BOOL:
${
LLAMA_SERVER_VERBOSE
}
>
)
target_link_libraries
(
${
TARGET
}
PRIVATE common llava
${
CMAKE_THREAD_LIBS_INIT
}
)
target_link_libraries
(
${
TARGET
}
PRIVATE
ggml llama
common llava
${
CMAKE_THREAD_LIBS_INIT
}
)
if
(
WIN32
)
TARGET_LINK_LIBRARIES
(
${
TARGET
}
PRIVATE ws2_32
)
endif
()
...
...
llm/generate/gen_darwin.sh
View file @
8f8e736b
...
...
@@ -18,16 +18,16 @@ sign() {
fi
}
COMMON_DARWIN_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -D
LLAMA
_METAL_EMBED_LIBRARY=on -D
LLAMA
_OPENMP=off"
COMMON_DARWIN_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -D
GGML
_METAL_EMBED_LIBRARY=on -D
GGML
_OPENMP=off"
case
"
${
GOARCH
}
"
in
"amd64"
)
COMMON_CPU_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-D
LLAMA
_METAL=off -D
LLAMA
_NATIVE=off"
COMMON_CPU_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-D
GGML
_METAL=off -D
GGML
_NATIVE=off"
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS
=
"--target llama --target ggml"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-DBUILD_SHARED_LIBS=off -D
LLAMA
_BLAS=off -D
LLAMA
_ACCELERATE=off -D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-DBUILD_SHARED_LIBS=off -D
GGML
_BLAS=off -D
GGML
_ACCELERATE=off -D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
_static"
echo
"Building static library"
build
...
...
@@ -37,7 +37,7 @@ case "${GOARCH}" in
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_ACCELERATE=off -D
LLAMA
_BLAS=off -D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_ACCELERATE=off -D
GGML
_BLAS=off -D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/cpu"
echo
"Building LCD CPU"
build
...
...
@@ -49,7 +49,7 @@ case "${GOARCH}" in
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_ACCELERATE=off -D
LLAMA
_BLAS=off -D
LLAMA
_AVX=on -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_ACCELERATE=off -D
GGML
_BLAS=off -D
GGML
_AVX=on -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/cpu_avx"
echo
"Building AVX CPU"
build
...
...
@@ -61,7 +61,7 @@ case "${GOARCH}" in
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_ACCELERATE=on -D
LLAMA
_BLAS=off -D
LLAMA
_AVX=on -D
LLAMA
_AVX2=on -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=on -D
LLAMA
_F16C=on
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_ACCELERATE=on -D
GGML
_BLAS=off -D
GGML
_AVX=on -D
GGML
_AVX2=on -D
GGML
_AVX512=off -D
GGML
_FMA=on -D
GGML
_F16C=on
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/cpu_avx2"
echo
"Building AVX2 CPU"
EXTRA_LIBS
=
"
${
EXTRA_LIBS
}
-framework Accelerate -framework Foundation"
...
...
@@ -75,14 +75,14 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS
=
"--target llama --target ggml"
CMAKE_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3
-DLLAMA_BLAS=off
-DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
_static"
echo
"Building static library"
build
if
[
-z
"
$OLLAMA_SKIP_METAL_GENERATE
"
]
;
then
init_vars
CMAKE_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DLLAMA_ACCELERATE=on
-DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-DLLAMA_METAL=on
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/darwin/
${
ARCH
}
/metal"
EXTRA_LIBS
=
"
${
EXTRA_LIBS
}
-framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build
...
...
llm/generate/gen_linux.sh
View file @
8f8e736b
...
...
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
export
CUDACXX
=
$(
command
-v
nvcc
)
fi
fi
COMMON_CMAKE_DEFS
=
"-DCMAKE_POSITION_INDEPENDENT_CODE=on -D
LLAMA
_NATIVE=off -D
LLAMA
_AVX=on -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off -D
LLAMA
_OPENMP=off"
COMMON_CMAKE_DEFS
=
"-DCMAKE_POSITION_INDEPENDENT_CODE=on -D
GGML
_NATIVE=off -D
GGML
_AVX=on -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off -D
GGML
_OPENMP=off"
source
$(
dirname
$0
)
/gen_common.sh
init_vars
git_module_setup
...
...
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS
=
"--target llama --target ggml"
CMAKE_DEFS
=
"-DBUILD_SHARED_LIBS=off -D
LLAMA
_NATIVE=off -D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off -D
LLAMA
_OPENMP=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"-DBUILD_SHARED_LIBS=off -D
GGML
_NATIVE=off -D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off -D
GGML
_OPENMP=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
_static"
echo
"Building static library"
build
...
...
@@ -84,22 +84,22 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
compress
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -D
LLAMA
_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -D
LLAMA
_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -D
LLAMA
_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -D
LLAMA
_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# -D
GGML
_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -D
GGML
_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -D
GGML
_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -D
GGML
_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# Note: the following seem to yield slower results than AVX2 - ymmv
# -D
LLAMA
_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -D
LLAMA
_AVX512_VBMI -- 2018 Intel Cannon Lake
# -D
LLAMA
_AVX512_VNNI -- 2021 Intel Alder Lake
# -D
GGML
_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -D
GGML
_AVX512_VBMI -- 2018 Intel Cannon Lake
# -D
GGML
_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS
=
"-DCMAKE_POSITION_INDEPENDENT_CODE=on -D
LLAMA
_NATIVE=off -D
LLAMA
_OPENMP=off"
COMMON_CPU_DEFS
=
"-DCMAKE_POSITION_INDEPENDENT_CODE=on -D
GGML
_NATIVE=off -D
GGML
_OPENMP=off"
if
[
-z
"
${
OLLAMA_CPU_TARGET
}
"
-o
"
${
OLLAMA_CPU_TARGET
}
"
=
"cpu"
]
;
then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cpu"
echo
"Building LCD CPU"
build
...
...
@@ -116,7 +116,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_AVX=on -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=off -D
LLAMA
_F16C=off
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_AVX=on -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_FMA=off -D
GGML
_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cpu_avx"
echo
"Building AVX CPU"
build
...
...
@@ -129,7 +129,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
LLAMA
_AVX=on -D
LLAMA
_AVX2=on -D
LLAMA
_AVX512=off -D
LLAMA
_FMA=on -D
LLAMA
_F16C=on
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-D
GGML
_AVX=on -D
GGML
_AVX2=on -D
GGML
_AVX512=off -D
GGML
_FMA=on -D
GGML
_F16C=on
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cpu_avx2"
echo
"Building AVX2 CPU"
build
...
...
@@ -170,15 +170,15 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
#
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
# Disabling has minimal performance effect while maintaining compatibility.
ARM64_DEFS
=
"-D
LLAMA
_AVX=off -D
LLAMA
_AVX2=off -D
LLAMA
_AVX512=off -D
LLAMA
_CUDA_F16=off"
ARM64_DEFS
=
"-D
GGML
_AVX=off -D
GGML
_AVX2=off -D
GGML
_AVX512=off -D
GGML
_CUDA_F16=off"
fi
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
if
[
-n
"
${
OLLAMA_CUSTOM_CUDA_DEFS
}
"
]
;
then
echo
"OLLAMA_CUSTOM_CUDA_DEFS=
\"
${
OLLAMA_CUSTOM_CUDA_DEFS
}
\"
"
CMAKE_CUDA_DEFS
=
"-D
LLAMA
_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
${
OLLAMA_CUSTOM_CUDA_DEFS
}
"
CMAKE_CUDA_DEFS
=
"-D
GGML
_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
${
OLLAMA_CUSTOM_CUDA_DEFS
}
"
echo
"Building custom CUDA GPU"
else
CMAKE_CUDA_DEFS
=
"-D
LLAMA
_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -D
LLAMA
_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
"
CMAKE_CUDA_DEFS
=
"-D
GGML
_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -D
GGML
_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
-DCMAKE_LIBRARY_PATH=/usr/local/cuda/compat
"
fi
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
${
ARM64_DEFS
}
${
CMAKE_CUDA_DEFS
}
"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/cuda
${
CUDA_VARIANT
}
"
...
...
@@ -216,7 +216,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
init_vars
source
${
ONEAPI_ROOT
}
/setvars.sh
--force
# set up environment variables for oneAPI
CC
=
icx
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
-DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -D
LLAMA
_SYCL=ON -D
LLAMA
_SYCL_F16=OFF"
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
-DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -D
GGML
_SYCL=ON -D
GGML
_SYCL_F16=OFF"
BUILD_DIR
=
"../build/linux/
${
ARCH
}
/oneapi"
EXTRA_LIBS
=
"-fsycl -Wl,-rpath,
${
ONEAPI_ROOT
}
/compiler/latest/lib,-rpath,
${
ONEAPI_ROOT
}
/mkl/latest/lib,-rpath,
${
ONEAPI_ROOT
}
/tbb/latest/lib,-rpath,
${
ONEAPI_ROOT
}
/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
DEBUG_FLAGS
=
""
# icx compiles with -O0 if we pass -g, so we must remove it
...
...
@@ -254,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
ROCM_VARIANT
=
_v
$(
ls
${
ROCM_PATH
}
/lib/librocblas.so.
*
.
*
.????? |
cut
-f5
-d
.
||
true
)
fi
init_vars
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
-D
LLAMA
_HIPBLAS=on -DCMAKE_C_COMPILER=
$ROCM_PATH
/llvm/bin/clang -DCMAKE_CXX_COMPILER=
$ROCM_PATH
/llvm/bin/clang++ -DAMDGPU_TARGETS=
$(
amdGPUs
)
-DGPU_TARGETS=
$(
amdGPUs
)
"
CMAKE_DEFS
=
"
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
-D
GGML
_HIPBLAS=on -DCMAKE_C_COMPILER=
$ROCM_PATH
/llvm/bin/clang -DCMAKE_CXX_COMPILER=
$ROCM_PATH
/llvm/bin/clang++ -DAMDGPU_TARGETS=
$(
amdGPUs
)
-DGPU_TARGETS=
$(
amdGPUs
)
"
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
if
[
-n
"
${
OLLAMA_CUSTOM_ROCM_DEFS
}
"
]
;
then
echo
"OLLAMA_CUSTOM_ROCM_DEFS=
\"
${
OLLAMA_CUSTOM_ROCM_DEFS
}
\"
"
...
...
llm/generate/gen_windows.ps1
View file @
8f8e736b
...
...
@@ -39,8 +39,8 @@ function init_vars {
}
$
script
:
cmakeDefs
=
@(
"-DBUILD_SHARED_LIBS=on"
,
"-D
LLAMA
_NATIVE=off"
,
"-D
LLAMA
_OPENMP=off"
"-D
GGML
_NATIVE=off"
,
"-D
GGML
_OPENMP=off"
)
$
script
:
commonCpuDefs
=
@(
"-DCMAKE_POSITION_INDEPENDENT_CODE=on"
)
$
script
:
ARCH
=
$
Env
:
PROCESSOR_ARCHITECTURE
.
ToLower
()
...
...
@@ -182,9 +182,9 @@ function cleanup {
}
# -D
LLAMA
_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -D
LLAMA
_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -D
LLAMA
_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# -D
GGML
_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -D
GGML
_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -D
GGML
_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
function build_static() {
...
...
@@ -204,13 +204,13 @@ function build_static() {
"
-DCMAKE_C_COMPILER
=
gcc.exe
",
"
-DCMAKE_CXX_COMPILER
=
g
++.
exe
",
"
-DBUILD_SHARED_LIBS
=
off
",
"
-D
LLAMA
_NATIVE
=
off
",
"
-D
LLAMA
_AVX
=
off
",
"
-D
LLAMA
_AVX2
=
off
",
"
-D
LLAMA
_AVX512
=
off
",
"
-D
LLAMA
_F16C
=
off
",
"
-D
LLAMA
_FMA
=
off
",
"
-D
LLAMA
_OPENMP
=
off
")
"
-D
GGML
_NATIVE
=
off
",
"
-D
GGML
_AVX
=
off
",
"
-D
GGML
_AVX2
=
off
",
"
-D
GGML
_AVX512
=
off
",
"
-D
GGML
_F16C
=
off
",
"
-D
GGML
_FMA
=
off
",
"
-D
GGML
_OPENMP
=
off
")
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
_static
"
write-host "
Building
static
library
"
build
...
...
@@ -224,7 +224,7 @@ function build_cpu($gen_arch) {
if ((-not "
${env:OLLAMA_SKIP_CPU_GENERATE}
" ) -and ((-not "
${env:OLLAMA_CPU_TARGET}
") -or ("
${env:OLLAMA_CPU_TARGET}
" -eq "
cpu
"))) {
# remaining llama.cpp builds use MSVC
init_vars
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
",
$gen_arch
, "
-D
LLAMA
_AVX
=
off
", "
-D
LLAMA
_AVX2
=
off
", "
-D
LLAMA
_AVX512
=
off
", "
-D
LLAMA
_FMA
=
off
", "
-D
LLAMA
_F16C
=
off
") +
$
script
:
cmakeDefs
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
",
$gen_arch
, "
-D
GGML
_AVX
=
off
", "
-D
GGML
_AVX2
=
off
", "
-D
GGML
_AVX512
=
off
", "
-D
GGML
_FMA
=
off
", "
-D
GGML
_F16C
=
off
") +
$
script
:
cmakeDefs
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
/cpu
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cpu
"
write-host "
Building
LCD
CPU
"
...
...
@@ -239,7 +239,7 @@ function build_cpu($gen_arch) {
function build_cpu_avx() {
if ((-not "
${env:OLLAMA_SKIP_CPU_GENERATE}
" ) -and ((-not "
${env:OLLAMA_CPU_TARGET}
") -or ("
${env:OLLAMA_CPU_TARGET}
" -eq "
cpu_avx
"))) {
init_vars
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
", "
x64
", "
-D
LLAMA
_AVX
=
on
", "
-D
LLAMA
_AVX2
=
off
", "
-D
LLAMA
_AVX512
=
off
", "
-D
LLAMA
_FMA
=
off
", "
-D
LLAMA
_F16C
=
off
") +
$
script
:
cmakeDefs
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
", "
x64
", "
-D
GGML
_AVX
=
on
", "
-D
GGML
_AVX2
=
off
", "
-D
GGML
_AVX512
=
off
", "
-D
GGML
_FMA
=
off
", "
-D
GGML
_F16C
=
off
") +
$
script
:
cmakeDefs
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
/cpu_avx
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cpu_avx
"
write-host "
Building
AVX
CPU
"
...
...
@@ -254,7 +254,7 @@ function build_cpu_avx() {
function build_cpu_avx2() {
if ((-not "
${env:OLLAMA_SKIP_CPU_GENERATE}
" ) -and ((-not "
${env:OLLAMA_CPU_TARGET}
") -or ("
${env:OLLAMA_CPU_TARGET}
" -eq "
cpu_avx2
"))) {
init_vars
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
", "
x64
", "
-D
LLAMA
_AVX
=
on
", "
-D
LLAMA
_AVX2
=
on
", "
-D
LLAMA
_AVX512
=
off
", "
-D
LLAMA
_FMA
=
on
", "
-D
LLAMA
_F16C
=
on
") +
$
script
:
cmakeDefs
$
script
:
cmakeDefs
=
$
script
:
commonCpuDefs
+ @("
-A
", "
x64
", "
-D
GGML
_AVX
=
on
", "
-D
GGML
_AVX2
=
on
", "
-D
GGML
_AVX512
=
off
", "
-D
GGML
_FMA
=
on
", "
-D
GGML
_F16C
=
on
") +
$
script
:
cmakeDefs
$
script
:
buildDir
="
..
/build/windows/
${script:ARCH}
/cpu_avx2
"
$
script
:
distDir
="
$
script
:
DIST_BASE
\cpu_avx2
"
write-host "
Building
AVX2
CPU
"
...
...
@@ -279,9 +279,9 @@ function build_cuda() {
$
script
:
distDir
="
$
script
:
DIST_BASE
\cuda
$
script
:
CUDA_VARIANT
"
$
script
:
cmakeDefs
+= @(
"
-A
", "
x64
",
"
-D
LLAMA
_CUDA
=
ON
",
"
-D
LLAMA
_AVX
=
on
",
"
-D
LLAMA
_AVX2
=
off
",
"
-D
GGML
_CUDA
=
ON
",
"
-D
GGML
_AVX
=
on
",
"
-D
GGML
_AVX2
=
off
",
"
-DCUDAToolkit_INCLUDE_DIR
=
$
script
:
CUDA_INCLUDE_DIR
",
"
-DCMAKE_CUDA_FLAGS
=
-t8
",
"
-DCMAKE_CUDA_ARCHITECTURES
=
${script:CMAKE_CUDA_ARCHITECTURES}
"
...
...
@@ -319,7 +319,7 @@ function build_oneapi() {
$
script
:
distDir
=
"
$
script
:
DIST_BASE
\oneapi
$
script
:
ONEAPI_VARIANT
"
$
script
:
cmakeDefs
+=
@(
"-G"
,
"MinGW Makefiles"
,
"-D
LLAMA
_SYCL=ON"
,
"-D
GGML
_SYCL=ON"
,
"-DCMAKE_C_COMPILER=icx"
,
"-DCMAKE_CXX_COMPILER=icx"
,
"-DCMAKE_BUILD_TYPE=Release"
...
...
@@ -365,10 +365,10 @@ function build_rocm() {
"-G"
,
"Ninja"
,
"-DCMAKE_C_COMPILER=clang.exe"
,
"-DCMAKE_CXX_COMPILER=clang++.exe"
,
"-D
LLAMA
_HIPBLAS=on"
,
"-D
GGML
_HIPBLAS=on"
,
"-DHIP_PLATFORM=amd"
,
"-D
LLAMA
_AVX=on"
,
"-D
LLAMA
_AVX2=off"
,
"-D
GGML
_AVX=on"
,
"-D
GGML
_AVX2=off"
,
"-DCMAKE_POSITION_INDEPENDENT_CODE=on"
,
"-DAMDGPU_TARGETS=
$(
amdGPUs
)
",
"
-DGPU_TARGETS
=
$
(
amdGPUs
)
"
...
...
llama.cpp
@
d7fd29ff
Compare
7c26775a
...
d7fd29ff
Subproject commit
7c26775adb579e92b59c82e8084c07a1d0f75e9c
Subproject commit
d7fd29fff16456ce9c3a23fd2d09a66256b05aff
llm/llm.go
View file @
8f8e736b
package
llm
// #cgo CFLAGS: -Illama.cpp
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a
-lstdc++
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a
-lstdc++
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
// #cgo CFLAGS: -Illama.cpp
/include -Illama.cpp/ggml/include
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/
src/
libllama.a
${SRCDIR}/build/darwin/arm64_static/ggml/src/libggml.a -lstdc++ -framework Accelerate -framework Metal
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/
src/
libllama.a
${SRCDIR}/build/darwin/x86_64_static/ggml/src/libggml.a -lstdc++ -framework Accelerate -framework Metal
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/
src/
libllama.a
${SRCDIR}/build/windows/amd64_static/ggml/src/libggml.a
-static -lstdc++
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/
src/
libllama.a
${SRCDIR}/build/windows/arm64_static/ggml/src/libggml.a
-static -lstdc++
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/
src/
libllama.a
${SRCDIR}/build/linux/x86_64_static/ggml/src/libggml.a
-lstdc++
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/
src/
libllama.a
${SRCDIR}/build/linux/arm64_static/ggml/src/libggml.
-lstdc++
// #include <stdlib.h>
// #include "llama.h"
import
"C"
...
...
llm/patches/01-load-progress.diff
View file @
8f8e736b
diff --git a/common/common.cpp b/common/common.cpp
index
73ff0e85..6adb1a92
100644
index
2c05a4d4..927f0e3d
100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2
447
,6 +2
447
,8 @@
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
@@ -2
093
,6 +2
093
,8 @@
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
...
...
@@ -12,10 +12,10 @@ index 73ff0e85..6adb1a92 100644
mparams.kv_overrides = NULL;
} else {
diff --git a/common/common.h b/common/common.h
index
58ed72f4..0bb2605e
100644
index
65c0ef81..ebca2c77
100644
--- a/common/common.h
+++ b/common/common.h
@@ -18
0
,6 +18
0
,13 @@
struct gpt_params {
@@ -18
4
,6 +18
4
,13 @@
struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
...
...
@@ -26,6 +26,6 @@ index 58ed72f4..0bb2605e 100644
+ // context pointer passed to the progress callback
+ void * progress_callback_user_data;
+
//
server params
int32_t port = 8080; // server listens on this network port
int32_t
timeout_read = 600; // http read timeo
ut in
seconds
//
embedding
bool embedding = false; // get only sentence embedding
int32_t
embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absol
ut
e
in
t16, 1=taxicab, 2=euclidean, >2=p-norm)
llm/patches/03-load_exception.diff
View file @
8f8e736b
From 544a2d2e646d39e878d87dfbb3398a356bc560ab Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 23 May 2024 11:18:45 -0700
Subject: [PATCH] throw exception on load errors
---
llama.cpp | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/llama.cpp b/llama.cpp
index 15c66077..8ba90b6a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6346,7 +6346,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
diff --git a/src/llama.cpp b/src/llama.cpp
index 73f52435..58a00fb1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7241,7 +7241,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
}
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
...
...
@@ -20,7 +11,7 @@ index 15c66077..8ba90b6a 100644
}
return 0;
@@ -156
00
,16 +156
00
,23 @@
struct llama_model * llama_load_model_from_file(
@@ -1
7
56
4
,16 +1
7
56
4
,23 @@
struct llama_model * llama_load_model_from_file(
}
model->rpc_servers.push_back(servers);
}
...
...
@@ -52,6 +43,3 @@ index 15c66077..8ba90b6a 100644
}
return model;
--
2.45.1
llm/patches/04-metal.diff
View file @
8f8e736b
diff --git a/ggml-metal.m b/ggml-metal.m
diff --git a/ggml
/src/ggml
-metal.m b/ggml
/src/ggml
-metal.m
index 0207b787..b5e9884b 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
--- a/ggml
/src/ggml
-metal.m
+++ b/ggml
/src/ggml
-metal.m
@@ -1396,27 +1396,23 @@
static enum ggml_status ggml_metal_graph_compute(
// to the matrix-vector kernel
int ne11_mm_min = 1;
...
...
llm/patches/05-default-pretokenizer.diff
View file @
8f8e736b
diff --git a/llama.cpp b/llama.cpp
index
61948751..4b72a293
100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -
4824
,16 +
4824
,7 @@
static void llm_load_vocab(
diff --git a/
src/
llama.cpp b/
src/
llama.cpp
index
73f52435..2b81b4bd
100644
--- a/
src/
llama.cpp
+++ b/
src/
llama.cpp
@@ -
5092
,16 +
5092
,7 @@
static void llm_load_vocab(
// for now, only BPE models have pre-tokenizers
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
...
...
@@ -20,13 +20,13 @@ index 61948751..4b72a293 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -
4888,7 +4879
,8 @@
static void llm_load_vocab(
tokenizer_pre == "
poro-chat
") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_
PORO
;
@@ -
5164,7 +5155
,8 @@
static void llm_load_vocab(
tokenizer_pre == "
jais
") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_
JAIS
;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
} else {
} else
if (vocab.type == LLAMA_VOCAB_TYPE_SPM)
{
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
llm/patches/06-qwen2.diff
View file @
8f8e736b
diff --git a/llama.cpp b/llama.cpp
diff --git a/
src/
llama.cpp b/
src/
llama.cpp
index 40d2ec2c..f34eb79a 100644
--- a/llama.cpp
+++ b/llama.cpp
--- a/
src/
llama.cpp
+++ b/
src/
llama.cpp
@@ -6943,7 +6943,7 @@
static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
...
...
llm/patches/07-embeddings.diff
0 → 100644
View file @
8f8e736b
diff --git a/src/llama.cpp b/src/llama.cpp
index 1fe2b9f7..a43312a7 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13689,7 +13689,7 @@
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
- const bool has_logits = !cparams.embeddings;
+ const bool has_logits = cparams.causal_attn;
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -13959,17 +13959,25 @@
static int llama_decode_internal(
// no output
res = nullptr;
embd = nullptr;
- } else if (cparams.embeddings) {
- res = nullptr; // do not extract logits for embedding case
- embd = gf->nodes[gf->n_nodes - 1];
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
- embd = gf->nodes[gf->n_nodes - 2];
+ }
+
+ if (cparams.embeddings) {
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
+ embd = gf->nodes[i];
+ if (strcmp(embd->name, "result_embd_pooled") == 0) {
+ break;
+ }
}
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
- } else {
+ } else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
}
+
+ if (!cparams.causal_attn) {
+ res = nullptr; // do not extract logits when not needed
+ }
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
ggml_backend_sched_alloc_graph(lctx.sched, gf);
llm/patches/07-gemma.diff
deleted
100644 → 0
View file @
d89454de
From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
From: Ollama maintainers <hello@ollama.com>
Date: Wed, 26 Jun 2024 16:18:09 -0700
Subject: [PATCH] Architecture support
---
llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 193 insertions(+), 1 deletion(-)
diff --git a/llama.cpp b/llama.cpp
index 61948751..3b4196f5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -217,6 +217,7 @@
enum llm_arch {
LLM_ARCH_INTERNLM2,
LLM_ARCH_MINICPM,
LLM_ARCH_GEMMA,
+ LLM_ARCH_GEMMA2,
LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA,
LLM_ARCH_XVERSE,
@@ -255,6 +256,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" },
+ { LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
@@ -464,10 +466,12 @@
enum llm_tensor {
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
@@ -960,6 +964,24 @@
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_GEMMA2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
+ },
+ },
{
LLM_ARCH_STARCODER2,
{
@@ -1941,6 +1963,8 @@
enum e_model {
MODEL_8x22B,
MODEL_16x12B,
MODEL_10B_128x3_66B,
+ MODEL_9B,
+ MODEL_27B,
};
static const size_t kiB = 1024;
@@ -2114,6 +2138,7 @@
struct llama_layer {
struct ggml_tensor * attn_out_norm_b;
struct ggml_tensor * attn_q_a_norm;
struct ggml_tensor * attn_kv_a_norm;
+ struct ggml_tensor * attn_post_norm;
// attention
struct ggml_tensor * wq;
@@ -2136,6 +2161,7 @@
struct llama_layer {
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
+ struct ggml_tensor * ffn_post_norm;
struct ggml_tensor * layer_out_norm;
struct ggml_tensor * layer_out_norm_b;
struct ggml_tensor * ffn_norm_exps;
@@ -4529,6 +4555,16 @@
static void llm_load_hparams(
}
} break;
case LLM_ARCH_GEMMA:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 18: model.type = e_model::MODEL_9B; break;
+ case 28: model.type = e_model::MODEL_27B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GEMMA2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6305,6 +6341,40 @@
static bool llm_load_tensors(
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
}
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+ const int64_t n_ff = hparams.n_ff;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+ }
+ } break;
case LLM_ARCH_STARCODER2:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -10614,6 +10684,123 @@
struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_gemma2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+ cb(Qcur, "Qcur_scaled", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+ model.layers[il].wo, NULL,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_post_norm", il);
+
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = llm_build_norm(ctx0, sa_out, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -11847,6 +12034,10 @@
static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_gemma();
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ result = llm.build_gemma2();
+ } break;
case LLM_ARCH_STARCODER2:
{
result = llm.build_starcoder2();
@@ -16671,6 +16862,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
case LLM_ARCH_GEMMA:
+ case LLM_ARCH_GEMMA2:
case LLM_ARCH_STARCODER2:
case LLM_ARCH_GPTNEOX:
return LLAMA_ROPE_TYPE_NEOX;
@@ -18551,7 +18743,7 @@
static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<s>assistant\n";
}
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
// google/gemma-7b-it
std::string system_prompt = "";
for (auto message : chat) {
--
2.45.2
llm/patches/09-pooling.diff
View file @
8f8e736b
diff --git a/llama.cpp b/llama.cpp
index
61948751..61fe7b57
100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -
7591
,14 +
7591
,14 @@
struct llm_build_context {
diff --git a/
src/
llama.cpp b/
src/
llama.cpp
index
721b8f4e..cfe7ac40
100644
--- a/
src/
llama.cpp
+++ b/
src/
llama.cpp
@@ -
8420
,14 +
8420
,14 @@
struct llm_build_context {
}
struct ggml_tensor * build_inp_mean() {
...
...
@@ -19,7 +19,7 @@ index 61948751..61fe7b57 100644
cb(lctx.inp_cls, "inp_cls", -1);
ggml_set_input(lctx.inp_cls);
return lctx.inp_cls;
@@ -1
2062
,19 +1
2062
,16 @@
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
@@ -1
3847
,19 +1
3847
,16 @@
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_mean->data;
...
...
@@ -42,7 +42,7 @@ index 61948751..61fe7b57 100644
const uint64_t s = sum[i];
if (s > 0) {
div[i] = 1.0f/float(s);
@@ -1
2094
,14 +1
2091
,11 @@
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
@@ -1
3879
,14 +1
3876
,11 @@
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment