"vscode:/vscode.git/clone" did not exist on "5782e0393d1643383243c31629790491c5e69db8"
Commit 97ef6ff8 authored by xuxzh1's avatar xuxzh1 🎱
Browse files

update

parent 4cc1a614
Pipeline #2023 canceled with stages
...@@ -12,9 +12,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU. ...@@ -12,9 +12,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.
List all SYCL devices with ID, compute capability, max work group size, ect. List all SYCL devices with ID, compute capability, max work group size, ect.
1. Build the llama.cpp for SYCL for all targets. 1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
2. Enable oneAPI running environment 2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
``` ```
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
...@@ -29,19 +29,13 @@ source /opt/intel/oneapi/setvars.sh ...@@ -29,19 +29,13 @@ source /opt/intel/oneapi/setvars.sh
Check the ID in startup log, like: Check the ID in startup log, like:
``` ```
found 4 SYCL devices: found 2 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, | | | | |Max | |Max |Global | |
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 | | | | |compute|Max work|sub |mem | |
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, |ID| Device Type| Name|Version|units |group |group|size | Driver version|
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 |--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, | 0| [level_zero:gpu:0]| Intel Arc A770 Graphics| 1.3| 512| 1024| 32| 16225M| 1.3.29138|
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 | 1| [level_zero:gpu:1]| Intel UHD Graphics 750| 1.3| 32| 512| 32| 62631M| 1.3.29138|
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
``` ```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
...@@ -4,33 +4,24 @@ ...@@ -4,33 +4,24 @@
# Copyright (C) 2024 Intel Corporation # Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1
GGML_SYCL_SINGLE_GPU=1
else
GGML_SYCL_DEVICE=0
GGML_SYCL_SINGLE_GPU=0
fi
#export GGML_SYCL_DEBUG=1 #export GGML_SYCL_DEBUG=1
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
NGL=33
CONEXT=8192
if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1
echo "use $GGML_SYCL_DEVICE as main GPU" echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only #use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
else else
#use multiple GPUs with same max compute units #use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
fi fi
#use main GPU only
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
#use multiple GPUs with same max compute units
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#include "common.h" #include "common.h"
//#include "log.h" // TODO: start using log.h
#include "llama.h" #include "llama.h"
#include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring>
#include <fstream> #include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <iostream> // TODO: remove me
#if defined(_WIN32) #if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
...@@ -13,25 +15,25 @@ ...@@ -13,25 +15,25 @@
#include <shellapi.h> // For CommandLineToArgvW #include <shellapi.h> // For CommandLineToArgvW
#endif #endif
static void print_usage_information(const char * argv0, FILE * stream) { static void print_usage_information(const char * argv0) {
fprintf(stream, "usage: %s [options]\n\n", argv0); printf("usage: %s [options]\n\n", argv0);
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n"); printf("The tokenize program tokenizes a prompt using a given model,\n");
fprintf(stream, "and prints the resulting tokens to standard output.\n\n"); printf("and prints the resulting tokens to standard output.\n\n");
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n"); printf("It needs a model file, a prompt, and optionally other flags\n");
fprintf(stream, "to control the behavior of the tokenizer.\n\n"); printf("to control the behavior of the tokenizer.\n\n");
fprintf(stream, " The possible options are:\n"); printf(" The possible options are:\n");
fprintf(stream, "\n"); printf("\n");
fprintf(stream, " -h, --help print this help and exit\n"); printf(" -h, --help print this help and exit\n");
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n"); printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n"); printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
fprintf(stream, " --stdin read prompt from standard input.\n"); printf(" --stdin read prompt from standard input.\n");
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
fprintf(stream, " --no-parse-special do not parse control tokens.\n"); printf(" --no-parse-special do not parse control tokens.\n");
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
fprintf(stream, " --show-count print the total number of tokens.\n"); printf(" --show-count print the total number of tokens.\n");
} }
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) { static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
...@@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) { ...@@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
const int argc = argv.size(); const int argc = argv.size();
if (argc <= 1) { if (argc <= 1) {
print_usage_information(argv[0].c_str(), stderr); print_usage_information(argv[0].c_str());
return 1; return 1;
} }
...@@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) { ...@@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
for (; iarg < argc; ++iarg) { for (; iarg < argc; ++iarg) {
std::string arg{argv[iarg]}; std::string arg{argv[iarg]};
if (arg == "-h" || arg == "--help") { if (arg == "-h" || arg == "--help") {
print_usage_information(argv[0].c_str(), stdout); print_usage_information(argv[0].c_str());
return 0; return 0;
} }
else if (arg == "--ids") { else if (arg == "--ids") {
...@@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) { ...@@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
// Start actually doing the tokenizing stuff. // Start actually doing the tokenizing stuff.
////// //////
#ifdef LOG_DISABLE_LOGS
disable_logging = true;
#endif
if (disable_logging) { if (disable_logging) {
llama_log_set(llama_log_callback_null, NULL); llama_log_set(llama_log_callback_null, NULL);
} }
...@@ -362,12 +360,12 @@ int main(int raw_argc, char ** raw_argv) { ...@@ -362,12 +360,12 @@ int main(int raw_argc, char ** raw_argv) {
prompt = stdin_buffer.str(); prompt = stdin_buffer.str();
} }
const bool model_wants_add_bos = llama_should_add_bos_token(model); const bool model_wants_add_bos = llama_add_bos_token(model);
const bool add_bos = model_wants_add_bos && !no_bos; const bool add_bos = model_wants_add_bos && !no_bos;
const bool parse_special = !no_parse_special; const bool parse_special = !no_parse_special;
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special); tokens = common_tokenize(model, prompt, add_bos, parse_special);
if (printing_ids) { if (printing_ids) {
printf("["); printf("[");
...@@ -382,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) { ...@@ -382,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) {
} else { } else {
bool invalid_utf8 = false; bool invalid_utf8 = false;
printf("%6d -> '", tokens[i]); printf("%6d -> '", tokens[i]);
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
if (invalid_utf8) { if (invalid_utf8) {
printf("' (utf-8 decode failure)\n"); printf("' (utf-8 decode failure)\n");
} else { } else {
......
...@@ -5,11 +5,11 @@ ...@@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib" "nixpkgs-lib": "nixpkgs-lib"
}, },
"locked": { "locked": {
"lastModified": 1722555600, "lastModified": 1730504689,
"narHash": "sha256-XOQkdLafnb/p9ij77byFQjDf5m5QYl9b2REiVClC+x4=", "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
"owner": "hercules-ci", "owner": "hercules-ci",
"repo": "flake-parts", "repo": "flake-parts",
"rev": "8471fe90ad337a8074e957b69ca4d0089218391d", "rev": "506278e768c2a08bec68eb62932193e341f55c90",
"type": "github" "type": "github"
}, },
"original": { "original": {
...@@ -20,11 +20,11 @@ ...@@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1722421184, "lastModified": 1732014248,
"narHash": "sha256-/DJBI6trCeVnasdjUo9pbnodCLZcFqnVZiLUfqLH4jA=", "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "9f918d616c5321ad374ae6cb5ea89c9e04bf3e58", "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
"type": "github" "type": "github"
}, },
"original": { "original": {
...@@ -36,14 +36,14 @@ ...@@ -36,14 +36,14 @@
}, },
"nixpkgs-lib": { "nixpkgs-lib": {
"locked": { "locked": {
"lastModified": 1722555339, "lastModified": 1730504152,
"narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=", "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
"type": "tarball", "type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
}, },
"original": { "original": {
"type": "tarball", "type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
} }
}, },
"root": { "root": {
......
...@@ -145,7 +145,9 @@ ...@@ -145,7 +145,9 @@
# the same path you would with an overlay. # the same path you would with an overlay.
legacyPackages = { legacyPackages = {
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix {
inherit llamaVersion;
};
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
}; };
...@@ -157,6 +159,7 @@ ...@@ -157,6 +159,7 @@
default = config.legacyPackages.llamaPackages.llama-cpp; default = config.legacyPackages.llamaPackages.llama-cpp;
vulkan = config.packages.default.override { useVulkan = true; }; vulkan = config.packages.default.override { useVulkan = true; };
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp; windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
python-scripts = config.legacyPackages.llamaPackages.python-scripts;
} }
// lib.optionalAttrs pkgs.stdenv.isLinux { // lib.optionalAttrs pkgs.stdenv.isLinux {
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp; cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
......
...@@ -56,6 +56,15 @@ else() ...@@ -56,6 +56,15 @@ else()
set(GGML_NATIVE_DEFAULT ON) set(GGML_NATIVE_DEFAULT ON)
endif() endif()
# defaults
if (NOT GGML_LLAMAFILE_DEFAULT)
set(GGML_LLAMAFILE_DEFAULT OFF)
endif()
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
endif()
# general # general
option(GGML_STATIC "ggml: static link libraries" OFF) option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT}) option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
...@@ -83,6 +92,7 @@ else() ...@@ -83,6 +92,7 @@ else()
endif() endif()
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
...@@ -90,12 +100,16 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF) ...@@ -90,12 +100,16 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
if (NOT MSVC) if (NOT MSVC)
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512 option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
endif() endif()
option(GGML_LASX "ggml: enable lasx" ON) option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON) option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_SVE "ggml: enable SVE" OFF) option(GGML_SVE "ggml: enable SVE" OFF)
if (WIN32) if (WIN32)
...@@ -104,42 +118,40 @@ endif() ...@@ -104,42 +118,40 @@ endif()
# ggml core # ggml core
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism") set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
option(GGML_CPU "ggml: enable CPU backend" ON)
# 3rd party libs / backends # 3rd party libs / backends
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON) option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor") "ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF) option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
"ggml: iters./thread per block for Q2_K/Q6_K")
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"ggml: max. batch size for using peer access") "ggml: max. batch size for using peer access")
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF) option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF) option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF) option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF) option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF) option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL}) option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
...@@ -148,10 +160,13 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING ...@@ -148,10 +160,13 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON) option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_RPC "ggml: use RPC" OFF) option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_AMX "ggml: use AMX" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device") "ggml: sycl target device")
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture")
# extra artifacts # extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
...@@ -204,13 +219,14 @@ include(CMakePackageConfigHelpers) ...@@ -204,13 +219,14 @@ include(CMakePackageConfigHelpers)
# all public headers # all public headers
set(GGML_PUBLIC_HEADERS set(GGML_PUBLIC_HEADERS
include/ggml.h include/ggml.h
include/ggml-cpu.h
include/ggml-alloc.h include/ggml-alloc.h
include/ggml-backend.h include/ggml-backend.h
include/ggml-blas.h include/ggml-blas.h
include/ggml-cann.h include/ggml-cann.h
include/ggml-cuda.h include/ggml-cuda.h
include/ggml.h
include/ggml-kompute.h include/ggml-kompute.h
include/ggml-opt.h
include/ggml-metal.h include/ggml-metal.h
include/ggml-rpc.h include/ggml-rpc.h
include/ggml-sycl.h include/ggml-sycl.h
...@@ -220,15 +236,14 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") ...@@ -220,15 +236,14 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
#if (GGML_METAL) #if (GGML_METAL)
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal") # set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
#endif() #endif()
install(TARGETS ggml PUBLIC_HEADER) install(TARGETS ggml LIBRARY PUBLIC_HEADER)
install(TARGETS ggml-base LIBRARY)
if (BUILD_SHARED_LIBS)
install(TARGETS ggml LIBRARY)
endif()
# FIXME: this should be done in the backend cmake files
if (GGML_METAL) if (GGML_METAL)
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
install( install(
FILES src/ggml-metal.metal FILES src/ggml-metal/ggml-metal.metal
PERMISSIONS PERMISSIONS
OWNER_READ OWNER_READ
OWNER_WRITE OWNER_WRITE
......
...@@ -7,8 +7,8 @@ extern "C" { ...@@ -7,8 +7,8 @@ extern "C" {
#endif #endif
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t; typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend * ggml_backend_t;
// Tensor allocator // Tensor allocator
struct ggml_tallocr { struct ggml_tallocr {
...@@ -24,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st ...@@ -24,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
// Graph allocator // Graph allocator
/* /*
Example usage: Example usage:
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type()); ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
// optional: create a worst-case graph and reserve the buffers to avoid reallocations // optional: create a worst-case graph and reserve the buffers to avoid reallocations
ggml_gallocr_reserve(galloc, build_graph(max_batch)); ggml_gallocr_reserve(galloc, build_graph(max_batch));
......
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// buffer_type API
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
#ifdef __cplusplus
}
#endif
...@@ -3,6 +3,20 @@ ...@@ -3,6 +3,20 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#ifdef GGML_BACKEND_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BACKEND_BUILD
# define GGML_BACKEND_API __declspec(dllexport) extern
# else
# define GGML_BACKEND_API __declspec(dllimport) extern
# endif
# else
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
# endif
#else
# define GGML_BACKEND_API extern
#endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
...@@ -12,43 +26,52 @@ extern "C" { ...@@ -12,43 +26,52 @@ extern "C" {
typedef struct ggml_backend_event * ggml_backend_event_t; typedef struct ggml_backend_event * ggml_backend_event_t;
typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t; typedef void * ggml_backend_graph_plan_t;
typedef struct ggml_backend_reg * ggml_backend_reg_t;
typedef struct ggml_backend_device * ggml_backend_dev_t;
// //
// Backend buffer // Backend buffer type
// //
// buffer type GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
//
// Backend buffer
//
// buffer
enum ggml_backend_buffer_usage { enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0, GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2, GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
}; };
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer); GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// //
// Backend // Backend (stream)
// //
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend); GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
...@@ -63,8 +86,10 @@ extern "C" { ...@@ -63,8 +86,10 @@ extern "C" {
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); // "offset" refers to the offset in tensor->data for setting/getting data
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend); GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
...@@ -74,64 +99,126 @@ extern "C" { ...@@ -74,64 +99,126 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
// NOTE: will be removed, use device version instead
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous copy // asynchronous copy
// the copy is performed after all the currently queued operations in backend_src // the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations // backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported // automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
// events GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
// //
// CPU backend // Events
// //
GGML_API ggml_backend_t ggml_backend_cpu_init(void); GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); //
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); // Backend device
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); //
// Create a backend buffer from an existing pointer enum ggml_backend_dev_type {
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); // CPU device using system memory
GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
};
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); // functionality supported by the device
struct ggml_backend_dev_caps {
// asynchronous operations
bool async;
// pinned host buffer
bool host_buffer;
// creating buffers from host ptr
bool buffer_from_host_ptr;
// event synchronization
bool events;
};
#ifdef GGML_USE_CPU_HBM // all the device properties
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); struct ggml_backend_dev_props {
#endif const char * name;
const char * description;
size_t memory_free;
size_t memory_total;
enum ggml_backend_dev_type type;
struct ggml_backend_dev_caps caps;
};
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
// //
// Backend registry // Backend (reg)
// //
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
// Split buffer type for tensor parallelism
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
// Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
//
// Backend registry
//
GGML_API size_t ggml_backend_reg_get_count(void); // Backend (reg) enumeration
GGML_API size_t ggml_backend_reg_find_by_name(const char * name); GGML_API size_t ggml_backend_reg_count(void);
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional) GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
GGML_API const char * ggml_backend_reg_get_name(size_t i); GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i); // Device enumeration
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size); GGML_API size_t ggml_backend_dev_count(void);
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
// Direct backend (stream) initialization
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);
// //
// Backend scheduler // Backend scheduler
// //
// The backend scheduler allows for multiple backends to be used together // The backend scheduler allows for multiple backend devices to be used together
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
// The backends are selected based on: // The backends are selected based on:
// - the backend that supports the operation // - the backend that supports the operation
...@@ -155,20 +242,26 @@ extern "C" { ...@@ -155,20 +242,26 @@ extern "C" {
ggml_backend_sched_reserve(sched, reserve_graph); ggml_backend_sched_reserve(sched, reserve_graph);
// compute // compute
graph = build_graph(sched); graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
ggml_backend_sched_graph_compute(sched, graph); for (int i = 0; i < 10; ++i) {
ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
}
// if there are graph inputs: // if there are graph inputs:
ggml_backend_sched_reset(sched); graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
ggml_backend_sched_alloc_graph(sched, graph); ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
ggml_backend_tensor_set(input_tensor, ...); ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
ggml_backend_sched_graph_compute(sched, graph); ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
ggml_backend_sched_graph_compute(sched, graph); // execute the graph
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
// allocate them statically via ggml_backend_alloc_ctx_tensors
} }
*/ */
struct ggml_backend_sched;
typedef struct ggml_backend_sched * ggml_backend_sched_t; typedef struct ggml_backend_sched * ggml_backend_sched_t;
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
// when ask == true, the scheduler wants to know if the user wants to observe this node // when ask == true, the scheduler wants to know if the user wants to observe this node
// this allows the scheduler to batch nodes together in order to evaluate them in a single call // this allows the scheduler to batch nodes together in order to evaluate them in a single call
// //
...@@ -177,12 +270,12 @@ extern "C" { ...@@ -177,12 +270,12 @@ extern "C" {
// //
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler // Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel); GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
...@@ -197,12 +290,14 @@ extern "C" { ...@@ -197,12 +290,14 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler // Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
// Reset all assignments and allocators - must be called before changing the node backends // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
// The correct way to use this API is to discard the deallocated tensors and create new ones.
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
// Set a callback to be called for each resulting node during graph compute // Set a callback to be called for each resulting node during graph compute
...@@ -223,7 +318,7 @@ extern "C" { ...@@ -223,7 +318,7 @@ extern "C" {
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends // Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
...@@ -232,6 +327,9 @@ extern "C" { ...@@ -232,6 +327,9 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
// CPU buffer types are always available
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
......
...@@ -9,13 +9,15 @@ extern "C" { ...@@ -9,13 +9,15 @@ extern "C" {
#endif #endif
// backend API // backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
// number of threads used for conversion to float // number of threads used for conversion to float
// for openblas and blis, this will also set the number of threads used for blas operations // for openblas and blis, this will also set the number of threads used for blas operations
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -34,6 +34,8 @@ extern "C" { ...@@ -34,6 +34,8 @@ extern "C" {
*/ */
#define GGML_CANN_MAX_DEVICES 16 #define GGML_CANN_MAX_DEVICES 16
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
/** /**
* @brief Initializes the CANN backend for a specified device. * @brief Initializes the CANN backend for a specified device.
* *
...@@ -44,7 +46,7 @@ extern "C" { ...@@ -44,7 +46,7 @@ extern "C" {
* @param device The index of the device to initialize. * @param device The index of the device to initialize.
* @return A pointer to the initialized backend instance, or nullptr on failure. * @return A pointer to the initialized backend instance, or nullptr on failure.
*/ */
GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device); GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
/** /**
* @brief Checks if a given backend is a CANN backend. * @brief Checks if a given backend is a CANN backend.
...@@ -55,7 +57,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device); ...@@ -55,7 +57,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
* @param backend The backend instance to check. * @param backend The backend instance to check.
* @return True if the backend is a CANN backend, false otherwise. * @return True if the backend is a CANN backend, false otherwise.
*/ */
GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
/** /**
* @brief Retrieves the CANN buffer type for a specified device. * @brief Retrieves the CANN buffer type for a specified device.
...@@ -67,7 +69,7 @@ GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend); ...@@ -67,7 +69,7 @@ GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
* @return A pointer to the buffer type interface for the specified device, or * @return A pointer to the buffer type interface for the specified device, or
* nullptr if the device index is out of range. * nullptr if the device index is out of range.
*/ */
GGML_API GGML_CALL ggml_backend_buffer_type_t GGML_BACKEND_API ggml_backend_buffer_type_t
ggml_backend_cann_buffer_type(int32_t device); ggml_backend_cann_buffer_type(int32_t device);
/** /**
...@@ -78,7 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device); ...@@ -78,7 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
* *
* @return The number of CANN devices available. * @return The number of CANN devices available.
*/ */
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void); GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
/**
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
*
* @return A pointer to the host buffer type interface.
*/
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
/** /**
* @brief Retrieves the description of a specific CANN device. * @brief Retrieves the description of a specific CANN device.
...@@ -90,7 +99,7 @@ GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void); ...@@ -90,7 +99,7 @@ GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
* @param description Pointer to a buffer where the description will be written. * @param description Pointer to a buffer where the description will be written.
* @param description_size Size of the description buffer. * @param description_size Size of the description buffer.
*/ */
GGML_API GGML_CALL void ggml_backend_cann_get_device_description( GGML_BACKEND_API void ggml_backend_cann_get_device_description(
int32_t device, char* description, size_t description_size); int32_t device, char* description, size_t description_size);
/** /**
...@@ -105,20 +114,9 @@ GGML_API GGML_CALL void ggml_backend_cann_get_device_description( ...@@ -105,20 +114,9 @@ GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
* @param total Pointer to a variable where the total memory size will be * @param total Pointer to a variable where the total memory size will be
* stored. * stored.
*/ */
GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
size_t* free, size_t* free,
size_t* total); size_t* total);
/**
* @brief Set the logging callback for GGML.
*
* This function sets the logging callback and user data for logging.
*
* @param log_callback The logging callback to set.
* @param user_data User data to pass to the logging callback.
*/
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
void* user_data);
#ifdef __cplusplus #ifdef __cplusplus
} }
......
#pragma once
#ifndef __cplusplus
#error "This header is for C++ only"
#endif
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include <memory>
// Smart pointers for ggml types
// ggml
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
// ggml-alloc
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
// ggml-backend
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
// numa strategies
enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0,
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
GGML_NUMA_STRATEGY_COUNT
};
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
//
// system info
//
// x86
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
GGML_BACKEND_API int ggml_cpu_has_avx (void);
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
GGML_BACKEND_API int ggml_cpu_has_fma (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
// ARM
GGML_BACKEND_API int ggml_cpu_has_neon (void);
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
GGML_BACKEND_API int ggml_cpu_has_sve (void);
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
// other
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
// Internal types and functions exposed for tests and benchmarks
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
struct ggml_type_traits_cpu {
ggml_from_float_t from_float;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
};
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
GGML_BACKEND_API void ggml_cpu_init(void);
//
// CPU backend
//
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef GGML_USE_CPU_HBM
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
#ifdef __cplusplus
}
#endif
...@@ -3,7 +3,11 @@ ...@@ -3,7 +3,11 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef GGML_USE_HIPBLAS #ifdef __cplusplus
extern "C" {
#endif
#ifdef GGML_USE_HIP
#define GGML_CUDA_NAME "ROCm" #define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS" #define GGML_CUBLAS_NAME "hipBLAS"
#elif defined(GGML_USE_MUSA) #elif defined(GGML_USE_MUSA)
...@@ -13,35 +17,31 @@ ...@@ -13,35 +17,31 @@
#define GGML_CUDA_NAME "CUDA" #define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS" #define GGML_CUBLAS_NAME "cuBLAS"
#endif #endif
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_CUDA_MAX_DEVICES 16 #define GGML_CUDA_MAX_DEVICES 16
// backend API // backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device); GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer // device buffer
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices // split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void);
GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
...@@ -11,6 +11,8 @@ ...@@ -11,6 +11,8 @@
extern "C" { extern "C" {
#endif #endif
#define GGML_KOMPUTE_MAX_DEVICES 16
struct ggml_vk_device { struct ggml_vk_device {
int index; int index;
int type; // same as VkPhysicalDeviceType int type; // same as VkPhysicalDeviceType
...@@ -35,11 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void); ...@@ -35,11 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void);
// forward declaration // forward declaration
typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend * ggml_backend_t;
GGML_API ggml_backend_t ggml_backend_kompute_init(int device); GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
......
// Note: this description is outdated
//
// An interface allowing to compute ggml_cgraph with Metal // An interface allowing to compute ggml_cgraph with Metal
// //
// This is a fully functional interface that extends ggml with GPU support for Apple devices. // This is a fully functional interface that extends ggml with GPU support for Apple devices.
...@@ -25,9 +27,6 @@ ...@@ -25,9 +27,6 @@
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 64
struct ggml_tensor; struct ggml_tensor;
struct ggml_cgraph; struct ggml_cgraph;
...@@ -40,25 +39,27 @@ extern "C" { ...@@ -40,25 +39,27 @@ extern "C" {
// user-code should use only these functions // user-code should use only these functions
// //
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); GGML_DEPRECATED(
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family // helper to check if the device supports a specific family
// ideally, the user code should be doing these checks // ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family); GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
......
// This file contains functionality for training models using GGML.
// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
//
// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
struct ggml_opt_dataset;
struct ggml_opt_context;
struct ggml_opt_result;
typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
typedef struct ggml_opt_context * ggml_opt_context_t;
typedef struct ggml_opt_result * ggml_opt_result_t;
// ====== Loss ======
// built-in loss types, i.e. the built-in quantities minimized by the optimizer
// custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
enum ggml_opt_loss_type {
GGML_OPT_LOSS_TYPE_MEAN,
GGML_OPT_LOSS_TYPE_SUM,
GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
};
// ====== Dataset ======
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
int64_t ne_datapoint, // number of elements per datapoint
int64_t ne_label, // number of elements per label
int64_t ndata, // total number of datapoints/labels
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
// get underlying tensors that store the data
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
// shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
// get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
GGML_API void ggml_opt_dataset_get_batch(
ggml_opt_dataset_t dataset,
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
int64_t ibatch);
// ====== Model / Context ======
enum ggml_opt_build_type {
GGML_OPT_BUILD_TYPE_FORWARD,
GGML_OPT_BUILD_TYPE_GRAD,
GGML_OPT_BUILD_TYPE_OPT,
};
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
struct ggml_opt_optimizer_params {
// AdamW optimizer parameters
struct {
float alpha; // learning rate
float beta1;
float beta2;
float eps; // epsilon for numerical stability
float wd; // weight decay for AdamW, use 0.0f to disable
} adamw;
};
// callback to calculate optimizer parameters prior to a backward pass
// userdata can be used to pass arbitrary data
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
// returns the default optimizer params (constant)
// userdata is not used
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
// parameters for initializing a new optimization context
struct ggml_opt_params {
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
// the forward graph is defined by inputs and outputs
// those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
struct ggml_tensor * inputs;
struct ggml_tensor * outputs;
enum ggml_opt_loss_type loss_type;
enum ggml_opt_build_type build_type;
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
};
// get parameters for an optimization context with defaults set where possible
// parameters for which no sensible defaults exist are supplied as arguments to this function
GGML_API ggml_opt_params ggml_opt_default_params(
ggml_backend_sched_t backend_sched,
struct ggml_context * ctx_compute,
struct ggml_tensor * inputs,
struct ggml_tensor * outputs,
enum ggml_opt_loss_type loss_type);
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
// set gradients to zero, initilize loss, and optionally reset the optimizer
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
// get underlying tensors that store data
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
GGML_API struct ggml_tensor * ggml_opt_loss( ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
// ====== Optimization Result ======
GGML_API ggml_opt_result_t ggml_opt_result_init();
GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
// get data from result, uncertainties are optional and can be ignored by passing NULL
GGML_API void ggml_opt_result_ndata( ggml_opt_result_t result, int64_t * ndata); // writes 1 value, number of datapoints
GGML_API void ggml_opt_result_loss( ggml_opt_result_t result, double * loss, double * unc); // writes 1 value
GGML_API void ggml_opt_result_pred( ggml_opt_result_t result, int32_t * pred); // writes ndata values
GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc); // writes 1 value
// ====== Computation ======
// do forward pass, increment result if not NULL
GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// do forward pass, increment result if not NULL, do backward pass
GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// ############################################################################
// ## The high-level functions start here. They do not depend on any private ##
// ## functions or structs and can be copied to and adapted for user code. ##
// ############################################################################
// ====== Intended Usage ======
//
// 1. Select the appropriate loss for your problem.
// 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
// Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
// 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
// The first context should contain the model parameters and inputs and be allocated statically in user code.
// The second context should contain all other tensors and will be (re)allocated automatically.
// Due to this automated allocation the data of the second context is not defined when accessed in user code.
// Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
// 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
// signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
typedef void (*ggml_opt_epoch_callback)(
bool train, // true after training evaluation, false after validation evaluation
ggml_opt_context_t opt_ctx,
ggml_opt_dataset_t dataset,
ggml_opt_result_t result, // result associated with the dataset subsection
int64_t ibatch, // number of batches that have been evaluated so far
int64_t ibatch_max, // total number of batches in this dataset subsection
int64_t t_start_us); // time at which the evaluation on the dataset subsection was started
// do training on front of dataset, do evaluation only on back of dataset
GGML_API void ggml_opt_epoch(
ggml_opt_context_t opt_ctx,
ggml_opt_dataset_t dataset,
ggml_opt_result_t result_train, // result to increment during training, ignored if NULL
ggml_opt_result_t result_eval, // result to increment during evaluation, ignored if NULL
int64_t idata_split, // data index at which to split training and evaluation
ggml_opt_epoch_callback callback_train,
ggml_opt_epoch_callback callback_eval);
// callback that prints a progress bar on stderr
GGML_API void ggml_opt_epoch_callback_progress_bar(
bool train,
ggml_opt_context_t opt_ctx,
ggml_opt_dataset_t dataset,
ggml_opt_result_t result,
int64_t ibatch,
int64_t ibatch_max,
int64_t t_start_us);
// fit model defined by inputs and outputs to dataset
GGML_API void ggml_opt_fit(
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
enum ggml_opt_loss_type loss_type, // loss to minimize
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
int64_t nepoch, // how many times the dataset should be iterated over
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
bool silent); // whether or not info prints to stderr should be suppressed
#ifdef __cplusplus
}
#endif
...@@ -10,14 +10,18 @@ extern "C" { ...@@ -10,14 +10,18 @@ extern "C" {
#define GGML_RPC_MAX_SERVERS 16 #define GGML_RPC_MAX_SERVERS 16
// backend API // backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint); GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
#ifdef __cplusplus #ifdef __cplusplus
} }
......
...@@ -17,26 +17,33 @@ extern "C" { ...@@ -17,26 +17,33 @@ extern "C" {
#endif #endif
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_sycl_init(int device); GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
// devide buffer // devide buffer
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices // split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len); GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size); GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count(); char *description,
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); size_t description_size);
GGML_BACKEND_API int ggml_backend_sycl_get_device_count();
GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
// SYCL doesn't support registering host memory, keep here for reference // SYCL doesn't support registering host memory, keep here for reference
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); // GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer); // GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
...@@ -10,19 +10,21 @@ extern "C" { ...@@ -10,19 +10,21 @@ extern "C" {
#define GGML_VK_NAME "Vulkan" #define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16 #define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_instance_init(void); GGML_BACKEND_API void ggml_vk_instance_init(void);
// backend API // backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num); GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void); GGML_BACKEND_API int ggml_backend_vk_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment