Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
1deafd82
Unverified
Commit
1deafd82
authored
Jan 08, 2025
by
Jeffrey Morgan
Committed by
GitHub
Jan 08, 2025
Browse files
llama: update vendored code to commit 46e3556 (#8308)
parent
57f038ec
Changes
305
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
267 additions
and
201 deletions
+267
-201
api/types.go
api/types.go
+0
-2
llama/amx.cpp
llama/amx.cpp
+1
-1
llama/amx.h
llama/amx.h
+1
-1
llama/clip.cpp
llama/clip.cpp
+14
-14
llama/clip.h
llama/clip.h
+1
-1
llama/common.cpp
llama/common.cpp
+59
-23
llama/common.h
llama/common.h
+42
-20
llama/ggml-alloc.c
llama/ggml-alloc.c
+1
-2
llama/ggml-alloc.h
llama/ggml-alloc.h
+1
-1
llama/ggml-backend-impl.h
llama/ggml-backend-impl.h
+1
-1
llama/ggml-backend-reg.cpp
llama/ggml-backend-reg.cpp
+78
-53
llama/ggml-backend.cpp
llama/ggml-backend.cpp
+5
-2
llama/ggml-backend.h
llama/ggml-backend.h
+1
-1
llama/ggml-blas.cpp
llama/ggml-blas.cpp
+1
-1
llama/ggml-blas.h
llama/ggml-blas.h
+1
-1
llama/ggml-common.h
llama/ggml-common.h
+1
-1
llama/ggml-cpp.h
llama/ggml-cpp.h
+1
-1
llama/ggml-cpu-aarch64.cpp
llama/ggml-cpu-aarch64.cpp
+56
-73
llama/ggml-cpu-aarch64.h
llama/ggml-cpu-aarch64.h
+1
-1
llama/ggml-cpu-impl.h
llama/ggml-cpu-impl.h
+1
-1
No files found.
api/types.go
View file @
1deafd82
...
@@ -225,7 +225,6 @@ type Options struct {
...
@@ -225,7 +225,6 @@ type Options struct {
Mirostat
int
`json:"mirostat,omitempty"`
Mirostat
int
`json:"mirostat,omitempty"`
MirostatTau
float32
`json:"mirostat_tau,omitempty"`
MirostatTau
float32
`json:"mirostat_tau,omitempty"`
MirostatEta
float32
`json:"mirostat_eta,omitempty"`
MirostatEta
float32
`json:"mirostat_eta,omitempty"`
PenalizeNewline
bool
`json:"penalize_newline,omitempty"`
Stop
[]
string
`json:"stop,omitempty"`
Stop
[]
string
`json:"stop,omitempty"`
}
}
...
@@ -606,7 +605,6 @@ func DefaultOptions() Options {
...
@@ -606,7 +605,6 @@ func DefaultOptions() Options {
Mirostat
:
0
,
Mirostat
:
0
,
MirostatTau
:
5.0
,
MirostatTau
:
5.0
,
MirostatEta
:
0.1
,
MirostatEta
:
0.1
,
PenalizeNewline
:
true
,
Seed
:
-
1
,
Seed
:
-
1
,
Runner
:
Runner
{
Runner
:
Runner
{
...
...
llama/amx.cpp
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/amx.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/clip.cpp
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -935,7 +935,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
...
@@ -935,7 +935,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
mlp_3
=
ggml_cont
(
ctx0
,
ggml_permute
(
ctx0
,
mlp_3
,
1
,
0
,
2
,
3
));
mlp_3
=
ggml_cont
(
ctx0
,
ggml_permute
(
ctx0
,
mlp_3
,
1
,
0
,
2
,
3
));
mlp_3
=
ggml_reshape_4d
(
ctx0
,
mlp_3
,
n_patch
,
n_patch
,
mlp_3
->
ne
[
1
],
mlp_3
->
ne
[
2
]);
mlp_3
=
ggml_reshape_4d
(
ctx0
,
mlp_3
,
n_patch
,
n_patch
,
mlp_3
->
ne
[
1
],
mlp_3
->
ne
[
2
]);
// stride = 1, padding = 1, bias is nullptr
// stride = 1, padding = 1, bias is nullptr
block_1
=
ggml_conv_
depthwise_2d
(
ctx0
,
model
.
mm_model_block_1_block_0_0_w
,
mlp_3
,
1
,
1
,
1
,
1
,
1
,
1
);
block_1
=
ggml_conv_
2d_dw
(
ctx0
,
model
.
mm_model_block_1_block_0_0_w
,
mlp_3
,
1
,
1
,
1
,
1
,
1
,
1
);
// layer norm
// layer norm
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
...
@@ -983,7 +983,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
...
@@ -983,7 +983,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// block_2
// block_2
{
{
// stride = 2
// stride = 2
block_1
=
ggml_conv_
depthwise_2d
(
ctx0
,
model
.
mm_model_block_2_block_0_0_w
,
block_1
,
2
,
2
,
1
,
1
,
1
,
1
);
block_1
=
ggml_conv_
2d_dw
(
ctx0
,
model
.
mm_model_block_2_block_0_0_w
,
block_1
,
2
,
2
,
1
,
1
,
1
,
1
);
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// layer norm
// layer norm
...
@@ -1044,7 +1044,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
...
@@ -1044,7 +1044,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// mlp_2 ne [24, 24, 2048, 1]
// mlp_2 ne [24, 24, 2048, 1]
mlp_2
=
ggml_pool_2d
(
ctx0
,
mlp_2
,
GGML_OP_POOL_AVG
,
2
,
2
,
2
,
2
,
0
,
0
);
mlp_2
=
ggml_pool_2d
(
ctx0
,
mlp_2
,
GGML_OP_POOL_AVG
,
2
,
2
,
2
,
2
,
0
,
0
);
// weight ne = [3, 3, 2048, 1]
// weight ne = [3, 3, 2048, 1]
struct
ggml_tensor
*
peg_0
=
ggml_conv_
depthwise_2d
(
ctx0
,
model
.
mm_model_peg_0_w
,
mlp_2
,
1
,
1
,
1
,
1
,
1
,
1
);
struct
ggml_tensor
*
peg_0
=
ggml_conv_
2d_dw
(
ctx0
,
model
.
mm_model_peg_0_w
,
mlp_2
,
1
,
1
,
1
,
1
,
1
,
1
);
peg_0
=
ggml_cont
(
ctx0
,
ggml_permute
(
ctx0
,
peg_0
,
1
,
2
,
0
,
3
));
peg_0
=
ggml_cont
(
ctx0
,
ggml_permute
(
ctx0
,
peg_0
,
1
,
2
,
0
,
3
));
peg_0
=
ggml_add
(
ctx0
,
peg_0
,
model
.
mm_model_peg_0_b
);
peg_0
=
ggml_add
(
ctx0
,
peg_0
,
model
.
mm_model_peg_0_b
);
mlp_2
=
ggml_cont
(
ctx0
,
ggml_permute
(
ctx0
,
mlp_2
,
1
,
2
,
0
,
3
));
mlp_2
=
ggml_cont
(
ctx0
,
ggml_permute
(
ctx0
,
mlp_2
,
1
,
2
,
0
,
3
));
...
@@ -1262,28 +1262,28 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
...
@@ -1262,28 +1262,28 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
}
#ifdef GGML_USE_CUDA
#ifdef GGML_USE_CUDA
new_clip
->
backend
=
ggml_backend_cuda_init
(
0
);
new_clip
->
backend
=
ggml_backend_cuda_init
(
0
);
LOG_INF
(
"%s: CLIP using CUDA backend
\n
"
,
__func__
);
LOG_INF
(
"%s: CLIP using CUDA backend
\n
"
,
__func__
);
#endif
#endif
#ifdef GGML_USE_METAL
#ifdef GGML_USE_METAL
new_clip
->
backend
=
ggml_backend_metal_init
();
new_clip
->
backend
=
ggml_backend_metal_init
();
LOG_INF
(
"%s: CLIP using Metal backend
\n
"
,
__func__
);
LOG_INF
(
"%s: CLIP using Metal backend
\n
"
,
__func__
);
#endif
#endif
#ifdef GGML_USE_CANN
#ifdef GGML_USE_CANN
new_clip
->
backend
=
ggml_backend_cann_init
(
0
);
new_clip
->
backend
=
ggml_backend_cann_init
(
0
);
LOG_INF
(
"%s: CLIP using CANN backend
\n
"
,
__func__
);
LOG_INF
(
"%s: CLIP using CANN backend
\n
"
,
__func__
);
#endif
#endif
#ifdef GGML_USE_VULKAN
#ifdef GGML_USE_VULKAN
new_clip
->
backend
=
ggml_backend_vk_init
(
0
);
new_clip
->
backend
=
ggml_backend_vk_init
(
0
);
LOG_INF
(
"%s: CLIP using Vulkan backend
\n
"
,
__func__
);
LOG_INF
(
"%s: CLIP using Vulkan backend
\n
"
,
__func__
);
#endif
#endif
#ifdef GGML_USE_SYCL
#ifdef GGML_USE_SYCL
new_clip
->
backend
=
ggml_backend_sycl_init
(
0
);
new_clip
->
backend
=
ggml_backend_sycl_init
(
0
);
LOG_INF
(
"%s: CLIP using SYCL backend
\n
"
,
__func__
);
LOG_INF
(
"%s: CLIP using SYCL backend
\n
"
,
__func__
);
#endif
#endif
if
(
!
new_clip
->
backend
)
{
if
(
!
new_clip
->
backend
)
{
...
...
llama/clip.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/common.cpp
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -44,6 +44,7 @@
...
@@ -44,6 +44,7 @@
#include <cstdarg>
#include <cstdarg>
#include <cstring>
#include <cstring>
#include <ctime>
#include <ctime>
#include <filesystem>
#include <fstream>
#include <fstream>
#include <iostream>
#include <iostream>
#include <iterator>
#include <iterator>
...
@@ -88,7 +89,9 @@
...
@@ -88,7 +89,9 @@
#ifdef __linux__
#ifdef __linux__
#include <linux/limits.h>
#include <linux/limits.h>
#elif defined(_WIN32)
#elif defined(_WIN32)
#define PATH_MAX MAX_PATH
# if !defined(PATH_MAX)
# define PATH_MAX MAX_PATH
# endif
#else
#else
#include <sys/syslimits.h>
#include <sys/syslimits.h>
#endif
#endif
...
@@ -912,9 +915,8 @@ struct common_init_result common_init_from_params(common_params & params) {
...
@@ -912,9 +915,8 @@ struct common_init_result common_init_from_params(common_params & params) {
}
}
if
(
params
.
ctx_shift
&&
!
llama_kv_cache_can_shift
(
lctx
))
{
if
(
params
.
ctx_shift
&&
!
llama_kv_cache_can_shift
(
lctx
))
{
LOG_ERR
(
"%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'
\n
"
,
__func__
);
LOG_WRN
(
"%s: KV cache shifting is not supported for this model, disabling KV cache shifting
\n
"
,
__func__
);
llama_free_model
(
model
);
params
.
ctx_shift
=
false
;
return
iparams
;
}
}
if
(
!
params
.
control_vectors
.
empty
())
{
if
(
!
params
.
control_vectors
.
empty
())
{
...
@@ -945,20 +947,21 @@ struct common_init_result common_init_from_params(common_params & params) {
...
@@ -945,20 +947,21 @@ struct common_init_result common_init_from_params(common_params & params) {
// load and optionally apply lora adapters
// load and optionally apply lora adapters
for
(
auto
&
la
:
params
.
lora_adapters
)
{
for
(
auto
&
la
:
params
.
lora_adapters
)
{
common_lora_adapter_container
loaded_la
;
llama_lora_adapter_ptr
lora
;
loaded_la
.
path
=
la
.
path
;
lora
.
reset
(
llama_lora_adapter_init
(
model
,
la
.
path
.
c_str
()));
loaded_la
.
scale
=
la
.
scale
;
if
(
lora
==
nullptr
)
{
loaded_la
.
adapter
=
llama_lora_adapter_init
(
model
,
la
.
path
.
c_str
());
if
(
loaded_la
.
adapter
==
nullptr
)
{
LOG_ERR
(
"%s: failed to apply lora adapter '%s'
\n
"
,
__func__
,
la
.
path
.
c_str
());
LOG_ERR
(
"%s: failed to apply lora adapter '%s'
\n
"
,
__func__
,
la
.
path
.
c_str
());
llama_free
(
lctx
);
llama_free
(
lctx
);
llama_free_model
(
model
);
llama_free_model
(
model
);
return
iparams
;
return
iparams
;
}
}
iparams
.
lora_adapters
.
push_back
(
loaded_la
);
// copy to list of loaded adapters
la
.
ptr
=
lora
.
get
();
iparams
.
lora
.
emplace_back
(
std
::
move
(
lora
));
// copy to list of loaded adapters
}
}
if
(
!
params
.
lora_init_without_apply
)
{
if
(
!
params
.
lora_init_without_apply
)
{
common_lora_adapters_apply
(
lctx
,
i
params
.
lora_adapters
);
common_lora_adapters_apply
(
lctx
,
params
.
lora_adapters
);
}
}
if
(
params
.
sampling
.
ignore_eos
&&
llama_token_eos
(
model
)
==
LLAMA_TOKEN_NULL
)
{
if
(
params
.
sampling
.
ignore_eos
&&
llama_token_eos
(
model
)
==
LLAMA_TOKEN_NULL
)
{
...
@@ -966,6 +969,25 @@ struct common_init_result common_init_from_params(common_params & params) {
...
@@ -966,6 +969,25 @@ struct common_init_result common_init_from_params(common_params & params) {
params
.
sampling
.
ignore_eos
=
false
;
params
.
sampling
.
ignore_eos
=
false
;
}
}
if
(
params
.
sampling
.
ignore_eos
)
{
for
(
llama_token
i
=
0
;
i
<
llama_n_vocab
(
model
);
i
++
)
{
if
(
llama_token_is_eog
(
model
,
i
))
{
LOG_INF
(
"%s: added %s logit bias = %f
\n
"
,
__func__
,
common_token_to_piece
(
lctx
,
i
).
c_str
(),
-
INFINITY
);
params
.
sampling
.
logit_bias
.
push_back
({
i
,
-
INFINITY
});
}
}
}
if
(
params
.
sampling
.
penalty_last_n
==
-
1
)
{
LOG_INF
(
"%s: setting penalty_last_n to ctx_size = %d
\n
"
,
__func__
,
llama_n_ctx
(
lctx
));
params
.
sampling
.
penalty_last_n
=
llama_n_ctx
(
lctx
);
}
if
(
params
.
sampling
.
dry_penalty_last_n
==
-
1
)
{
LOG_INF
(
"%s: setting dry_penalty_last_n to ctx_size = %d
\n
"
,
__func__
,
llama_n_ctx
(
lctx
));
params
.
sampling
.
dry_penalty_last_n
=
llama_n_ctx
(
lctx
);
}
if
(
params
.
warmup
)
{
if
(
params
.
warmup
)
{
LOG_WRN
(
"%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)
\n
"
,
__func__
);
LOG_WRN
(
"%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)
\n
"
,
__func__
);
...
@@ -1000,17 +1022,17 @@ struct common_init_result common_init_from_params(common_params & params) {
...
@@ -1000,17 +1022,17 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_perf_context_reset
(
lctx
);
llama_perf_context_reset
(
lctx
);
}
}
iparams
.
model
=
model
;
iparams
.
model
.
reset
(
model
)
;
iparams
.
context
=
lctx
;
iparams
.
context
.
reset
(
lctx
)
;
return
iparams
;
return
iparams
;
}
}
void
common_lora_adapters_apply
(
struct
llama_context
*
ctx
,
std
::
vector
<
common_lora_adapter_
container
>
&
lora_adapters
)
{
void
common_lora_adapters_apply
(
struct
llama_context
*
ctx
,
std
::
vector
<
common_lora_adapter_
info
>
&
lora
)
{
llama_lora_adapter_clear
(
ctx
);
llama_lora_adapter_clear
(
ctx
);
for
(
auto
&
la
:
lora
_adapters
)
{
for
(
auto
&
la
:
lora
)
{
if
(
la
.
scale
!=
0.0
f
)
{
if
(
la
.
scale
!=
0.0
f
)
{
llama_lora_adapter_set
(
ctx
,
la
.
ada
pt
e
r
,
la
.
scale
);
llama_lora_adapter_set
(
ctx
,
la
.
ptr
,
la
.
scale
);
}
}
}
}
}
}
...
@@ -1102,7 +1124,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
...
@@ -1102,7 +1124,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
#define CURL_RETRY_DELAY_SECONDS 2
static
bool
curl_perform_with_retry
(
const
std
::
string
&
url
,
CURL
*
curl
,
int
max_attempts
,
int
retry_delay_seconds
)
{
static
bool
curl_perform_with_retry
(
const
std
::
string
&
url
,
CURL
*
curl
,
int
max_attempts
,
int
retry_delay_seconds
)
{
int
remaining_attempts
=
max_attempts
;
int
remaining_attempts
=
max_attempts
;
while
(
remaining_attempts
>
0
)
{
while
(
remaining_attempts
>
0
)
{
...
@@ -1126,7 +1148,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
...
@@ -1126,7 +1148,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
}
}
static
bool
common_download_file
(
const
std
::
string
&
url
,
const
std
::
string
&
path
,
const
std
::
string
&
hf_token
)
{
static
bool
common_download_file
(
const
std
::
string
&
url
,
const
std
::
string
&
path
,
const
std
::
string
&
hf_token
)
{
// Initialize libcurl
// Initialize libcurl
std
::
unique_ptr
<
CURL
,
decltype
(
&
curl_easy_cleanup
)
>
curl
(
curl_easy_init
(),
&
curl_easy_cleanup
);
std
::
unique_ptr
<
CURL
,
decltype
(
&
curl_easy_cleanup
)
>
curl
(
curl_easy_init
(),
&
curl_easy_cleanup
);
if
(
!
curl
)
{
if
(
!
curl
)
{
...
@@ -1156,8 +1177,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
...
@@ -1156,8 +1177,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
#endif
#endif
// Check if the file already exists locally
// Check if the file already exists locally
struct
stat
model_file_info
;
auto
file_exists
=
std
::
filesystem
::
exists
(
path
);
auto
file_exists
=
(
stat
(
path
.
c_str
(),
&
model_file_info
)
==
0
);
// If the file exists, check its JSON metadata companion file.
// If the file exists, check its JSON metadata companion file.
std
::
string
metadata_path
=
path
+
".json"
;
std
::
string
metadata_path
=
path
+
".json"
;
...
@@ -1199,11 +1219,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
...
@@ -1199,11 +1219,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
std
::
string
etag
;
std
::
string
etag
;
std
::
string
last_modified
;
std
::
string
last_modified
;
};
};
common_load_model_from_url_headers
headers
;
common_load_model_from_url_headers
headers
;
{
{
typedef
size_t
(
*
CURLOPT_HEADERFUNCTION_PTR
)(
char
*
,
size_t
,
size_t
,
void
*
);
typedef
size_t
(
*
CURLOPT_HEADERFUNCTION_PTR
)(
char
*
,
size_t
,
size_t
,
void
*
);
auto
header_callback
=
[](
char
*
buffer
,
size_t
/*size*/
,
size_t
n_items
,
void
*
userdata
)
->
size_t
{
auto
header_callback
=
[](
char
*
buffer
,
size_t
/*size*/
,
size_t
n_items
,
void
*
userdata
)
->
size_t
{
common_load_model_from_url_headers
*
headers
=
(
common_load_model_from_url_headers
*
)
userdata
;
common_load_model_from_url_headers
*
headers
=
(
common_load_model_from_url_headers
*
)
userdata
;
static
std
::
regex
header_regex
(
"([^:]+): (.*)
\r\n
"
);
static
std
::
regex
header_regex
(
"([^:]+): (.*)
\r\n
"
);
static
std
::
regex
etag_regex
(
"ETag"
,
std
::
regex_constants
::
icase
);
static
std
::
regex
etag_regex
(
"ETag"
,
std
::
regex_constants
::
icase
);
...
@@ -1618,6 +1640,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
...
@@ -1618,6 +1640,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
// Chat template utils
// Chat template utils
//
//
std
::
string
common_get_builtin_chat_template
(
const
struct
llama_model
*
model
)
{
static
const
char
*
template_key
=
"tokenizer.chat_template"
;
// call with NULL buffer to get the total size of the string
int32_t
res
=
llama_model_meta_val_str
(
model
,
template_key
,
NULL
,
0
);
if
(
res
>
0
)
{
std
::
vector
<
char
>
model_template
(
res
+
1
,
0
);
llama_model_meta_val_str
(
model
,
template_key
,
model_template
.
data
(),
model_template
.
size
());
return
std
::
string
(
model_template
.
data
(),
model_template
.
size
()
-
1
);
}
return
""
;
}
bool
common_chat_verify_template
(
const
std
::
string
&
tmpl
)
{
bool
common_chat_verify_template
(
const
std
::
string
&
tmpl
)
{
llama_chat_message
chat
[]
=
{{
"user"
,
"test"
}};
llama_chat_message
chat
[]
=
{{
"user"
,
"test"
}};
int
res
=
llama_chat_apply_template
(
nullptr
,
tmpl
.
c_str
(),
chat
,
1
,
true
,
nullptr
,
0
);
int
res
=
llama_chat_apply_template
(
nullptr
,
tmpl
.
c_str
(),
chat
,
1
,
true
,
nullptr
,
0
);
...
@@ -1787,7 +1821,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
...
@@ -1787,7 +1821,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
break
;
break
;
case
0
:
// max absolute
case
0
:
// max absolute
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
if
(
sum
<
std
::
abs
(
inp
[
i
]))
sum
=
std
::
abs
(
inp
[
i
]);
if
(
sum
<
std
::
abs
(
inp
[
i
]))
{
sum
=
std
::
abs
(
inp
[
i
]);
}
}
}
sum
/=
32760.0
;
// make an int16 range
sum
/=
32760.0
;
// make an int16 range
break
;
break
;
...
...
llama/common.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -28,7 +28,7 @@
...
@@ -28,7 +28,7 @@
#pragma once
#pragma once
#include "llama.h"
#include "llama
-cpp
.h"
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -53,10 +53,8 @@
...
@@ -53,10 +53,8 @@
struct
common_lora_adapter_info
{
struct
common_lora_adapter_info
{
std
::
string
path
;
std
::
string
path
;
float
scale
;
float
scale
;
};
struct
common_lora_adapter_container
:
common_lora_adapter_info
{
struct
llama_lora_adapter
*
ptr
;
struct
llama_lora_adapter
*
adapter
;
};
};
using
llama_tokens
=
std
::
vector
<
llama_token
>
;
using
llama_tokens
=
std
::
vector
<
llama_token
>
;
...
@@ -106,6 +104,7 @@ enum llama_example {
...
@@ -106,6 +104,7 @@ enum llama_example {
LLAMA_EXAMPLE_LLAVA
,
LLAMA_EXAMPLE_LLAVA
,
LLAMA_EXAMPLE_LOOKUP
,
LLAMA_EXAMPLE_LOOKUP
,
LLAMA_EXAMPLE_PARALLEL
,
LLAMA_EXAMPLE_PARALLEL
,
LLAMA_EXAMPLE_TTS
,
LLAMA_EXAMPLE_COUNT
,
LLAMA_EXAMPLE_COUNT
,
};
};
...
@@ -121,6 +120,7 @@ enum common_sampler_type {
...
@@ -121,6 +120,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_TEMPERATURE
=
7
,
COMMON_SAMPLER_TYPE_TEMPERATURE
=
7
,
COMMON_SAMPLER_TYPE_XTC
=
8
,
COMMON_SAMPLER_TYPE_XTC
=
8
,
COMMON_SAMPLER_TYPE_INFILL
=
9
,
COMMON_SAMPLER_TYPE_INFILL
=
9
,
COMMON_SAMPLER_TYPE_PENALTIES
=
10
,
};
};
// dimensionality reduction methods, used by cvector-generator
// dimensionality reduction methods, used by cvector-generator
...
@@ -156,7 +156,6 @@ struct common_params_sampling {
...
@@ -156,7 +156,6 @@ struct common_params_sampling {
int32_t
mirostat
=
0
;
// 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
int32_t
mirostat
=
0
;
// 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float
mirostat_tau
=
5.00
f
;
// target entropy
float
mirostat_tau
=
5.00
f
;
// target entropy
float
mirostat_eta
=
0.10
f
;
// learning rate
float
mirostat_eta
=
0.10
f
;
// learning rate
bool
penalize_nl
=
false
;
// consider newlines as a repeatable token
bool
ignore_eos
=
false
;
bool
ignore_eos
=
false
;
bool
no_perf
=
false
;
// disable performance metrics
bool
no_perf
=
false
;
// disable performance metrics
bool
timing_per_token
=
false
;
bool
timing_per_token
=
false
;
...
@@ -165,6 +164,7 @@ struct common_params_sampling {
...
@@ -165,6 +164,7 @@ struct common_params_sampling {
std
::
vector
<
enum
common_sampler_type
>
samplers
=
{
std
::
vector
<
enum
common_sampler_type
>
samplers
=
{
COMMON_SAMPLER_TYPE_PENALTIES
,
COMMON_SAMPLER_TYPE_DRY
,
COMMON_SAMPLER_TYPE_DRY
,
COMMON_SAMPLER_TYPE_TOP_K
,
COMMON_SAMPLER_TYPE_TOP_K
,
COMMON_SAMPLER_TYPE_TYPICAL_P
,
COMMON_SAMPLER_TYPE_TYPICAL_P
,
...
@@ -184,6 +184,7 @@ struct common_params_sampling {
...
@@ -184,6 +184,7 @@ struct common_params_sampling {
struct
common_params_speculative
{
struct
common_params_speculative
{
std
::
vector
<
ggml_backend_dev_t
>
devices
;
// devices to use for offloading
std
::
vector
<
ggml_backend_dev_t
>
devices
;
// devices to use for offloading
int32_t
n_ctx
=
0
;
// draft context size
int32_t
n_ctx
=
0
;
// draft context size
int32_t
n_max
=
16
;
// maximum number of tokens to draft during speculative decoding
int32_t
n_max
=
16
;
// maximum number of tokens to draft during speculative decoding
int32_t
n_min
=
5
;
// minimum number of draft tokens to use for speculative decoding
int32_t
n_min
=
5
;
// minimum number of draft tokens to use for speculative decoding
...
@@ -197,6 +198,14 @@ struct common_params_speculative {
...
@@ -197,6 +198,14 @@ struct common_params_speculative {
std
::
string
model
=
""
;
// draft model for speculative decoding // NOLINT
std
::
string
model
=
""
;
// draft model for speculative decoding // NOLINT
};
};
struct
common_params_vocoder
{
std
::
string
hf_repo
=
""
;
// HF repo // NOLINT
std
::
string
hf_file
=
""
;
// HF file // NOLINT
std
::
string
model
=
""
;
// model path // NOLINT
std
::
string
model_url
=
""
;
// model url to download // NOLINT
};
struct
common_params
{
struct
common_params
{
int32_t
n_predict
=
-
1
;
// new tokens to predict
int32_t
n_predict
=
-
1
;
// new tokens to predict
int32_t
n_ctx
=
4096
;
// context size
int32_t
n_ctx
=
4096
;
// context size
...
@@ -219,11 +228,13 @@ struct common_params {
...
@@ -219,11 +228,13 @@ struct common_params {
float
defrag_thold
=
0.1
f
;
// KV cache defragmentation threshold
float
defrag_thold
=
0.1
f
;
// KV cache defragmentation threshold
// offload params
// offload params
std
::
vector
<
ggml_backend_dev_t
>
devices
;
// devices to use for offloading
std
::
vector
<
ggml_backend_dev_t
>
devices
;
// devices to use for offloading
int32_t
n_gpu_layers
=
-
1
;
// number of layers to store in VRAM (-1 - use default)
int32_t
main_gpu
=
0
;
// the GPU that is used for scratch and small tensors
int32_t
n_gpu_layers
=
-
1
;
// number of layers to store in VRAM (-1 - use default)
float
tensor_split
[
128
]
=
{
0
};
// how split tensors should be distributed across GPUs
int32_t
main_gpu
=
0
;
// the GPU that is used for scratch and small tensors
enum
llama_split_mode
split_mode
=
LLAMA_SPLIT_MODE_LAYER
;
// how to split the model across GPUs
float
tensor_split
[
128
]
=
{
0
};
// how split tensors should be distributed across GPUs
enum
llama_split_mode
split_mode
=
LLAMA_SPLIT_MODE_LAYER
;
// how to split the model across GPUs
struct
cpu_params
cpuparams
;
struct
cpu_params
cpuparams
;
struct
cpu_params
cpuparams_batch
;
struct
cpu_params
cpuparams_batch
;
...
@@ -237,8 +248,9 @@ struct common_params {
...
@@ -237,8 +248,9 @@ struct common_params {
enum
llama_pooling_type
pooling_type
=
LLAMA_POOLING_TYPE_UNSPECIFIED
;
// pooling type for embeddings
enum
llama_pooling_type
pooling_type
=
LLAMA_POOLING_TYPE_UNSPECIFIED
;
// pooling type for embeddings
enum
llama_attention_type
attention_type
=
LLAMA_ATTENTION_TYPE_UNSPECIFIED
;
// attention type for embeddings
enum
llama_attention_type
attention_type
=
LLAMA_ATTENTION_TYPE_UNSPECIFIED
;
// attention type for embeddings
struct
common_params_sampling
sampling
;
struct
common_params_sampling
sampling
;
struct
common_params_speculative
speculative
;
struct
common_params_speculative
speculative
;
struct
common_params_vocoder
vocoder
;
std
::
string
model
=
""
;
// model path // NOLINT
std
::
string
model
=
""
;
// model path // NOLINT
std
::
string
model_alias
=
""
;
// model alias // NOLINT
std
::
string
model_alias
=
""
;
// model alias // NOLINT
...
@@ -490,10 +502,12 @@ std::string fs_get_cache_file(const std::string & filename);
...
@@ -490,10 +502,12 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils
// Model utils
//
//
// note: defines object's lifetime
struct
common_init_result
{
struct
common_init_result
{
struct
llama_model
*
model
=
nullptr
;
llama_model_ptr
model
;
struct
llama_context
*
context
=
nullptr
;
llama_context_ptr
context
;
std
::
vector
<
common_lora_adapter_container
>
lora_adapters
;
std
::
vector
<
llama_lora_adapter_ptr
>
lora
;
};
};
struct
common_init_result
common_init_from_params
(
common_params
&
params
);
struct
common_init_result
common_init_from_params
(
common_params
&
params
);
...
@@ -515,7 +529,7 @@ struct llama_model * common_load_model_from_hf(
...
@@ -515,7 +529,7 @@ struct llama_model * common_load_model_from_hf(
const
struct
llama_model_params
&
params
);
const
struct
llama_model_params
&
params
);
// clear LoRA adapters from context, then apply new list of adapters
// clear LoRA adapters from context, then apply new list of adapters
void
common_lora_adapters_apply
(
struct
llama_context
*
ctx
,
std
::
vector
<
common_lora_adapter_
container
>
&
lora_adapters
);
void
common_lora_adapters_apply
(
struct
llama_context
*
ctx
,
std
::
vector
<
common_lora_adapter_
info
>
&
lora
);
//
//
// Batch utils
// Batch utils
...
@@ -583,6 +597,9 @@ struct common_chat_msg {
...
@@ -583,6 +597,9 @@ struct common_chat_msg {
std
::
string
content
;
std
::
string
content
;
};
};
// Get the built-in chat template for the model. Return empty string if not present.
std
::
string
common_get_builtin_chat_template
(
const
struct
llama_model
*
model
);
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool
common_chat_verify_template
(
const
std
::
string
&
tmpl
);
bool
common_chat_verify_template
(
const
std
::
string
&
tmpl
);
...
@@ -619,7 +636,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
...
@@ -619,7 +636,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
// Embedding utils
// Embedding utils
//
//
void
common_embd_normalize
(
const
float
*
inp
,
float
*
out
,
int
n
,
int
embd_norm
=
2
);
// TODO: repace embd_norm with an enum
void
common_embd_normalize
(
const
float
*
inp
,
float
*
out
,
int
n
,
int
embd_norm
);
float
common_embd_similarity_cos
(
const
float
*
embd1
,
const
float
*
embd2
,
int
n
);
float
common_embd_similarity_cos
(
const
float
*
embd1
,
const
float
*
embd2
,
int
n
);
...
@@ -648,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
...
@@ -648,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
// Split utils
// Split utils
//
//
static
const
char
*
const
LLM_KV_SPLIT_NO
=
"split.no"
;
namespace
{
static
const
char
*
const
LLM_KV_SPLIT_COUNT
=
"split.count"
;
static
const
char
*
const
LLM_KV_SPLIT_TENSORS_COUNT
=
"split.tensors.count"
;
const
char
*
const
LLM_KV_SPLIT_NO
=
"split.no"
;
const
char
*
const
LLM_KV_SPLIT_COUNT
=
"split.count"
;
const
char
*
const
LLM_KV_SPLIT_TENSORS_COUNT
=
"split.tensors.count"
;
}
llama/ggml-alloc.c
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -560,7 +560,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
...
@@ -560,7 +560,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
size_t
offset
=
ggml_dyn_tallocr_alloc
(
alloc
,
size
,
node
);
size_t
offset
=
ggml_dyn_tallocr_alloc
(
alloc
,
size
,
node
);
hn
->
buffer_id
=
buffer_id
;
hn
->
buffer_id
=
buffer_id
;
hn
->
offset
=
offset
;
hn
->
offset
=
offset
;
return
;
}
}
}
}
...
...
llama/ggml-alloc.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-backend-impl.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-backend-reg.cpp
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -92,6 +92,26 @@
...
@@ -92,6 +92,26 @@
#include "ggml-kompute.h"
#include "ggml-kompute.h"
#endif
#endif
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
static
std
::
wstring
utf8_to_utf16
(
const
std
::
string
&
str
)
{
std
::
wstring_convert
<
std
::
codecvt_utf8_utf16
<
wchar_t
>>
converter
;
return
converter
.
from_bytes
(
str
);
}
static
std
::
string
utf16_to_utf8
(
const
std
::
wstring
&
str
)
{
std
::
wstring_convert
<
std
::
codecvt_utf8_utf16
<
wchar_t
>>
converter
;
return
converter
.
to_bytes
(
str
);
}
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
#ifdef _WIN32
#ifdef _WIN32
using
dl_handle
=
std
::
remove_pointer_t
<
HMODULE
>
;
using
dl_handle
=
std
::
remove_pointer_t
<
HMODULE
>
;
...
@@ -114,11 +134,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
...
@@ -114,11 +134,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
return
handle
;
return
handle
;
}
}
static
dl_handle
*
dl_load_library
(
const
std
::
string
&
path
)
{
std
::
wstring_convert
<
std
::
codecvt_utf8_utf16
<
wchar_t
>>
converter
;
return
dl_load_library
(
converter
.
from_bytes
(
path
));
}
static
void
*
dl_get_sym
(
dl_handle
*
handle
,
const
char
*
name
)
{
static
void
*
dl_get_sym
(
dl_handle
*
handle
,
const
char
*
name
)
{
DWORD
old_mode
=
SetErrorMode
(
SEM_FAILCRITICALERRORS
);
DWORD
old_mode
=
SetErrorMode
(
SEM_FAILCRITICALERRORS
);
SetErrorMode
(
old_mode
|
SEM_FAILCRITICALERRORS
);
SetErrorMode
(
old_mode
|
SEM_FAILCRITICALERRORS
);
...
@@ -140,8 +155,8 @@ struct dl_handle_deleter {
...
@@ -140,8 +155,8 @@ struct dl_handle_deleter {
}
}
};
};
static
void
*
dl_load_library
(
const
std
::
string
&
path
)
{
static
void
*
dl_load_library
(
const
std
::
w
string
&
path
)
{
dl_handle
*
handle
=
dlopen
(
path
.
c_str
(),
RTLD_NOW
|
RTLD_LOCAL
);
dl_handle
*
handle
=
dlopen
(
utf16_to_utf8
(
path
)
.
c_str
(),
RTLD_NOW
|
RTLD_LOCAL
);
return
handle
;
return
handle
;
}
}
...
@@ -182,9 +197,9 @@ struct ggml_backend_registry {
...
@@ -182,9 +197,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_CANN
#ifdef GGML_USE_CANN
register_backend
(
ggml_backend_cann_reg
());
register_backend
(
ggml_backend_cann_reg
());
#endif
#endif
#ifdef GGML_USE_BLAS
//
#ifdef GGML_USE_BLAS
register_backend
(
ggml_backend_blas_reg
());
//
register_backend(ggml_backend_blas_reg());
#endif
//
#endif
#ifdef GGML_USE_RPC
#ifdef GGML_USE_RPC
register_backend
(
ggml_backend_rpc_reg
());
register_backend
(
ggml_backend_rpc_reg
());
#endif
#endif
...
@@ -228,11 +243,11 @@ struct ggml_backend_registry {
...
@@ -228,11 +243,11 @@ struct ggml_backend_registry {
devices
.
push_back
(
device
);
devices
.
push_back
(
device
);
}
}
ggml_backend_reg_t
load_backend
(
const
char
*
path
,
bool
silent
)
{
ggml_backend_reg_t
load_backend
(
const
std
::
wstring
&
path
,
bool
silent
)
{
dl_handle_ptr
handle
{
dl_load_library
(
path
)
};
dl_handle_ptr
handle
{
dl_load_library
(
path
)
};
if
(
!
handle
)
{
if
(
!
handle
)
{
if
(
!
silent
)
{
if
(
!
silent
)
{
GGML_LOG_ERROR
(
"%s: failed to load %s
\n
"
,
__func__
,
path
);
GGML_LOG_ERROR
(
"%s: failed to load %s
\n
"
,
__func__
,
utf16_to_utf8
(
path
).
c_str
()
);
}
}
return
nullptr
;
return
nullptr
;
}
}
...
@@ -240,7 +255,7 @@ struct ggml_backend_registry {
...
@@ -240,7 +255,7 @@ struct ggml_backend_registry {
auto
score_fn
=
(
ggml_backend_score_t
)
dl_get_sym
(
handle
.
get
(),
"ggml_backend_score"
);
auto
score_fn
=
(
ggml_backend_score_t
)
dl_get_sym
(
handle
.
get
(),
"ggml_backend_score"
);
if
(
score_fn
&&
score_fn
()
==
0
)
{
if
(
score_fn
&&
score_fn
()
==
0
)
{
if
(
!
silent
)
{
if
(
!
silent
)
{
GGML_LOG_INFO
(
"%s: backend %s is not supported on this system
\n
"
,
__func__
,
path
);
GGML_LOG_INFO
(
"%s: backend %s is not supported on this system
\n
"
,
__func__
,
utf16_to_utf8
(
path
).
c_str
()
);
}
}
return
nullptr
;
return
nullptr
;
}
}
...
@@ -248,7 +263,7 @@ struct ggml_backend_registry {
...
@@ -248,7 +263,7 @@ struct ggml_backend_registry {
auto
backend_init_fn
=
(
ggml_backend_init_t
)
dl_get_sym
(
handle
.
get
(),
"ggml_backend_init"
);
auto
backend_init_fn
=
(
ggml_backend_init_t
)
dl_get_sym
(
handle
.
get
(),
"ggml_backend_init"
);
if
(
!
backend_init_fn
)
{
if
(
!
backend_init_fn
)
{
if
(
!
silent
)
{
if
(
!
silent
)
{
GGML_LOG_ERROR
(
"%s: failed to find ggml_backend_init in %s
\n
"
,
__func__
,
path
);
GGML_LOG_ERROR
(
"%s: failed to find ggml_backend_init in %s
\n
"
,
__func__
,
utf16_to_utf8
(
path
).
c_str
()
);
}
}
return
nullptr
;
return
nullptr
;
}
}
...
@@ -257,16 +272,16 @@ struct ggml_backend_registry {
...
@@ -257,16 +272,16 @@ struct ggml_backend_registry {
if
(
!
reg
||
reg
->
api_version
!=
GGML_BACKEND_API_VERSION
)
{
if
(
!
reg
||
reg
->
api_version
!=
GGML_BACKEND_API_VERSION
)
{
if
(
!
silent
)
{
if
(
!
silent
)
{
if
(
!
reg
)
{
if
(
!
reg
)
{
GGML_LOG_ERROR
(
"%s: failed to initialize backend from %s: ggml_backend_init returned NULL
\n
"
,
__func__
,
path
);
GGML_LOG_ERROR
(
"%s: failed to initialize backend from %s: ggml_backend_init returned NULL
\n
"
,
__func__
,
utf16_to_utf8
(
path
).
c_str
()
);
}
else
{
}
else
{
GGML_LOG_ERROR
(
"%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)
\n
"
,
GGML_LOG_ERROR
(
"%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)
\n
"
,
__func__
,
path
,
reg
->
api_version
,
GGML_BACKEND_API_VERSION
);
__func__
,
utf16_to_utf8
(
path
).
c_str
()
,
reg
->
api_version
,
GGML_BACKEND_API_VERSION
);
}
}
}
}
return
nullptr
;
return
nullptr
;
}
}
GGML_LOG_INFO
(
"%s: loaded %s backend from %s
\n
"
,
__func__
,
ggml_backend_reg_name
(
reg
),
path
);
GGML_LOG_INFO
(
"%s: loaded %s backend from %s
\n
"
,
__func__
,
ggml_backend_reg_name
(
reg
),
utf16_to_utf8
(
path
).
c_str
()
);
register_backend
(
reg
,
std
::
move
(
handle
));
register_backend
(
reg
,
std
::
move
(
handle
));
...
@@ -402,14 +417,14 @@ ggml_backend_t ggml_backend_init_best(void) {
...
@@ -402,14 +417,14 @@ ggml_backend_t ggml_backend_init_best(void) {
// Dynamic loading
// Dynamic loading
ggml_backend_reg_t
ggml_backend_load
(
const
char
*
path
)
{
ggml_backend_reg_t
ggml_backend_load
(
const
char
*
path
)
{
return
get_reg
().
load_backend
(
path
,
false
);
return
get_reg
().
load_backend
(
utf8_to_utf16
(
path
)
,
false
);
}
}
void
ggml_backend_unload
(
ggml_backend_reg_t
reg
)
{
void
ggml_backend_unload
(
ggml_backend_reg_t
reg
)
{
get_reg
().
unload_backend
(
reg
,
true
);
get_reg
().
unload_backend
(
reg
,
true
);
}
}
static
std
::
string
get_executable_path
()
{
static
std
::
w
string
get_executable_path
()
{
#if defined(__APPLE__)
#if defined(__APPLE__)
// get executable path
// get executable path
std
::
vector
<
char
>
path
;
std
::
vector
<
char
>
path
;
...
@@ -427,13 +442,17 @@ static std::string get_executable_path() {
...
@@ -427,13 +442,17 @@ static std::string get_executable_path() {
if
(
last_slash
!=
std
::
string
::
npos
)
{
if
(
last_slash
!=
std
::
string
::
npos
)
{
base_path
=
base_path
.
substr
(
0
,
last_slash
);
base_path
=
base_path
.
substr
(
0
,
last_slash
);
}
}
return
base_path
+
"/"
;
return
utf8_to_utf16
(
base_path
+
"/"
)
;
#elif defined(__linux__)
#elif defined(__linux__)
|| defined(__FreeBSD__)
std
::
string
base_path
=
"."
;
std
::
string
base_path
=
"."
;
std
::
vector
<
char
>
path
(
1024
);
std
::
vector
<
char
>
path
(
1024
);
while
(
true
)
{
while
(
true
)
{
// get executable path
// get executable path
# if defined(__linux__)
ssize_t
len
=
readlink
(
"/proc/self/exe"
,
path
.
data
(),
path
.
size
());
ssize_t
len
=
readlink
(
"/proc/self/exe"
,
path
.
data
(),
path
.
size
());
# elif defined(__FreeBSD__)
ssize_t
len
=
readlink
(
"/proc/curproc/file"
,
path
.
data
(),
path
.
size
());
# endif
if
(
len
==
-
1
)
{
if
(
len
==
-
1
)
{
break
;
break
;
}
}
...
@@ -449,57 +468,63 @@ static std::string get_executable_path() {
...
@@ -449,57 +468,63 @@ static std::string get_executable_path() {
path
.
resize
(
path
.
size
()
*
2
);
path
.
resize
(
path
.
size
()
*
2
);
}
}
return
base_path
+
"/"
;
return
utf8_to_utf16
(
base_path
+
"/"
)
;
#elif defined(_WIN32)
#elif defined(_WIN32)
std
::
vector
<
char
>
path
(
MAX_PATH
);
std
::
vector
<
w
char
_t
>
path
(
MAX_PATH
);
DWORD
len
=
GetModuleFileName
A
(
NULL
,
path
.
data
(),
path
.
size
());
DWORD
len
=
GetModuleFileName
W
(
NULL
,
path
.
data
(),
path
.
size
());
if
(
len
==
0
)
{
if
(
len
==
0
)
{
return
""
;
return
{}
;
}
}
std
::
string
base_path
(
path
.
data
(),
len
);
std
::
w
string
base_path
(
path
.
data
(),
len
);
// remove executable name
// remove executable name
auto
last_slash
=
base_path
.
find_last_of
(
'\\'
);
auto
last_slash
=
base_path
.
find_last_of
(
'\\'
);
if
(
last_slash
!=
std
::
string
::
npos
)
{
if
(
last_slash
!=
std
::
string
::
npos
)
{
base_path
=
base_path
.
substr
(
0
,
last_slash
);
base_path
=
base_path
.
substr
(
0
,
last_slash
);
}
}
return
base_path
+
"
\\
"
;
return
base_path
+
L"
\\
"
;
#else
return
{};
#endif
}
static
std
::
wstring
backend_filename_prefix
()
{
#ifdef _WIN32
return
L"ggml-"
;
#else
return
L"libggml-"
;
#endif
#endif
}
}
static
std
::
string
backend_filename_
pre
fix
()
{
static
std
::
w
string
backend_filename_
suf
fix
()
{
#ifdef _WIN32
#ifdef _WIN32
return
"ggml-
"
;
return
L".dll
"
;
#else
#else
return
"libggml-
"
;
return
L".so
"
;
#endif
#endif
}
}
static
std
::
string
backend_filename_suffix
()
{
static
std
::
w
string
path_separator
()
{
#ifdef _WIN32
#ifdef _WIN32
return
".dll
"
;
return
L"
\\
"
;
#else
#else
return
".so
"
;
return
L"/
"
;
#endif
#endif
}
}
static
ggml_backend_reg_t
ggml_backend_load_best
(
const
char
*
name
,
bool
silent
,
const
char
*
user_search_path
)
{
static
ggml_backend_reg_t
ggml_backend_load_best
(
const
char
*
name
,
bool
silent
,
const
char
*
user_search_path
)
{
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths
// TODO: search system paths
std
::
string
file_prefix
=
backend_filename_prefix
()
+
name
+
"-"
;
std
::
w
string
file_prefix
=
backend_filename_prefix
()
+
utf8_to_utf16
(
name
)
+
L
"-"
;
std
::
vector
<
std
::
string
>
search_paths
;
std
::
vector
<
std
::
w
string
>
search_paths
;
if
(
user_search_path
==
nullptr
)
{
if
(
user_search_path
==
nullptr
)
{
search_paths
.
push_back
(
".
/
"
);
search_paths
.
push_back
(
L
"."
+
path_separator
()
);
search_paths
.
push_back
(
get_executable_path
());
search_paths
.
push_back
(
get_executable_path
());
}
else
{
}
else
{
#if defined(_WIN32)
search_paths
.
push_back
(
utf8_to_utf16
(
user_search_path
)
+
path_separator
());
search_paths
.
push_back
(
std
::
string
(
user_search_path
)
+
"
\\
"
);
#else
search_paths
.
push_back
(
std
::
string
(
user_search_path
)
+
"/"
);
#endif
}
}
int
best_score
=
0
;
int
best_score
=
0
;
std
::
string
best_path
;
std
::
w
string
best_path
;
namespace
fs
=
std
::
filesystem
;
namespace
fs
=
std
::
filesystem
;
for
(
const
auto
&
search_path
:
search_paths
)
{
for
(
const
auto
&
search_path
:
search_paths
)
{
...
@@ -509,27 +534,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
...
@@ -509,27 +534,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
fs
::
directory_iterator
dir_it
(
search_path
,
fs
::
directory_options
::
skip_permission_denied
);
fs
::
directory_iterator
dir_it
(
search_path
,
fs
::
directory_options
::
skip_permission_denied
);
for
(
const
auto
&
entry
:
dir_it
)
{
for
(
const
auto
&
entry
:
dir_it
)
{
if
(
entry
.
is_regular_file
())
{
if
(
entry
.
is_regular_file
())
{
std
::
string
filename
=
entry
.
path
().
filename
().
string
();
std
::
w
string
filename
=
entry
.
path
().
filename
().
w
string
();
std
::
string
ext
=
entry
.
path
().
extension
().
string
();
std
::
w
string
ext
=
entry
.
path
().
extension
().
w
string
();
if
(
filename
.
find
(
file_prefix
)
==
0
&&
ext
==
backend_filename_suffix
())
{
if
(
filename
.
find
(
file_prefix
)
==
0
&&
ext
==
backend_filename_suffix
())
{
dl_handle_ptr
handle
{
dl_load_library
(
entry
.
path
().
c_
str
())
};
dl_handle_ptr
handle
{
dl_load_library
(
entry
.
path
().
w
str
ing
())
};
if
(
!
handle
&&
!
silent
)
{
if
(
!
handle
&&
!
silent
)
{
GGML_LOG_ERROR
(
"%s: failed to load %s
\n
"
,
__func__
,
entry
.
path
().
string
().
c_str
());
GGML_LOG_ERROR
(
"%s: failed to load %s
\n
"
,
__func__
,
utf16_to_utf8
(
entry
.
path
().
w
string
()
)
.
c_str
());
}
}
if
(
handle
)
{
if
(
handle
)
{
auto
score_fn
=
(
ggml_backend_score_t
)
dl_get_sym
(
handle
.
get
(),
"ggml_backend_score"
);
auto
score_fn
=
(
ggml_backend_score_t
)
dl_get_sym
(
handle
.
get
(),
"ggml_backend_score"
);
if
(
score_fn
)
{
if
(
score_fn
)
{
int
s
=
score_fn
();
int
s
=
score_fn
();
#ifndef NDEBUG
#ifndef NDEBUG
GGML_LOG_DEBUG
(
"%s: %s score: %d
\n
"
,
__func__
,
entry
.
path
().
string
().
c_str
(),
s
);
GGML_LOG_DEBUG
(
"%s: %s score: %d
\n
"
,
__func__
,
utf16_to_utf8
(
entry
.
path
().
w
string
()
)
.
c_str
(),
s
);
#endif
#endif
if
(
s
>
best_score
)
{
if
(
s
>
best_score
)
{
best_score
=
s
;
best_score
=
s
;
best_path
=
entry
.
path
().
string
();
best_path
=
entry
.
path
().
w
string
();
}
}
}
else
{
}
else
{
if
(
!
silent
)
{
if
(
!
silent
)
{
GGML_LOG_INFO
(
"%s: failed to find ggml_backend_score in %s
\n
"
,
__func__
,
entry
.
path
().
string
().
c_str
());
GGML_LOG_INFO
(
"%s: failed to find ggml_backend_score in %s
\n
"
,
__func__
,
utf16_to_utf8
(
entry
.
path
().
w
string
()
)
.
c_str
());
}
}
}
}
}
}
...
@@ -541,15 +566,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
...
@@ -541,15 +566,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
if
(
best_score
==
0
)
{
if
(
best_score
==
0
)
{
// try to load the base backend
// try to load the base backend
for
(
const
auto
&
search_path
:
search_paths
)
{
for
(
const
auto
&
search_path
:
search_paths
)
{
std
::
string
path
=
search_path
+
backend_filename_prefix
()
+
name
+
backend_filename_suffix
();
std
::
w
string
path
=
search_path
+
backend_filename_prefix
()
+
utf8_to_utf16
(
name
)
+
backend_filename_suffix
();
if
(
fs
::
exists
(
path
))
{
if
(
fs
::
exists
(
path
))
{
return
get_reg
().
load_backend
(
path
.
c_str
()
,
silent
);
return
get_reg
().
load_backend
(
path
,
silent
);
}
}
}
}
return
nullptr
;
return
nullptr
;
}
}
return
get_reg
().
load_backend
(
best_path
.
c_str
()
,
silent
);
return
get_reg
().
load_backend
(
best_path
,
silent
);
}
}
void
ggml_backend_load_all
()
{
void
ggml_backend_load_all
()
{
...
...
llama/ggml-backend.cpp
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -826,9 +826,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
...
@@ -826,9 +826,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
for
(
int
i
=
0
;
i
<
graph
->
n_nodes
;
i
++
)
{
for
(
int
i
=
0
;
i
<
graph
->
n_nodes
;
i
++
)
{
if
(
cur_split
<
sched
->
n_splits
&&
i
==
sched
->
splits
[
cur_split
].
i_start
)
{
if
(
cur_split
<
sched
->
n_splits
&&
i
==
sched
->
splits
[
cur_split
].
i_start
)
{
ggml_backend_t
split_backend
=
sched
->
backends
[
sched
->
splits
[
cur_split
].
backend_id
];
ggml_backend_t
split_backend
=
sched
->
backends
[
sched
->
splits
[
cur_split
].
backend_id
];
GGML_LOG_DEBUG
(
"
\n
## SPLIT #%d: %s # %d inputs
:
"
,
cur_split
,
ggml_backend_name
(
split_backend
),
GGML_LOG_DEBUG
(
"
\n
## SPLIT #%d: %s # %d inputs"
,
cur_split
,
ggml_backend_name
(
split_backend
),
sched
->
splits
[
cur_split
].
n_inputs
);
sched
->
splits
[
cur_split
].
n_inputs
);
for
(
int
j
=
0
;
j
<
sched
->
splits
[
cur_split
].
n_inputs
;
j
++
)
{
for
(
int
j
=
0
;
j
<
sched
->
splits
[
cur_split
].
n_inputs
;
j
++
)
{
if
(
j
==
0
)
{
GGML_LOG_DEBUG
(
": "
);
}
GGML_LOG_DEBUG
(
"[%s (%5.5s)] "
,
sched
->
splits
[
cur_split
].
inputs
[
j
]
->
name
,
GGML_LOG_DEBUG
(
"[%s (%5.5s)] "
,
sched
->
splits
[
cur_split
].
inputs
[
j
]
->
name
,
fmt_size
(
ggml_nbytes
(
sched
->
splits
[
cur_split
].
inputs
[
j
])));
fmt_size
(
ggml_nbytes
(
sched
->
splits
[
cur_split
].
inputs
[
j
])));
}
}
...
...
llama/ggml-backend.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-blas.cpp
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-blas.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-common.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cpp.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cpu-aarch64.cpp
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -220,9 +220,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
...
@@ -220,9 +220,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
}
}
static
inline
__m256i
mul_sum_us8_pairs_int32x8
(
const
__m256i
ax
,
const
__m256i
sy
)
{
static
inline
__m256i
mul_sum_us8_pairs_int32x8
(
const
__m256i
ax
,
const
__m256i
sy
)
{
#if
defined(__AVXVNNI__) || (
defined(__AVX512VNNI__) && defined(__AVX512VL__)
)
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
const
__m256i
zero
=
_mm256_setzero_si256
();
const
__m256i
zero
=
_mm256_setzero_si256
();
return
_mm256_dpbusd_epi32
(
zero
,
ax
,
sy
);
return
_mm256_dpbusd_epi32
(
zero
,
ax
,
sy
);
#elif defined(__AVXVNNI__)
const
__m256i
zero
=
_mm256_setzero_si256
();
return
_mm256_dpbusd_avx_epi32
(
zero
,
ax
,
sy
);
#else
#else
// Perform multiplication and create 16-bit values
// Perform multiplication and create 16-bit values
const
__m256i
dot
=
_mm256_maddubs_epi16
(
ax
,
sy
);
const
__m256i
dot
=
_mm256_maddubs_epi16
(
ax
,
sy
);
...
@@ -590,21 +593,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
...
@@ -590,21 +593,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if
(
ggml_cpu_has_neon
()
&&
ggml_cpu_has_dotprod
())
{
if
(
ggml_cpu_has_neon
()
&&
ggml_cpu_has_dotprod
())
{
const
block_q4_0x4
*
b_ptr
=
(
const
block_q4_0x4
*
)
vx
;
const
block_q4_0x4
*
b_ptr
=
(
const
block_q4_0x4
*
)
vx
;
for
(
int
c
=
0
;
c
<
nc
;
c
+=
ncols_interleaved
)
{
for
(
int
c
=
0
;
c
<
nc
;
c
+=
ncols_interleaved
)
{
const
block_q8_0
*
a_ptr
=
(
const
block_q8_0
*
)
vy
;
const
block_q8_0
*
a_ptr
=
(
const
block_q8_0
*
)
vy
;
float32x4_t
acc
=
vdupq_n_f32
(
0
);
float32x4_t
acc
=
vdupq_n_f32
(
0
);
for
(
int
b
=
0
;
b
<
nb
;
b
++
)
{
for
(
int
b
=
0
;
b
<
nb
;
b
++
)
{
int8x16_t
b0
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
);
int8x16_t
b0
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
);
int8x16_t
b1
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
+
16
);
int8x16_t
b1
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
+
16
);
int8x16_t
b2
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
+
32
);
int8x16_t
b2
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
+
32
);
int8x16_t
b3
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
+
48
);
int8x16_t
b3
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
+
48
);
float16x4_t
bd
=
vld1_f16
((
const
__fp16
*
)
b_ptr
->
d
);
float16x4_t
bd
=
vld1_f16
((
const
__fp16
*
)
b_ptr
->
d
);
int8x16_t
a0
=
vld1q_s8
(
a_ptr
->
qs
);
int8x16_t
a0
=
vld1q_s8
(
a_ptr
->
qs
);
int8x16_t
a1
=
vld1q_s8
(
a_ptr
->
qs
+
qk
/
2
);
int8x16_t
a1
=
vld1q_s8
(
a_ptr
->
qs
+
qk
/
2
);
float16x4_t
ad
=
vld1_dup_f16
((
const
__fp16
*
)
&
a_ptr
->
d
);
float16x4_t
ad
=
vld1_dup_f16
((
const
__fp16
*
)
&
a_ptr
->
d
);
int32x4_t
ret
=
vdupq_n_s32
(
0
);
int32x4_t
ret
=
vdupq_n_s32
(
0
);
...
@@ -673,72 +676,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
...
@@ -673,72 +676,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
UNUSED
(
ncols_interleaved
);
UNUSED
(
ncols_interleaved
);
UNUSED
(
blocklen
);
UNUSED
(
blocklen
);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if
(
ggml_cpu_has_neon
()
&&
ggml_cpu_has_matmul_int8
())
{
if
(
ggml_cpu_has_neon
()
&&
ggml_cpu_has_dotprod
())
{
const
void
*
b_ptr
=
vx
;
const
block_q4_0x4
*
b_ptr
=
(
const
block_q4_0x4
*
)
vx
;
const
void
*
a_ptr
=
vy
;
float
*
res_ptr
=
s
;
__asm__
__volatile__
(
for
(
int
c
=
0
;
c
<
nc
;
c
+=
ncols_interleaved
)
{
"movi v2.16b, #0x4
\n
"
const
block_q8_0
*
a_ptr
=
(
const
block_q8_0
*
)
vy
;
"movi v1.16b, #0xf0
\n
"
float32x4_t
acc
=
vdupq_n_f32
(
0
);
"add %x[b_ptr], %x[b_ptr], #0x8
\n
"
for
(
int
b
=
0
;
b
<
nb
;
b
++
)
{
"1:"
// Column loop
int8x16_t
b0
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
);
"add x23, %x[a_ptr], #0x2
\n
"
int8x16_t
b1
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
+
16
);
"movi v0.16b, #0x0
\n
"
int8x16_t
b2
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
+
32
);
"mov x22, %x[nb]
\n
"
int8x16_t
b3
=
vld1q_s8
((
const
int8_t
*
)
b_ptr
->
qs
+
48
);
"2:"
// Block loop
float16x4_t
bd
=
vld1_f16
((
const
__fp16
*
)
b_ptr
->
d
);
"ldr q31, [%x[b_ptr], #0x0]
\n
"
"ldr q30, [%x[b_ptr], #0x10]
\n
"
int8x16_t
a0
=
(
int8x16_t
)
vld1q_dup_s64
((
const
int64_t
*
)
a_ptr
->
qs
);
"mov x21, x23
\n
"
int8x16_t
a1
=
(
int8x16_t
)
vld1q_dup_s64
((
const
int64_t
*
)
a_ptr
->
qs
+
1
);
"movi v29.4s, #0x0
\n
"
int8x16_t
a2
=
(
int8x16_t
)
vld1q_dup_s64
((
const
int64_t
*
)
a_ptr
->
qs
+
2
);
"ldr q28, [%x[b_ptr], #0x20]
\n
"
int8x16_t
a3
=
(
int8x16_t
)
vld1q_dup_s64
((
const
int64_t
*
)
a_ptr
->
qs
+
3
);
"ldr q27, [%x[b_ptr], #0x30]
\n
"
float16x4_t
ad
=
vld1_dup_f16
((
const
__fp16
*
)
&
a_ptr
->
d
);
"movi v26.4s, #0x0
\n
"
"sub x20, x23, #0x2
\n
"
int32x4_t
ret0
=
vdupq_n_s32
(
0
);
"ld1r { v25.8h }, [x20]
\n
"
int32x4_t
ret1
=
vdupq_n_s32
(
0
);
"ldr q24, [%x[b_ptr], #-0x8]
\n
"
"sub x22, x22, #0x1
\n
"
ret0
=
vdotq_s32
(
ret0
,
b0
<<
4
,
a0
);
"add x23, x23, #0x22
\n
"
ret1
=
vdotq_s32
(
ret1
,
b1
<<
4
,
a0
);
"ld1r { v23.2d }, [x21], #0x8
\n
"
ret0
=
vdotq_s32
(
ret0
,
b2
<<
4
,
a1
);
"sshl v22.16b, v31.16b, v2.16b
\n
"
ret1
=
vdotq_s32
(
ret1
,
b3
<<
4
,
a1
);
"sshl v16.16b, v30.16b, v2.16b
\n
"
"add %x[b_ptr], %x[b_ptr], #0x48
\n
"
ret0
=
vdotq_s32
(
ret0
,
b0
&
0xf0U
,
a2
);
"ld1r { v21.2d }, [x21], #0x8
\n
"
ret1
=
vdotq_s32
(
ret1
,
b1
&
0xf0U
,
a2
);
"sshl v20.16b, v28.16b, v2.16b
\n
"
ret0
=
vdotq_s32
(
ret0
,
b2
&
0xf0U
,
a3
);
"sshl v19.16b, v27.16b, v2.16b
\n
"
ret1
=
vdotq_s32
(
ret1
,
b3
&
0xf0U
,
a3
);
"ld1r { v18.2d }, [x21], #0x8
\n
"
"ld1r { v17.2d }, [x21], #0x8
\n
"
int32x4_t
ret
=
vpaddq_s32
(
ret0
,
ret1
);
"and v31.16b, v31.16b, v1.16b
\n
"
"and v30.16b, v30.16b, v1.16b
\n
"
acc
=
vfmaq_f32
(
acc
,
vcvtq_n_f32_s32
(
ret
,
4
),
".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b
\n
"
vmulq_f32
(
vcvt_f32_f16
(
ad
),
vcvt_f32_f16
(
bd
)));
".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b
\n
"
a_ptr
++
;
"and v28.16b, v28.16b, v1.16b
\n
"
b_ptr
++
;
"and v27.16b, v27.16b, v1.16b
\n
"
}
"fcvtl v25.4s, v25.4h
\n
"
vst1q_f32
(
s
,
acc
);
"fcvtl v16.4s, v24.4h
\n
"
s
+=
ncols_interleaved
;
".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b
\n
"
}
".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b
\n
"
"fmul v16.4s, v16.4s, v25.4s
\n
"
".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b
\n
"
".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b
\n
"
".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b
\n
"
".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b
\n
"
"addp v29.4s, v29.4s, v26.4s
\n
"
"scvtf v29.4s, v29.4s, #0x4
\n
"
"fmla v0.4s, v29.4s, v16.4s
\n
"
"cbnz x22, 2b
\n
"
"sub %x[nc], %x[nc], #0x4
\n
"
"str q0, [%x[res_ptr], #0x0]
\n
"
"add %x[res_ptr], %x[res_ptr], #0x10
\n
"
"cbnz %x[nc], 1b
\n
"
:
[
b_ptr
]
"+&r"
(
b_ptr
),
[
res_ptr
]
"+&r"
(
res_ptr
),
[
nc
]
"+&r"
(
nc
)
:
[
a_ptr
]
"r"
(
a_ptr
),
[
nb
]
"r"
(
nb
)
:
"memory"
,
"v0"
,
"v1"
,
"v2"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
,
"v29"
,
"v30"
,
"v31"
,
"x20"
,
"x21"
,
"x22"
,
"x23"
);
return
;
return
;
}
}
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_
MATMUL_INT8
)
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_
DOTPROD
)
float
sumf
[
4
];
float
sumf
[
4
];
int
sumi
;
int
sumi
;
...
...
llama/ggml-cpu-aarch64.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cpu-impl.h
View file @
1deafd82
/**
/**
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
* llama.cpp - commit
46e3556e01b824e52395fb050b29804b6cff2a7c
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
Prev
1
2
3
4
5
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment