Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
7a81daf0
Unverified
Commit
7a81daf0
authored
Dec 14, 2024
by
Jeffrey Morgan
Committed by
GitHub
Dec 14, 2024
Browse files
llama: update vendor code to commit ba1cb19c (#8101)
parent
60f75560
Changes
273
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1021 additions
and
568 deletions
+1021
-568
llama/amx.cpp
llama/amx.cpp
+94
-70
llama/amx.h
llama/amx.h
+2
-14
llama/build-info.cpp
llama/build-info.cpp
+1
-1
llama/clip.cpp
llama/clip.cpp
+206
-26
llama/clip.h
llama/clip.h
+9
-3
llama/common.cpp
llama/common.cpp
+3
-41
llama/common.h
llama/common.h
+12
-7
llama/ggml-aarch64.c
llama/ggml-aarch64.c
+0
-155
llama/ggml-alloc.c
llama/ggml-alloc.c
+1
-1
llama/ggml-alloc.h
llama/ggml-alloc.h
+1
-1
llama/ggml-backend-impl.h
llama/ggml-backend-impl.h
+1
-1
llama/ggml-backend-reg.cpp
llama/ggml-backend-reg.cpp
+48
-15
llama/ggml-backend.cpp
llama/ggml-backend.cpp
+1
-1
llama/ggml-backend.h
llama/ggml-backend.h
+2
-1
llama/ggml-blas.cpp
llama/ggml-blas.cpp
+1
-1
llama/ggml-blas.h
llama/ggml-blas.h
+1
-1
llama/ggml-common.h
llama/ggml-common.h
+43
-49
llama/ggml-cpp.h
llama/ggml-cpp.h
+1
-1
llama/ggml-cpu-aarch64.cpp
llama/ggml-cpu-aarch64.cpp
+591
-152
llama/ggml-cpu-aarch64.h
llama/ggml-cpu-aarch64.h
+3
-27
No files found.
llama/amx.cpp
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -31,6 +31,7 @@
...
@@ -31,6 +31,7 @@
#include "ggml-backend.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "ggml-cpu.h"
#include "ggml-cpu-traits.h"
#if defined(__gnu_linux__)
#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <sys/syscall.h>
...
@@ -43,31 +44,65 @@
...
@@ -43,31 +44,65 @@
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
// AMX type_trais
namespace
ggml
::
cpu
::
amx
{
class
tensor_traits
:
public
ggml
::
cpu
::
tensor_traits
{
bool
work_size
(
int
/* n_threads */
,
const
struct
ggml_tensor
*
op
,
size_t
&
size
)
override
{
size
=
ggml_backend_amx_desired_wsize
(
op
);
return
true
;
}
bool
compute_forward
(
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
op
)
override
{
if
(
op
->
op
==
GGML_OP_MUL_MAT
)
{
ggml_backend_amx_mul_mat
(
params
,
op
);
return
true
;
}
return
false
;
}
};
static
ggml
::
cpu
::
tensor_traits
*
get_tensor_traits
(
ggml_backend_buffer_t
,
struct
ggml_tensor
*
)
{
static
tensor_traits
traits
;
return
&
traits
;
}
}
// namespace ggml::cpu::amx
// AMX buffer interface
// AMX buffer interface
static
void
ggml_backend_amx_buffer_free_buffer
(
ggml_backend_buffer_t
buffer
)
{
static
void
ggml_backend_amx_buffer_free_buffer
(
ggml_backend_buffer_t
buffer
)
{
free
(
buffer
->
context
);
free
(
buffer
->
context
);
}
}
static
void
*
ggml_backend_amx_buffer_get_base
(
ggml_backend_buffer_t
buffer
)
{
static
void
*
ggml_backend_amx_buffer_get_base
(
ggml_backend_buffer_t
buffer
)
{
return
(
void
*
)(
buffer
->
context
);
return
(
void
*
)
(
buffer
->
context
);
}
static
void
ggml_backend_amx_buffer_init_tensor
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
)
{
tensor
->
extra
=
(
void
*
)
ggml
::
cpu
::
amx
::
get_tensor_traits
(
buffer
,
tensor
);
GGML_UNUSED
(
buffer
);
}
}
static
void
ggml_backend_amx_buffer_memset_tensor
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
,
uint8_t
value
,
size_t
offset
,
size_t
size
)
{
static
void
ggml_backend_amx_buffer_memset_tensor
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
,
memset
((
char
*
)
tensor
->
data
+
offset
,
value
,
size
);
uint8_t
value
,
size_t
offset
,
size_t
size
)
{
memset
((
char
*
)
tensor
->
data
+
offset
,
value
,
size
);
GGML_UNUSED
(
buffer
);
GGML_UNUSED
(
buffer
);
}
}
static
void
ggml_backend_amx_buffer_set_tensor
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
,
const
void
*
data
,
size_t
offset
,
size_t
size
)
{
static
void
ggml_backend_amx_buffer_set_tensor
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
,
const
void
*
data
,
size_t
offset
,
size_t
size
)
{
if
(
qtype_has_amx_kernels
(
tensor
->
type
))
{
if
(
qtype_has_amx_kernels
(
tensor
->
type
))
{
GGML_LOG_DEBUG
(
"%s: amx repack tensor %s of type %s
\n
"
,
__func__
,
tensor
->
name
,
ggml_type_name
(
tensor
->
type
));
ggml_backend_amx_convert_weight
(
tensor
,
data
,
offset
,
size
);
ggml_backend_amx_convert_weight
(
tensor
,
data
,
offset
,
size
);
}
else
{
}
else
{
memcpy
((
char
*
)
tensor
->
data
+
offset
,
data
,
size
);
memcpy
((
char
*
)
tensor
->
data
+
offset
,
data
,
size
);
}
}
GGML_UNUSED
(
buffer
);
GGML_UNUSED
(
buffer
);
}
}
/*
// need to figure what we need to do with buffer->extra.
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size);
memcpy(data, (const char *)tensor->data + offset, size);
...
@@ -88,6 +123,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
...
@@ -88,6 +123,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
GGML_UNUSED(buffer);
GGML_UNUSED(buffer);
}
}
*/
static
void
ggml_backend_amx_buffer_clear
(
ggml_backend_buffer_t
buffer
,
uint8_t
value
)
{
static
void
ggml_backend_amx_buffer_clear
(
ggml_backend_buffer_t
buffer
,
uint8_t
value
)
{
memset
(
buffer
->
context
,
value
,
buffer
->
size
);
memset
(
buffer
->
context
,
value
,
buffer
->
size
);
...
@@ -96,13 +132,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
...
@@ -96,13 +132,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
static
ggml_backend_buffer_i
ggml_backend_amx_buffer_interface
=
{
static
ggml_backend_buffer_i
ggml_backend_amx_buffer_interface
=
{
/* .free_buffer = */
ggml_backend_amx_buffer_free_buffer
,
/* .free_buffer = */
ggml_backend_amx_buffer_free_buffer
,
/* .get_base = */
ggml_backend_amx_buffer_get_base
,
/* .get_base = */
ggml_backend_amx_buffer_get_base
,
/* .init_tensor = */
NULL
,
// no initialization required
/* .init_tensor = */
ggml_backend_amx_buffer_init_tensor
,
/* .memset_tensor = */
ggml_backend_amx_buffer_memset_tensor
,
/* .memset_tensor = */
ggml_backend_amx_buffer_memset_tensor
,
/* .set_tensor = */
ggml_backend_amx_buffer_set_tensor
,
/* .set_tensor = */
ggml_backend_amx_buffer_set_tensor
,
/* .get_tensor = */
ggml_backend_amx_buffer_get_tenso
r
,
/* .get_tensor = */
nullpt
r
,
/* .cpy_tensor = */
ggml_backend_amx_buffer_cpy_tenso
r
,
/* .cpy_tensor = */
nullpt
r
,
/* .clear = */
ggml_backend_amx_buffer_clear
,
/* .clear = */
ggml_backend_amx_buffer_clear
,
/* .reset = */
NULL
,
/* .reset = */
nullptr
,
};
};
static
const
char
*
ggml_backend_amx_buffer_type_get_name
(
ggml_backend_buffer_type_t
buft
)
{
static
const
char
*
ggml_backend_amx_buffer_type_get_name
(
ggml_backend_buffer_type_t
buft
)
{
...
@@ -112,7 +148,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
...
@@ -112,7 +148,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
}
}
static
ggml_backend_buffer_t
ggml_backend_amx_buffer_type_alloc_buffer
(
ggml_backend_buffer_type_t
buft
,
size_t
size
)
{
static
ggml_backend_buffer_t
ggml_backend_amx_buffer_type_alloc_buffer
(
ggml_backend_buffer_type_t
buft
,
size_t
size
)
{
void
*
data
=
aligned_alloc
(
TENSOR_ALIGNMENT
,
size
);
void
*
data
=
ggml_
aligned_
m
alloc
(
size
);
if
(
data
==
NULL
)
{
if
(
data
==
NULL
)
{
fprintf
(
stderr
,
"%s: failed to allocate buffer of size %zu
\n
"
,
__func__
,
size
);
fprintf
(
stderr
,
"%s: failed to allocate buffer of size %zu
\n
"
,
__func__
,
size
);
return
NULL
;
return
NULL
;
...
@@ -127,14 +163,44 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
...
@@ -127,14 +163,44 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
GGML_UNUSED
(
buft
);
GGML_UNUSED
(
buft
);
}
}
static
size_t
ggml_backend_amx_buffer_type_get_alloc_size
(
ggml_backend_buffer_type_t
buft
,
const
ggml_tensor
*
tensor
)
{
namespace
ggml
::
cpu
::
amx
{
return
ggml_backend_amx_get_alloc_size
(
tensor
);
class
extra_buffer_type
:
ggml
::
cpu
::
extra_buffer_type
{
bool
supports_op
(
ggml_backend_dev_t
,
const
struct
ggml_tensor
*
op
)
override
{
GGML_UNUSED
(
buft
);
// handle only 2d gemm for now
}
auto
is_contiguous_2d
=
[](
const
struct
ggml_tensor
*
t
)
{
return
ggml_is_contiguous
(
t
)
&&
t
->
ne
[
3
]
==
1
&&
t
->
ne
[
2
]
==
1
;
};
static
bool
ggml_backend_amx_buffer_type_is_host
(
ggml_backend_buffer_type_t
buft
)
{
if
(
op
->
op
==
GGML_OP_MUL_MAT
&&
is_contiguous_2d
(
op
->
src
[
0
])
&&
// src0 must be contiguous
is_contiguous_2d
(
op
->
src
[
1
])
&&
// src1 must be contiguous
op
->
src
[
0
]
->
buffer
&&
op
->
src
[
0
]
->
buffer
->
buft
==
ggml_backend_amx_buffer_type
()
&&
op
->
ne
[
0
]
%
(
TILE_N
*
2
)
==
0
&&
// out_features is 32x
(
qtype_has_amx_kernels
(
op
->
src
[
0
]
->
type
)
||
(
op
->
src
[
0
]
->
type
==
GGML_TYPE_F16
)))
{
// src1 must be host buffer
if
(
op
->
src
[
1
]
->
buffer
&&
!
ggml_backend_buft_is_host
(
op
->
src
[
1
]
->
buffer
->
buft
))
{
return
false
;
}
// src1 must be float32
if
(
op
->
src
[
1
]
->
type
==
GGML_TYPE_F32
)
{
return
true
;
}
}
return
false
;
return
false
;
}
ggml
::
cpu
::
tensor_traits
*
get_tensor_traits
(
const
struct
ggml_tensor
*
op
)
override
{
if
(
op
->
op
==
GGML_OP_MUL_MAT
&&
op
->
src
[
0
]
->
buffer
&&
op
->
src
[
0
]
->
buffer
->
buft
==
ggml_backend_amx_buffer_type
())
{
return
(
ggml
::
cpu
::
tensor_traits
*
)
op
->
src
[
0
]
->
extra
;
}
return
nullptr
;
}
};
}
// namespace ggml::cpu::amx
static
size_t
ggml_backend_amx_buffer_type_get_alloc_size
(
ggml_backend_buffer_type_t
buft
,
const
ggml_tensor
*
tensor
)
{
return
ggml_backend_amx_get_alloc_size
(
tensor
);
GGML_UNUSED
(
buft
);
GGML_UNUSED
(
buft
);
}
}
...
@@ -155,68 +221,26 @@ static bool ggml_amx_init() {
...
@@ -155,68 +221,26 @@ static bool ggml_amx_init() {
return
true
;
return
true
;
#endif
#endif
}
}
ggml_backend_buffer_type_t
ggml_backend_amx_buffer_type
()
{
ggml_backend_buffer_type_t
ggml_backend_amx_buffer_type
()
{
static
struct
ggml_backend_buffer_type
ggml_backend_buffer_type_amx
=
{
static
struct
ggml_backend_buffer_type
ggml_backend_buffer_type_amx
=
{
/* .iface = */
{
/* .iface = */
{
/* .get_name = */
ggml_backend_amx_buffer_type_get_name
,
/* .get_name = */
ggml_backend_amx_buffer_type_get_name
,
/* .alloc_buffer = */
ggml_backend_amx_buffer_type_alloc_buffer
,
/* .alloc_buffer = */
ggml_backend_amx_buffer_type_alloc_buffer
,
/* .get_alignment = */
ggml_backend_amx_buffer_type_get_alignment
,
/* .get_alignment = */
ggml_backend_amx_buffer_type_get_alignment
,
/* .get_max_size = */
NULL
,
// defaults to SIZE_MAX
/* .get_max_size = */
nullptr
,
// defaults to SIZE_MAX
/* .get_alloc_size = */
ggml_backend_amx_buffer_type_get_alloc_size
,
/* .get_alloc_size = */
ggml_backend_amx_buffer_type_get_alloc_size
,
/* .is_host = */
ggml_backend_amx_buffer_type_is_host
,
/* .is_host = */
nullptr
,
},
},
/* .device = */
ggml_backend_reg_dev_get
(
ggml_backend_cpu_reg
(),
0
),
/* .device = */
ggml_backend_reg_dev_get
(
ggml_backend_cpu_reg
(),
0
),
/* .context = */
NULL
,
/* .context = */
new
ggml
::
cpu
::
amx
::
extra_buffer_type
()
,
};
};
if
(
!
ggml_amx_init
())
{
if
(
!
ggml_amx_init
())
{
return
NULL
;
return
nullptr
;
}
}
return
&
ggml_backend_buffer_type_amx
;
return
&
ggml_backend_buffer_type_amx
;
}
}
bool
ggml_backend_amx_buft_is_amx
(
ggml_backend_buffer_type_t
buft
)
{
return
buft
->
iface
.
get_name
==
ggml_backend_amx_buffer_type_get_name
;
}
bool
ggml_backend_amx_device_supports_op
(
const
struct
ggml_tensor
*
op
)
{
// handle only 2d gemm for now
auto
is_contiguous_2d
=
[](
const
struct
ggml_tensor
*
t
)
{
return
ggml_is_contiguous
(
t
)
&&
t
->
ne
[
3
]
==
1
&&
t
->
ne
[
2
]
==
1
;
};
switch
(
op
->
op
)
{
case
GGML_OP_NONE
:
case
GGML_OP_RESHAPE
:
case
GGML_OP_VIEW
:
case
GGML_OP_PERMUTE
:
case
GGML_OP_TRANSPOSE
:
return
true
;
case
GGML_OP_MUL_MAT
:
{
const
struct
ggml_tensor
*
src0
=
op
->
src
[
0
];
const
struct
ggml_tensor
*
src1
=
op
->
src
[
1
];
const
enum
ggml_type
type
=
src0
->
type
;
const
int64_t
ne0
=
op
->
ne
[
0
];
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool
has_amx_kernels
=
qtype_has_amx_kernels
(
type
)
||
(
type
==
GGML_TYPE_F16
);
bool
can_use_amx
=
is_contiguous_2d
(
src0
)
&&
// src0 must be contiguous
is_contiguous_2d
(
src1
)
&&
// src1 must be contiguous
src1
->
type
==
GGML_TYPE_F32
&&
// src1 must be float32
has_amx_kernels
&&
// with amx kernel impls
ne0
%
(
TILE_N
*
2
)
==
0
;
// out_features is 32x
return
can_use_amx
;
}
default:
return
false
;
}
}
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
llama/amx.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -27,20 +27,8 @@
...
@@ -27,20 +27,8 @@
#include "ggml-backend.h"
#include "ggml-backend.h"
#include "ggml-cpu-impl.h"
#include "ggml-cpu-impl.h"
#ifdef __cplusplus
// GGML internal header
extern
"C"
{
#endif
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
ggml_backend_buffer_type_t
ggml_backend_amx_buffer_type
(
void
);
ggml_backend_buffer_type_t
ggml_backend_amx_buffer_type
(
void
);
bool
ggml_backend_amx_buft_is_amx
(
ggml_backend_buffer_type_t
buft
);
bool
ggml_backend_amx_device_supports_op
(
const
struct
ggml_tensor
*
op
);
void
ggml_backend_amx_mul_mat
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
size_t
ggml_backend_amx_desired_wsize
(
const
struct
ggml_tensor
*
dst
);
#endif
#ifdef __cplusplus
}
#endif
#endif
llama/build-info.cpp
View file @
7a81daf0
int
LLAMA_BUILD_NUMBER
=
0
;
int
LLAMA_BUILD_NUMBER
=
0
;
char
const
*
LLAMA_COMMIT
=
"
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
"
;
char
const
*
LLAMA_COMMIT
=
"
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
"
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
llama/clip.cpp
View file @
7a81daf0
This diff is collapsed.
Click to expand it.
llama/clip.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -71,6 +71,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
...
@@ -71,6 +71,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
CLIP_API
void
clip_free
(
struct
clip_ctx
*
ctx
);
CLIP_API
void
clip_free
(
struct
clip_ctx
*
ctx
);
CLIP_API
size_t
clip_embd_nbytes
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
size_t
clip_embd_nbytes
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
size_t
clip_embd_nbytes_by_img
(
const
struct
clip_ctx
*
ctx
,
int
img_h
,
int
img_w
);
CLIP_API
int32_t
clip_image_size
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
int32_t
clip_image_size
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
int32_t
clip_patch_size
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
int32_t
clip_patch_size
(
const
struct
clip_ctx
*
ctx
);
...
@@ -82,10 +83,12 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
...
@@ -82,10 +83,12 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API
const
int32_t
*
clip_image_grid
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
const
int32_t
*
clip_image_grid
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
int
clip_n_patches
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
int
clip_n_patches
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
int
clip_n_mmproj_embd
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
int
clip_n_patches_by_img
(
const
struct
clip_ctx
*
ctx
,
struct
clip_image_f32
*
img
);
CLIP_API
int
clip_n_mmproj_embd
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
int
clip_uhd_num_image_embeds_col
(
struct
clip_ctx
*
ctx_clip
);
CLIP_API
int
clip_uhd_num_image_embeds_col
(
struct
clip_ctx
*
ctx_clip
);
CLIP_API
void
clip_add_load_image_size
(
struct
clip_ctx
*
ctx_clip
,
struct
clip_image_size
*
load_image_size
);
CLIP_API
void
clip_add_load_image_size
(
struct
clip_ctx
*
ctx_clip
,
struct
clip_image_size
*
load_image_size
);
CLIP_API
struct
clip_image_size
*
clip_get_load_image_size
(
struct
clip_ctx
*
ctx_clip
);
CLIP_API
struct
clip_image_size
*
clip_image_size_init
();
CLIP_API
struct
clip_image_size
*
clip_image_size_init
();
CLIP_API
struct
clip_image_u8
*
clip_image_u8_init
();
CLIP_API
struct
clip_image_u8
*
clip_image_u8_init
();
...
@@ -112,6 +115,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
...
@@ -112,6 +115,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API
bool
clip_model_quantize
(
const
char
*
fname_inp
,
const
char
*
fname_out
,
int
itype
);
CLIP_API
bool
clip_model_quantize
(
const
char
*
fname_inp
,
const
char
*
fname_out
,
int
itype
);
CLIP_API
int
clip_is_minicpmv
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
int
clip_is_minicpmv
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
bool
clip_is_qwen2vl
(
const
struct
clip_ctx
*
ctx
);
CLIP_API
bool
clip_encode_float_image
(
struct
clip_ctx
*
ctx
,
int
n_threads
,
float
*
img
,
int
h
,
int
w
,
float
*
vec
);
#ifdef __cplusplus
#ifdef __cplusplus
}
}
...
...
llama/common.cpp
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -1041,38 +1041,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
...
@@ -1041,38 +1041,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
return
mparams
;
return
mparams
;
}
}
static
ggml_type
kv_cache_type_from_str
(
const
std
::
string
&
s
)
{
if
(
s
==
"f32"
)
{
return
GGML_TYPE_F32
;
}
if
(
s
==
"f16"
)
{
return
GGML_TYPE_F16
;
}
if
(
s
==
"bf16"
)
{
return
GGML_TYPE_BF16
;
}
if
(
s
==
"q8_0"
)
{
return
GGML_TYPE_Q8_0
;
}
if
(
s
==
"q4_0"
)
{
return
GGML_TYPE_Q4_0
;
}
if
(
s
==
"q4_1"
)
{
return
GGML_TYPE_Q4_1
;
}
if
(
s
==
"iq4_nl"
)
{
return
GGML_TYPE_IQ4_NL
;
}
if
(
s
==
"q5_0"
)
{
return
GGML_TYPE_Q5_0
;
}
if
(
s
==
"q5_1"
)
{
return
GGML_TYPE_Q5_1
;
}
throw
std
::
runtime_error
(
"Unsupported cache type: "
+
s
);
}
struct
llama_context_params
common_context_params_to_llama
(
const
common_params
&
params
)
{
struct
llama_context_params
common_context_params_to_llama
(
const
common_params
&
params
)
{
auto
cparams
=
llama_context_default_params
();
auto
cparams
=
llama_context_default_params
();
...
@@ -1107,8 +1075,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
...
@@ -1107,8 +1075,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams
.
pooling_type
=
LLAMA_POOLING_TYPE_RANK
;
cparams
.
pooling_type
=
LLAMA_POOLING_TYPE_RANK
;
}
}
cparams
.
type_k
=
kv_cache_type_from_str
(
params
.
cache_type_k
)
;
cparams
.
type_k
=
params
.
cache_type_k
;
cparams
.
type_v
=
kv_cache_type_from_str
(
params
.
cache_type_v
)
;
cparams
.
type_v
=
params
.
cache_type_v
;
return
cparams
;
return
cparams
;
}
}
...
@@ -1134,12 +1102,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
...
@@ -1134,12 +1102,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
#define CURL_RETRY_DELAY_SECONDS 2
static
bool
starts_with
(
const
std
::
string
&
str
,
const
std
::
string
&
prefix
)
{
// While we wait for C++20's std::string::starts_with...
return
str
.
rfind
(
prefix
,
0
)
==
0
;
}
static
bool
curl_perform_with_retry
(
const
std
::
string
&
url
,
CURL
*
curl
,
int
max_attempts
,
int
retry_delay_seconds
)
{
static
bool
curl_perform_with_retry
(
const
std
::
string
&
url
,
CURL
*
curl
,
int
max_attempts
,
int
retry_delay_seconds
)
{
int
remaining_attempts
=
max_attempts
;
int
remaining_attempts
=
max_attempts
;
...
...
llama/common.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -63,9 +63,9 @@ using llama_tokens = std::vector<llama_token>;
...
@@ -63,9 +63,9 @@ using llama_tokens = std::vector<llama_token>;
// build info
// build info
extern
int
LLAMA_BUILD_NUMBER
;
extern
int
LLAMA_BUILD_NUMBER
;
extern
c
har
const
*
LLAMA_COMMIT
;
extern
c
onst
char
*
LLAMA_COMMIT
;
extern
c
har
const
*
LLAMA_COMPILER
;
extern
c
onst
char
*
LLAMA_COMPILER
;
extern
c
har
const
*
LLAMA_BUILD_TARGET
;
extern
c
onst
char
*
LLAMA_BUILD_TARGET
;
struct
common_control_vector_load_info
;
struct
common_control_vector_load_info
;
...
@@ -241,7 +241,7 @@ struct common_params {
...
@@ -241,7 +241,7 @@ struct common_params {
struct
common_params_speculative
speculative
;
struct
common_params_speculative
speculative
;
std
::
string
model
=
""
;
// model path // NOLINT
std
::
string
model
=
""
;
// model path // NOLINT
std
::
string
model_alias
=
"
unknown
"
;
// model alias // NOLINT
std
::
string
model_alias
=
""
;
// model alias
// NOLINT
std
::
string
model_url
=
""
;
// model url to download // NOLINT
std
::
string
model_url
=
""
;
// model url to download // NOLINT
std
::
string
hf_token
=
""
;
// HF token // NOLINT
std
::
string
hf_token
=
""
;
// HF token // NOLINT
std
::
string
hf_repo
=
""
;
// HF repo // NOLINT
std
::
string
hf_repo
=
""
;
// HF repo // NOLINT
...
@@ -312,8 +312,8 @@ struct common_params {
...
@@ -312,8 +312,8 @@ struct common_params {
bool
warmup
=
true
;
// warmup run
bool
warmup
=
true
;
// warmup run
bool
check_tensors
=
false
;
// validate tensor data
bool
check_tensors
=
false
;
// validate tensor data
std
::
string
cache_type_k
=
"f
16
"
;
// KV cache data type for the K
ggml_type
cache_type_k
=
GGML_TYPE_F
16
;
// KV cache data type for the K
std
::
string
cache_type_v
=
"f
16
"
;
// KV cache data type for the V
ggml_type
cache_type_v
=
GGML_TYPE_F
16
;
// KV cache data type for the V
// multimodal models (see examples/llava)
// multimodal models (see examples/llava)
std
::
string
mmproj
=
""
;
// path to multimodal projector // NOLINT
std
::
string
mmproj
=
""
;
// path to multimodal projector // NOLINT
...
@@ -463,6 +463,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
...
@@ -463,6 +463,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return
parts
;
return
parts
;
}
}
static
bool
string_starts_with
(
const
std
::
string
&
str
,
const
std
::
string
&
prefix
)
{
// While we wait for C++20's std::string::starts_with...
return
str
.
rfind
(
prefix
,
0
)
==
0
;
}
bool
string_parse_kv_override
(
const
char
*
data
,
std
::
vector
<
llama_model_kv_override
>
&
overrides
);
bool
string_parse_kv_override
(
const
char
*
data
,
std
::
vector
<
llama_model_kv_override
>
&
overrides
);
void
string_process_escapes
(
std
::
string
&
input
);
void
string_process_escapes
(
std
::
string
&
input
);
...
...
llama/ggml-aarch64.c
deleted
100644 → 0
View file @
60f75560
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml-aarch64.h"
#include "ggml-impl.h"
#include "ggml-quants.h"
#include <assert.h>
#define UNUSED GGML_UNUSED
static
block_q4_0x4
make_block_q4_0x4
(
block_q4_0
*
in
,
unsigned
int
blck_size_interleave
)
{
block_q4_0x4
out
;
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
out
.
d
[
i
]
=
in
[
i
].
d
;
}
const
int
end
=
QK4_0
*
2
/
blck_size_interleave
;
if
(
blck_size_interleave
==
8
)
{
const
uint64_t
xor_mask
=
0x8888888888888888ULL
;
for
(
int
i
=
0
;
i
<
end
;
++
i
)
{
int
src_id
=
i
%
4
;
int
src_offset
=
(
i
/
4
)
*
blck_size_interleave
;
int
dst_offset
=
i
*
blck_size_interleave
;
uint64_t
elems
;
// Using memcpy to avoid unaligned memory accesses
memcpy
(
&
elems
,
&
in
[
src_id
].
qs
[
src_offset
],
sizeof
(
uint64_t
));
elems
^=
xor_mask
;
memcpy
(
&
out
.
qs
[
dst_offset
],
&
elems
,
sizeof
(
uint64_t
));
}
}
else
if
(
blck_size_interleave
==
4
)
{
const
uint32_t
xor_mask
=
0x88888888
;
for
(
int
i
=
0
;
i
<
end
;
++
i
)
{
int
src_id
=
i
%
4
;
int
src_offset
=
(
i
/
4
)
*
blck_size_interleave
;
int
dst_offset
=
i
*
blck_size_interleave
;
uint32_t
elems
;
memcpy
(
&
elems
,
&
in
[
src_id
].
qs
[
src_offset
],
sizeof
(
uint32_t
));
elems
^=
xor_mask
;
memcpy
(
&
out
.
qs
[
dst_offset
],
&
elems
,
sizeof
(
uint32_t
));
}
}
else
{
GGML_ASSERT
(
false
);
}
return
out
;
}
// interleave 8 block_q4_0s in blocks of blck_size_interleave
// returns an interleaved block_q4_0x8
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
static
block_q4_0x8
make_block_q4_0x8
(
block_q4_0
*
in
,
unsigned
int
blck_size_interleave
)
{
block_q4_0x8
out
;
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
out
.
d
[
i
]
=
in
[
i
].
d
;
}
const
int
end
=
QK4_0
*
4
/
blck_size_interleave
;
const
uint64_t
xor_mask
=
0x8888888888888888ULL
;
for
(
int
i
=
0
;
i
<
end
;
++
i
)
{
int
src_id
=
i
%
8
;
int
src_offset
=
(
i
/
8
)
*
blck_size_interleave
;
int
dst_offset
=
i
*
blck_size_interleave
;
uint64_t
elems
;
memcpy
(
&
elems
,
&
in
[
src_id
].
qs
[
src_offset
],
sizeof
(
uint64_t
));
elems
^=
xor_mask
;
memcpy
(
&
out
.
qs
[
dst_offset
],
&
elems
,
sizeof
(
uint64_t
));
}
return
out
;
}
static
size_t
quantize_q4_0_nr_bl
(
const
float
*
restrict
src
,
void
*
restrict
dst
,
int64_t
nrow
,
int64_t
n_per_row
,
int
nrows_interleaved
,
int
blck_size_interleave
)
{
assert
(
n_per_row
%
QK4_0
==
0
);
const
int
nb
=
n_per_row
/
QK4_0
;
void
*
out_ptr
=
NULL
;
if
(
nrows_interleaved
==
8
)
{
out_ptr
=
(
block_q4_0x8
*
)
dst
;
}
else
if
(
nrows_interleaved
==
4
)
{
out_ptr
=
(
block_q4_0x4
*
)
dst
;
}
assert
(
nrows_interleaved
<=
8
);
block_q4_0
dst_tmp
[
8
];
for
(
int
b
=
0
;
b
<
(
nrow
*
n_per_row
);
b
+=
nrows_interleaved
*
n_per_row
)
{
for
(
int64_t
x
=
0
;
x
<
nb
;
x
++
)
{
for
(
int
i
=
0
;
i
<
nrows_interleaved
;
i
++
)
{
quantize_row_q4_0_ref
(
src
+
b
+
i
*
n_per_row
+
x
*
QK4_0
,
(
block_q4_0
*
)
dst_tmp
+
i
,
QK4_0
);
}
if
(
nrows_interleaved
==
8
)
{
*
(
block_q4_0x8
*
)
out_ptr
=
make_block_q4_0x8
(
dst_tmp
,
blck_size_interleave
);
out_ptr
=
(
block_q4_0x8
*
)
out_ptr
+
1
;
}
else
if
(
nrows_interleaved
==
4
)
{
*
(
block_q4_0x4
*
)
out_ptr
=
make_block_q4_0x4
(
dst_tmp
,
blck_size_interleave
);
out_ptr
=
(
block_q4_0x4
*
)
out_ptr
+
1
;
}
}
}
return
((
nrow
*
n_per_row
)
/
QK4_0
*
sizeof
(
block_q4_0
));
}
size_t
quantize_q4_0_4x4
(
const
float
*
restrict
src
,
void
*
restrict
dst
,
int64_t
nrow
,
int64_t
n_per_row
,
const
float
*
quant_weights
)
{
UNUSED
(
quant_weights
);
return
quantize_q4_0_nr_bl
(
src
,
dst
,
nrow
,
n_per_row
,
4
,
4
);
}
size_t
quantize_q4_0_4x8
(
const
float
*
restrict
src
,
void
*
restrict
dst
,
int64_t
nrow
,
int64_t
n_per_row
,
const
float
*
quant_weights
)
{
UNUSED
(
quant_weights
);
return
quantize_q4_0_nr_bl
(
src
,
dst
,
nrow
,
n_per_row
,
4
,
8
);
}
size_t
quantize_q4_0_8x8
(
const
float
*
restrict
src
,
void
*
restrict
dst
,
int64_t
nrow
,
int64_t
n_per_row
,
const
float
*
quant_weights
)
{
UNUSED
(
quant_weights
);
return
quantize_q4_0_nr_bl
(
src
,
dst
,
nrow
,
n_per_row
,
8
,
8
);
}
llama/ggml-alloc.c
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-alloc.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-backend-impl.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-backend-reg.cpp
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -72,6 +72,10 @@
...
@@ -72,6 +72,10 @@
#include "ggml-vulkan.h"
#include "ggml-vulkan.h"
#endif
#endif
#ifdef GGML_USE_OPENCL
#include "ggml-opencl.h"
#endif
#ifdef GGML_USE_BLAS
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#include "ggml-blas.h"
#endif
#endif
...
@@ -172,6 +176,9 @@ struct ggml_backend_registry {
...
@@ -172,6 +176,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN
#ifdef GGML_USE_VULKAN
register_backend
(
ggml_backend_vk_reg
());
register_backend
(
ggml_backend_vk_reg
());
#endif
#endif
#ifdef GGML_USE_OPENCL
register_backend
(
ggml_backend_opencl_reg
());
#endif
#ifdef GGML_USE_CANN
#ifdef GGML_USE_CANN
register_backend
(
ggml_backend_cann_reg
());
register_backend
(
ggml_backend_cann_reg
());
#endif
#endif
...
@@ -475,11 +482,21 @@ static std::string backend_filename_suffix() {
...
@@ -475,11 +482,21 @@ static std::string backend_filename_suffix() {
#endif
#endif
}
}
static
ggml_backend_reg_t
ggml_backend_load_best
(
const
char
*
name
,
bool
silent
)
{
static
ggml_backend_reg_t
ggml_backend_load_best
(
const
char
*
name
,
bool
silent
,
const
char
*
user_search_path
)
{
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths
// TODO: search system paths
std
::
vector
<
std
::
string
>
search_paths
=
{
"./"
,
get_executable_path
()
};
std
::
string
file_prefix
=
backend_filename_prefix
()
+
name
+
"-"
;
std
::
string
file_prefix
=
backend_filename_prefix
()
+
name
+
"-"
;
std
::
vector
<
std
::
string
>
search_paths
;
if
(
user_search_path
==
nullptr
)
{
search_paths
.
push_back
(
"./"
);
search_paths
.
push_back
(
get_executable_path
());
}
else
{
#if defined(_WIN32)
search_paths
.
push_back
(
std
::
string
(
user_search_path
)
+
"
\\
"
);
#else
search_paths
.
push_back
(
std
::
string
(
user_search_path
)
+
"/"
);
#endif
}
int
best_score
=
0
;
int
best_score
=
0
;
std
::
string
best_path
;
std
::
string
best_path
;
...
@@ -489,7 +506,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
...
@@ -489,7 +506,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
if
(
!
fs
::
exists
(
search_path
))
{
if
(
!
fs
::
exists
(
search_path
))
{
continue
;
continue
;
}
}
for
(
const
auto
&
entry
:
fs
::
directory_iterator
(
search_path
))
{
fs
::
directory_iterator
dir_it
(
search_path
,
fs
::
directory_options
::
skip_permission_denied
);
for
(
const
auto
&
entry
:
dir_it
)
{
if
(
entry
.
is_regular_file
())
{
if
(
entry
.
is_regular_file
())
{
std
::
string
filename
=
entry
.
path
().
filename
().
string
();
std
::
string
filename
=
entry
.
path
().
filename
().
string
();
std
::
string
ext
=
entry
.
path
().
extension
().
string
();
std
::
string
ext
=
entry
.
path
().
extension
().
string
();
...
@@ -509,6 +527,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
...
@@ -509,6 +527,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
best_score
=
s
;
best_score
=
s
;
best_path
=
entry
.
path
().
string
();
best_path
=
entry
.
path
().
string
();
}
}
}
else
{
if
(
!
silent
)
{
GGML_LOG_INFO
(
"%s: failed to find ggml_backend_score in %s
\n
"
,
__func__
,
entry
.
path
().
string
().
c_str
());
}
}
}
}
}
}
}
...
@@ -531,15 +553,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
...
@@ -531,15 +553,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
}
}
void
ggml_backend_load_all
()
{
void
ggml_backend_load_all
()
{
ggml_backend_load_best
(
"blas"
,
true
);
ggml_backend_load_all_from_path
(
nullptr
);
ggml_backend_load_best
(
"cann"
,
true
);
}
ggml_backend_load_best
(
"cuda"
,
true
);
ggml_backend_load_best
(
"hip"
,
true
);
void
ggml_backend_load_all_from_path
(
const
char
*
dir_path
)
{
ggml_backend_load_best
(
"kompute"
,
true
);
#ifdef NDEBUG
ggml_backend_load_best
(
"metal"
,
true
);
bool
silent
=
true
;
ggml_backend_load_best
(
"rpc"
,
true
);
#else
ggml_backend_load_best
(
"sycl"
,
true
);
bool
silent
=
false
;
ggml_backend_load_best
(
"vulkan"
,
true
);
#endif
ggml_backend_load_best
(
"musa"
,
true
);
ggml_backend_load_best
(
"cpu"
,
true
);
ggml_backend_load_best
(
"blas"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"cann"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"cuda"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"hip"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"kompute"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"metal"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"rpc"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"sycl"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"vulkan"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"opencl"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"musa"
,
silent
,
dir_path
);
ggml_backend_load_best
(
"cpu"
,
silent
,
dir_path
);
}
}
llama/ggml-backend.cpp
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-backend.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -254,6 +254,7 @@ extern "C" {
...
@@ -254,6 +254,7 @@ extern "C" {
GGML_API
void
ggml_backend_unload
(
ggml_backend_reg_t
reg
);
GGML_API
void
ggml_backend_unload
(
ggml_backend_reg_t
reg
);
// Load all known backends from dynamic libraries
// Load all known backends from dynamic libraries
GGML_API
void
ggml_backend_load_all
(
void
);
GGML_API
void
ggml_backend_load_all
(
void
);
GGML_API
void
ggml_backend_load_all_from_path
(
const
char
*
dir_path
);
//
//
// Backend scheduler
// Backend scheduler
...
...
llama/ggml-blas.cpp
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-blas.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-common.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -32,7 +32,20 @@
...
@@ -32,7 +32,20 @@
typedef
uint16_t
ggml_half
;
typedef
uint16_t
ggml_half
;
typedef
uint32_t
ggml_half2
;
typedef
uint32_t
ggml_half2
;
#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CPP)
#include <cstdint>
typedef
uint16_t
ggml_half
;
typedef
uint32_t
ggml_half2
;
// std-c++ allow anonymous unions but some compiler warn on it
#define GGML_COMMON_AGGR_U data
// std-c++ do not allow it.
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_METAL)
#elif defined(GGML_COMMON_DECL_METAL)
...
@@ -41,7 +54,8 @@ typedef uint32_t ggml_half2;
...
@@ -41,7 +54,8 @@ typedef uint32_t ggml_half2;
typedef
half
ggml_half
;
typedef
half
ggml_half
;
typedef
half2
ggml_half2
;
typedef
half2
ggml_half2
;
#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA)
#elif defined(GGML_COMMON_DECL_CUDA)
...
@@ -55,7 +69,8 @@ typedef half2 ggml_half2;
...
@@ -55,7 +69,8 @@ typedef half2 ggml_half2;
typedef
half
ggml_half
;
typedef
half
ggml_half
;
typedef
half2
ggml_half2
;
typedef
half2
ggml_half2
;
#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_HIP)
#elif defined(GGML_COMMON_DECL_HIP)
...
@@ -65,7 +80,8 @@ typedef half2 ggml_half2;
...
@@ -65,7 +80,8 @@ typedef half2 ggml_half2;
typedef
half
ggml_half
;
typedef
half
ggml_half
;
typedef
half2
ggml_half2
;
typedef
half2
ggml_half2
;
#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_SYCL)
#elif defined(GGML_COMMON_DECL_SYCL)
...
@@ -75,7 +91,8 @@ typedef half2 ggml_half2;
...
@@ -75,7 +91,8 @@ typedef half2 ggml_half2;
typedef
sycl
::
half
ggml_half
;
typedef
sycl
::
half
ggml_half
;
typedef
sycl
::
half2
ggml_half2
;
typedef
sycl
::
half2
ggml_half2
;
#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL
#define GGML_COMMON_DECL
#endif
#endif
...
@@ -180,9 +197,9 @@ typedef struct {
...
@@ -180,9 +197,9 @@ typedef struct {
struct
{
struct
{
ggml_half
d
;
// delta
ggml_half
d
;
// delta
ggml_half
m
;
// min
ggml_half
m
;
// min
}
GGML_COMMON_AGGR
;
}
GGML_COMMON_AGGR
_S
;
ggml_half2
dm
;
ggml_half2
dm
;
};
}
GGML_COMMON_AGGR_U
;
uint8_t
qs
[
QK4_1
/
2
];
// nibbles / quants
uint8_t
qs
[
QK4_1
/
2
];
// nibbles / quants
}
block_q4_1
;
}
block_q4_1
;
static_assert
(
sizeof
(
block_q4_1
)
==
2
*
sizeof
(
ggml_half
)
+
QK4_1
/
2
,
"wrong q4_1 block size/padding"
);
static_assert
(
sizeof
(
block_q4_1
)
==
2
*
sizeof
(
ggml_half
)
+
QK4_1
/
2
,
"wrong q4_1 block size/padding"
);
...
@@ -201,9 +218,9 @@ typedef struct {
...
@@ -201,9 +218,9 @@ typedef struct {
struct
{
struct
{
ggml_half
d
;
// delta
ggml_half
d
;
// delta
ggml_half
m
;
// min
ggml_half
m
;
// min
}
GGML_COMMON_AGGR
;
}
GGML_COMMON_AGGR
_S
;
ggml_half2
dm
;
ggml_half2
dm
;
};
}
GGML_COMMON_AGGR_U
;
uint8_t
qh
[
4
];
// 5-th bit of quants
uint8_t
qh
[
4
];
// 5-th bit of quants
uint8_t
qs
[
QK5_1
/
2
];
// nibbles / quants
uint8_t
qs
[
QK5_1
/
2
];
// nibbles / quants
}
block_q5_1
;
}
block_q5_1
;
...
@@ -222,37 +239,13 @@ typedef struct {
...
@@ -222,37 +239,13 @@ typedef struct {
struct
{
struct
{
ggml_half
d
;
// delta
ggml_half
d
;
// delta
ggml_half
s
;
// d * sum(qs[i])
ggml_half
s
;
// d * sum(qs[i])
}
GGML_COMMON_AGGR
;
}
GGML_COMMON_AGGR
_S
;
ggml_half2
ds
;
ggml_half2
ds
;
};
}
GGML_COMMON_AGGR_U
;
int8_t
qs
[
QK8_1
];
// quants
int8_t
qs
[
QK8_1
];
// quants
}
block_q8_1
;
}
block_q8_1
;
static_assert
(
sizeof
(
block_q8_1
)
==
2
*
sizeof
(
ggml_half
)
+
QK8_1
,
"wrong q8_1 block size/padding"
);
static_assert
(
sizeof
(
block_q8_1
)
==
2
*
sizeof
(
ggml_half
)
+
QK8_1
,
"wrong q8_1 block size/padding"
);
typedef
struct
{
ggml_half
d
[
4
];
// deltas for 4 q4_0 blocks
uint8_t
qs
[
QK4_0
*
2
];
// nibbles / quants for 4 q4_0 blocks
}
block_q4_0x4
;
static_assert
(
sizeof
(
block_q4_0x4
)
==
4
*
sizeof
(
ggml_half
)
+
QK4_0
*
2
,
"wrong q4_0x4 block size/padding"
);
typedef
struct
{
ggml_half
d
[
8
];
// deltas for 8 q4_0 blocks
uint8_t
qs
[
QK4_0
*
4
];
// nibbles / quants for 8 q4_0 blocks
}
block_q4_0x8
;
static_assert
(
sizeof
(
block_q4_0x8
)
==
8
*
sizeof
(
ggml_half
)
+
QK4_0
*
4
,
"wrong q4_0x8 block size/padding"
);
typedef
struct
{
ggml_half
d
[
4
];
// deltas for 4 q8_0 blocks
int8_t
qs
[
QK8_0
*
4
];
// quants for 4 q8_0 blocks
}
block_q8_0x4
;
static_assert
(
sizeof
(
block_q8_0x4
)
==
4
*
sizeof
(
ggml_half
)
+
QK8_0
*
4
,
"wrong q8_0x4 block size/padding"
);
typedef
struct
{
ggml_half
d
[
8
];
// deltas for 8 q8_0 blocks
int8_t
qs
[
QK8_0
*
8
];
// quants for 8 q8_0 blocks
}
block_q8_0x8
;
static_assert
(
sizeof
(
block_q8_0x8
)
==
8
*
sizeof
(
ggml_half
)
+
QK8_0
*
8
,
"wrong q8_0x8 block size/padding"
);
//
//
// Ternary quantization
// Ternary quantization
//
//
...
@@ -287,9 +280,9 @@ typedef struct {
...
@@ -287,9 +280,9 @@ typedef struct {
struct
{
struct
{
ggml_half
d
;
// super-block scale for quantized scales
ggml_half
d
;
// super-block scale for quantized scales
ggml_half
dmin
;
// super-block scale for quantized mins
ggml_half
dmin
;
// super-block scale for quantized mins
}
GGML_COMMON_AGGR
;
}
GGML_COMMON_AGGR
_S
;
ggml_half2
dm
;
ggml_half2
dm
;
};
}
GGML_COMMON_AGGR_U
;
}
block_q2_K
;
}
block_q2_K
;
static_assert
(
sizeof
(
block_q2_K
)
==
2
*
sizeof
(
ggml_half
)
+
QK_K
/
16
+
QK_K
/
4
,
"wrong q2_K block size/padding"
);
static_assert
(
sizeof
(
block_q2_K
)
==
2
*
sizeof
(
ggml_half
)
+
QK_K
/
16
+
QK_K
/
4
,
"wrong q2_K block size/padding"
);
...
@@ -314,9 +307,9 @@ typedef struct {
...
@@ -314,9 +307,9 @@ typedef struct {
struct
{
struct
{
ggml_half
d
;
// super-block scale for quantized scales
ggml_half
d
;
// super-block scale for quantized scales
ggml_half
dmin
;
// super-block scale for quantized mins
ggml_half
dmin
;
// super-block scale for quantized mins
}
GGML_COMMON_AGGR
;
}
GGML_COMMON_AGGR
_S
;
ggml_half2
dm
;
ggml_half2
dm
;
};
}
GGML_COMMON_AGGR_U
;
uint8_t
scales
[
K_SCALE_SIZE
];
// scales and mins, quantized with 6 bits
uint8_t
scales
[
K_SCALE_SIZE
];
// scales and mins, quantized with 6 bits
uint8_t
qs
[
QK_K
/
2
];
// 4--bit quants
uint8_t
qs
[
QK_K
/
2
];
// 4--bit quants
}
block_q4_K
;
}
block_q4_K
;
...
@@ -331,9 +324,9 @@ typedef struct {
...
@@ -331,9 +324,9 @@ typedef struct {
struct
{
struct
{
ggml_half
d
;
// super-block scale for quantized scales
ggml_half
d
;
// super-block scale for quantized scales
ggml_half
dmin
;
// super-block scale for quantized mins
ggml_half
dmin
;
// super-block scale for quantized mins
}
GGML_COMMON_AGGR
;
}
GGML_COMMON_AGGR
_S
;
ggml_half2
dm
;
ggml_half2
dm
;
};
}
GGML_COMMON_AGGR_U
;
uint8_t
scales
[
K_SCALE_SIZE
];
// scales and mins, quantized with 6 bits
uint8_t
scales
[
K_SCALE_SIZE
];
// scales and mins, quantized with 6 bits
uint8_t
qh
[
QK_K
/
8
];
// quants, high bit
uint8_t
qh
[
QK_K
/
8
];
// quants, high bit
uint8_t
qs
[
QK_K
/
2
];
// quants, low 4 bits
uint8_t
qs
[
QK_K
/
2
];
// quants, low 4 bits
...
@@ -444,12 +437,6 @@ typedef struct {
...
@@ -444,12 +437,6 @@ typedef struct {
}
block_iq4_xs
;
}
block_iq4_xs
;
static_assert
(
sizeof
(
block_iq4_xs
)
==
sizeof
(
ggml_half
)
+
sizeof
(
uint16_t
)
+
QK_K
/
64
+
QK_K
/
2
,
"wrong iq4_xs block size/padding"
);
static_assert
(
sizeof
(
block_iq4_xs
)
==
sizeof
(
ggml_half
)
+
sizeof
(
uint16_t
)
+
QK_K
/
64
+
QK_K
/
2
,
"wrong iq4_xs block size/padding"
);
typedef
struct
{
ggml_half
d
[
4
];
// deltas for 4 iq4_nl blocks
uint8_t
qs
[
QK4_NL
*
2
];
// nibbles / quants for 4 iq4_nl blocks
}
block_iq4_nlx4
;
static_assert
(
sizeof
(
block_iq4_nlx4
)
==
4
*
sizeof
(
ggml_half
)
+
QK4_NL
*
2
,
"wrong iq4_nlx4 block size/padding"
);
#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL
...
@@ -463,6 +450,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
...
@@ -463,6 +450,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };
#define GGML_TABLE_END() };
#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CPP)
#include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };
#define GGML_COMMON_IMPL
#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_METAL)
#elif defined(GGML_COMMON_IMPL_METAL)
#include <metal_stdlib>
#include <metal_stdlib>
...
@@ -505,7 +499,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
...
@@ -505,7 +499,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
240
,
113
,
114
,
243
,
116
,
245
,
246
,
119
,
120
,
249
,
250
,
123
,
252
,
125
,
126
,
255
,
240
,
113
,
114
,
243
,
116
,
245
,
246
,
119
,
120
,
249
,
250
,
123
,
252
,
125
,
126
,
255
,
GGML_TABLE_END
()
GGML_TABLE_END
()
//#if __CUDA_ARCH__ >=
MIN
_CC_DP4A // lowest compute capability for integer intrinsics
//#if __CUDA_ARCH__ >=
GGML_CUDA
_CC_DP4A // lowest compute capability for integer intrinsics
GGML_TABLE_BEGIN
(
uint64_t
,
ksigns64
,
128
)
GGML_TABLE_BEGIN
(
uint64_t
,
ksigns64
,
128
)
0x0000000000000000
,
0xff000000000000ff
,
0xff0000000000ff00
,
0x000000000000ffff
,
0x0000000000000000
,
0xff000000000000ff
,
0xff0000000000ff00
,
0x000000000000ffff
,
0xff00000000ff0000
,
0x0000000000ff00ff
,
0x0000000000ffff00
,
0xff00000000ffffff
,
0xff00000000ff0000
,
0x0000000000ff00ff
,
0x0000000000ffff00
,
0xff00000000ffffff
,
...
...
llama/ggml-cpp.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cpu-aarch64.c
→
llama/ggml-cpu-aarch64.c
pp
View file @
7a81daf0
This diff is collapsed.
Click to expand it.
llama/ggml-cpu-aarch64.h
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -26,33 +26,9 @@
...
@@ -26,33 +26,9 @@
#pragma once
#pragma once
#include "ggml-cpu-traits.h"
#include "ggml.h"
#include "ggml.h"
// GGML internal header
// GGML internal header
#ifdef __cplusplus
ggml_backend_buffer_type_t
ggml_backend_cpu_aarch64_buffer_type
(
void
);
extern
"C"
{
#endif
// Quantization
void
quantize_mat_q8_0
(
const
float
*
GGML_RESTRICT
x
,
void
*
GGML_RESTRICT
y
,
int64_t
nrows
,
int64_t
n_per_row
,
int64_t
blck_size_interleave
);
// GEMV
void
ggml_gemv_q4_0_4x4_q8_0
(
int
n
,
float
*
GGML_RESTRICT
s
,
size_t
bs
,
const
void
*
GGML_RESTRICT
vx
,
const
void
*
GGML_RESTRICT
vy
,
int
nr
,
int
nc
);
void
ggml_gemv_q4_0_4x8_q8_0
(
int
n
,
float
*
GGML_RESTRICT
s
,
size_t
bs
,
const
void
*
GGML_RESTRICT
vx
,
const
void
*
GGML_RESTRICT
vy
,
int
nr
,
int
nc
);
void
ggml_gemv_q4_0_8x8_q8_0
(
int
n
,
float
*
GGML_RESTRICT
s
,
size_t
bs
,
const
void
*
GGML_RESTRICT
vx
,
const
void
*
GGML_RESTRICT
vy
,
int
nr
,
int
nc
);
void
ggml_gemv_iq4_nl_4x4_q8_0
(
int
n
,
float
*
GGML_RESTRICT
s
,
size_t
bs
,
const
void
*
GGML_RESTRICT
vx
,
const
void
*
GGML_RESTRICT
vy
,
int
nr
,
int
nc
);
// GEMM
void
ggml_gemm_q4_0_4x4_q8_0
(
int
n
,
float
*
GGML_RESTRICT
s
,
size_t
bs
,
const
void
*
GGML_RESTRICT
vx
,
const
void
*
GGML_RESTRICT
vy
,
int
nr
,
int
nc
);
void
ggml_gemm_q4_0_4x8_q8_0
(
int
n
,
float
*
GGML_RESTRICT
s
,
size_t
bs
,
const
void
*
GGML_RESTRICT
vx
,
const
void
*
GGML_RESTRICT
vy
,
int
nr
,
int
nc
);
void
ggml_gemm_q4_0_8x8_q8_0
(
int
n
,
float
*
GGML_RESTRICT
s
,
size_t
bs
,
const
void
*
GGML_RESTRICT
vx
,
const
void
*
GGML_RESTRICT
vy
,
int
nr
,
int
nc
);
void
ggml_gemm_iq4_nl_4x4_q8_0
(
int
n
,
float
*
GGML_RESTRICT
s
,
size_t
bs
,
const
void
*
GGML_RESTRICT
vx
,
const
void
*
GGML_RESTRICT
vy
,
int
nr
,
int
nc
);
void
ggml_aarch64_repack_tensor
(
struct
ggml_tensor
*
cur
,
enum
ggml_type
repack_type
,
const
void
*
data
,
size_t
data_size
);
enum
ggml_type
ggml_aarch64_get_optimal_repack_type
(
const
struct
ggml_tensor
*
cur
);
#ifdef __cplusplus
}
#endif
Prev
1
2
3
4
5
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment