Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
a83eaa7a
Commit
a83eaa7a
authored
Jul 19, 2023
by
Michael Yang
Browse files
update llama.cpp to e782c9e735f93ab4767ffc37462c523b73a17ddc
parent
5156e48c
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1708 additions
and
650 deletions
+1708
-650
llama/ggml-cuda.cu
llama/ggml-cuda.cu
+563
-62
llama/ggml-cuda.h
llama/ggml-cuda.h
+1
-1
llama/ggml-metal.h
llama/ggml-metal.h
+1
-1
llama/ggml-metal.m
llama/ggml-metal.m
+45
-35
llama/ggml-metal.metal
llama/ggml-metal.metal
+390
-329
llama/ggml.c
llama/ggml.c
+493
-159
llama/ggml.h
llama/ggml.h
+49
-2
llama/k_quants.c
llama/k_quants.c
+1
-1
llama/k_quants.h
llama/k_quants.h
+9
-1
llama/llama-util.h
llama/llama-util.h
+4
-4
llama/llama.cpp
llama/llama.cpp
+120
-53
llama/llama.h
llama/llama.h
+32
-2
No files found.
llama/ggml-cuda.cu
View file @
a83eaa7a
This diff is collapsed.
Click to expand it.
llama/ggml-cuda.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-metal.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-metal.m
View file @
a83eaa7a
// +build darwin
// +build darwin
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -722,8 +722,8 @@ void ggml_metal_graph_compute(
...
@@ -722,8 +722,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne12
==
1
);
GGML_ASSERT
(
ne12
==
1
);
nth0
=
4
;
nth0
=
2
;
nth1
=
16
;
nth1
=
32
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q4_K_f32
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q4_K_f32
];
}
break
;
}
break
;
case
GGML_TYPE_Q5_K
:
case
GGML_TYPE_Q5_K
:
...
@@ -731,8 +731,8 @@ void ggml_metal_graph_compute(
...
@@ -731,8 +731,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne12
==
1
);
GGML_ASSERT
(
ne12
==
1
);
nth0
=
4
;
nth0
=
2
;
nth1
=
16
;
nth1
=
32
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q5_K_f32
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q5_K_f32
];
}
break
;
}
break
;
case
GGML_TYPE_Q6_K
:
case
GGML_TYPE_Q6_K
:
...
@@ -740,8 +740,8 @@ void ggml_metal_graph_compute(
...
@@ -740,8 +740,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne12
==
1
);
GGML_ASSERT
(
ne12
==
1
);
nth0
=
4
;
nth0
=
2
;
nth1
=
16
;
nth1
=
32
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q6_K_f32
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q6_K_f32
];
}
break
;
}
break
;
default:
default:
...
@@ -767,15 +767,18 @@ void ggml_metal_graph_compute(
...
@@ -767,15 +767,18 @@ void ggml_metal_graph_compute(
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
ne0
)
atIndex
:
13
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
ne0
)
atIndex
:
13
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
ne1
)
atIndex
:
14
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
ne1
)
atIndex
:
14
];
if
(
src0t
==
GGML_TYPE_Q4_0
||
src0t
==
GGML_TYPE_Q4_1
)
{
if
(
src0t
==
GGML_TYPE_Q4_0
||
src0t
==
GGML_TYPE_Q4_1
||
[
encoder
setThreadgroupMemoryLength
:
nth0
*
nth1
*
sizeof
(
float
)
atIndex
:
0
];
src0t
==
GGML_TYPE_Q4_K
)
{
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
((
ne01
+
7
)
/
8
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
if
(
src0t
==
GGML_TYPE_Q5_K
)
{
[
encoder
dispatchThreadgroups
:
MTLSizeMake
((
ne01
+
3
)
/
4
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
if
(
src0t
==
GGML_TYPE_Q6_K
)
{
[
encoder
dispatchThreadgroups
:
MTLSizeMake
((
ne01
+
1
)
/
2
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
}
else
if
(
src0t
==
GGML_TYPE_Q2_K
||
else
if
(
src0t
==
GGML_TYPE_Q2_K
||
src0t
==
GGML_TYPE_Q3_K
||
src0t
==
GGML_TYPE_Q3_K
)
{
src0t
==
GGML_TYPE_Q4_K
||
src0t
==
GGML_TYPE_Q5_K
||
src0t
==
GGML_TYPE_Q6_K
)
{
[
encoder
setThreadgroupMemoryLength
:
nth0
*
nth1
*
sizeof
(
float
)
atIndex
:
0
];
[
encoder
setThreadgroupMemoryLength
:
nth0
*
nth1
*
sizeof
(
float
)
atIndex
:
0
];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
1
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
1
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
{
}
else
{
...
@@ -821,7 +824,7 @@ void ggml_metal_graph_compute(
...
@@ -821,7 +824,7 @@ void ggml_metal_graph_compute(
const
float
eps
=
1e-6
f
;
const
float
eps
=
1e-6
f
;
const
int
nth
=
256
;
const
int
nth
=
512
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rms_norm
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rms_norm
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
...
@@ -829,7 +832,7 @@ void ggml_metal_graph_compute(
...
@@ -829,7 +832,7 @@ void ggml_metal_graph_compute(
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
eps
length
:
sizeof
(
float
)
atIndex
:
4
];
[
encoder
setBytes
:
&
eps
length
:
sizeof
(
float
)
atIndex
:
4
];
[
encoder
setThreadgroupMemoryLength
:
nth
*
sizeof
(
float
)
atIndex
:
0
];
[
encoder
setThreadgroupMemoryLength
:
nth
/
32
*
sizeof
(
float
)
atIndex
:
0
];
const
int64_t
nrows
=
ggml_nrows
(
src0
);
const
int64_t
nrows
=
ggml_nrows
(
src0
);
...
@@ -910,28 +913,35 @@ void ggml_metal_graph_compute(
...
@@ -910,28 +913,35 @@ void ggml_metal_graph_compute(
const
int
n_past
=
((
int32_t
*
)(
src1
->
data
))[
0
];
const
int
n_past
=
((
int32_t
*
)(
src1
->
data
))[
0
];
float
freq_base
;
float
freq_scale
;
memcpy
(
&
freq_base
,
(
int32_t
*
)
src1
->
data
+
4
,
sizeof
(
float
));
memcpy
(
&
freq_scale
,
(
int32_t
*
)
src1
->
data
+
5
,
sizeof
(
float
));
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rope
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rope
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
1
];
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
1
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
ne01
length
:
sizeof
(
int64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
ne01
length
:
sizeof
(
int64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
ne02
length
:
sizeof
(
int64_t
)
atIndex
:
4
];
[
encoder
setBytes
:
&
ne02
length
:
sizeof
(
int64_t
)
atIndex
:
4
];
[
encoder
setBytes
:
&
ne03
length
:
sizeof
(
int64_t
)
atIndex
:
5
];
[
encoder
setBytes
:
&
ne03
length
:
sizeof
(
int64_t
)
atIndex
:
5
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
uint64_t
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
uint64_t
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
7
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
7
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
uint64_t
)
atIndex
:
8
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
uint64_t
)
atIndex
:
8
];
[
encoder
setBytes
:
&
nb03
length
:
sizeof
(
uint64_t
)
atIndex
:
9
];
[
encoder
setBytes
:
&
nb03
length
:
sizeof
(
uint64_t
)
atIndex
:
9
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
int64_t
)
atIndex
:
10
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
int64_t
)
atIndex
:
10
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
int64_t
)
atIndex
:
11
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
int64_t
)
atIndex
:
11
];
[
encoder
setBytes
:
&
ne2
length
:
sizeof
(
int64_t
)
atIndex
:
12
];
[
encoder
setBytes
:
&
ne2
length
:
sizeof
(
int64_t
)
atIndex
:
12
];
[
encoder
setBytes
:
&
ne3
length
:
sizeof
(
int64_t
)
atIndex
:
13
];
[
encoder
setBytes
:
&
ne3
length
:
sizeof
(
int64_t
)
atIndex
:
13
];
[
encoder
setBytes
:
&
nb0
length
:
sizeof
(
uint64_t
)
atIndex
:
14
];
[
encoder
setBytes
:
&
nb0
length
:
sizeof
(
uint64_t
)
atIndex
:
14
];
[
encoder
setBytes
:
&
nb1
length
:
sizeof
(
uint64_t
)
atIndex
:
15
];
[
encoder
setBytes
:
&
nb1
length
:
sizeof
(
uint64_t
)
atIndex
:
15
];
[
encoder
setBytes
:
&
nb2
length
:
sizeof
(
uint64_t
)
atIndex
:
16
];
[
encoder
setBytes
:
&
nb2
length
:
sizeof
(
uint64_t
)
atIndex
:
16
];
[
encoder
setBytes
:
&
nb3
length
:
sizeof
(
uint64_t
)
atIndex
:
17
];
[
encoder
setBytes
:
&
nb3
length
:
sizeof
(
uint64_t
)
atIndex
:
17
];
[
encoder
setBytes
:
&
n_past
length
:
sizeof
(
int
)
atIndex
:
18
];
[
encoder
setBytes
:
&
n_past
length
:
sizeof
(
int
)
atIndex
:
18
];
[
encoder
setBytes
:
&
n_dims
length
:
sizeof
(
int
)
atIndex
:
19
];
[
encoder
setBytes
:
&
n_dims
length
:
sizeof
(
int
)
atIndex
:
19
];
[
encoder
setBytes
:
&
mode
length
:
sizeof
(
int
)
atIndex
:
20
];
[
encoder
setBytes
:
&
mode
length
:
sizeof
(
int
)
atIndex
:
20
];
[
encoder
setBytes
:
&
freq_base
length
:
sizeof
(
float
)
atIndex
:
21
];
[
encoder
setBytes
:
&
freq_scale
length
:
sizeof
(
float
)
atIndex
:
22
];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
ne02
,
ne03
)
threadsPerThreadgroup
:
MTLSizeMake
(
1
,
1
,
1
)];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
ne02
,
ne03
)
threadsPerThreadgroup
:
MTLSizeMake
(
1
,
1
,
1
)];
}
break
;
}
break
;
...
...
llama/ggml-metal.metal
View file @
a83eaa7a
This diff is collapsed.
Click to expand it.
llama/ggml.c
View file @
a83eaa7a
This diff is collapsed.
Click to expand it.
llama/ggml.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -227,8 +227,13 @@
...
@@ -227,8 +227,13 @@
#define GGML_MAX_NAME 48
#define GGML_MAX_NAME 48
#define GGML_DEFAULT_N_THREADS 4
#define GGML_DEFAULT_N_THREADS 4
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
#define GGML_UNUSED(x) (void)(x)
#define GGML_UNUSED(x) (void)(x)
#define GGML_ASSERT(x) \
#define GGML_ASSERT(x) \
do { \
do { \
if (!(x)) { \
if (!(x)) { \
...
@@ -389,6 +394,8 @@ extern "C" {
...
@@ -389,6 +394,8 @@ extern "C" {
GGML_OP_CLAMP
,
GGML_OP_CLAMP
,
GGML_OP_CONV_1D
,
GGML_OP_CONV_1D
,
GGML_OP_CONV_2D
,
GGML_OP_CONV_2D
,
GGML_OP_POOL_1D
,
GGML_OP_POOL_2D
,
GGML_OP_FLASH_ATTN
,
GGML_OP_FLASH_ATTN
,
GGML_OP_FLASH_FF
,
GGML_OP_FLASH_FF
,
...
@@ -468,6 +475,10 @@ extern "C" {
...
@@ -468,6 +475,10 @@ extern "C" {
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
int
n_tasks
[
GGML_MAX_NODES
];
int
n_tasks
[
GGML_MAX_NODES
];
// abort ggml_graph_compute when true
bool
(
*
abort_callback
)(
void
*
data
);
void
*
abort_callback_data
;
};
};
// computation graph
// computation graph
...
@@ -1136,6 +1147,17 @@ extern "C" {
...
@@ -1136,6 +1147,17 @@ extern "C" {
int
mode
,
int
mode
,
int
n_ctx
);
int
n_ctx
);
// custom RoPE, in-place, returns view(a)
GGML_API
struct
ggml_tensor
*
ggml_rope_custom_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
int
n_past
,
int
n_dims
,
int
mode
,
float
freq_base
,
float
freq_scale
,
int
n_ctx
);
// rotary position embedding backward, i.e compute dx from dy
// rotary position embedding backward, i.e compute dx from dy
// a - dy
// a - dy
GGML_API
struct
ggml_tensor
*
ggml_rope_back
(
GGML_API
struct
ggml_tensor
*
ggml_rope_back
(
...
@@ -1190,6 +1212,31 @@ extern "C" {
...
@@ -1190,6 +1212,31 @@ extern "C" {
int
s
,
int
s
,
int
d
);
int
d
);
enum
ggml_op_pool
{
GGML_OP_POOL_MAX
,
GGML_OP_POOL_AVG
,
GGML_OP_POOL_COUNT
,
};
GGML_API
struct
ggml_tensor
*
ggml_pool_1d
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
enum
ggml_op_pool
op
,
int
k0
,
// kernel size
int
s0
,
// stride
int
p0
);
// padding
GGML_API
struct
ggml_tensor
*
ggml_pool_2d
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
enum
ggml_op_pool
op
,
int
k0
,
int
k1
,
int
s0
,
int
s1
,
int
p0
,
int
p1
);
GGML_API
struct
ggml_tensor
*
ggml_flash_attn
(
GGML_API
struct
ggml_tensor
*
ggml_flash_attn
(
struct
ggml_context
*
ctx
,
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
q
,
struct
ggml_tensor
*
q
,
...
@@ -1329,7 +1376,7 @@ extern "C" {
...
@@ -1329,7 +1376,7 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute()
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API
struct
ggml_cplan
ggml_graph_plan
(
struct
ggml_cgraph
*
cgraph
,
int
n_threads
/*= GGML_DEFAULT_N_THREADS*/
);
GGML_API
struct
ggml_cplan
ggml_graph_plan
(
struct
ggml_cgraph
*
cgraph
,
int
n_threads
/*= GGML_DEFAULT_N_THREADS*/
);
GGML_API
void
ggml_graph_compute
(
struct
ggml_cgraph
*
cgraph
,
struct
ggml_cplan
*
cplan
);
GGML_API
int
ggml_graph_compute
(
struct
ggml_cgraph
*
cgraph
,
struct
ggml_cplan
*
cplan
);
GGML_API
void
ggml_graph_reset
(
struct
ggml_cgraph
*
cgraph
);
GGML_API
void
ggml_graph_reset
(
struct
ggml_cgraph
*
cgraph
);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// same as ggml_graph_compute() but the work data is allocated as a part of the context
...
...
llama/k_quants.c
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
...
llama/k_quants.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -41,6 +41,14 @@
...
@@ -41,6 +41,14 @@
#define K_SCALE_SIZE 12
#define K_SCALE_SIZE 12
#endif
#endif
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
//
//
// Super-block quantization structures
// Super-block quantization structures
//
//
...
...
llama/llama-util.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -201,13 +201,13 @@ struct llama_mmap {
...
@@ -201,13 +201,13 @@ struct llama_mmap {
llama_mmap
(
struct
llama_file
*
file
,
size_t
prefetch
=
(
size_t
)
-
1
/* -1 = max value */
,
bool
numa
=
false
)
{
llama_mmap
(
struct
llama_file
*
file
,
size_t
prefetch
=
(
size_t
)
-
1
/* -1 = max value */
,
bool
numa
=
false
)
{
size
=
file
->
size
;
size
=
file
->
size
;
int
fd
=
fileno
(
file
->
fp
);
int
fd
=
fileno
(
file
->
fp
);
int
flags
=
MAP_
PRIVATE
;
int
flags
=
MAP_
SHARED
;
// prefetch/readahead impairs performance on NUMA systems
// prefetch/readahead impairs performance on NUMA systems
if
(
numa
)
{
prefetch
=
0
;
}
if
(
numa
)
{
prefetch
=
0
;
}
#ifdef __linux__
#ifdef __linux__
if
(
prefetch
)
{
flags
|=
MAP_POPULATE
;
}
if
(
prefetch
)
{
flags
|=
MAP_POPULATE
;
}
#endif
#endif
addr
=
mmap
(
NULL
,
file
->
size
,
PROT_READ
|
PROT_WRITE
,
flags
,
fd
,
0
);
addr
=
mmap
(
NULL
,
file
->
size
,
PROT_READ
,
flags
,
fd
,
0
);
if
(
addr
==
MAP_FAILED
)
{
if
(
addr
==
MAP_FAILED
)
{
throw
std
::
runtime_error
(
format
(
"mmap failed: %s"
,
strerror
(
errno
)));
throw
std
::
runtime_error
(
format
(
"mmap failed: %s"
,
strerror
(
errno
)));
}
}
...
@@ -249,7 +249,7 @@ struct llama_mmap {
...
@@ -249,7 +249,7 @@ struct llama_mmap {
throw
std
::
runtime_error
(
format
(
"CreateFileMappingA failed: %s"
,
llama_format_win_err
(
error
).
c_str
()));
throw
std
::
runtime_error
(
format
(
"CreateFileMappingA failed: %s"
,
llama_format_win_err
(
error
).
c_str
()));
}
}
addr
=
MapViewOfFile
(
hMapping
,
FILE_MAP_
COPY
,
0
,
0
,
0
);
addr
=
MapViewOfFile
(
hMapping
,
FILE_MAP_
READ
,
0
,
0
,
0
);
error
=
GetLastError
();
error
=
GetLastError
();
CloseHandle
(
hMapping
);
CloseHandle
(
hMapping
);
...
...
llama/llama.cpp
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
...
@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
// memory sizes
// memory sizes
//
//
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_SCRATCH0
()
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_SCRATCH0
(
int
n_ctx
)
{
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
{
MODEL_3B
,
256ull
*
MB
},
/* empirical scaling, still a guess */
{
MODEL_7B
,
512ull
*
MB
},
{
MODEL_3B
,
((
size_t
)
n_ctx
/
16ull
+
128ull
)
*
MB
},
{
MODEL_13B
,
512ull
*
MB
},
{
MODEL_7B
,
((
size_t
)
n_ctx
/
16ull
+
256ull
)
*
MB
},
{
MODEL_30B
,
512ull
*
MB
},
{
MODEL_13B
,
((
size_t
)
n_ctx
/
12ull
+
256ull
)
*
MB
},
{
MODEL_65B
,
1024ull
*
MB
},
{
MODEL_30B
,
((
size_t
)
n_ctx
/
10ull
+
256ull
)
*
MB
},
{
MODEL_65B
,
((
size_t
)
n_ctx
/
8ull
+
512ull
)
*
MB
},
};
};
return
k_sizes
;
return
k_sizes
;
}
}
...
@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
...
@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
// this is mostly needed for temporary mul_mat buffers to dequantize the data
// this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled
// not actually needed if BLAS is disabled
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_EVAL
()
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_EVAL
(
int
n_ctx
)
{
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
{
MODEL_3B
,
512ull
*
MB
},
{
MODEL_3B
,
((
size_t
)
n_ctx
/
256ull
+
512ull
)
*
MB
},
{
MODEL_7B
,
768ull
*
MB
},
{
MODEL_7B
,
((
size_t
)
n_ctx
/
256ull
+
768ull
)
*
MB
},
{
MODEL_13B
,
1024ull
*
MB
},
{
MODEL_13B
,
((
size_t
)
n_ctx
/
256ull
+
1024ull
)
*
MB
},
{
MODEL_30B
,
1280ull
*
MB
},
{
MODEL_30B
,
((
size_t
)
n_ctx
/
256ull
+
1280ull
)
*
MB
},
{
MODEL_65B
,
1536ull
*
MB
},
{
MODEL_65B
,
((
size_t
)
n_ctx
/
256ull
+
1536ull
)
*
MB
},
};
};
return
k_sizes
;
return
k_sizes
;
}
}
...
@@ -215,6 +216,10 @@ struct llama_hparams {
...
@@ -215,6 +216,10 @@ struct llama_hparams {
uint32_t
n_head
=
32
;
uint32_t
n_head
=
32
;
uint32_t
n_layer
=
32
;
uint32_t
n_layer
=
32
;
uint32_t
n_rot
=
64
;
uint32_t
n_rot
=
64
;
float
rope_freq_base
=
10000.0
f
;
float
rope_freq_scale
=
1.0
f
;
enum
llama_ftype
ftype
=
LLAMA_FTYPE_MOSTLY_F16
;
enum
llama_ftype
ftype
=
LLAMA_FTYPE_MOSTLY_F16
;
bool
operator
!=
(
const
llama_hparams
&
other
)
const
{
bool
operator
!=
(
const
llama_hparams
&
other
)
const
{
...
@@ -329,7 +334,7 @@ struct llama_model {
...
@@ -329,7 +334,7 @@ struct llama_model {
};
};
struct
llama_context
{
struct
llama_context
{
llama_context
(
const
llama_model
&
model
,
const
llama_vocab
&
vocab
)
:
model
(
model
),
vocab
(
vocab
),
t_load_us
(
model
.
t_load_us
),
t_start_us
(
model
.
t_start_us
)
{}
llama_context
(
const
llama_model
&
model
)
:
model
(
model
),
t_load_us
(
model
.
t_load_us
),
t_start_us
(
model
.
t_start_us
)
{}
#ifdef GGML_USE_METAL
#ifdef GGML_USE_METAL
~
llama_context
()
{
~
llama_context
()
{
if
(
ctx_metal
)
{
if
(
ctx_metal
)
{
...
@@ -350,7 +355,6 @@ struct llama_context {
...
@@ -350,7 +355,6 @@ struct llama_context {
int32_t
n_p_eval
=
0
;
// number of tokens in eval calls for the prompt (with batch size > 1)
int32_t
n_p_eval
=
0
;
// number of tokens in eval calls for the prompt (with batch size > 1)
const
llama_model
&
model
;
const
llama_model
&
model
;
const
llama_vocab
&
vocab
;
bool
model_owner
=
false
;
bool
model_owner
=
false
;
...
@@ -577,7 +581,9 @@ struct llama_file_loader {
...
@@ -577,7 +581,9 @@ struct llama_file_loader {
}
}
// skip to the next multiple of 32 bytes
// skip to the next multiple of 32 bytes
file
.
seek
(
-
static_cast
<
ptrdiff_t
>
(
file
.
tell
())
&
31
,
SEEK_CUR
);
if
(
file_version
>=
LLAMA_FILE_VERSION_GGJT_V1
)
{
file
.
seek
(
-
static_cast
<
ptrdiff_t
>
(
file
.
tell
())
&
31
,
SEEK_CUR
);
}
tensor
.
file_off
=
file
.
tell
();
tensor
.
file_off
=
file
.
tell
();
tensor
.
name
=
name
;
tensor
.
name
=
name
;
...
@@ -674,7 +680,7 @@ struct llama_model_loader {
...
@@ -674,7 +680,7 @@ struct llama_model_loader {
*
ctx_size_p
=
*
mmapped_size_p
=
0
;
*
ctx_size_p
=
*
mmapped_size_p
=
0
;
for
(
const
llama_load_tensor
&
lt
:
tensors_map
.
tensors
)
{
for
(
const
llama_load_tensor
&
lt
:
tensors_map
.
tensors
)
{
*
ctx_size_p
+=
sizeof
(
struct
ggml_tensor
)
+
GGML_OBJECT_SIZE
;
*
ctx_size_p
+=
sizeof
(
struct
ggml_tensor
)
+
GGML_OBJECT_SIZE
;
*
(
use_mmap
?
mmapped_size_p
:
ctx_size_p
)
+=
lt
.
size
;
*
(
use_mmap
?
mmapped_size_p
:
ctx_size_p
)
+=
lt
.
size
+
16
;
}
}
}
}
...
@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() {
...
@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() {
/*.gpu_layers =*/
0
,
/*.gpu_layers =*/
0
,
/*.main_gpu =*/
0
,
/*.main_gpu =*/
0
,
/*.tensor_split =*/
{
0
},
/*.tensor_split =*/
{
0
},
/*.rope_freq_base =*/
10000.0
f
,
/*.rope_freq_scale =*/
1.0
f
,
/*.progress_callback =*/
nullptr
,
/*.progress_callback =*/
nullptr
,
/*.progress_callback_user_data =*/
nullptr
,
/*.progress_callback_user_data =*/
nullptr
,
/*.low_vram =*/
false
,
/*.low_vram =*/
false
,
...
@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
...
@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
return
result
;
return
result
;
}
}
int
llama_max_devices
()
{
return
LLAMA_MAX_DEVICES
;
}
bool
llama_mmap_supported
()
{
bool
llama_mmap_supported
()
{
return
llama_mmap
::
SUPPORTED
;
return
llama_mmap
::
SUPPORTED
;
}
}
...
@@ -993,6 +1005,8 @@ static void llama_model_load_internal(
...
@@ -993,6 +1005,8 @@ static void llama_model_load_internal(
int
n_gpu_layers
,
int
n_gpu_layers
,
int
main_gpu
,
int
main_gpu
,
const
float
*
tensor_split
,
const
float
*
tensor_split
,
float
rope_freq_base
,
float
rope_freq_scale
,
bool
low_vram
,
bool
low_vram
,
ggml_type
memory_type
,
ggml_type
memory_type
,
bool
use_mmap
,
bool
use_mmap
,
...
@@ -1027,22 +1041,27 @@ static void llama_model_load_internal(
...
@@ -1027,22 +1041,27 @@ static void llama_model_load_internal(
}
}
hparams
.
n_ctx
=
n_ctx
;
hparams
.
n_ctx
=
n_ctx
;
hparams
.
rope_freq_base
=
rope_freq_base
;
hparams
.
rope_freq_scale
=
rope_freq_scale
;
}
}
const
uint32_t
n_ff
=
((
2
*
(
4
*
hparams
.
n_embd
)
/
3
+
hparams
.
n_mult
-
1
)
/
hparams
.
n_mult
)
*
hparams
.
n_mult
;
const
uint32_t
n_ff
=
((
2
*
(
4
*
hparams
.
n_embd
)
/
3
+
hparams
.
n_mult
-
1
)
/
hparams
.
n_mult
)
*
hparams
.
n_mult
;
{
{
fprintf
(
stderr
,
"%s: format = %s
\n
"
,
__func__
,
llama_file_version_name
(
file_version
));
fprintf
(
stderr
,
"%s: format = %s
\n
"
,
__func__
,
llama_file_version_name
(
file_version
));
fprintf
(
stderr
,
"%s: n_vocab = %u
\n
"
,
__func__
,
hparams
.
n_vocab
);
fprintf
(
stderr
,
"%s: n_vocab = %u
\n
"
,
__func__
,
hparams
.
n_vocab
);
fprintf
(
stderr
,
"%s: n_ctx = %u
\n
"
,
__func__
,
hparams
.
n_ctx
);
fprintf
(
stderr
,
"%s: n_ctx = %u
\n
"
,
__func__
,
hparams
.
n_ctx
);
fprintf
(
stderr
,
"%s: n_embd = %u
\n
"
,
__func__
,
hparams
.
n_embd
);
fprintf
(
stderr
,
"%s: n_embd = %u
\n
"
,
__func__
,
hparams
.
n_embd
);
fprintf
(
stderr
,
"%s: n_mult = %u
\n
"
,
__func__
,
hparams
.
n_mult
);
fprintf
(
stderr
,
"%s: n_mult = %u
\n
"
,
__func__
,
hparams
.
n_mult
);
fprintf
(
stderr
,
"%s: n_head = %u
\n
"
,
__func__
,
hparams
.
n_head
);
fprintf
(
stderr
,
"%s: n_head = %u
\n
"
,
__func__
,
hparams
.
n_head
);
fprintf
(
stderr
,
"%s: n_layer = %u
\n
"
,
__func__
,
hparams
.
n_layer
);
fprintf
(
stderr
,
"%s: n_layer = %u
\n
"
,
__func__
,
hparams
.
n_layer
);
fprintf
(
stderr
,
"%s: n_rot = %u
\n
"
,
__func__
,
hparams
.
n_rot
);
fprintf
(
stderr
,
"%s: n_rot = %u
\n
"
,
__func__
,
hparams
.
n_rot
);
fprintf
(
stderr
,
"%s: freq_base = %.1f
\n
"
,
__func__
,
hparams
.
rope_freq_base
);
fprintf
(
stderr
,
"%s: freq_scale = %g
\n
"
,
__func__
,
hparams
.
rope_freq_scale
);
fprintf
(
stderr
,
"%s: ftype = %u (%s)
\n
"
,
__func__
,
hparams
.
ftype
,
llama_ftype_name
(
hparams
.
ftype
));
fprintf
(
stderr
,
"%s: ftype = %u (%s)
\n
"
,
__func__
,
hparams
.
ftype
,
llama_ftype_name
(
hparams
.
ftype
));
fprintf
(
stderr
,
"%s: n_ff = %u
\n
"
,
__func__
,
n_ff
);
fprintf
(
stderr
,
"%s: n_ff = %u
\n
"
,
__func__
,
n_ff
);
fprintf
(
stderr
,
"%s: model size = %s
\n
"
,
__func__
,
llama_model_type_name
(
model
.
type
));
fprintf
(
stderr
,
"%s: model size = %s
\n
"
,
__func__
,
llama_model_type_name
(
model
.
type
));
}
}
if
(
file_version
<
LLAMA_FILE_VERSION_GGJT_V2
)
{
if
(
file_version
<
LLAMA_FILE_VERSION_GGJT_V2
)
{
...
@@ -1191,9 +1210,9 @@ static void llama_model_load_internal(
...
@@ -1191,9 +1210,9 @@ static void llama_model_load_internal(
const
size_t
mem_required
=
const
size_t
mem_required
=
ctx_size
+
ctx_size
+
mmapped_size
-
vram_weights
+
// weights in VRAM not in memory
mmapped_size
-
vram_weights
+
// weights in VRAM not in memory
MEM_REQ_SCRATCH0
().
at
(
model
.
type
)
+
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
model
.
type
)
+
MEM_REQ_SCRATCH1
().
at
(
model
.
type
)
+
MEM_REQ_SCRATCH1
().
at
(
model
.
type
)
+
MEM_REQ_EVAL
().
at
(
model
.
type
);
MEM_REQ_EVAL
(
hparams
.
n_ctx
).
at
(
model
.
type
);
// this is the memory required by one llama_state
// this is the memory required by one llama_state
const
size_t
mem_required_state
=
const
size_t
mem_required_state
=
...
@@ -1297,6 +1316,8 @@ static bool llama_model_load(
...
@@ -1297,6 +1316,8 @@ static bool llama_model_load(
int
n_gpu_layers
,
int
n_gpu_layers
,
int
main_gpu
,
int
main_gpu
,
float
*
tensor_split
,
float
*
tensor_split
,
float
rope_freq_base
,
float
rope_freq_scale
,
bool
low_vram
,
bool
low_vram
,
ggml_type
memory_type
,
ggml_type
memory_type
,
bool
use_mmap
,
bool
use_mmap
,
...
@@ -1305,7 +1326,7 @@ static bool llama_model_load(
...
@@ -1305,7 +1326,7 @@ static bool llama_model_load(
llama_progress_callback
progress_callback
,
llama_progress_callback
progress_callback
,
void
*
progress_callback_user_data
)
{
void
*
progress_callback_user_data
)
{
try
{
try
{
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
low_vram
,
memory_type
,
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
rope_freq_base
,
rope_freq_scale
,
low_vram
,
memory_type
,
use_mmap
,
use_mlock
,
vocab_only
,
progress_callback
,
progress_callback_user_data
);
use_mmap
,
use_mlock
,
vocab_only
,
progress_callback
,
progress_callback_user_data
);
return
true
;
return
true
;
}
catch
(
const
std
::
exception
&
err
)
{
}
catch
(
const
std
::
exception
&
err
)
{
...
@@ -1357,6 +1378,9 @@ static bool llama_eval_internal(
...
@@ -1357,6 +1378,9 @@ static bool llama_eval_internal(
const
int
n_rot
=
hparams
.
n_embd
/
hparams
.
n_head
;
const
int
n_rot
=
hparams
.
n_embd
/
hparams
.
n_head
;
const
int
n_gpu_layers
=
model
.
n_gpu_layers
;
const
int
n_gpu_layers
=
model
.
n_gpu_layers
;
const
float
freq_base
=
hparams
.
rope_freq_base
;
const
float
freq_scale
=
hparams
.
rope_freq_scale
;
auto
&
mem_per_token
=
lctx
.
mem_per_token
;
auto
&
mem_per_token
=
lctx
.
mem_per_token
;
auto
&
buf_compute
=
lctx
.
buf_compute
;
auto
&
buf_compute
=
lctx
.
buf_compute
;
...
@@ -1454,11 +1478,11 @@ static bool llama_eval_internal(
...
@@ -1454,11 +1478,11 @@ static bool llama_eval_internal(
offload_func_kq
(
tmpq
);
offload_func_kq
(
tmpq
);
ggml_set_name
(
tmpq
,
"tmpq"
);
ggml_set_name
(
tmpq
,
"tmpq"
);
struct
ggml_tensor
*
Kcur
=
ggml_rope_inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpk
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
0
);
struct
ggml_tensor
*
Kcur
=
ggml_rope_
custom_
inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpk
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
freq_base
,
freq_scale
,
0
);
offload_func_kq
(
Kcur
);
offload_func_kq
(
Kcur
);
ggml_set_name
(
Kcur
,
"Kcur"
);
ggml_set_name
(
Kcur
,
"Kcur"
);
struct
ggml_tensor
*
Qcur
=
ggml_rope_inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpq
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
0
);
struct
ggml_tensor
*
Qcur
=
ggml_rope_
custom_
inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpq
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
freq_base
,
freq_scale
,
0
);
offload_func_kq
(
Qcur
);
offload_func_kq
(
Qcur
);
ggml_set_name
(
Qcur
,
"Qcur"
);
ggml_set_name
(
Qcur
,
"Qcur"
);
...
@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
...
@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
}
}
// Normalize the second derivatives
// Normalize the second derivatives
float
second_derivatives_sum
=
std
::
accumulate
(
second_derivatives
.
begin
(),
second_derivatives
.
end
(),
0.0
f
);
{
for
(
float
&
value
:
second_derivatives
)
{
const
float
second_derivatives_sum
=
std
::
accumulate
(
second_derivatives
.
begin
(),
second_derivatives
.
end
(),
0.0
f
);
value
/=
second_derivatives_sum
;
if
(
second_derivatives_sum
>
1e-6
f
)
{
for
(
float
&
value
:
second_derivatives
)
{
value
/=
second_derivatives_sum
;
}
}
else
{
for
(
float
&
value
:
second_derivatives
)
{
value
=
1.0
f
/
second_derivatives
.
size
();
}
}
}
}
float
cum_sum
=
0.0
f
;
float
cum_sum
=
0.0
f
;
...
@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance(
...
@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance(
struct
llama_context
*
guidance_ctx
,
struct
llama_context
*
guidance_ctx
,
float
scale
,
float
scale
,
float
smooth_factor
)
{
float
smooth_factor
)
{
int64_t
t_start_sample_us
=
t_start_sample_us
=
ggml_time_us
();
int64_t
t_start_sample_us
=
ggml_time_us
();
assert
(
ctx
);
assert
(
ctx
);
auto
n_vocab
=
llama_n_vocab
(
ctx
);
auto
n_vocab
=
llama_n_vocab
(
ctx
);
...
@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file(
...
@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file(
ggml_type
memory_type
=
params
.
f16_kv
?
GGML_TYPE_F16
:
GGML_TYPE_F32
;
ggml_type
memory_type
=
params
.
f16_kv
?
GGML_TYPE_F16
:
GGML_TYPE_F32
;
if
(
!
llama_model_load
(
path_model
,
*
model
,
model
->
vocab
,
params
.
n_ctx
,
params
.
n_batch
,
params
.
n_gpu_layers
,
if
(
!
llama_model_load
(
path_model
,
*
model
,
model
->
vocab
,
params
.
n_ctx
,
params
.
n_batch
,
params
.
n_gpu_layers
,
params
.
main_gpu
,
params
.
tensor_split
,
params
.
low_vram
,
memory_type
,
params
.
use_mmap
,
params
.
use_mlock
,
params
.
main_gpu
,
params
.
tensor_split
,
params
.
rope_freq_base
,
params
.
rope_freq_scale
,
params
.
low_vram
,
params
.
vocab_only
,
params
.
progress_callback
,
params
.
progress_callback_user_data
))
{
memory_type
,
params
.
use_mmap
,
params
.
use_mlock
,
params
.
vocab_only
,
params
.
progress_callback
,
params
.
progress_callback_user_data
))
{
delete
model
;
delete
model
;
fprintf
(
stderr
,
"%s: failed to load model
\n
"
,
__func__
);
fprintf
(
stderr
,
"%s: failed to load model
\n
"
,
__func__
);
return
nullptr
;
return
nullptr
;
...
@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model(
...
@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model(
return
nullptr
;
return
nullptr
;
}
}
llama_context
*
ctx
=
new
llama_context
(
*
model
,
model
->
vocab
);
llama_context
*
ctx
=
new
llama_context
(
*
model
);
if
(
params
.
seed
==
LLAMA_DEFAULT_SEED
)
{
if
(
params
.
seed
==
LLAMA_DEFAULT_SEED
)
{
params
.
seed
=
time
(
NULL
);
params
.
seed
=
time
(
NULL
);
...
@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model(
...
@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model(
ctx
->
embedding
.
resize
(
hparams
.
n_embd
);
ctx
->
embedding
.
resize
(
hparams
.
n_embd
);
}
}
ctx
->
buf_compute
.
resize
(
MEM_REQ_EVAL
().
at
(
ctx
->
model
.
type
));
ctx
->
buf_compute
.
resize
(
MEM_REQ_EVAL
(
hparams
.
n_ctx
).
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
0
].
resize
(
MEM_REQ_SCRATCH0
().
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
0
].
resize
(
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
1
].
resize
(
MEM_REQ_SCRATCH1
().
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
1
].
resize
(
MEM_REQ_SCRATCH1
().
at
(
ctx
->
model
.
type
));
}
}
...
@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
...
@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
return
0
;
return
0
;
}
}
int
llama_tokenize
(
int
llama_tokenize
_with_model
(
struct
llama_
context
*
ctx
,
const
struct
llama_
model
*
model
,
const
char
*
text
,
const
char
*
text
,
llama_token
*
tokens
,
llama_token
*
tokens
,
int
n_max_tokens
,
int
n_max_tokens
,
bool
add_bos
)
{
bool
add_bos
)
{
auto
res
=
llama_tokenize
(
ctx
->
vocab
,
text
,
add_bos
);
auto
res
=
llama_tokenize
(
model
->
vocab
,
text
,
add_bos
);
if
(
n_max_tokens
<
(
int
)
res
.
size
())
{
if
(
n_max_tokens
<
(
int
)
res
.
size
())
{
fprintf
(
stderr
,
"%s: too many tokens
\n
"
,
__func__
);
fprintf
(
stderr
,
"%s: too many tokens
\n
"
,
__func__
);
...
@@ -3581,8 +3615,29 @@ int llama_tokenize(
...
@@ -3581,8 +3615,29 @@ int llama_tokenize(
return
res
.
size
();
return
res
.
size
();
}
}
int
llama_tokenize
(
struct
llama_context
*
ctx
,
const
char
*
text
,
llama_token
*
tokens
,
int
n_max_tokens
,
bool
add_bos
)
{
return
llama_tokenize_with_model
(
&
ctx
->
model
,
text
,
tokens
,
n_max_tokens
,
add_bos
);
}
int
llama_n_vocab_from_model
(
const
struct
llama_model
*
model
)
{
return
model
->
vocab
.
id_to_token
.
size
();
}
int
llama_n_ctx_from_model
(
const
struct
llama_model
*
model
)
{
return
model
->
hparams
.
n_ctx
;
}
int
llama_n_embd_from_model
(
const
struct
llama_model
*
model
)
{
return
model
->
hparams
.
n_embd
;
}
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
)
{
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
)
{
return
ctx
->
vocab
.
id_to_token
.
size
();
return
ctx
->
model
.
vocab
.
id_to_token
.
size
();
}
}
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
)
{
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
)
{
...
@@ -3593,19 +3648,27 @@ int llama_n_embd(const struct llama_context * ctx) {
...
@@ -3593,19 +3648,27 @@ int llama_n_embd(const struct llama_context * ctx) {
return
ctx
->
model
.
hparams
.
n_embd
;
return
ctx
->
model
.
hparams
.
n_embd
;
}
}
int
llama_get_vocab
(
int
llama_get_vocab
_from_model
(
const
struct
llama_
context
*
ctx
,
const
struct
llama_
model
*
model
,
const
char
*
*
strings
,
const
char
*
*
strings
,
float
*
scores
,
float
*
scores
,
int
capacity
)
{
int
capacity
)
{
int
n
=
std
::
min
(
capacity
,
(
int
)
ctx
->
vocab
.
id_to_token
.
size
());
int
n
=
std
::
min
(
capacity
,
(
int
)
model
->
vocab
.
id_to_token
.
size
());
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
strings
[
i
]
=
ctx
->
vocab
.
id_to_token
[
i
].
tok
.
c_str
();
strings
[
i
]
=
model
->
vocab
.
id_to_token
[
i
].
tok
.
c_str
();
scores
[
i
]
=
ctx
->
vocab
.
id_to_token
[
i
].
score
;
scores
[
i
]
=
model
->
vocab
.
id_to_token
[
i
].
score
;
}
}
return
n
;
return
n
;
}
}
int
llama_get_vocab
(
const
struct
llama_context
*
ctx
,
const
char
*
*
strings
,
float
*
scores
,
int
capacity
)
{
return
llama_get_vocab_from_model
(
&
ctx
->
model
,
strings
,
scores
,
capacity
);
}
float
*
llama_get_logits
(
struct
llama_context
*
ctx
)
{
float
*
llama_get_logits
(
struct
llama_context
*
ctx
)
{
return
ctx
->
logits
.
data
();
return
ctx
->
logits
.
data
();
}
}
...
@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
...
@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return
ctx
->
embedding
.
data
();
return
ctx
->
embedding
.
data
();
}
}
const
char
*
llama_token_to_str
(
const
struct
llama_
context
*
ctx
,
llama_token
token
)
{
const
char
*
llama_token_to_str
_with_model
(
const
struct
llama_
model
*
model
,
llama_token
token
)
{
if
(
token
>=
llama_n_vocab
(
ctx
))
{
if
(
token
>=
llama_n_vocab
_from_model
(
model
))
{
return
nullptr
;
return
nullptr
;
}
}
return
ctx
->
vocab
.
id_to_token
[
token
].
tok
.
c_str
();
return
model
->
vocab
.
id_to_token
[
token
].
tok
.
c_str
();
}
const
char
*
llama_token_to_str
(
const
struct
llama_context
*
ctx
,
llama_token
token
)
{
return
llama_token_to_str_with_model
(
&
ctx
->
model
,
token
);
}
}
llama_token
llama_token_bos
()
{
llama_token
llama_token_bos
()
{
...
...
llama/llama.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -115,6 +115,11 @@ extern "C" {
...
@@ -115,6 +115,11 @@ extern "C" {
int32_t
n_gpu_layers
;
// number of layers to store in VRAM
int32_t
n_gpu_layers
;
// number of layers to store in VRAM
int32_t
main_gpu
;
// the GPU that is used for scratch and small tensors
int32_t
main_gpu
;
// the GPU that is used for scratch and small tensors
float
tensor_split
[
LLAMA_MAX_DEVICES
];
// how to split layers across multiple GPUs
float
tensor_split
[
LLAMA_MAX_DEVICES
];
// how to split layers across multiple GPUs
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float
rope_freq_base
;
// RoPE base frequency
float
rope_freq_scale
;
// RoPE frequency scaling factor
// called with a progress value between 0 and 1, pass NULL to disable
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback
progress_callback
;
llama_progress_callback
progress_callback
;
// context pointer passed to the progress callback
// context pointer passed to the progress callback
...
@@ -174,6 +179,8 @@ extern "C" {
...
@@ -174,6 +179,8 @@ extern "C" {
int32_t
n_eval
;
int32_t
n_eval
;
};
};
LLAMA_API
int
llama_max_devices
();
LLAMA_API
struct
llama_context_params
llama_context_default_params
();
LLAMA_API
struct
llama_context_params
llama_context_default_params
();
LLAMA_API
struct
llama_model_quantize_params
llama_model_quantize_default_params
();
LLAMA_API
struct
llama_model_quantize_params
llama_model_quantize_default_params
();
...
@@ -296,10 +303,21 @@ extern "C" {
...
@@ -296,10 +303,21 @@ extern "C" {
int
n_max_tokens
,
int
n_max_tokens
,
bool
add_bos
);
bool
add_bos
);
LLAMA_API
int
llama_tokenize_with_model
(
const
struct
llama_model
*
model
,
const
char
*
text
,
llama_token
*
tokens
,
int
n_max_tokens
,
bool
add_bos
);
LLAMA_API
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_embd
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_embd
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_vocab_from_model
(
const
struct
llama_model
*
model
);
LLAMA_API
int
llama_n_ctx_from_model
(
const
struct
llama_model
*
model
);
LLAMA_API
int
llama_n_embd_from_model
(
const
struct
llama_model
*
model
);
// Get the vocabulary as output parameters.
// Get the vocabulary as output parameters.
// Returns number of results.
// Returns number of results.
LLAMA_API
int
llama_get_vocab
(
LLAMA_API
int
llama_get_vocab
(
...
@@ -308,6 +326,12 @@ extern "C" {
...
@@ -308,6 +326,12 @@ extern "C" {
float
*
scores
,
float
*
scores
,
int
capacity
);
int
capacity
);
LLAMA_API
int
llama_get_vocab_from_model
(
const
struct
llama_model
*
model
,
const
char
*
*
strings
,
float
*
scores
,
int
capacity
);
// Token logits obtained from the last call to llama_eval()
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
// Can be mutated in order to change the probabilities of the next token
...
@@ -320,7 +344,13 @@ extern "C" {
...
@@ -320,7 +344,13 @@ extern "C" {
LLAMA_API
float
*
llama_get_embeddings
(
struct
llama_context
*
ctx
);
LLAMA_API
float
*
llama_get_embeddings
(
struct
llama_context
*
ctx
);
// Token Id -> String. Uses the vocabulary in the provided context
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API
const
char
*
llama_token_to_str
(
const
struct
llama_context
*
ctx
,
llama_token
token
);
LLAMA_API
const
char
*
llama_token_to_str
(
const
struct
llama_context
*
ctx
,
llama_token
token
);
LLAMA_API
const
char
*
llama_token_to_str_with_model
(
const
struct
llama_model
*
model
,
llama_token
token
);
// Special tokens
// Special tokens
LLAMA_API
llama_token
llama_token_bos
();
// beginning-of-sentence
LLAMA_API
llama_token
llama_token_bos
();
// beginning-of-sentence
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment