Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
d1b2f532
"test/vscode:/vscode.git/clone" did not exist on "53d04acec98331d75fb17ce7cc3210dd157f1e27"
Unverified
Commit
d1b2f532
authored
Aug 14, 2023
by
Michael Yang
Committed by
GitHub
Aug 14, 2023
Browse files
Merge pull request #350 from jmorganca/update-llama-cpp
update llama.cpp
parents
2ab20095
f7b61333
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
115 additions
and
58 deletions
+115
-58
llm/ggml-alloc.c
llm/ggml-alloc.c
+1
-1
llm/ggml-alloc.h
llm/ggml-alloc.h
+1
-1
llm/ggml-cuda.cu
llm/ggml-cuda.cu
+69
-27
llm/ggml-cuda.h
llm/ggml-cuda.h
+1
-1
llm/ggml-metal.h
llm/ggml-metal.h
+1
-1
llm/ggml-metal.m
llm/ggml-metal.m
+4
-4
llm/ggml-metal.metal
llm/ggml-metal.metal
+1
-1
llm/ggml-mpi.c
llm/ggml-mpi.c
+1
-1
llm/ggml-mpi.h
llm/ggml-mpi.h
+1
-1
llm/ggml-opencl.cpp
llm/ggml-opencl.cpp
+1
-1
llm/ggml-opencl.h
llm/ggml-opencl.h
+1
-1
llm/ggml.c
llm/ggml.c
+1
-1
llm/ggml.h
llm/ggml.h
+1
-1
llm/k_quants.c
llm/k_quants.c
+1
-1
llm/k_quants.h
llm/k_quants.h
+1
-1
llm/llama-util.h
llm/llama-util.h
+21
-12
llm/llama.cpp
llm/llama.cpp
+7
-1
llm/llama.h
llm/llama.h
+1
-1
No files found.
llm/ggml-alloc.c
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml-alloc.h
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml-cuda.cu
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
@@ -1779,7 +1779,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
...
@@ -1779,7 +1779,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
}
}
// contiguous u/y values
// contiguous u/y values
// also used for q5_K
static
__device__
__forceinline__
float
vec_dot_q4_K_q8_1_impl_mmq
(
static
__device__
__forceinline__
float
vec_dot_q4_K_q8_1_impl_mmq
(
const
int
*
__restrict__
v
,
const
int
*
__restrict__
u
,
const
uint8_t
*
__restrict__
sc
,
const
int
*
__restrict__
v
,
const
int
*
__restrict__
u
,
const
uint8_t
*
__restrict__
sc
,
const
uint8_t
*
__restrict__
m
,
const
half2
&
dm4
,
const
half2
*
__restrict__
ds8
)
{
const
uint8_t
*
__restrict__
m
,
const
half2
&
dm4
,
const
half2
*
__restrict__
ds8
)
{
...
@@ -1789,19 +1788,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
...
@@ -1789,19 +1788,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
float
sumf_m
=
0.0
f
;
float
sumf_m
=
0.0
f
;
#pragma unroll
#pragma unroll
for
(
int
i
0
=
0
;
i
0
<
VDR_Q4_K_Q8_1_MMQ
;
i0
+=
(
QI8_1
/
QR4_K
)
)
{
for
(
int
i
=
0
;
i
<
QR4_K
*
VDR_Q4_K_Q8_1_MMQ
/
QI8_1
;
++
i
)
{
int
sumi_d
=
0
;
int
sumi_d
=
0
;
#pragma unroll
#pragma unroll
for
(
int
i
=
i0
;
i
<
i0
+
(
QI8_1
/
QR4_K
);
++
i
)
{
for
(
int
j
=
0
;
j
<
QI8_1
;
++
j
)
{
sumi_d
=
__dp4a
(
v
[
2
*
i
+
0
],
u
[
2
*
i
+
0
],
sumi_d
);
// SIMD dot product
sumi_d
=
__dp4a
((
v
[
j
]
>>
(
4
*
i
))
&
0x0F0F0F0F
,
u
[
i
*
QI8_1
+
j
],
sumi_d
);
// SIMD dot product
sumi_d
=
__dp4a
(
v
[
2
*
i
+
1
],
u
[
2
*
i
+
1
],
sumi_d
);
// SIMD dot product
}
}
const
float2
ds8f
=
__half22float2
(
ds8
[
i
0
/
4
]);
const
float2
ds8f
=
__half22float2
(
ds8
[
i
]);
sumf_d
+=
ds8f
.
x
*
(
sc
[
i
0
/
4
]
*
sumi_d
);
sumf_d
+=
ds8f
.
x
*
(
sc
[
i
]
*
sumi_d
);
sumf_m
+=
ds8f
.
y
*
m
[
i
0
/
4
];
// sum of q8_1 block * q4_K min val
sumf_m
+=
ds8f
.
y
*
m
[
i
];
// sum of q8_1 block * q4_K min val
}
}
const
float2
dm4f
=
__half22float2
(
dm4
);
const
float2
dm4f
=
__half22float2
(
dm4
);
...
@@ -1818,7 +1816,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
...
@@ -1818,7 +1816,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
#define VDR_Q5_K_Q8_1_MMQ 8
#define VDR_Q5_K_Q8_1_MMQ 8
// contiguous v/x values
// contiguous v/x values
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1_impl
(
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1_impl
_vmmq
(
const
int
*
__restrict__
vl
,
const
int
*
__restrict__
vh
,
const
int
*
__restrict__
u
,
const
uint8_t
*
__restrict__
sc
,
const
int
*
__restrict__
vl
,
const
int
*
__restrict__
vh
,
const
int
*
__restrict__
u
,
const
uint8_t
*
__restrict__
sc
,
const
uint8_t
*
__restrict__
m
,
const
half2
&
dm5
,
const
float
*
__restrict__
d8
)
{
const
uint8_t
*
__restrict__
m
,
const
half2
&
dm5
,
const
float
*
__restrict__
d8
)
{
...
@@ -1855,6 +1853,40 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
...
@@ -1855,6 +1853,40 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
}
// contiguous u/y values
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1_impl_mmq
(
const
int
*
__restrict__
v
,
const
int
*
__restrict__
u
,
const
uint8_t
*
__restrict__
sc
,
const
uint8_t
*
__restrict__
m
,
const
half2
&
dm4
,
const
half2
*
__restrict__
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
#pragma unroll
for
(
int
i
=
0
;
i
<
QR5_K
*
VDR_Q5_K_Q8_1_MMQ
/
QI8_1
;
++
i
)
{
int
sumi_d
=
0
;
#pragma unroll
for
(
int
j
=
0
;
j
<
QI8_1
;
++
j
)
{
sumi_d
=
__dp4a
(
v
[
i
*
QI8_1
+
j
],
u
[
i
*
QI8_1
+
j
],
sumi_d
);
// SIMD dot product
}
const
float2
ds8f
=
__half22float2
(
ds8
[
i
]);
sumf_d
+=
ds8f
.
x
*
(
sc
[
i
]
*
sumi_d
);
sumf_m
+=
ds8f
.
y
*
m
[
i
];
// sum of q8_1 block * q4_K min val
}
const
float2
dm4f
=
__half22float2
(
dm4
);
return
dm4f
.
x
*
sumf_d
-
dm4f
.
y
*
sumf_m
;
#else
assert
(
false
);
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q6_K_Q8_1_MMVQ 1
#define VDR_Q6_K_Q8_1_MMVQ 1
#define VDR_Q6_K_Q8_1_MMQ 8
#define VDR_Q6_K_Q8_1_MMQ 8
...
@@ -2850,18 +2882,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
...
@@ -2850,18 +2882,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
int
v
[
QR4_K
*
VDR_Q4_K_Q8_1_MMQ
];
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q4_K_Q8_1_MMQ
;
++
l
)
{
v
[
l
+
0
]
=
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
+
l
]
>>
0
)
&
0x0F0F0F0F
;
v
[
l
+
(
QI4_K
/
4
)]
=
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
+
l
]
>>
4
)
&
0x0F0F0F0F
;
}
const
uint8_t
*
sc
=
((
const
uint8_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
k
/
16
])
+
2
*
((
k
%
16
)
/
8
);
const
uint8_t
*
sc
=
((
const
uint8_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
k
/
16
])
+
2
*
((
k
%
16
)
/
8
);
const
int
index_y
=
j
*
WARP_SIZE
+
(
QR4_K
*
k
)
%
WARP_SIZE
;
const
int
index_y
=
j
*
WARP_SIZE
+
(
QR4_K
*
k
)
%
WARP_SIZE
;
return
vec_dot_q4_K_q8_1_impl_mmq
(
v
,
&
y_qs
[
index_y
],
sc
,
sc
+
8
,
x_dm
[
i
*
(
WARP_SIZE
/
QI4_K
)
+
i
/
QI4_K
],
&
y_ds
[
index_y
/
QI8_1
]);
return
vec_dot_q4_K_q8_1_impl_mmq
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
&
y_qs
[
index_y
],
sc
,
sc
+
8
,
x_dm
[
i
*
(
WARP_SIZE
/
QI4_K
)
+
i
/
QI4_K
],
&
y_ds
[
index_y
/
QI8_1
]);
}
}
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1
(
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1
(
...
@@ -2908,7 +2933,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
...
@@ -2908,7 +2933,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
u
[
2
*
i
+
1
]
=
q8
[
4
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
}
}
return
vec_dot_q5_K_q8_1_impl
(
vl
,
vh
,
u
,
sc
,
m
,
bq5_K
->
dm
,
d8
);
return
vec_dot_q5_K_q8_1_impl
_vmmq
(
vl
,
vh
,
u
,
sc
,
m
,
bq5_K
->
dm
,
d8
);
#else
#else
...
@@ -3051,7 +3076,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
...
@@ -3051,7 +3076,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
const
int
index_x
=
i
*
(
QR5_K
*
WARP_SIZE
+
1
)
+
QR5_K
*
k
;
const
int
index_x
=
i
*
(
QR5_K
*
WARP_SIZE
+
1
)
+
QR5_K
*
k
;
const
int
index_y
=
j
*
WARP_SIZE
+
(
QR5_K
*
k
)
%
WARP_SIZE
;
const
int
index_y
=
j
*
WARP_SIZE
+
(
QR5_K
*
k
)
%
WARP_SIZE
;
return
vec_dot_q4_K_q8_1_impl_mmq
(
&
x_ql
[
index_x
],
&
y_qs
[
index_y
],
sc
,
sc
+
8
,
x_dm
[
i
*
(
WARP_SIZE
/
QI5_K
)
+
i
/
QI5_K
],
&
y_ds
[
index_y
/
QI8_1
]);
return
vec_dot_q5_K_q8_1_impl_mmq
(
&
x_ql
[
index_x
],
&
y_qs
[
index_y
],
sc
,
sc
+
8
,
x_dm
[
i
*
(
WARP_SIZE
/
QI5_K
)
+
i
/
QI5_K
],
&
y_ds
[
index_y
/
QI8_1
]);
}
}
static
__device__
__forceinline__
float
vec_dot_q6_K_q8_1
(
static
__device__
__forceinline__
float
vec_dot_q6_K_q8_1
(
...
@@ -3327,7 +3353,11 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
...
@@ -3327,7 +3353,11 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
#define MMQ_Y_Q4_1_PASCAL 64
#define MMQ_Y_Q4_1_PASCAL 64
#define NWARPS_Q4_1_PASCAL 8
#define NWARPS_Q4_1_PASCAL 8
template
<
bool
need_check
>
static
__global__
void
mul_mat_q4_1
(
template
<
bool
need_check
>
static
__global__
void
#if __CUDA_ARCH__ < CC_TURING
__launch_bounds__
(
WARP_SIZE
*
NWARPS_Q4_1_PASCAL
,
2
)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q4_1
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
...
@@ -3497,7 +3527,11 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
...
@@ -3497,7 +3527,11 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
#define MMQ_Y_Q3_K_PASCAL 64
#define MMQ_Y_Q3_K_PASCAL 64
#define NWARPS_Q3_K_PASCAL 8
#define NWARPS_Q3_K_PASCAL 8
template
<
bool
need_check
>
static
__global__
void
mul_mat_q3_K
(
template
<
bool
need_check
>
static
__global__
void
#if __CUDA_ARCH__ < CC_TURING
__launch_bounds__
(
WARP_SIZE
*
NWARPS_Q3_K_PASCAL
,
2
)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q3_K
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
...
@@ -3527,11 +3561,15 @@ template <bool need_check> static __global__ void mul_mat_q3_K(
...
@@ -3527,11 +3561,15 @@ template <bool need_check> static __global__ void mul_mat_q3_K(
#define MMQ_X_Q4_K_AMPERE 64
#define MMQ_X_Q4_K_AMPERE 64
#define MMQ_Y_Q4_K_AMPERE 128
#define MMQ_Y_Q4_K_AMPERE 128
#define NWARPS_Q4_K_AMPERE 4
#define NWARPS_Q4_K_AMPERE 4
#define MMQ_X_Q4_K_PASCAL
32
#define MMQ_X_Q4_K_PASCAL
64
#define MMQ_Y_Q4_K_PASCAL 64
#define MMQ_Y_Q4_K_PASCAL 64
#define NWARPS_Q4_K_PASCAL 8
#define NWARPS_Q4_K_PASCAL 8
template
<
bool
need_check
>
static
__global__
void
mul_mat_q4_K
(
template
<
bool
need_check
>
static
__global__
void
#if __CUDA_ARCH__ < CC_TURING
__launch_bounds__
(
WARP_SIZE
*
NWARPS_Q4_K_PASCAL
,
2
)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q4_K
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
...
@@ -3595,11 +3633,15 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
...
@@ -3595,11 +3633,15 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
#define MMQ_X_Q6_K_AMPERE 64
#define MMQ_X_Q6_K_AMPERE 64
#define MMQ_Y_Q6_K_AMPERE 64
#define MMQ_Y_Q6_K_AMPERE 64
#define NWARPS_Q6_K_AMPERE 4
#define NWARPS_Q6_K_AMPERE 4
#define MMQ_X_Q6_K_PASCAL
32
#define MMQ_X_Q6_K_PASCAL
64
#define MMQ_Y_Q6_K_PASCAL 64
#define MMQ_Y_Q6_K_PASCAL 64
#define NWARPS_Q6_K_PASCAL 8
#define NWARPS_Q6_K_PASCAL 8
template
<
bool
need_check
>
static
__global__
void
mul_mat_q6_K
(
template
<
bool
need_check
>
static
__global__
void
#if __CUDA_ARCH__ < CC_TURING
__launch_bounds__
(
WARP_SIZE
*
NWARPS_Q6_K_PASCAL
,
2
)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q6_K
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
...
...
llm/ggml-cuda.h
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml-metal.h
View file @
d1b2f532
//go:build darwin
//go:build darwin
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml-metal.m
View file @
d1b2f532
//go:build darwin
//go:build darwin
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
@@ -154,7 +154,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
...
@@ -154,7 +154,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
if (error) {
if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
exit(1)
;
return NULL
;
}
}
}
}
#else
#else
...
@@ -172,7 +172,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
...
@@ -172,7 +172,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
NSString
*
src
=
[
NSString
stringWithContentsOfFile
:
path
encoding
:
NSUTF8StringEncoding
error
:&
error
];
NSString
*
src
=
[
NSString
stringWithContentsOfFile
:
path
encoding
:
NSUTF8StringEncoding
error
:&
error
];
if
(
error
)
{
if
(
error
)
{
fprintf
(
stderr
,
"%s: error: %s
\n
"
,
__func__
,
[[
error
description
]
UTF8String
]);
fprintf
(
stderr
,
"%s: error: %s
\n
"
,
__func__
,
[[
error
description
]
UTF8String
]);
exit
(
1
)
;
return
NULL
;
}
}
#ifdef GGML_QKK_64
#ifdef GGML_QKK_64
...
@@ -184,7 +184,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
...
@@ -184,7 +184,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#endif
#endif
if
(
error
)
{
if
(
error
)
{
fprintf
(
stderr
,
"%s: error: %s
\n
"
,
__func__
,
[[
error
description
]
UTF8String
]);
fprintf
(
stderr
,
"%s: error: %s
\n
"
,
__func__
,
[[
error
description
]
UTF8String
]);
exit
(
1
)
;
return
NULL
;
}
}
}
}
#endif
#endif
...
...
llm/ggml-metal.metal
View file @
d1b2f532
//go:build darwin
//go:build darwin
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml-mpi.c
View file @
d1b2f532
//go:build mpi
//go:build mpi
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml-mpi.h
View file @
d1b2f532
//go:build mpi
//go:build mpi
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml-opencl.cpp
View file @
d1b2f532
//go:build opencl
//go:build opencl
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml-opencl.h
View file @
d1b2f532
//go:build opencl
//go:build opencl
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml.c
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/ggml.h
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/k_quants.c
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/k_quants.h
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
llm/llama-util.h
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
@@ -297,20 +297,29 @@ struct llama_mmap {
...
@@ -297,20 +297,29 @@ struct llama_mmap {
throw
std
::
runtime_error
(
format
(
"MapViewOfFile failed: %s"
,
llama_format_win_err
(
error
).
c_str
()));
throw
std
::
runtime_error
(
format
(
"MapViewOfFile failed: %s"
,
llama_format_win_err
(
error
).
c_str
()));
}
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
if
(
prefetch
)
{
if
(
prefetch
)
{
// Advise the kernel to preload the mapped memory
// The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
WIN32_MEMORY_RANGE_ENTRY
range
;
// will dynamically load it using GetProcAddress.
range
.
VirtualAddress
=
addr
;
BOOL
(
WINAPI
*
pPrefetchVirtualMemory
)
(
HANDLE
,
ULONG_PTR
,
PWIN32_MEMORY_RANGE_ENTRY
,
ULONG
);
range
.
NumberOfBytes
=
(
SIZE_T
)
size
;
HMODULE
hKernel32
;
if
(
!
PrefetchVirtualMemory
(
GetCurrentProcess
(),
1
,
&
range
,
0
))
{
fprintf
(
stderr
,
"warning: PrefetchVirtualMemory failed: %s
\n
"
,
// This call is guaranteed to succeed.
llama_format_win_err
(
GetLastError
()).
c_str
());
hKernel32
=
GetModuleHandleW
(
L"kernel32.dll"
);
// This call may fail if on a pre-Win8 system.
pPrefetchVirtualMemory
=
reinterpret_cast
<
decltype
(
pPrefetchVirtualMemory
)
>
(
GetProcAddress
(
hKernel32
,
"PrefetchVirtualMemory"
));
if
(
pPrefetchVirtualMemory
)
{
// Advise the kernel to preload the mapped memory.
WIN32_MEMORY_RANGE_ENTRY
range
;
range
.
VirtualAddress
=
addr
;
range
.
NumberOfBytes
=
(
SIZE_T
)
size
;
if
(
!
pPrefetchVirtualMemory
(
GetCurrentProcess
(),
1
,
&
range
,
0
))
{
fprintf
(
stderr
,
"warning: PrefetchVirtualMemory failed: %s
\n
"
,
llama_format_win_err
(
GetLastError
()).
c_str
());
}
}
}
}
}
#else
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
}
}
~
llama_mmap
()
{
~
llama_mmap
()
{
...
...
llm/llama.cpp
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
@@ -3363,6 +3363,12 @@ struct llama_context * llama_new_context_with_model(
...
@@ -3363,6 +3363,12 @@ struct llama_context * llama_new_context_with_model(
// this allocates all Metal resources and memory buffers
// this allocates all Metal resources and memory buffers
ctx
->
ctx_metal
=
ggml_metal_init
(
1
);
ctx
->
ctx_metal
=
ggml_metal_init
(
1
);
if
(
!
ctx
->
ctx_metal
)
{
LLAMA_LOG_ERROR
(
"%s: ggml_metal_init() failed
\n
"
,
__func__
);
llama_free
(
ctx
);
return
NULL
;
}
void
*
data_ptr
=
NULL
;
void
*
data_ptr
=
NULL
;
size_t
data_size
=
0
;
size_t
data_size
=
0
;
...
...
llm/llama.h
View file @
d1b2f532
/**
/**
* llama.cpp - git
f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
* llama.cpp - git
3ebb00935f3f0522b75df49c2769ab1774b91380
*
*
* MIT License
* MIT License
*
*
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment