Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
7a1c3e62
Commit
7a1c3e62
authored
Aug 01, 2023
by
Michael Yang
Browse files
update llama.cpp
parent
da52f5bf
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
2683 additions
and
573 deletions
+2683
-573
llama/ggml-alloc.c
llama/ggml-alloc.c
+567
-0
llama/ggml-alloc.h
llama/ggml-alloc.h
+48
-0
llama/ggml-cuda.cu
llama/ggml-cuda.cu
+1740
-418
llama/ggml-cuda.h
llama/ggml-cuda.h
+2
-1
llama/ggml-metal.h
llama/ggml-metal.h
+1
-1
llama/ggml-metal.m
llama/ggml-metal.m
+18
-17
llama/ggml-metal.metal
llama/ggml-metal.metal
+5
-2
llama/ggml-mpi.c
llama/ggml-mpi.c
+1
-1
llama/ggml-mpi.h
llama/ggml-mpi.h
+1
-1
llama/ggml-opencl.cpp
llama/ggml-opencl.cpp
+1
-1
llama/ggml-opencl.h
llama/ggml-opencl.h
+1
-1
llama/ggml.c
llama/ggml.c
+50
-27
llama/ggml.h
llama/ggml.h
+13
-2
llama/k_quants.c
llama/k_quants.c
+33
-31
llama/k_quants.h
llama/k_quants.h
+1
-1
llama/llama-util.h
llama/llama-util.h
+1
-1
llama/llama.cpp
llama/llama.cpp
+198
-67
llama/llama.h
llama/llama.h
+2
-1
No files found.
llama/ggml-alloc.c
0 → 100644
View file @
7a1c3e62
/**
* llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
* Copyright (c) 2023 Georgi Gerganov
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "ggml-alloc.h"
#include "ggml.h"
#include <assert.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define UNUSED(x) (void)(x)
#define MAX(a, b) ((a) > (b) ? (a) : (b))
//#define GGML_ALLOCATOR_DEBUG
//#define AT_PRINTF printf
#define AT_PRINTF(...) ((void)0)
struct
hash_node
{
struct
ggml_tensor
*
t
;
int
n_children
;
int
n_views
;
};
static
size_t
hash
(
void
*
p
)
{
return
(
size_t
)
p
%
GGML_GRAPH_HASHTABLE_SIZE
;
}
static
struct
hash_node
*
hash_get
(
struct
hash_node
hash_table
[],
struct
ggml_tensor
*
t
)
{
size_t
h
=
hash
(
t
);
// linear probing
size_t
i
=
h
;
while
(
hash_table
[
i
].
t
!=
NULL
)
{
if
(
hash_table
[
i
].
t
==
t
)
{
return
&
hash_table
[
i
];
}
i
=
(
i
+
1
)
%
GGML_GRAPH_HASHTABLE_SIZE
;
if
(
i
==
h
)
{
// hash table is full
GGML_ASSERT
(
false
);
}
}
hash_table
[
i
].
t
=
t
;
return
&
hash_table
[
i
];
}
// TODO: GGML_PAD ?
static
size_t
aligned_offset
(
const
void
*
buffer
,
size_t
offset
,
size_t
alignment
)
{
assert
(
alignment
&&
!
(
alignment
&
(
alignment
-
1
)));
// power of 2
size_t
align
=
(
alignment
-
(((
uintptr_t
)
buffer
+
offset
)
%
alignment
))
%
alignment
;
return
offset
+
align
;
}
struct
free_block
{
void
*
addr
;
size_t
size
;
};
#define MAX_FREE_BLOCKS 128
struct
ggml_allocr
{
void
*
data
;
size_t
size
;
size_t
alignment
;
int
n_free_blocks
;
struct
free_block
free_blocks
[
MAX_FREE_BLOCKS
];
struct
hash_node
hash_table
[
GGML_GRAPH_HASHTABLE_SIZE
];
size_t
max_size
;
bool
measure
;
#ifdef GGML_ALLOCATOR_DEBUG
struct
ggml_tensor
*
allocated_tensors
[
1024
];
#endif
};
#ifdef GGML_ALLOCATOR_DEBUG
static
void
add_allocated_tensor
(
struct
ggml_allocator
*
alloc
,
struct
ggml_tensor
*
tensor
)
{
for
(
int
i
=
0
;
i
<
1024
;
i
++
)
{
if
(
alloc
->
allocated_tensors
[
i
]
==
NULL
)
{
alloc
->
allocated_tensors
[
i
]
=
tensor
;
return
;
}
}
GGML_ASSERT
(
!
"out of allocated_tensors"
);
}
static
void
remove_allocated_tensor
(
struct
ggml_allocator
*
alloc
,
struct
ggml_tensor
*
tensor
)
{
for
(
int
i
=
0
;
i
<
1024
;
i
++
)
{
if
(
alloc
->
allocated_tensors
[
i
]
==
tensor
||
(
alloc
->
allocated_tensors
[
i
]
!=
NULL
&&
alloc
->
allocated_tensors
[
i
]
->
data
==
tensor
->
data
))
{
alloc
->
allocated_tensors
[
i
]
=
NULL
;
return
;
}
}
printf
(
"tried to free tensor %s not found
\n
"
,
tensor
->
name
);
GGML_ASSERT
(
!
"tensor not found"
);
}
#endif
static
size_t
ggml_allocator_get_alloc_size
(
struct
ggml_allocr
*
alloc
,
struct
ggml_tensor
*
tensor
)
{
return
ggml_nbytes
(
tensor
);
UNUSED
(
alloc
);
}
void
ggml_allocr_alloc
(
struct
ggml_allocr
*
alloc
,
struct
ggml_tensor
*
tensor
)
{
size_t
size
=
ggml_allocator_get_alloc_size
(
alloc
,
tensor
);
size
=
aligned_offset
(
NULL
,
size
,
alloc
->
alignment
);
AT_PRINTF
(
"%s: allocating %s (%zu bytes) - "
,
__func__
,
tensor
->
name
,
size
);
size_t
max_avail
=
0
;
// find the best fitting free block
int
best_fit_block
=
-
1
;
size_t
best_fit_size
=
SIZE_MAX
;
for
(
int
i
=
0
;
i
<
alloc
->
n_free_blocks
;
i
++
)
{
struct
free_block
*
block
=
&
alloc
->
free_blocks
[
i
];
max_avail
=
MAX
(
max_avail
,
block
->
size
);
if
(
block
->
size
>=
size
&&
block
->
size
<=
best_fit_size
)
{
best_fit_block
=
i
;
best_fit_size
=
block
->
size
;
}
}
AT_PRINTF
(
"block %d
\n
"
,
best_fit_block
);
if
(
best_fit_block
==
-
1
)
{
fprintf
(
stderr
,
"%s: not enough space in the buffer (needed %zu, largest block available %zu)
\n
"
,
__func__
,
size
,
max_avail
);
GGML_ASSERT
(
!
"not enough space in the buffer"
);
return
;
}
struct
free_block
*
block
=
&
alloc
->
free_blocks
[
best_fit_block
];
void
*
addr
=
block
->
addr
;
block
->
addr
=
(
char
*
)
block
->
addr
+
size
;
block
->
size
-=
size
;
if
(
block
->
size
==
0
)
{
// remove block if empty
alloc
->
n_free_blocks
--
;
for
(
int
j
=
best_fit_block
;
j
<
alloc
->
n_free_blocks
;
j
++
)
{
alloc
->
free_blocks
[
j
]
=
alloc
->
free_blocks
[
j
+
1
];
}
}
tensor
->
data
=
addr
;
#ifdef GGML_ALLOCATOR_DEBUG
add_allocated_tensor
(
alloc
,
tensor
);
size_t
cur_max
=
(
char
*
)
addr
-
(
char
*
)
alloc
->
data
+
size
;
if
(
cur_max
>
alloc
->
max_size
)
{
printf
(
"max_size = %.2f MB: tensors: "
,
cur_max
/
1024
.
0
/
1024
.
0
);
for
(
int
i
=
0
;
i
<
1024
;
i
++
)
{
if
(
alloc
->
allocated_tensors
[
i
])
{
printf
(
"%s (%.2f MB) "
,
alloc
->
allocated_tensors
[
i
]
->
name
,
ggml_nbytes
(
alloc
->
allocated_tensors
[
i
])
/
1024
.
0
/
1024
.
0
);
}
}
printf
(
"
\n
"
);
}
#endif
alloc
->
max_size
=
MAX
(
alloc
->
max_size
,
(
char
*
)
addr
-
(
char
*
)
alloc
->
data
+
size
);
}
// this is a very naive implementation, but for our case the number of free blocks should be very small
static
void
ggml_allocator_free_tensor
(
struct
ggml_allocr
*
alloc
,
struct
ggml_tensor
*
tensor
)
{
void
*
ptr
=
tensor
->
data
;
if
(
ptr
<
alloc
->
data
||
(
char
*
)
ptr
>=
(
char
*
)
alloc
->
data
+
alloc
->
max_size
)
{
// the tensor was not allocated in this buffer
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
// the easiest way to deal with this is just to ignore it
return
;
}
size_t
size
=
ggml_allocator_get_alloc_size
(
alloc
,
tensor
);
size
=
aligned_offset
(
NULL
,
size
,
alloc
->
alignment
);
AT_PRINTF
(
"%s: freeing %s (%zu bytes) - n_free_blocks = %d
\n
"
,
__func__
,
tensor
->
name
,
size
,
alloc
->
n_free_blocks
);
#ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor
(
alloc
,
tensor
);
#endif
// see if we can merge with an existing block
for
(
int
i
=
0
;
i
<
alloc
->
n_free_blocks
;
i
++
)
{
struct
free_block
*
block
=
&
alloc
->
free_blocks
[
i
];
// check if ptr is at the end of the block
if
((
char
*
)
block
->
addr
+
block
->
size
==
ptr
)
{
block
->
size
+=
size
;
// check if we can merge with the next block
if
(
i
<
alloc
->
n_free_blocks
-
1
&&
(
char
*
)
block
->
addr
+
block
->
size
==
alloc
->
free_blocks
[
i
+
1
].
addr
)
{
block
->
size
+=
alloc
->
free_blocks
[
i
+
1
].
size
;
alloc
->
n_free_blocks
--
;
for
(
int
j
=
i
+
1
;
j
<
alloc
->
n_free_blocks
;
j
++
)
{
alloc
->
free_blocks
[
j
]
=
alloc
->
free_blocks
[
j
+
1
];
}
}
return
;
}
// check if ptr is at the beginning of the block
if
((
char
*
)
ptr
+
size
==
block
->
addr
)
{
block
->
addr
=
ptr
;
block
->
size
+=
size
;
// check if we can merge with the previous block
if
(
i
>
0
&&
(
char
*
)
alloc
->
free_blocks
[
i
-
1
].
addr
+
alloc
->
free_blocks
[
i
-
1
].
size
==
block
->
addr
)
{
alloc
->
free_blocks
[
i
-
1
].
size
+=
block
->
size
;
alloc
->
n_free_blocks
--
;
for
(
int
j
=
i
;
j
<
alloc
->
n_free_blocks
;
j
++
)
{
alloc
->
free_blocks
[
j
]
=
alloc
->
free_blocks
[
j
+
1
];
}
}
return
;
}
}
// otherwise, add a new block
GGML_ASSERT
(
alloc
->
n_free_blocks
<
MAX_FREE_BLOCKS
&&
"out of free blocks"
);
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
int
insert_pos
=
0
;
while
(
insert_pos
<
alloc
->
n_free_blocks
&&
alloc
->
free_blocks
[
insert_pos
].
addr
<
ptr
)
{
insert_pos
++
;
}
// shift all blocks from insert_pos onward to make room for the new block
for
(
int
i
=
alloc
->
n_free_blocks
;
i
>
insert_pos
;
i
--
)
{
alloc
->
free_blocks
[
i
]
=
alloc
->
free_blocks
[
i
-
1
];
}
// insert the new block
alloc
->
free_blocks
[
insert_pos
].
addr
=
ptr
;
alloc
->
free_blocks
[
insert_pos
].
size
=
size
;
alloc
->
n_free_blocks
++
;
}
void
ggml_allocr_reset
(
struct
ggml_allocr
*
alloc
)
{
alloc
->
n_free_blocks
=
1
;
size_t
align_offset
=
aligned_offset
(
alloc
->
data
,
0
,
alloc
->
alignment
);
alloc
->
free_blocks
[
0
].
addr
=
(
char
*
)
alloc
->
data
+
align_offset
;
alloc
->
free_blocks
[
0
].
size
=
alloc
->
size
-
align_offset
;
}
struct
ggml_allocr
*
ggml_allocr_new
(
void
*
data
,
size_t
size
,
size_t
alignment
)
{
struct
ggml_allocr
*
alloc
=
(
struct
ggml_allocr
*
)
malloc
(
sizeof
(
struct
ggml_allocr
)
/* + n_free_blocks * sizeof(struct free_block) */
);
*
alloc
=
(
struct
ggml_allocr
){
/*.data = */
data
,
/*.size = */
size
,
/*.alignment = */
alignment
,
/*.n_free_blocks = */
0
,
/*.free_blocks = */
{{
0
}},
/*.hash_table = */
{{
0
}},
/*.max_size = */
0
,
/*.measure = */
false
,
#ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */
=
{
0
},
#endif
};
ggml_allocr_reset
(
alloc
);
return
alloc
;
}
// address and size of the buffer when measuring
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
static
void
*
const
MEASURE_BASE_ADDR
=
(
void
*
)
0x1000
;
static
const
size_t
MEASURE_MAX_SIZE
=
1ULL
<<
40
;
// 1 TB
struct
ggml_allocr
*
ggml_allocr_new_measure
(
size_t
alignment
)
{
struct
ggml_allocr
*
alloc
=
(
struct
ggml_allocr
*
)
malloc
(
sizeof
(
struct
ggml_allocr
)
/* + n_free_blocks * sizeof(struct free_block) */
);
*
alloc
=
(
struct
ggml_allocr
){
/*.data = */
MEASURE_BASE_ADDR
,
/*.size = */
MEASURE_MAX_SIZE
,
/*.alignment = */
alignment
,
/*.n_free_blocks = */
0
,
/*.free_blocks = */
{{
0
}},
/*.hash_table = */
{{
0
}},
/*.max_size = */
0
,
/*.measure = */
true
,
#ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */
=
{
0
},
#endif
};
ggml_allocr_reset
(
alloc
);
return
alloc
;
}
void
ggml_allocr_free
(
struct
ggml_allocr
*
alloc
)
{
free
(
alloc
);
}
bool
ggml_allocr_is_measure
(
struct
ggml_allocr
*
alloc
)
{
return
alloc
->
measure
;
}
//////////// compute graph allocator
static
bool
ggml_is_view
(
struct
ggml_tensor
*
t
)
{
return
t
->
op
==
GGML_OP_RESHAPE
||
t
->
op
==
GGML_OP_VIEW
||
t
->
op
==
GGML_OP_TRANSPOSE
||
t
->
op
==
GGML_OP_PERMUTE
||
t
->
op
==
GGML_OP_CPY
;
}
static
bool
ggml_are_same_layout
(
const
struct
ggml_tensor
*
a
,
const
struct
ggml_tensor
*
b
)
{
if
(
a
->
type
!=
b
->
type
)
{
return
false
;
}
for
(
int
i
=
0
;
i
<
GGML_MAX_DIMS
;
i
++
)
{
if
(
a
->
ne
[
i
]
!=
b
->
ne
[
i
])
{
return
false
;
}
if
(
a
->
nb
[
i
]
!=
b
->
nb
[
i
])
{
return
false
;
}
}
return
true
;
}
static
struct
ggml_tensor
*
get_view_parent
(
struct
ggml_tensor
*
t
)
{
switch
(
t
->
op
)
{
case
GGML_OP_PERMUTE
:
case
GGML_OP_RESHAPE
:
case
GGML_OP_TRANSPOSE
:
case
GGML_OP_VIEW
:
return
t
->
src
[
0
];
case
GGML_OP_CPY
:
return
t
->
src
[
1
];
default:
return
NULL
;
}
}
static
struct
ggml_tensor
*
get_view_source
(
struct
ggml_tensor
*
t
)
{
struct
ggml_tensor
*
parent
=
t
;
do
{
parent
=
get_view_parent
(
parent
);
}
while
(
ggml_is_view
(
parent
));
return
parent
;
}
static
bool
ggml_op_can_inplace
(
enum
ggml_op
op
)
{
switch
(
op
)
{
case
GGML_OP_SCALE
:
case
GGML_OP_DIAG_MASK_ZERO
:
case
GGML_OP_DIAG_MASK_INF
:
case
GGML_OP_ADD
:
case
GGML_OP_ADD1
:
case
GGML_OP_ACC
:
case
GGML_OP_SUB
:
case
GGML_OP_MUL
:
case
GGML_OP_DIV
:
case
GGML_OP_SQR
:
case
GGML_OP_SQRT
:
case
GGML_OP_LOG
:
case
GGML_OP_UNARY
:
case
GGML_OP_ROPE
:
case
GGML_OP_RMS_NORM
:
case
GGML_OP_SET
:
case
GGML_OP_SOFT_MAX
:
case
GGML_OP_CONT
:
return
true
;
default:
return
false
;
}
}
static
void
allocate_node
(
struct
ggml_allocr
*
alloc
,
struct
ggml_tensor
*
node
)
{
struct
hash_node
*
ht
=
alloc
->
hash_table
;
if
(
node
->
data
==
NULL
)
{
if
(
ggml_is_view
(
node
))
{
size_t
offset
;
switch
(
node
->
op
)
{
case
GGML_OP_VIEW
:
memcpy
(
&
offset
,
node
->
op_params
,
sizeof
(
size_t
));
node
->
data
=
(
char
*
)
node
->
src
[
0
]
->
data
+
offset
;
break
;
case
GGML_OP_PERMUTE
:
case
GGML_OP_RESHAPE
:
case
GGML_OP_TRANSPOSE
:
node
->
data
=
node
->
src
[
0
]
->
data
;
break
;
case
GGML_OP_CPY
:
node
->
data
=
node
->
src
[
1
]
->
data
;
break
;
default:
GGML_ASSERT
(
!
"unknown view op"
);
break
;
}
}
else
{
// see if we can reuse a parent's buffer (inplace)
if
(
ggml_op_can_inplace
(
node
->
op
))
{
for
(
int
i
=
0
;
i
<
GGML_MAX_SRC
;
i
++
)
{
struct
ggml_tensor
*
parent
=
node
->
src
[
i
];
if
(
parent
==
NULL
)
{
break
;
}
struct
hash_node
*
p_hn
=
hash_get
(
ht
,
parent
);
if
(
parent
->
data
!=
NULL
&&
p_hn
->
n_children
==
1
&&
p_hn
->
n_views
==
0
&&
ggml_are_same_layout
(
node
,
parent
))
{
if
(
ggml_is_view
(
parent
))
{
struct
ggml_tensor
*
view_src
=
get_view_source
(
parent
);
struct
hash_node
*
view_src_hn
=
hash_get
(
ht
,
view_src
);
if
(
view_src_hn
->
n_views
==
1
&&
view_src_hn
->
n_children
==
0
&&
view_src
->
data
==
parent
->
data
)
{
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
// the parent's data that it will need later (same layout requirement). the problem is that then
// we cannot free the tensor because the original address of the allocation is lost.
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
AT_PRINTF
(
"reusing view parent %s (%s) for %s
\n
"
,
parent
->
name
,
view_src
->
name
,
node
->
name
);
node
->
data
=
parent
->
data
;
return
;
}
}
else
{
AT_PRINTF
(
"reusing parent %s for %s
\n
"
,
parent
->
name
,
node
->
name
);
node
->
data
=
parent
->
data
;
}
return
;
}
}
}
ggml_allocr_alloc
(
alloc
,
node
);
}
}
}
static
size_t
ggml_allocator_alloc_graph_tensors_n
(
struct
ggml_allocr
*
alloc
,
struct
ggml_cgraph
**
graphs
,
int
n_graphs
,
struct
ggml_tensor
***
inputs
,
struct
ggml_tensor
***
outputs
)
{
// reset hash table
struct
hash_node
*
ht
=
alloc
->
hash_table
;
memset
(
ht
,
0
,
sizeof
(
struct
hash_node
)
*
GGML_GRAPH_HASHTABLE_SIZE
);
// count number of children and views
for
(
int
g
=
0
;
g
<
n_graphs
;
g
++
)
{
struct
ggml_cgraph
*
gf
=
graphs
[
g
];
for
(
int
i
=
0
;
i
<
gf
->
n_nodes
;
i
++
)
{
struct
ggml_tensor
*
node
=
gf
->
nodes
[
i
];
if
(
ggml_is_view
(
node
))
{
struct
ggml_tensor
*
view_src
=
get_view_source
(
node
);
hash_get
(
ht
,
view_src
)
->
n_views
+=
1
;
}
for
(
int
j
=
0
;
j
<
GGML_MAX_SRC
;
j
++
)
{
struct
ggml_tensor
*
parent
=
node
->
src
[
j
];
if
(
parent
==
NULL
)
{
break
;
}
hash_get
(
ht
,
parent
)
->
n_children
+=
1
;
}
}
}
// allocate tensors
for
(
int
g
=
0
;
g
<
n_graphs
;
g
++
)
{
struct
ggml_cgraph
*
gf
=
graphs
[
g
];
AT_PRINTF
(
"####### graph %d/%d
\n
"
,
g
,
n_graphs
);
// graph inputs are allocated first to ensure that they are not overwritten by each other
if
(
inputs
!=
NULL
&&
inputs
[
g
]
!=
NULL
)
{
for
(
int
i
=
0
;
inputs
[
g
][
i
]
!=
NULL
;
i
++
)
{
struct
ggml_tensor
*
input
=
inputs
[
g
][
i
];
AT_PRINTF
(
"input: %s
\n
"
,
input
->
name
);
allocate_node
(
alloc
,
input
);
}
}
for
(
int
i
=
0
;
i
<
gf
->
n_nodes
;
i
++
)
{
struct
ggml_tensor
*
node
=
gf
->
nodes
[
i
];
// allocate parents (leafs)
for
(
int
j
=
0
;
j
<
GGML_MAX_SRC
;
j
++
)
{
struct
ggml_tensor
*
parent
=
node
->
src
[
j
];
if
(
parent
==
NULL
)
{
break
;
}
allocate_node
(
alloc
,
parent
);
}
// allocate node
allocate_node
(
alloc
,
node
);
AT_PRINTF
(
"exec: %s (%s) <= "
,
ggml_op_name
(
node
->
op
),
node
->
name
);
for
(
int
j
=
0
;
j
<
GGML_MAX_SRC
;
j
++
)
{
struct
ggml_tensor
*
parent
=
node
->
src
[
j
];
if
(
parent
==
NULL
)
{
break
;
}
AT_PRINTF
(
"%s"
,
parent
->
name
);
if
(
j
<
GGML_MAX_SRC
-
1
&&
node
->
src
[
j
+
1
]
!=
NULL
)
{
AT_PRINTF
(
", "
);
}
}
AT_PRINTF
(
"
\n
"
);
// update parents
for
(
int
j
=
0
;
j
<
GGML_MAX_SRC
;
j
++
)
{
struct
ggml_tensor
*
parent
=
node
->
src
[
j
];
if
(
parent
==
NULL
)
{
break
;
}
struct
hash_node
*
p_hn
=
hash_get
(
ht
,
parent
);
p_hn
->
n_children
-=
1
;
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
if
(
p_hn
->
n_children
==
0
&&
p_hn
->
n_views
==
0
)
{
if
(
ggml_is_view
(
parent
))
{
struct
ggml_tensor
*
view_src
=
get_view_source
(
parent
);
struct
hash_node
*
view_src_hn
=
hash_get
(
ht
,
view_src
);
view_src_hn
->
n_views
-=
1
;
AT_PRINTF
(
"view_src %s: %d children, %d views
\n
"
,
view_src
->
name
,
view_src
->
n_children
,
view_src
->
n_views
);
if
(
view_src_hn
->
n_views
==
0
&&
view_src_hn
->
n_children
==
0
&&
view_src
->
data
!=
node
->
data
)
{
ggml_allocator_free_tensor
(
alloc
,
view_src
);
}
}
else
{
if
(
parent
->
data
!=
node
->
data
)
{
ggml_allocator_free_tensor
(
alloc
,
parent
);
}
}
}
}
AT_PRINTF
(
"
\n
"
);
}
// free graph outputs here that wouldn't be freed otherwise because they have no children
if
(
outputs
!=
NULL
&&
outputs
[
g
]
!=
NULL
)
{
for
(
int
i
=
0
;
outputs
[
g
][
i
]
!=
NULL
;
i
++
)
{
struct
ggml_tensor
*
output
=
outputs
[
g
][
i
];
AT_PRINTF
(
"output: %s
\n
"
,
output
->
name
);
ggml_allocator_free_tensor
(
alloc
,
output
);
}
}
}
return
alloc
->
max_size
;
}
size_t
ggml_allocr_alloc_graph
(
struct
ggml_allocr
*
alloc
,
struct
ggml_cgraph
*
graph
)
{
return
ggml_allocator_alloc_graph_tensors_n
(
alloc
,
&
graph
,
1
,
NULL
,
NULL
);
}
llama/ggml-alloc.h
0 → 100644
View file @
7a1c3e62
/**
* llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
* Copyright (c) 2023 Georgi Gerganov
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include "ggml.h"
#ifdef __cplusplus
extern
"C"
{
#endif
GGML_API
struct
ggml_allocr
*
ggml_allocr_new
(
void
*
data
,
size_t
size
,
size_t
alignment
);
GGML_API
struct
ggml_allocr
*
ggml_allocr_new_measure
(
size_t
alignment
);
GGML_API
void
ggml_allocr_free
(
struct
ggml_allocr
*
alloc
);
GGML_API
bool
ggml_allocr_is_measure
(
struct
ggml_allocr
*
alloc
);
GGML_API
void
ggml_allocr_reset
(
struct
ggml_allocr
*
alloc
);
GGML_API
void
ggml_allocr_alloc
(
struct
ggml_allocr
*
alloc
,
struct
ggml_tensor
*
tensor
);
GGML_API
size_t
ggml_allocr_alloc_graph
(
struct
ggml_allocr
*
alloc
,
struct
ggml_cgraph
*
graph
);
#ifdef __cplusplus
}
#endif
llama/ggml-cuda.cu
View file @
7a1c3e62
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
@@ -78,13 +78,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
} while (0)
#endif // CUDART_VERSION >= 11
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
typedef
half
dfloat
;
// dequantize float
typedef
half2
dfloat2
;
#else
typedef
float
dfloat
;
// dequantize float
typedef
float2
dfloat2
;
#endif //GGML_CUDA_DMMV_F16
#endif //GGML_CUDA_F16
static
__device__
__forceinline__
int
get_int_from_int8
(
const
int8_t
*
x8
,
const
int
&
i32
)
{
const
uint16_t
*
x16
=
(
uint16_t
*
)
(
x8
+
sizeof
(
int
)
*
i32
);
// assume at least 2 byte alignment
int
x32
=
0
;
x32
|=
x16
[
0
]
<<
0
;
x32
|=
x16
[
1
]
<<
16
;
return
x32
;
}
static
__device__
__forceinline__
int
get_int_from_uint8
(
const
uint8_t
*
x8
,
const
int
&
i32
)
{
const
uint16_t
*
x16
=
(
uint16_t
*
)
(
x8
+
sizeof
(
int
)
*
i32
);
// assume at least 2 byte alignment
int
x32
=
0
;
x32
|=
x16
[
0
]
<<
0
;
x32
|=
x16
[
1
]
<<
16
;
return
x32
;
}
static
__device__
__forceinline__
int
get_int_from_int8_aligned
(
const
int8_t
*
x8
,
const
int
&
i32
)
{
return
*
((
int
*
)
(
x8
+
sizeof
(
int
)
*
i32
));
// assume at least 4 byte alignment
}
static
__device__
__forceinline__
int
get_int_from_uint8_aligned
(
const
uint8_t
*
x8
,
const
int
&
i32
)
{
return
*
((
int
*
)
(
x8
+
sizeof
(
int
)
*
i32
));
// assume at least 4 byte alignment
}
typedef
void
(
*
dequantize_kernel_t
)(
const
void
*
vx
,
const
int
ib
,
const
int
iqs
,
dfloat2
&
v
);
typedef
void
(
*
to_fp32_cuda_t
)(
const
void
*
__restrict__
x
,
float
*
__restrict__
y
,
int
k
,
cudaStream_t
stream
);
...
...
@@ -113,8 +141,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
#define QR4_1 2
#define QI4_1 (QK4_1 / (4 * QR4_1))
typedef
struct
{
half
d
;
// delta
half
m
;
// min
half2
dm
;
// dm.x = delta, dm.y = min
uint8_t
qs
[
QK4_1
/
2
];
// nibbles / quants
}
block_q4_1
;
static_assert
(
sizeof
(
block_q4_1
)
==
sizeof
(
ggml_fp16_t
)
*
2
+
QK4_1
/
2
,
"wrong q4_1 block size/padding"
);
...
...
@@ -133,8 +160,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
#define QR5_1 2
#define QI5_1 (QK5_1 / (4 * QR5_1))
typedef
struct
{
half
d
;
// delta
half
m
;
// min
half2
dm
;
// dm.x = delta, dm.y = min
uint8_t
qh
[
4
];
// 5-th bit of quants
uint8_t
qs
[
QK5_1
/
2
];
// nibbles / quants
}
block_q5_1
;
...
...
@@ -153,13 +179,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
#define QR8_1 1
#define QI8_1 (QK8_1 / (4 * QR8_1))
typedef
struct
{
half
d
;
// delta
half
s
;
// unquantized sum
half2
ds
;
// ds.x = delta, ds.y = sum
int8_t
qs
[
QK8_0
];
// quants
}
block_q8_1
;
static_assert
(
sizeof
(
block_q8_1
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
QK8_0
,
"wrong q8_1 block size/padding"
);
typedef
float
(
*
vec_dot_q_cuda_t
)(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
);
typedef
float
(
*
vec_dot_q_cuda_t
)(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
);
typedef
void
(
*
allocate_tiles_cuda_t
)(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
);
typedef
void
(
*
load_tiles_cuda_t
)(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
);
typedef
float
(
*
vec_dot_q_mul_mat_cuda_t
)(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ms
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
);
//================================= k-quants
...
...
@@ -176,8 +208,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
typedef
struct
{
uint8_t
scales
[
QK_K
/
16
];
// scales and mins, quantized with 4 bits
uint8_t
qs
[
QK_K
/
4
];
// quants
half
d
;
// super-block scale for quantized scales
half
dmin
;
// super-block scale for quantized mins
half2
dm
;
// super-block scale for quantized scales/mins
}
block_q2_K
;
static_assert
(
sizeof
(
block_q2_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
QK_K
/
16
+
QK_K
/
4
,
"wrong q2_K block size/padding"
);
...
...
@@ -206,8 +237,7 @@ typedef struct {
static_assert
(
sizeof
(
block_q4_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
QK_K
/
2
+
2
,
"wrong q4_K block size/padding"
);
#else
typedef
struct
{
half
d
;
// super-block scale for quantized scales
half
dmin
;
// super-block scale for quantized mins
half2
dm
;
// super-block scale for quantized scales/mins
uint8_t
scales
[
3
*
QK_K
/
64
];
// scales, quantized with 6 bits
uint8_t
qs
[
QK_K
/
2
];
// 4--bit quants
}
block_q4_K
;
...
...
@@ -226,11 +256,10 @@ typedef struct {
static_assert
(
sizeof
(
block_q5_K
)
==
sizeof
(
ggml_fp16_t
)
+
QK_K
/
2
+
QK_K
/
8
+
QK_K
/
16
,
"wrong q5_K block size/padding"
);
#else
typedef
struct
{
half
d
;
// super-block scale for quantized scales
half
dmin
;
// super-block scale for quantized mins
uint8_t
scales
[
K_SCALE_SIZE
];
// scales and mins, quantized with 6 bits
uint8_t
qh
[
QK_K
/
8
];
// quants, high bit
uint8_t
qs
[
QK_K
/
2
];
// quants, low 4 bits
half2
dm
;
// super-block scale for quantized scales/mins
uint8_t
scales
[
K_SCALE_SIZE
];
// scales and mins, quantized with 6 bits
uint8_t
qh
[
QK_K
/
8
];
// quants, high bit
uint8_t
qs
[
QK_K
/
2
];
// quants, low 4 bits
}
block_q5_K
;
static_assert
(
sizeof
(
block_q5_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
K_SCALE_SIZE
+
QK_K
/
2
+
QK_K
/
8
,
"wrong q5_K block size/padding"
);
#endif
...
...
@@ -259,6 +288,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
#define CUDA_QUANTIZE_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
#ifndef GGML_CUDA_MMQ_Y
#define GGML_CUDA_MMQ_Y 64
#endif // GGML_CUDA_MMQ_Y
// dmmv = dequantize_mul_mat_vec
#ifndef GGML_CUDA_DMMV_X
#define GGML_CUDA_DMMV_X 32
...
...
@@ -393,33 +426,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
v
.
x
=
vui
&
0xF
;
v
.
y
=
vui
>>
4
;
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
v
=
__hsub2
(
v
,
{
8.0
f
,
8.0
f
});
v
=
__hmul2
(
v
,
{
d
,
d
});
#else
v
.
x
=
(
v
.
x
-
8.0
f
)
*
d
;
v
.
y
=
(
v
.
y
-
8.0
f
)
*
d
;
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
}
static
__device__
__forceinline__
void
dequantize_q4_1
(
const
void
*
vx
,
const
int
ib
,
const
int
iqs
,
dfloat2
&
v
){
const
block_q4_1
*
x
=
(
const
block_q4_1
*
)
vx
;
const
dfloat
d
=
x
[
ib
].
d
;
const
dfloat
m
=
x
[
ib
].
m
;
const
dfloat
d
=
x
[
ib
].
d
m
.
x
;
const
dfloat
m
=
x
[
ib
].
dm
.
y
;
const
int
vui
=
x
[
ib
].
qs
[
iqs
];
v
.
x
=
vui
&
0xF
;
v
.
y
=
vui
>>
4
;
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
v
=
__hmul2
(
v
,
{
d
,
d
});
v
=
__hadd2
(
v
,
{
m
,
m
});
#else
v
.
x
=
(
v
.
x
*
d
)
+
m
;
v
.
y
=
(
v
.
y
*
d
)
+
m
;
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
}
static
__device__
__forceinline__
void
dequantize_q5_0
(
const
void
*
vx
,
const
int
ib
,
const
int
iqs
,
dfloat2
&
v
){
...
...
@@ -436,20 +469,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
v
.
x
=
((
x
[
ib
].
qs
[
iqs
]
&
0xf
)
|
xh_0
);
v
.
y
=
((
x
[
ib
].
qs
[
iqs
]
>>
4
)
|
xh_1
);
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
v
=
__hsub2
(
v
,
{
16.0
f
,
16.0
f
});
v
=
__hmul2
(
v
,
{
d
,
d
});
#else
v
.
x
=
(
v
.
x
-
16.0
f
)
*
d
;
v
.
y
=
(
v
.
y
-
16.0
f
)
*
d
;
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
}
static
__device__
__forceinline__
void
dequantize_q5_1
(
const
void
*
vx
,
const
int
ib
,
const
int
iqs
,
dfloat2
&
v
){
const
block_q5_1
*
x
=
(
const
block_q5_1
*
)
vx
;
const
dfloat
d
=
x
[
ib
].
d
;
const
dfloat
m
=
x
[
ib
].
m
;
const
dfloat
d
=
x
[
ib
].
d
m
.
x
;
const
dfloat
m
=
x
[
ib
].
dm
.
y
;
uint32_t
qh
;
memcpy
(
&
qh
,
x
[
ib
].
qh
,
sizeof
(
qh
));
...
...
@@ -460,13 +493,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
v
.
x
=
((
x
[
ib
].
qs
[
iqs
]
&
0xf
)
|
xh_0
);
v
.
y
=
((
x
[
ib
].
qs
[
iqs
]
>>
4
)
|
xh_1
);
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
v
=
__hmul2
(
v
,
{
d
,
d
});
v
=
__hadd2
(
v
,
{
m
,
m
});
#else
v
.
x
=
(
v
.
x
*
d
)
+
m
;
v
.
y
=
(
v
.
y
*
d
)
+
m
;
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
}
static
__device__
__forceinline__
void
dequantize_q8_0
(
const
void
*
vx
,
const
int
ib
,
const
int
iqs
,
dfloat2
&
v
){
...
...
@@ -477,12 +510,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
v
.
x
=
x
[
ib
].
qs
[
iqs
+
0
];
v
.
y
=
x
[
ib
].
qs
[
iqs
+
1
];
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
v
=
__hmul2
(
v
,
{
d
,
d
});
#else
v
.
x
*=
d
;
v
.
y
*=
d
;
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
}
//================================== k-quants
...
...
@@ -501,8 +534,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
const
uint8_t
q
=
x
[
i
].
qs
[
32
*
n
+
l
];
float
*
y
=
yy
+
i
*
QK_K
+
128
*
n
;
float
dall
=
x
[
i
].
d
;
float
dmin
=
x
[
i
].
dm
in
;
float
dall
=
x
[
i
].
d
m
.
x
;
float
dmin
=
x
[
i
].
dm
.
y
;
y
[
l
+
0
]
=
dall
*
(
x
[
i
].
scales
[
is
+
0
]
&
0xF
)
*
((
q
>>
0
)
&
3
)
-
dmin
*
(
x
[
i
].
scales
[
is
+
0
]
>>
4
);
y
[
l
+
32
]
=
dall
*
(
x
[
i
].
scales
[
is
+
2
]
&
0xF
)
*
((
q
>>
2
)
&
3
)
-
dmin
*
(
x
[
i
].
scales
[
is
+
2
]
>>
4
);
y
[
l
+
64
]
=
dall
*
(
x
[
i
].
scales
[
is
+
4
]
&
0xF
)
*
((
q
>>
4
)
&
3
)
-
dmin
*
(
x
[
i
].
scales
[
is
+
4
]
>>
4
);
...
...
@@ -512,8 +545,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
const
int
il
=
tid
%
16
;
// 0...15
const
uint8_t
q
=
x
[
i
].
qs
[
il
]
>>
(
2
*
is
);
float
*
y
=
yy
+
i
*
QK_K
+
16
*
is
+
il
;
float
dall
=
x
[
i
].
d
;
float
dmin
=
x
[
i
].
dm
in
;
float
dall
=
x
[
i
].
d
m
.
x
;
float
dmin
=
x
[
i
].
dm
.
y
;
y
[
0
]
=
dall
*
(
x
[
i
].
scales
[
is
+
0
]
&
0xF
)
*
((
q
>>
0
)
&
3
)
-
dmin
*
(
x
[
i
].
scales
[
is
+
0
]
>>
4
);
y
[
32
]
=
dall
*
(
x
[
i
].
scales
[
is
+
2
]
&
0xF
)
*
((
q
>>
4
)
&
3
)
-
dmin
*
(
x
[
i
].
scales
[
is
+
2
]
>>
4
);
#endif
...
...
@@ -599,8 +632,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
float
*
y
=
yy
+
i
*
QK_K
+
64
*
il
+
n
*
ir
;
const
float
dall
=
x
[
i
].
d
;
const
float
dmin
=
x
[
i
].
dm
in
;
const
float
dall
=
x
[
i
].
d
m
.
x
;
const
float
dmin
=
x
[
i
].
dm
.
y
;
const
uint8_t
*
q
=
x
[
i
].
qs
+
32
*
il
+
n
*
ir
;
...
...
@@ -638,8 +671,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
float
*
y
=
yy
+
i
*
QK_K
+
64
*
il
+
2
*
ir
;
const
float
dall
=
x
[
i
].
d
;
const
float
dmin
=
x
[
i
].
dm
in
;
const
float
dall
=
x
[
i
].
d
m
.
x
;
const
float
dmin
=
x
[
i
].
dm
.
y
;
const
uint8_t
*
ql
=
x
[
i
].
qs
+
32
*
il
+
2
*
ir
;
const
uint8_t
*
qh
=
x
[
i
].
qh
+
2
*
ir
;
...
...
@@ -751,8 +784,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
const
float
*
y
=
yy
+
i
*
QK_K
+
y_offset
;
const
uint8_t
*
q
=
x
[
i
].
qs
+
q_offset
;
const
float
dall
=
x
[
i
].
d
;
const
float
dmin
=
x
[
i
].
dm
in
;
const
float
dall
=
x
[
i
].
d
m
.
x
;
const
float
dmin
=
x
[
i
].
dm
.
y
;
const
uint32_t
*
a
=
(
const
uint32_t
*
)(
x
[
i
].
scales
+
s_offset
);
aux
[
0
]
=
a
[
0
]
&
0x0f0f0f0f
;
...
...
@@ -794,9 +827,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
uaux
[
0
]
=
s
[
0
]
&
0x0f0f0f0f
;
uaux
[
1
]
=
(
s
[
0
]
>>
4
)
&
0x0f0f0f0f
;
const
half2
*
dh
=
(
const
half2
*
)
&
x
[
i
].
d
;
const
float2
dall
=
__half22float2
(
dh
[
0
]);
const
float2
dall
=
__half22float2
(
x
[
i
].
dm
);
float
sum1
=
0
,
sum2
=
0
;
for
(
int
l
=
0
;
l
<
K_QUANTS_PER_ITERATION
;
++
l
)
{
...
...
@@ -974,8 +1005,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
const
float
*
y1
=
yy
+
i
*
QK_K
+
y_offset
;
const
float
*
y2
=
y1
+
128
;
const
float
dall
=
x
[
i
].
d
;
const
float
dmin
=
x
[
i
].
dm
in
;
const
float
dall
=
x
[
i
].
d
m
.
x
;
const
float
dmin
=
x
[
i
].
dm
.
y
;
const
uint16_t
*
a
=
(
const
uint16_t
*
)
x
[
i
].
scales
;
aux
[
0
]
=
a
[
im
+
0
]
&
kmask1
;
...
...
@@ -1107,8 +1138,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
const
float
*
y1
=
yy
+
i
*
QK_K
+
y_offset
;
const
float
*
y2
=
y1
+
128
;
const
float
dall
=
x
[
i
].
d
;
const
float
dmin
=
x
[
i
].
dm
in
;
const
float
dall
=
x
[
i
].
d
m
.
x
;
const
float
dmin
=
x
[
i
].
dm
.
y
;
const
uint16_t
*
a
=
(
const
uint16_t
*
)
x
[
i
].
scales
;
aux
[
0
]
=
a
[
im
+
0
]
&
kmask1
;
...
...
@@ -1296,19 +1327,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
v
.
y
=
x
[
ib
+
iqs
+
1
];
}
static
__global__
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int
ndata
,
const
int
k
)
{
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
static
__global__
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int
kx
,
const
int
k
x_padded
)
{
const
int
i
x
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
if
(
i
x
>=
k
x_padded
)
{
return
;
}
const
int
iy
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
i_padded
=
iy
*
kx_padded
+
ix
;
block_q8_1
*
y
=
(
block_q8_1
*
)
vy
;
const
int
ib
=
i
/
QK8_1
;
// block index
const
int
iqs
=
i
%
QK8_1
;
// quant index
const
int
ib
=
i
_padded
/
QK8_1
;
// block index
const
int
iqs
=
i
_padded
%
QK8_1
;
// quant index
const
float
xi
=
i
<
ndata
?
x
[
i
]
:
0.0
f
;
const
float
xi
=
i
x
<
kx
?
x
[
i
y
*
kx
+
ix
]
:
0.0
f
;
float
amax
=
fabsf
(
xi
);
float
sum
=
xi
;
...
...
@@ -1327,8 +1362,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
return
;
}
y
[
ib
].
d
=
d
;
y
[
ib
].
s
=
sum
;
y
[
ib
].
d
s
.
x
=
d
;
y
[
ib
].
ds
.
y
=
sum
;
}
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dequantize_kernel
>
...
...
@@ -1352,45 +1387,114 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
y
[
iybs
+
iqs
+
y_offset
]
=
v
.
y
;
}
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_0
*
bq4_0
=
(
const
block_q4_0
*
)
vbq
;
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
int
vi
;
memcpy
(
&
vi
,
&
bq4_0
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)],
sizeof
(
int
));
const
int
ui0
=
*
((
int
*
)
&
bq8_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
const
int
ui1
=
*
((
int
*
)
&
bq8_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
QI4_0
)]);
#define VDR_q4_0_q8_1 1
const
float
d
=
__half2float
(
bq4_0
->
d
)
*
__half2float
(
bq8_1
->
d
);
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1_impl
(
const
int
&
vi
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half
&
d4
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
// subtract 8 from each quantized value
const
int
vi0
=
__vsub4
(
(
vi
>>
0
)
&
0x0F0F0F0F
,
0x08080808
)
;
const
int
vi1
=
__vsub4
(
(
vi
>>
4
)
&
0x0F0F0F0F
,
0x08080808
)
;
const
int
vi0
=
(
vi
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
vi
>>
4
)
&
0x0F0F0F0F
;
// SIMD dot product of quantized values
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
return
sumi
*
d
;
return
__half2float
(
d4
)
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
8
/
QI4_0
)
*
__half2float
(
ds8
.
y
))
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_1
*
bq4_1
=
(
const
block_q4_1
*
)
vbq
;
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
const
block_q4_0
*
bq4_0
=
(
const
block_q4_0
*
)
vbq
;
const
int
vi
=
get_int_from_uint8
(
bq4_0
->
qs
,
iqs
);
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI4_0
);
return
vec_dot_q4_0_q8_1_impl
(
vi
,
ui0
,
ui1
,
bq4_0
->
d
,
bq8_1
->
ds
);
}
static
__device__
__forceinline__
void
allocate_tiles_q4_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_qs
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI4_0
)
+
GGML_CUDA_MMQ_Y
/
QI4_0
];
*
x_ql
=
tile_x_qs
;
*
x_dm
=
tile_x_d
;
}
static
__device__
__forceinline__
void
load_tiles_q4_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI4_0
;
const
int
kqsx
=
k
%
QI4_0
;
const
block_q4_0
*
bx0
=
(
block_q4_0
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
int
vi
=
*
((
int
*
)
&
bq4_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
const
int
ui0
=
*
((
int
*
)
&
bq8_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
const
int
ui1
=
*
((
int
*
)
&
bq8_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
QI4_1
)]);
const
block_q4_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
float
d
=
__half2float
(
bq4_1
->
d
)
*
__half2float
(
bq8_1
->
d
);
const
float
m
=
bq4_1
->
m
;
const
float
s
=
bq8_1
->
s
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
x_dm
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
kbx
].
x
=
bxi
->
d
;
}
// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
// const int kbxd = k % blocks_per_tile_x_row;
// #pragma unroll
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
// if (i >= GGML_CUDA_MMQ_Y) {
// return;
// }
// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
// }
}
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
return
vec_dot_q4_0_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
k
/
QI4_0
].
x
,
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q4_1_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1_impl
(
const
int
&
vi
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half2
&
dm4
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
int
vi0
=
(
vi
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
vi
>>
4
)
&
0x0F0F0F0F
;
...
...
@@ -1398,439 +1502,1439 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
return
sumi
*
d
+
m
*
s
/
QI4_1
;
// scale sum by QI4_1 because there are QI4_1 threads working on this block
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm4
,
ds8
);
const
float
d4d8
=
__half2float
(
tmp
.
x
);
const
float
m4s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d4d8
=
__half2float
(
dm4
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m4s8
=
__half2float
(
dm4
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
// scale second part of sum by QI8_1/QR4_1 to compensate for multiple threads adding it
return
sumi
*
d4d8
+
m4s8
/
(
QI8_1
/
QR4_1
);
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_0
*
bq5_0
=
(
const
block_q5_0
*
)
vbq
;
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
const
block_q4_1
*
bq4_1
=
(
const
block_q4_1
*
)
vbq
;
const
int
vi
=
get_int_from_uint8_aligned
(
bq4_1
->
qs
,
iqs
);
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI4_1
);
return
vec_dot_q4_1_q8_1_impl
(
vi
,
ui0
,
ui1
,
bq4_1
->
dm
,
bq8_1
->
ds
);
}
static
__device__
__forceinline__
void
allocate_tiles_q4_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_qs
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI4_1
)
+
GGML_CUDA_MMQ_Y
/
QI4_1
];
*
x_ql
=
tile_x_qs
;
*
x_dm
=
tile_x_dm
;
}
static
__device__
__forceinline__
void
load_tiles_q4_1
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI4_1
;
const
int
kqsx
=
k
%
QI4_1
;
const
block_q4_1
*
bx0
=
(
block_q4_1
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI4_1
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI4_1
)
{
const
int
i
=
i0
+
i_offset
*
QI4_1
+
k
/
blocks_per_tile_x_row
;
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI4_1
)
+
i
/
QI4_1
+
kbxd
]
=
bxi
->
dm
;
}
}
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
int
qs
;
memcpy
(
&
qs
,
&
bq5_0
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)],
sizeof
(
int
));
const
int
qh0
=
bq5_0
->
qh
[
iqs
/
2
+
0
]
>>
4
*
(
iqs
%
2
);
const
int
qh1
=
bq5_0
->
qh
[
iqs
/
2
+
2
]
>>
4
*
(
iqs
%
2
);
const
int
ui0
=
*
((
int
*
)
&
bq8_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
const
int
ui1
=
*
((
int
*
)
&
bq8_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
QI5_0
)]);
const
float
d
=
__half2float
(
bq5_0
->
d
)
*
__half2float
(
bq8_1
->
d
);
int
vi0
=
(
qs
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh0 as 5th bits
vi0
|=
(
qh0
<<
4
)
&
0x00000010
;
// 1 -> 5
vi0
|=
(
qh0
<<
11
)
&
0x00001000
;
// 2 -> 13
vi0
|=
(
qh0
<<
18
)
&
0x00100000
;
// 3 -> 21
vi0
|=
(
qh0
<<
25
)
&
0x10000000
;
// 4 -> 29
vi0
=
__vsub4
(
vi0
,
0x10101010
);
// subtract 16 from quantized values
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
return
vec_dot_q4_1_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
i
*
(
WARP_SIZE
/
QI4_1
)
+
i
/
QI4_1
+
k
/
QI4_1
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q5_0_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1_impl
(
const
int
&
qs
,
const
int
&
qh
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half
&
d5
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
vi0
=
(
qs
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh as 5th bits
vi0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
vi0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
// SIMD dot product of quantized values
int
vi1
=
(
qs
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh1 as 5th bits
vi1
|=
(
qh1
<<
4
)
&
0x00000010
;
// 1 -> 5
vi1
|=
(
qh1
<<
11
)
&
0x00001000
;
// 2 -> 13
vi1
|=
(
qh1
<<
18
)
&
0x00100000
;
// 3 -> 21
vi1
|=
(
qh1
<<
25
)
&
0x10000000
;
// 4 -> 29
vi1
=
__vsub4
(
vi1
,
0x10101010
);
// subtract 16 from quantized values
int
vi1
=
(
qs
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh as 5th bits
vi1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
vi1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
vi1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
vi1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
// SIMD dot product of quantized values
return
sumi
*
d
;
return
__half2float
(
d5
)
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
16
/
QI5_0
)
*
__half2float
(
ds8
.
y
))
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_1
*
bq5_1
=
(
const
block_q5_1
*
)
vbq
;
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
const
block_q5_0
*
bq5_0
=
(
const
block_q5_0
*
)
vbq
;
const
int
qs
=
get_int_from_uint8
(
bq5_0
->
qs
,
iqs
);
const
int
qh
=
get_int_from_uint8
(
bq5_0
->
qh
,
0
)
>>
(
4
*
iqs
);
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI5_0
);
return
vec_dot_q5_0_q8_1_impl
(
qs
,
qh
,
ui0
,
ui1
,
bq5_0
->
d
,
bq8_1
->
ds
);
}
static
__device__
__forceinline__
void
allocate_tiles_q5_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_qh
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_0
)
+
GGML_CUDA_MMQ_Y
/
QI5_0
];
__shared__
half2
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_0
)
+
GGML_CUDA_MMQ_Y
/
QI5_0
];
*
x_ql
=
tile_x_ql
;
*
x_qh
=
tile_x_qh
;
*
x_dm
=
tile_x_d
;
}
static
__device__
__forceinline__
void
load_tiles_q5_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI5_0
;
const
int
kqsx
=
k
%
QI5_0
;
const
block_q5_0
*
bx0
=
(
block_q5_0
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_0
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_0
)
{
const
int
i
=
i0
+
i_offset
*
QI5_0
+
k
/
blocks_per_tile_x_row
;
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
int
qs
=
*
((
int
*
)
&
bq5_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
const
int
qh0
=
bq5_1
->
qh
[
iqs
/
2
+
0
]
>>
4
*
(
iqs
%
2
);
const
int
qh1
=
bq5_1
->
qh
[
iqs
/
2
+
2
]
>>
4
*
(
iqs
%
2
);
const
int
ui0
=
*
((
int
*
)
&
bq8_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
const
int
ui1
=
*
((
int
*
)
&
bq8_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
QI5_1
)]);
const
float
d
=
__half2float
(
bq5_1
->
d
)
*
__half2float
(
bq8_1
->
d
);
const
float
m
=
bq5_1
->
m
;
const
float
s
=
bq8_1
->
s
;
int
vi0
=
(
qs
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh0 as 5th bits
vi0
|=
(
qh0
<<
4
)
&
0x00000010
;
// 1 -> 5
vi0
|=
(
qh0
<<
11
)
&
0x00001000
;
// 2 -> 13
vi0
|=
(
qh0
<<
18
)
&
0x00100000
;
// 3 -> 21
vi0
|=
(
qh0
<<
25
)
&
0x10000000
;
// 4 -> 29
x_qh
[
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
kbxd
]
=
get_int_from_uint8
(
bxi
->
qh
,
0
);
x_dm
[
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
kbxd
].
x
=
bxi
->
d
;
}
}
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
k
/
QI5_0
;
return
vec_dot_q5_0_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
x_qh
[
index_bx
]
>>
(
4
*
(
k
%
QI5_0
)),
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
index_bx
].
x
,
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q5_1_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1_impl
(
const
int
&
qs
,
const
int
&
qh
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half2
&
dm5
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
vi0
=
(
qs
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh0 as 5th bits
vi0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
vi0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
// SIMD dot product of quantized values
int
vi1
=
(
qs
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh1 as 5th bits
vi1
|=
(
qh
1
<<
4
)
&
0x00000010
;
// 1 ->
5
vi1
|=
(
qh
1
<<
11
)
&
0x00001000
;
//
2
-> 1
3
vi1
|=
(
qh
1
<<
18
)
&
0x00100000
;
//
3
-> 2
1
vi1
|=
(
qh
1
<<
25
)
&
0x10000000
;
//
4
-> 2
9
int
vi1
=
(
qs
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh1 as 5th bits
vi1
|=
(
qh
>>
12
)
&
0x00000010
;
// 1
6
->
4
vi1
|=
(
qh
>>
5
)
&
0x00001000
;
//
17
-> 1
2
vi1
|=
(
qh
<<
2
)
&
0x00100000
;
//
18
-> 2
0
vi1
|=
(
qh
<<
9
)
&
0x10000000
;
//
19
-> 2
8
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
// SIMD dot product of quantized values
return
sumi
*
d
+
m
*
s
/
QI5_1
;
// scale sum by QI5_1 because there are QI5_1 threads working on this block
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm5
,
ds8
);
const
float
d5d8
=
__half2float
(
tmp
.
x
);
const
float
m5s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d5d8
=
__half2float
(
dm5
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m5s8
=
__half2float
(
dm5
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
return
sumi
*
d5d8
+
m5s8
/
QI5_1
;
// scale sum by QI5_1 because there are QI5_1 threads working on this block
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q8_0
*
bq8_0
=
(
const
block_q8_0
*
)
vbq
;
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
int
vi
;
memcpy
(
&
vi
,
&
bq8_0
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)],
sizeof
(
int
));
const
int
ui
=
*
((
int
*
)
&
bq8_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
const
block_q5_1
*
bq5_1
=
(
const
block_q5_1
*
)
vbq
;
const
float
d
=
__half2float
(
bq8_0
->
d
)
*
__half2float
(
bq8_1
->
d
);
const
int
qs
=
get_int_from_uint8_aligned
(
bq5_1
->
qs
,
iqs
);
const
int
qh
=
get_int_from_uint8_aligned
(
bq5_1
->
qh
,
0
)
>>
(
4
*
iqs
);
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI5_1
);
// SIMD dot product of quantized values
int
sumi
=
__dp4a
(
vi
,
ui
,
0
);
return
vec_dot_q5_1_q8_1_impl
(
qs
,
qh
,
ui0
,
ui1
,
bq5_1
->
dm
,
bq8_1
->
ds
);
}
return
sumi
*
d
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
static
__device__
__forceinline__
void
allocate_tiles_q5_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_qh
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_1
)
+
GGML_CUDA_MMQ_Y
/
QI5_1
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_1
)
+
GGML_CUDA_MMQ_Y
/
QI5_1
];
*
x_ql
=
tile_x_ql
;
*
x_qh
=
tile_x_qh
;
*
x_dm
=
tile_x_dm
;
}
static
__device__
__forceinline__
float
vec_dot_q2_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
static
__device__
__forceinline__
void
load_tiles_q5_1
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q2_K
*
bq2_K
=
(
const
block_q2_K
*
)
vbq
;
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
bq8_offset
=
QR2_K
*
(
iqs
/
QI
8
_1
)
;
const
int
scale_offset
=
iqs
-
iqs
%
QI8_1
+
(
iqs
%
QI
8
_1
)
/
(
QI8_1
/
2
)
;
const
int
kbx
=
k
/
QI
5
_1
;
const
int
kqsx
=
k
%
QI
5
_1
;
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
const
block_q5_1
*
bx0
=
(
block_q5_1
*
)
vx
;
const
float
d
=
bq2_K
->
d
;
const
float
dmin
=
bq2_K
->
dmin
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
int
v
=
*
((
int
*
)
&
bq2_K
->
qs
[
sizeof
(
int
)
*
iqs
])
;
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
for
(
int
i
=
0
;
i
<
QR2_K
;
++
i
)
{
const
int
sc
=
bq2_K
->
scales
[
scale_offset
+
2
*
i
];
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
}
const
block
_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
float
d8i
=
bq8i
->
d
;
const
int
block
s_per_tile_x_row
=
WARP_SIZE
/
QI5_1
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
const
int
vi
=
(
v
>>
(
2
*
i
))
&
0x03030303
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_1
)
{
const
int
i
=
i0
+
i_offset
*
QI5_1
+
k
/
blocks_per_tile_x_row
;
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
sumf_d
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
(
sc
&
0xF
));
// SIMD dot product
sumf_m
+=
d8i
*
(
__dp4a
(
0x01010101
,
ui
,
0
)
*
(
sc
>>
4
));
// multiply constant q2_K part with sum of q8_1 values
x_qh
[
i
*
(
WARP_SIZE
/
QI5_1
)
+
i
/
QI5_1
+
kbxd
]
=
get_int_from_uint8_aligned
(
bxi
->
qh
,
0
);
x_dm
[
i
*
(
WARP_SIZE
/
QI5_1
)
+
i
/
QI5_1
+
kbxd
]
=
bxi
->
dm
;
}
}
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_1
)
+
+
i
/
QI5_1
+
k
/
QI5_1
;
return
vec_dot_q5_1_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
x_qh
[
index_bx
]
>>
(
4
*
(
k
%
QI5_1
)),
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
index_bx
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q8_0_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1_impl
(
const
int
&
vi
,
const
int
&
ui
,
const
half
&
d8_0
,
const
half2
&
ds8_1
)
{
return
d
*
sumf_d
-
dmin
*
sumf_m
;
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
// SIMD dot product of quantized values
const
int
sumi
=
__dp4a
(
vi
,
ui
,
0
);
return
sumi
*
__half2float
(
d8_0
)
*
__half2float
(
ds8_1
.
x
);
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q
3_K
_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
static
__device__
__forceinline__
float
vec_dot_q
8_0
_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q3_K
*
bq3_K
=
(
const
block_q3_K
*
)
vbq
;
const
block_q8_0
*
bq8_0
=
(
const
block_q8_0
*
)
vbq
;
const
int
bq8_offset
=
QR3_K
*
(
iqs
/
(
QI3_K
/
2
)
);
const
int
scale_offset
=
iqs
-
iqs
%
QI8_1
+
(
iqs
%
QI8_1
)
/
(
QI8_1
/
2
);
const
int
vi
=
get_int_from_int8
(
bq8_0
->
qs
,
iqs
);
const
int
ui
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
float
sumf
=
0.0
f
;
return
vec_dot_q8_0_q8_1_impl
(
vi
,
ui
,
bq8_0
->
d
,
bq8_1
->
ds
);
}
const
float
d
=
bq3_K
->
d
;
static
__device__
__forceinline__
void
allocate_tiles_q8_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
int
vl
;
memcpy
(
&
vl
,
&
bq3_K
->
qs
[
sizeof
(
int
)
*
iqs
],
sizeof
(
int
))
;
__shared__
int
tile_x_qs
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
]
;
__shared__
half2
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI8_0
)
+
GGML_CUDA_MMQ_Y
/
QI8_0
]
;
int
vh
;
memcpy
(
&
vh
,
&
bq3_K
->
hmask
[
sizeof
(
int
)
*
(
iqs
%
(
QI3_K
/
2
))],
sizeof
(
int
));
vh
=
~
vh
;
// invert the mask so that a 0/1 results in 4/0 being subtracted
vh
>>=
bq8_offset
;
*
x_ql
=
tile_x_qs
;
*
x_dm
=
tile_x_d
;
}
for
(
int
i
=
0
;
i
<
QR3_K
;
++
i
)
{
const
int
isc
=
scale_offset
+
2
*
i
;
static
__device__
__forceinline__
void
load_tiles_q8_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
const
int
isc_low
=
isc
%
(
QK_K
/
32
);
const
int
sc_shift_low
=
4
*
(
isc
/
(
QK_K
/
32
));
const
int
sc_low
=
(
bq3_K
->
scales
[
isc_low
]
>>
sc_shift_low
)
&
0xF
;
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
isc_high
=
isc
%
(
QK_K
/
64
);
const
int
sc_shift_high
=
2
*
(
isc
/
(
QK_K
/
64
));
const
int
sc_high
=
((
bq3_K
->
scales
[(
QK_K
/
32
)
+
isc_high
]
>>
sc_shift_high
)
&
3
)
<<
4
;
const
int
kbx
=
k
/
QI8_0
;
const
int
kqsx
=
k
%
QI8_0
;
const
int
sc
=
(
sc_low
|
sc_high
)
-
32
;
const
block_q8_0
*
bx0
=
(
block_q8_0
*
)
vx
;
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
int
u
i
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
const
float
d8i
=
bq8i
->
d
;
#pragma unroll
for
(
int
i
0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
int
vil
=
(
vl
>>
(
2
*
i
))
&
0x03030303
;
const
block_q8_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
int
vih
=
((
vh
>>
i
)
<<
2
)
&
0x04040404
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_int8
(
bxi
->
qs
,
kqsx
);
x_dm
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
kbx
].
x
=
bxi
->
d
;
}
const
int
vi
=
__vsubss4
(
vil
,
vih
);
// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
// const int kbxd = k % blocks_per_tile_x_row;
// #pragma unroll
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
// #if GGML_CUDA_MMQ_Y < 64
// if (i >= GGML_CUDA_MMQ_Y) {
// return;
// }
// #endif // GGML_CUDA_MMQ_Y < 64
// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
// }
}
sumf
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
return
vec_dot_q8_0_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
y_qs
[
j
*
WARP_SIZE
+
k
],
x_dm
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
k
/
QI8_0
].
x
,
y_ds
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
k
/
QI8_1
]);
}
#define VDR_q2_K_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q2_K_q8_1_impl
(
const
int
&
v
,
const
int
*
__restrict__
u
,
const
uint8_t
*
__restrict__
scales
,
const
half2
&
dm
,
const
float
*
__restrict__
d8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
for
(
int
i
=
0
;
i
<
QR2_K
;
++
i
)
{
const
int
sc
=
scales
[
2
*
i
];
const
int
vi
=
(
v
>>
(
2
*
i
))
&
0x03030303
;
sumf_d
+=
d8
[
i
]
*
(
__dp4a
(
vi
,
u
[
i
],
0
)
*
(
sc
&
0xF
));
// SIMD dot product
int
sc_high
=
sc
>>
4
;
sc_high
|=
sc_high
<<
8
;
sc_high
|=
sc_high
<<
16
;
sumf_m
+=
d8
[
i
]
*
__dp4a
(
sc_high
,
u
[
i
],
0
);
// multiply constant q2_K part with sum of q8_1 values
}
return
d
*
sumf
;
const
float2
dmf
=
__half22float2
(
dm
);
return
dmf
.
x
*
sumf_d
-
dmf
.
y
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q
4
_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
static
__device__
__forceinline__
float
vec_dot_q
2
_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_K
*
bq4_K
=
(
const
block_q4_K
*
)
vbq
;
const
block_q2_K
*
bq2_K
=
(
const
block_q2_K
*
)
vbq
;
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
const
int
bq8_offset
=
QR2_K
*
(
iqs
/
QI8_1
)
;
const
int
scale_offset
=
iqs
-
iqs
%
QI8_1
+
(
iqs
%
QI8_1
)
/
(
QI8_1
/
2
)
;
#ifndef GGML_QKK_64
const
uint8_t
*
scales
=
bq2_K
->
scales
+
scale_offset
;
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
const
int
bq8_offset
=
QR4_K
*
(
iqs
/
(
QI8_1
/
2
));
const
int
v
=
get_int_from_uint8_aligned
(
bq2_K
->
qs
,
iqs
);
int
u
[
QR2_K
];
float
d8
[
QR2_K
];
const
float
d
=
bq4_K
->
d
;
const
float
dmin
=
bq4_K
->
dmin
;
for
(
int
i
=
0
;
i
<
QR2_K
;
++
i
)
{
u
[
i
]
=
get_int_from_int8_aligned
(
bq8_1
[
bq8_offset
+
i
].
qs
,
iqs
%
QI8_1
);
d8
[
i
]
=
bq8_1
[
bq8_offset
+
i
].
ds
.
x
;
}
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
return
vec_dot_q2_K_q8_1_impl
(
v
,
u
,
scales
,
bq2_K
->
dm
,
d8
);
}
const
int
*
q4
=
(
const
int
*
)(
bq4_K
->
qs
+
16
*
bq8_offset
+
4
*
(
iqs
%
4
));
const
int
v1
=
q4
[
0
];
const
int
v2
=
q4
[
4
];
static
__device__
__forceinline__
void
allocate_tiles_q2_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
bq4_K
->
scales
;
uint16_t
aux
[
2
];
const
int
j
=
bq8_offset
/
2
;
if
(
j
<
2
)
{
aux
[
0
]
=
scales
[
j
+
0
]
&
0x3f3f
;
aux
[
1
]
=
scales
[
j
+
2
]
&
0x3f3f
;
}
else
{
aux
[
0
]
=
((
scales
[
j
+
2
]
>>
0
)
&
0x0f0f
)
|
((
scales
[
j
-
2
]
&
0xc0c0
)
>>
2
);
aux
[
1
]
=
((
scales
[
j
+
2
]
>>
4
)
&
0x0f0f
)
|
((
scales
[
j
-
0
]
&
0xc0c0
)
>>
2
);
}
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI2_K
)
+
GGML_CUDA_MMQ_Y
/
QI2_K
];
__shared__
int
tile_x_sc
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
4
)
+
GGML_CUDA_MMQ_Y
/
4
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_sc
=
tile_x_sc
;
}
static
__device__
__forceinline__
void
load_tiles_q2_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI2_K
;
const
int
kqsx
=
k
%
QI2_K
;
const
block_q2_K
*
bx0
=
(
block_q2_K
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI2_K
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI2_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI2_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI2_K
)
+
i
/
QI2_K
+
kbxd
]
=
bxi
->
dm
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
const
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI2_K
/
4
);
x_sc
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
k
%
(
WARP_SIZE
/
4
)]
=
get_int_from_uint8_aligned
(
bxi
->
scales
,
k
%
(
QI2_K
/
4
));
}
}
static
__device__
__forceinline__
float
vec_dot_q2_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI2_K
;
const
int
kqsx
=
k
%
QI2_K
;
const
int
bq8_offset
=
QR2_K
*
(
kqsx
/
QI8_1
);
const
int
scale_offset
=
kqsx
-
kqsx
%
QI8_1
+
(
kqsx
%
QI8_1
)
/
(
QI8_1
/
2
);
const
uint8_t
*
scales
=
((
uint8_t
*
)
(
x_sc
+
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
))
+
kbx
*
16
+
scale_offset
;
int
u
[
QR2_K
];
float
d8
[
QR2_K
];
for
(
int
l
=
0
;
l
<
QR2_K
;
++
l
)
{
const
int
y_qs_index
=
j
*
(
QR2_K
*
WARP_SIZE
)
+
kbx
*
(
QR2_K
*
QI2_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
kqsx
%
QI8_1
;
u
[
l
]
=
y_qs
[
y_qs_index
];
d8
[
l
]
=
y_ds
[
y_qs_index
/
QI8_1
].
x
;
}
return
vec_dot_q2_K_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
u
,
scales
,
x_dm
[
i
*
(
WARP_SIZE
/
QI2_K
)
+
i
/
QI2_K
+
kbx
],
d8
);
}
#define VDR_q3_K_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q3_K_q8_1_impl
(
const
int
&
vl
,
const
int
&
vh
,
const
int
*
__restrict__
u
,
const
uint8_t
*
__restrict__
scales
,
const
int
&
scale_offset
,
const
float
&
d
,
const
float
*
__restrict__
d8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float
sumf
=
0.0
f
;
for
(
int
i
=
0
;
i
<
QR3_K
;
++
i
)
{
const
int
isc
=
scale_offset
+
2
*
i
;
const
int
isc_low
=
isc
%
(
QK_K
/
32
);
const
int
sc_shift_low
=
4
*
(
isc
/
(
QK_K
/
32
));
const
int
sc_low
=
(
scales
[
isc_low
]
>>
sc_shift_low
)
&
0xF
;
const
int
isc_high
=
isc
%
(
QK_K
/
64
);
const
int
sc_shift_high
=
2
*
(
isc
/
(
QK_K
/
64
));
const
int
sc_high
=
((
scales
[(
QK_K
/
32
)
+
isc_high
]
>>
sc_shift_high
)
&
3
)
<<
4
;
const
int
sc
=
(
sc_low
|
sc_high
)
-
32
;
const
int
vil
=
(
vl
>>
(
2
*
i
))
&
0x03030303
;
const
int
vih
=
((
vh
>>
i
)
<<
2
)
&
0x04040404
;
const
int
vi
=
__vsubss4
(
vil
,
vih
);
sumf
+=
d8
[
i
]
*
(
__dp4a
(
vi
,
u
[
i
],
0
)
*
sc
);
// SIMD dot product
}
return
d
*
sumf
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q3_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
const
block_q3_K
*
bq3_K
=
(
const
block_q3_K
*
)
vbq
;
const
int
bq8_offset
=
QR3_K
*
(
iqs
/
(
QI3_K
/
2
));
const
int
scale_offset
=
iqs
-
iqs
%
QI8_1
+
(
iqs
%
QI8_1
)
/
(
QI8_1
/
2
);
const
float
d
=
bq3_K
->
d
;
const
int
vl
=
get_int_from_uint8
(
bq3_K
->
qs
,
iqs
);
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
const
int
vh
=
~
get_int_from_uint8
(
bq3_K
->
hmask
,
iqs
%
(
QI3_K
/
2
))
>>
bq8_offset
;
int
u
[
QR3_K
];
float
d8
[
QR3_K
];
for
(
int
i
=
0
;
i
<
QR3_K
;
++
i
)
{
u
[
i
]
=
get_int_from_int8_aligned
(
bq8_1
[
bq8_offset
+
i
].
qs
,
iqs
%
QI8_1
);
d8
[
i
]
=
bq8_1
[
bq8_offset
+
i
].
ds
.
x
;
}
return
vec_dot_q3_K_q8_1_impl
(
vl
,
vh
,
u
,
bq3_K
->
scales
,
scale_offset
,
d
,
d8
);
}
static
__device__
__forceinline__
void
allocate_tiles_q3_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI3_K
)
+
GGML_CUDA_MMQ_Y
/
QI3_K
];
__shared__
int
tile_x_qh
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
2
)
+
GGML_CUDA_MMQ_Y
/
2
];
__shared__
int
tile_x_sc
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
4
)
+
GGML_CUDA_MMQ_Y
/
4
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_qh
=
tile_x_qh
;
*
x_sc
=
tile_x_sc
;
}
static
__device__
__forceinline__
void
load_tiles_q3_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI3_K
;
const
int
kqsx
=
k
%
QI3_K
;
const
block_q3_K
*
bx0
=
(
block_q3_K
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI3_K
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI3_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI3_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI3_K
)
+
i
/
QI3_K
+
kbxd
].
x
=
bxi
->
d
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
2
)
{
const
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
2
))
/
(
QI3_K
/
2
);
x_qh
[
i
*
(
WARP_SIZE
/
2
)
+
i
/
2
+
k
%
(
WARP_SIZE
/
2
)]
=
get_int_from_uint8
(
bxi
->
hmask
,
k
%
(
QI3_K
/
2
));
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
const
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI3_K
/
4
);
x_sc
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
k
%
(
WARP_SIZE
/
4
)]
=
get_int_from_uint8
(
bxi
->
scales
,
k
%
(
QI3_K
/
4
));
}
}
static
__device__
__forceinline__
float
vec_dot_q3_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI3_K
;
const
int
kqsx
=
k
%
QI3_K
;
const
int
bq8_offset
=
QR3_K
*
(
kqsx
/
(
QI3_K
/
2
));
const
int
scale_offset
=
kqsx
-
kqsx
%
QI8_1
+
(
kqsx
%
QI8_1
)
/
(
QI8_1
/
2
);
const
uint8_t
*
scales
=
((
uint8_t
*
)
(
x_sc
+
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
))
+
kbx
*
16
;
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
const
int
vh
=
~
x_qh
[
i
*
(
WARP_SIZE
/
2
)
+
i
/
2
+
kbx
*
(
QI3_K
/
2
)
+
kqsx
%
(
QI3_K
/
2
)]
>>
bq8_offset
;
int
u
[
QR3_K
];
float
d8
[
QR3_K
];
for
(
int
l
=
0
;
l
<
QR3_K
;
++
l
)
{
const
int
y_qs_index
=
j
*
(
QR3_K
*
WARP_SIZE
)
+
kbx
*
(
QR3_K
*
QI3_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
kqsx
%
QI8_1
;
u
[
l
]
=
y_qs
[
y_qs_index
];
d8
[
l
]
=
y_ds
[
y_qs_index
/
QI8_1
].
x
;
}
return
vec_dot_q3_K_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
vh
,
u
,
scales
,
scale_offset
,
x_dm
[
i
*
(
WARP_SIZE
/
QI3_K
)
+
i
/
QI3_K
+
kbx
].
x
,
d8
);
}
#define VDR_q4_K_q8_1 2
static
__device__
__forceinline__
float
vec_dot_q4_K_q8_1_impl
(
const
int
*
__restrict__
v
,
const
int
*
__restrict__
u
,
const
uint8_t
*
__restrict__
sc
,
const
uint8_t
*
__restrict__
m
,
const
half2
&
dm4
,
const
float
*
__restrict__
d8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
for
(
int
i
=
0
;
i
<
QR4_K
;
++
i
)
{
const
int
v0i
=
(
v
[
0
]
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
v1i
=
(
v
[
1
]
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
dot1
=
__dp4a
(
v1i
,
u
[
2
*
i
+
1
],
__dp4a
(
v0i
,
u
[
2
*
i
+
0
],
0
));
// SIMD dot product
const
int
dot2
=
__dp4a
(
0x01010101
,
u
[
2
*
i
+
1
],
__dp4a
(
0x01010101
,
u
[
2
*
i
+
0
],
0
));
// sum of u
sumf_d
+=
d8
[
i
]
*
(
dot1
*
sc
[
i
]);
sumf_m
+=
d8
[
i
]
*
(
dot2
*
m
[
i
]);
// multiply constant part of q4_K with sum of q8_1 values
}
return
__half2float
(
dm4
.
x
)
*
sumf_d
-
__half2float
(
dm4
.
y
)
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q4_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
#ifndef GGML_QKK_64
const
block_q4_K
*
bq4_K
=
(
const
block_q4_K
*
)
vbq
;
int
v
[
2
];
int
u
[
2
*
QR4_K
];
float
d8
[
QR4_K
];
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
const
int
bq8_offset
=
QR4_K
*
(
iqs
/
(
QI8_1
/
2
));
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
const
int
*
q4
=
(
const
int
*
)(
bq4_K
->
qs
+
16
*
bq8_offset
+
4
*
(
iqs
%
4
));
v
[
0
]
=
q4
[
0
];
v
[
1
]
=
q4
[
4
];
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
bq4_K
->
scales
;
uint16_t
aux
[
2
];
const
int
j
=
bq8_offset
/
2
;
if
(
j
<
2
)
{
aux
[
0
]
=
scales
[
j
+
0
]
&
0x3f3f
;
aux
[
1
]
=
scales
[
j
+
2
]
&
0x3f3f
;
}
else
{
aux
[
0
]
=
((
scales
[
j
+
2
]
>>
0
)
&
0x0f0f
)
|
((
scales
[
j
-
2
]
&
0xc0c0
)
>>
2
);
aux
[
1
]
=
((
scales
[
j
+
2
]
>>
4
)
&
0x0f0f
)
|
((
scales
[
j
-
0
]
&
0xc0c0
)
>>
2
);
}
const
uint8_t
*
sc
=
(
const
uint8_t
*
)
aux
;
const
uint8_t
*
m
=
sc
+
2
;
for
(
int
i
=
0
;
i
<
QR4_K
;
++
i
)
{
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
d8
[
i
]
=
bq8i
->
ds
.
x
;
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
iqs
%
4
);
u
[
2
*
i
+
0
]
=
q8
[
0
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
}
return
vec_dot_q4_K_q8_1_impl
(
v
,
u
,
sc
,
m
,
bq4_K
->
dm
,
d8
);
#else
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_K
*
bq4_K
=
(
const
block_q4_K
*
)
vbq
;
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
uint16_t
aux16
[
2
];
const
uint8_t
*
s
=
(
const
uint8_t
*
)
aux16
;
const
uint16_t
*
a
=
(
const
uint16_t
*
)
bq4_K
->
scales
;
aux16
[
0
]
=
a
[
0
]
&
0x0f0f
;
aux16
[
1
]
=
(
a
[
0
]
>>
4
)
&
0x0f0f
;
const
float
dall
=
bq4_K
->
d
[
0
];
const
float
dmin
=
bq4_K
->
d
[
1
];
const
float
d8_1
=
bq8_1
[
0
].
ds
.
x
;
const
float
d8_2
=
bq8_1
[
1
].
ds
.
x
;
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
+
4
);
const
int
*
q4
=
(
const
int
*
)
bq4_K
->
qs
+
iqs
;
const
int
v1
=
q4
[
0
];
const
int
v2
=
q4
[
4
];
const
int
dot1
=
__dp4a
(
ui2
,
v2
&
0x0f0f0f0f
,
__dp4a
(
ui1
,
v1
&
0x0f0f0f0f
,
0
));
const
int
dot2
=
__dp4a
(
ui4
,
(
v2
>>
4
)
&
0x0f0f0f0f
,
__dp4a
(
ui3
,
(
v1
>>
4
)
&
0x0f0f0f0f
,
0
));
const
int
dot3
=
__dp4a
(
0x01010101
,
ui2
,
__dp4a
(
0x01010101
,
ui1
,
0
));
const
int
dot4
=
__dp4a
(
0x01010101
,
ui4
,
__dp4a
(
0x01010101
,
ui3
,
0
));
sumf_d
+=
d8_1
*
(
dot1
*
s
[
0
])
+
d8_2
*
(
dot2
*
s
[
1
]);
sumf_m
+=
d8_1
*
(
dot3
*
s
[
2
])
+
d8_2
*
(
dot4
*
s
[
3
]);
return
dall
*
sumf_d
-
dmin
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
}
static
__device__
__forceinline__
void
allocate_tiles_q4_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI4_K
)
+
GGML_CUDA_MMQ_Y
/
QI4_K
];
__shared__
int
tile_x_sc
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
8
)
+
GGML_CUDA_MMQ_Y
/
8
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_sc
=
tile_x_sc
;
}
static
__device__
__forceinline__
void
load_tiles_q4_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI4_K
;
// == 0 if QK_K == 256
const
int
kqsx
=
k
%
QI4_K
;
// == k if QK_K == 256
const
block_q4_K
*
bx0
=
(
block_q4_K
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI4_K
;
// == 1 if QK_K == 256
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
// == 0 if QK_K == 256
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI4_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI4_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI4_K
)
+
i
/
QI4_K
+
kbxd
]
=
bxi
->
dm
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
const
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI4_K
/
8
);
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
k
%
(
WARP_SIZE
/
8
)]
=
get_int_from_uint8_aligned
(
bxi
->
scales
,
k
%
(
QI4_K
/
8
));
}
}
static
__device__
__forceinline__
float
vec_dot_q4_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI6_K
;
// == 0 if QK_K == 256
const
int
kqsx
=
k
%
QI6_K
;
// == k if QK_K == 256
int
v
[
2
];
int
u
[
2
*
QR4_K
];
float
d8
[
QR4_K
];
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
const
int
bq8_offset
=
QR4_K
*
(
kqsx
/
(
QI8_1
/
2
));
v
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
0
];
v
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
4
];
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
kbx
*
4
];
uint16_t
aux
[
2
];
const
int
l
=
bq8_offset
/
2
;
if
(
l
<
2
)
{
aux
[
0
]
=
scales
[
l
+
0
]
&
0x3f3f
;
aux
[
1
]
=
scales
[
l
+
2
]
&
0x3f3f
;
}
else
{
aux
[
0
]
=
((
scales
[
l
+
2
]
>>
0
)
&
0x0f0f
)
|
((
scales
[
l
-
2
]
&
0xc0c0
)
>>
2
);
aux
[
1
]
=
((
scales
[
l
+
2
]
>>
4
)
&
0x0f0f
)
|
((
scales
[
l
-
0
]
&
0xc0c0
)
>>
2
);
}
const
uint8_t
*
sc
=
(
const
uint8_t
*
)
aux
;
const
uint8_t
*
m
=
sc
+
2
;
for
(
int
l
=
0
;
l
<
QR4_K
;
++
l
)
{
const
int
kqsy
=
j
*
(
QR4_K
*
WARP_SIZE
)
+
kbx
*
(
QR4_K
*
QI4_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
kqsx
%
(
QI8_1
/
2
);
u
[
2
*
l
+
0
]
=
y_qs
[
kqsy
+
0
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
1
]
=
y_qs
[
kqsy
+
1
*
(
QI8_1
/
2
)];
d8
[
l
]
=
y_ds
[
kqsy
/
QI8_1
].
x
;
}
return
vec_dot_q4_K_q8_1_impl
(
v
,
u
,
sc
,
m
,
x_dm
[
i
*
(
WARP_SIZE
/
QI4_K
)
+
i
/
QI4_K
+
kbx
],
d8
);
}
#define VDR_q5_K_q8_1 2
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1_impl
(
const
int
*
__restrict__
vl
,
const
int
*
__restrict__
vh
,
const
int
*
__restrict__
u
,
const
uint8_t
*
__restrict__
sc
,
const
uint8_t
*
__restrict__
m
,
const
half2
&
dm5
,
const
float
*
__restrict__
d8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
for
(
int
i
=
0
;
i
<
QR5_K
;
++
i
)
{
const
int
vl0i
=
(
vl
[
0
]
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
vl1i
=
(
vl
[
1
]
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
vh0i
=
((
vh
[
0
]
>>
i
)
<<
4
)
&
0x10101010
;
const
int
vh1i
=
((
vh
[
1
]
>>
i
)
<<
4
)
&
0x10101010
;
const
int
v0i
=
vl0i
|
vh0i
;
const
int
v1i
=
vl1i
|
vh1i
;
const
int
dot1
=
__dp4a
(
v0i
,
u
[
2
*
i
+
0
],
__dp4a
(
v1i
,
u
[
2
*
i
+
1
],
0
));
// SIMD dot product
const
int
dot2
=
__dp4a
(
0x01010101
,
u
[
2
*
i
+
0
],
__dp4a
(
0x01010101
,
u
[
2
*
i
+
1
],
0
));
// sum of u
sumf_d
+=
d8
[
i
]
*
(
dot1
*
sc
[
i
]);
sumf_m
+=
d8
[
i
]
*
(
dot2
*
m
[
i
]);
}
return
__half2float
(
dm5
.
x
)
*
sumf_d
-
__half2float
(
dm5
.
y
)
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
#ifndef GGML_QKK_64
const
block_q5_K
*
bq5_K
=
(
const
block_q5_K
*
)
vbq
;
int
vl
[
2
];
int
vh
[
2
];
int
u
[
2
*
QR5_K
];
float
d8
[
QR5_K
];
const
int
bq8_offset
=
QR5_K
*
(
iqs
/
(
QI8_1
/
2
));
const
int
*
ql
=
(
const
int
*
)(
bq5_K
->
qs
+
16
*
bq8_offset
+
4
*
(
iqs
%
4
));
const
int
*
qh
=
(
const
int
*
)(
bq5_K
->
qh
+
4
*
(
iqs
%
4
));
vl
[
0
]
=
ql
[
0
];
vl
[
1
]
=
ql
[
4
];
vh
[
0
]
=
qh
[
0
]
>>
bq8_offset
;
vh
[
1
]
=
qh
[
4
]
>>
bq8_offset
;
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
bq5_K
->
scales
;
uint16_t
aux
[
2
];
const
int
j
=
bq8_offset
/
2
;
if
(
j
<
2
)
{
aux
[
0
]
=
scales
[
j
+
0
]
&
0x3f3f
;
aux
[
1
]
=
scales
[
j
+
2
]
&
0x3f3f
;
}
else
{
aux
[
0
]
=
((
scales
[
j
+
2
]
>>
0
)
&
0x0f0f
)
|
((
scales
[
j
-
2
]
&
0xc0c0
)
>>
2
);
aux
[
1
]
=
((
scales
[
j
+
2
]
>>
4
)
&
0x0f0f
)
|
((
scales
[
j
-
0
]
&
0xc0c0
)
>>
2
);
}
const
uint8_t
*
sc
=
(
const
uint8_t
*
)
aux
;
const
uint8_t
*
m
=
sc
+
2
;
for
(
int
i
=
0
;
i
<
QR4_K
;
++
i
)
{
for
(
int
i
=
0
;
i
<
QR5_K
;
++
i
)
{
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
d8
[
i
]
=
bq8i
->
ds
.
x
;
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
iqs
%
4
);
u
[
2
*
i
+
0
]
=
q8
[
0
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
}
return
vec_dot_q5_K_q8_1_impl
(
vl
,
vh
,
u
,
sc
,
m
,
bq5_K
->
dm
,
d8
);
#else
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_K
*
bq5_K
=
(
const
block_q5_K
*
)
vbq
;
const
int8_t
*
s
=
bq5_K
->
scales
;
const
float
d
=
bq5_K
->
d
;
const
float
d8_1
=
bq8_1
[
0
].
ds
.
x
;
const
float
d8_2
=
bq8_1
[
1
].
ds
.
x
;
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
+
4
);
const
int
*
ql
=
(
const
int
*
)
bq5_K
->
qs
+
iqs
;
const
int
vl1
=
ql
[
0
];
const
int
vl2
=
ql
[
4
];
const
int
step
=
4
*
iqs
;
// 0, 4, 8, 12
const
int
im
=
step
/
8
;
// = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
const
int
in
=
step
%
8
;
// 0, 4, 0, 4
const
int
vh
=
(
*
((
const
int
*
)(
bq5_K
->
qh
+
in
)))
>>
im
;
const
int
v1
=
(((
vh
<<
4
)
&
0x10101010
)
^
0x10101010
)
|
((
vl1
>>
0
)
&
0x0f0f0f0f
);
const
int
v2
=
(((
vh
<<
2
)
&
0x10101010
)
^
0x10101010
)
|
((
vl2
>>
0
)
&
0x0f0f0f0f
);
const
int
v3
=
(((
vh
>>
0
)
&
0x10101010
)
^
0x10101010
)
|
((
vl1
>>
4
)
&
0x0f0f0f0f
);
const
int
v4
=
(((
vh
>>
2
)
&
0x10101010
)
^
0x10101010
)
|
((
vl2
>>
4
)
&
0x0f0f0f0f
);
const
float
sumf_d
=
d8_1
*
(
__dp4a
(
ui1
,
v1
,
0
)
*
s
[
0
]
+
__dp4a
(
ui2
,
v2
,
0
)
*
s
[
1
])
+
d8_2
*
(
__dp4a
(
ui3
,
v3
,
0
)
*
s
[
2
]
+
__dp4a
(
ui4
,
v4
,
0
)
*
s
[
3
]);
return
d
*
sumf_d
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
}
static
__device__
__forceinline__
void
allocate_tiles_q5_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_K
)
+
GGML_CUDA_MMQ_Y
/
QI5_K
];
__shared__
int
tile_x_qh
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
4
)
+
GGML_CUDA_MMQ_Y
/
4
];
__shared__
int
tile_x_sc
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
8
)
+
GGML_CUDA_MMQ_Y
/
8
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_qh
=
tile_x_qh
;
*
x_sc
=
tile_x_sc
;
}
static
__device__
__forceinline__
void
load_tiles_q5_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI5_K
;
// == 0 if QK_K == 256
const
int
kqsx
=
k
%
QI5_K
;
// == k if QK_K == 256
const
block_q5_K
*
bx0
=
(
block_q5_K
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_K
;
// == 1 if QK_K == 256
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
// == 0 if QK_K == 256
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI5_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI5_K
)
+
i
/
QI5_K
+
kbxd
]
=
bxi
->
dm
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
const
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI5_K
/
4
);
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
k
%
(
WARP_SIZE
/
4
)]
=
get_int_from_uint8
(
bxi
->
qh
,
k
%
(
QI5_K
/
4
));
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
const
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI5_K
/
8
);
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
k
%
(
WARP_SIZE
/
8
)]
=
get_int_from_uint8_aligned
(
bxi
->
scales
,
k
%
(
QI5_K
/
8
));
}
}
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI6_K
;
// == 0 if QK_K == 256
const
int
kqsx
=
k
%
QI6_K
;
// == k if QK_K == 256
int
vl
[
2
];
int
vh
[
2
];
int
u
[
2
*
QR4_K
];
float
d8
[
QR4_K
];
const
int
bq8_offset
=
QR5_K
*
(
kqsx
/
(
QI8_1
/
2
));
vl
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
0
];
vl
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
4
];
vh
[
0
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
kqsx
%
4
+
0
]
>>
bq8_offset
;
vh
[
1
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
kqsx
%
4
+
4
]
>>
bq8_offset
;
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
kbx
*
4
];
uint16_t
aux
[
2
];
const
int
l
=
bq8_offset
/
2
;
if
(
l
<
2
)
{
aux
[
0
]
=
scales
[
l
+
0
]
&
0x3f3f
;
aux
[
1
]
=
scales
[
l
+
2
]
&
0x3f3f
;
}
else
{
aux
[
0
]
=
((
scales
[
l
+
2
]
>>
0
)
&
0x0f0f
)
|
((
scales
[
l
-
2
]
&
0xc0c0
)
>>
2
);
aux
[
1
]
=
((
scales
[
l
+
2
]
>>
4
)
&
0x0f0f
)
|
((
scales
[
l
-
0
]
&
0xc0c0
)
>>
2
);
}
const
uint8_t
*
sc
=
(
const
uint8_t
*
)
aux
;
const
uint8_t
*
m
=
sc
+
2
;
for
(
int
l
=
0
;
l
<
QR5_K
;
++
l
)
{
const
int
kqsy
=
j
*
(
QR5_K
*
WARP_SIZE
)
+
kbx
*
(
QR5_K
*
QI5_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
kqsx
%
(
QI8_1
/
2
);
u
[
2
*
l
+
0
]
=
y_qs
[
kqsy
+
0
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
1
]
=
y_qs
[
kqsy
+
1
*
(
QI8_1
/
2
)];
d8
[
l
]
=
y_ds
[
kqsy
/
QI8_1
].
x
;
}
return
vec_dot_q5_K_q8_1_impl
(
vl
,
vh
,
u
,
sc
,
m
,
x_dm
[
i
*
(
WARP_SIZE
/
QI5_K
)
+
i
/
QI5_K
+
kbx
],
d8
);
}
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
float
d8i
=
bq8i
->
d
;
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
iqs
%
4
);
const
int
ui1
=
q8
[
0
];
const
int
ui2
=
q8
[
4
];
#define VDR_q6_K_q8_1 1
const
int
vi1
=
(
v1
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
vi2
=
(
v2
>>
(
4
*
i
))
&
0x0F0F0F0F
;
static
__device__
__forceinline__
float
vec_dot_q6_K_q8_1_impl
(
const
int
&
vl
,
const
int
&
vh
,
const
int
*
__restrict__
u
,
const
int8_t
*
__restrict__
scales
,
const
float
&
d
,
const
float
*
__restrict__
d8
)
{
const
int
dot1
=
__dp4a
(
vi2
,
ui2
,
__dp4a
(
vi1
,
ui1
,
0
));
// SIMD dot product
const
int
dot2
=
__dp4a
(
0x01010101
,
ui2
,
__dp4a
(
0x01010101
,
ui1
,
0
))
;
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
float
sumf
=
0.0
f
;
sumf_d
+=
d8i
*
(
dot1
*
sc
[
i
]);
sumf_m
+=
d8i
*
(
dot2
*
m
[
i
]);
// multiply constant part of q4_K with sum of q8_1 values
}
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
const
int
sc
=
scales
[
4
*
i
];
return
d
*
sumf_d
-
dmin
*
sumf_m
;
const
int
vil
=
(
vl
>>
(
4
*
i
))
&
0x0F0F0F0F
;
#else
const
int
vih
=
((
vh
>>
(
4
*
i
))
<<
4
)
&
0x30303030
;
uint16_t
aux16
[
2
];
const
uint8_t
*
s
=
(
const
uint8_t
*
)
aux16
;
const
int
vi
=
__vsubss4
((
vil
|
vih
),
0x20202020
);
// vi = (vil | vih) - 32
const
uint16_t
*
a
=
(
const
uint16_t
*
)
bq4_K
->
scales
;
aux16
[
0
]
=
a
[
0
]
&
0x0f0f
;
aux16
[
1
]
=
(
a
[
0
]
>>
4
)
&
0x0f0f
;
sumf
+=
d8
[
i
]
*
(
__dp4a
(
vi
,
u
[
i
],
0
)
*
sc
);
// SIMD dot product
}
const
float
dall
=
bq4_K
->
d
[
0
];
const
float
dmin
=
bq4_K
->
d
[
1
];
return
d
*
sumf
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
const
float
d8_1
=
bq8_1
[
0
].
d
;
const
float
d8_2
=
bq8_1
[
1
].
d
;
static
__device__
__forceinline__
float
vec_dot_q6_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
)
{
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
+
4
);
const
block_q6_K
*
bq6_K
=
(
const
block_q6_K
*
)
vbq
;
const
int
*
q4
=
(
const
int
*
)
bq4_K
->
qs
+
iqs
;
const
int
v1
=
q4
[
0
]
;
const
int
v
2
=
q4
[
4
]
;
const
int
bq8_offset
=
2
*
QR6_K
*
(
iqs
/
(
QI6_K
/
2
))
+
(
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
)
;
const
int
scale_offset
=
(
QI6_K
/
4
)
*
(
iqs
/
(
QI6_K
/
2
))
+
(
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
8
)
;
const
int
v
h_shift
=
2
*
((
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
))
;
const
int
dot1
=
__dp4a
(
ui2
,
v2
&
0x0f0f0f0f
,
__dp4a
(
ui1
,
v1
&
0x0f0f0f0f
,
0
));
const
int
dot2
=
__dp4a
(
ui4
,
(
v2
>>
4
)
&
0x0f0f0f0f
,
__dp4a
(
ui3
,
(
v1
>>
4
)
&
0x0f0f0f0f
,
0
));
const
int
dot3
=
__dp4a
(
0x01010101
,
ui2
,
__dp4a
(
0x01010101
,
ui1
,
0
));
const
int
dot4
=
__dp4a
(
0x01010101
,
ui4
,
__dp4a
(
0x01010101
,
ui3
,
0
));
const
int
vl
=
get_int_from_uint8
(
bq6_K
->
ql
,
iqs
);
const
int
vh
=
get_int_from_uint8
(
bq6_K
->
qh
,
(
QI6_K
/
4
)
*
(
iqs
/
(
QI6_K
/
2
))
+
iqs
%
(
QI6_K
/
4
))
>>
vh_shift
;
sumf_d
+=
d8_1
*
(
dot1
*
s
[
0
])
+
d8_2
*
(
dot2
*
s
[
1
]);
sumf_m
+=
d8_1
*
(
dot3
*
s
[
2
])
+
d8_2
*
(
dot4
*
s
[
3
]);
const
int8_t
*
scales
=
bq6_K
->
scales
+
scale_offset
;
return
dall
*
sumf_d
-
dmin
*
sumf_m
;
int
u
[
QR6_K
];
float
d8
[
QR6_K
];
#endif
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
u
[
i
]
=
get_int_from_int8_aligned
(
bq8_1
[
bq8_offset
+
2
*
i
].
qs
,
iqs
%
QI8_1
);
d8
[
i
]
=
bq8_1
[
bq8_offset
+
2
*
i
].
ds
.
x
;
}
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
return
vec_dot_q6_K_q8_1_impl
(
vl
,
vh
,
u
,
scales
,
bq6_K
->
d
,
d8
);
}
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
static
__device__
__forceinline__
void
allocate_tiles_q6_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_K
*
bq5_K
=
(
const
block_q5_K
*
)
vbq
;
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI6_K
)
+
GGML_CUDA_MMQ_Y
/
QI6_K
];
__shared__
int
tile_x_qh
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
2
)
+
GGML_CUDA_MMQ_Y
/
2
];
__shared__
int
tile_x_sc
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
8
)
+
GGML_CUDA_MMQ_Y
/
8
];
#ifndef GGML_QKK_64
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_qh
=
tile_x_qh
;
*
x_sc
=
tile_x_sc
;
}
const
int
bq8_offset
=
QR5_K
*
(
iqs
/
(
QI8_1
/
2
));
const
int
*
ql
=
(
const
int
*
)(
bq5_K
->
qs
+
16
*
bq8_offset
+
4
*
(
iqs
%
4
));
const
int
*
qh
=
(
const
int
*
)(
bq5_K
->
qh
+
4
*
(
iqs
%
4
));
static
__device__
__forceinline__
void
load_tiles_q6_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
float
d
=
bq5_K
->
d
;
const
float
dmin
=
bq5_K
->
dmin
;
const
int
kbx
=
k
/
QI6_K
;
// == 0 if QK_K == 256
const
int
kqsx
=
k
%
QI6_K
;
// == k if QK_K == 256
const
int
vl1
=
ql
[
0
];
const
int
vl2
=
ql
[
4
];
const
block_q6_K
*
bx0
=
(
block_q6_K
*
)
vx
;
const
int
vh1
=
qh
[
0
]
>>
bq8_offset
;
const
int
vh2
=
qh
[
4
]
>>
bq8_offset
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
bq5_K
->
scales
;
uint16_t
aux
[
2
];
const
int
j
=
bq8_offset
/
2
;
if
(
j
<
2
)
{
aux
[
0
]
=
scales
[
j
+
0
]
&
0x3f3f
;
aux
[
1
]
=
scales
[
j
+
2
]
&
0x3f3f
;
}
else
{
aux
[
0
]
=
((
scales
[
j
+
2
]
>>
0
)
&
0x0f0f
)
|
((
scales
[
j
-
2
]
&
0xc0c0
)
>>
2
);
aux
[
1
]
=
((
scales
[
j
+
2
]
>>
4
)
&
0x0f0f
)
|
((
scales
[
j
-
0
]
&
0xc0c0
)
>>
2
);
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
ql
,
kqsx
);
}
const
uint8_t
*
sc
=
(
const
uint8_t
*
)
aux
;
const
uint8_t
*
m
=
sc
+
2
;
for
(
int
i
=
0
;
i
<
QR5_K
;
++
i
)
{
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI6_K
;
// == 1 if QK_K == 256
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
// == 0 if QK_K == 256
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
float
d8i
=
bq8i
->
d
;
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
iqs
%
4
)
;
const
int
ui1
=
q8
[
0
];
const
int
ui2
=
q8
[
4
]
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI6_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI6_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
int
vil1
=
(
vl1
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
vil2
=
(
vl2
>>
(
4
*
i
))
&
0x0F0F0F0F
;
x_dm
[
i
*
(
WARP_SIZE
/
QI6_K
)
+
i
/
QI6_K
+
kbxd
].
x
=
bxi
->
d
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
2
)
{
const
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
const
int
vih1
=
((
vh1
>>
i
)
<<
4
)
&
0x10101010
;
const
int
vih2
=
((
vh2
>>
i
)
<<
4
)
&
0x10101010
;
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
2
))
/
(
QI6_K
/
2
);
const
int
vi1
=
vil1
|
vih1
;
const
int
vi2
=
vil2
|
vih2
;
x_qh
[
i
*
(
WARP_SIZE
/
2
)
+
i
/
2
+
k
%
(
WARP_SIZE
/
2
)]
=
get_int_from_uint8
(
bxi
->
qh
,
k
%
(
QI6_K
/
2
))
;
}
const
int
dot1
=
__dp4a
(
vi2
,
ui2
,
__dp4a
(
vi1
,
ui1
,
0
));
// SIMD dot product
const
int
dot2
=
__dp4a
(
0x01010101
,
ui2
,
__dp4a
(
0x01010101
,
ui1
,
0
));
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
const
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
sumf_d
+=
d8i
*
(
dot1
*
sc
[
i
]);
sumf_m
+=
d8i
*
(
dot2
*
m
[
i
]);
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
4
;
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
k
%
(
WARP_SIZE
/
8
)]
=
get_int_from_int8
(
bxi
->
scales
,
k
%
(
QI6_K
/
8
));
}
}
return
d
*
sumf_d
-
dmin
*
sumf_m
;
static
__device__
__forceinline__
float
vec_dot_q6_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
#else
__builtin_assume
(
i
>=
0
);
__builtin_assume
(
i
<
GGML_CUDA_MMQ_Y
);
__builtin_assume
(
j
>=
0
);
__builtin_assume
(
j
<
WARP_SIZE
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int8_t
*
s
=
bq5_K
->
scales
;
const
int
kbx
=
k
/
QI6_K
;
// == 0 if QK_K == 256
const
int
kqsx
=
k
%
QI6_K
;
// == k if QK_K == 256
const
float
d
=
bq5_K
->
d
;
const
int
bq8_offset
=
2
*
QR6_K
*
(
kqsx
/
(
QI6_K
/
2
))
+
(
kqsx
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
);
const
int
scale_offset
=
(
QI6_K
/
4
)
*
(
kqsx
/
(
QI6_K
/
2
))
+
(
kqsx
%
(
QI6_K
/
2
))
/
(
QI6_K
/
8
);
const
int
vh_shift
=
2
*
((
kqsx
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
));
const
float
d8_1
=
bq8_1
[
0
].
d
;
const
float
d8_2
=
bq8_1
[
1
].
d
;
const
int
vh
=
x_qh
[
i
*
(
WARP_SIZE
/
2
)
+
i
/
2
+
kbx
*
(
QI6_K
/
2
)
+
(
QI6_K
/
4
)
*
(
kqsx
/
(
QI6_K
/
2
))
+
kqsx
%
(
QI6_K
/
4
)]
>>
vh_shift
;
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
+
4
);
const
int
x_sc_offset
=
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
kbx
*
(
QI6_K
/
8
);
const
int8_t
*
scales
=
((
int8_t
*
)
(
x_sc
+
x_sc_offset
))
+
scale_offset
;
const
int
*
ql
=
(
const
int
*
)
bq5_K
->
qs
+
iqs
;
const
int
vl1
=
ql
[
0
];
const
int
vl2
=
ql
[
4
];
int
u
[
QR6_K
];
float
d8
[
QR6_K
];
const
int
step
=
4
*
iqs
;
// 0, 4, 8, 12
const
int
im
=
step
/
8
;
// = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
const
int
in
=
step
%
8
;
// 0, 4, 0, 4
const
int
vh
=
(
*
((
const
int
*
)(
bq5_K
->
qh
+
in
)))
>>
im
;
for
(
int
l
=
0
;
l
<
QR6_K
;
++
l
)
{
const
int
kqsy
=
j
*
(
QR6_K
*
WARP_SIZE
)
+
kbx
*
(
QR6_K
*
QI6_K
)
+
(
bq8_offset
+
2
*
l
)
*
QI8_1
+
kqsx
%
QI8_1
;
u
[
l
]
=
y_qs
[
kqsy
];
d8
[
l
]
=
y_ds
[
kqsy
/
QI8_1
].
x
;
}
const
int
v1
=
(((
vh
<<
4
)
&
0x10101010
)
^
0x10101010
)
|
((
vl1
>>
0
)
&
0x0f0f0f0f
);
const
int
v2
=
(((
vh
<<
2
)
&
0x10101010
)
^
0x10101010
)
|
((
vl2
>>
0
)
&
0x0f0f0f0f
);
const
int
v3
=
(((
vh
>>
0
)
&
0x10101010
)
^
0x10101010
)
|
((
vl1
>>
4
)
&
0x0f0f0f0f
);
const
int
v4
=
(((
vh
>>
2
)
&
0x10101010
)
^
0x10101010
)
|
((
vl2
>>
4
)
&
0x0f0f0f0f
);
return
vec_dot_q6_K_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
vh
,
u
,
scales
,
x_dm
[
i
*
(
WARP_SIZE
/
QI6_K
)
+
i
/
QI6_K
+
kbx
].
x
,
d8
);
}
const
float
sumf_d
=
d8_1
*
(
__dp4a
(
ui1
,
v1
,
0
)
*
s
[
0
]
+
__dp4a
(
ui2
,
v2
,
0
)
*
s
[
1
])
+
d8_2
*
(
__dp4a
(
ui3
,
v3
,
0
)
*
s
[
2
]
+
__dp4a
(
ui4
,
v4
,
0
)
*
s
[
3
]);
template
<
int
qk
,
int
qr
,
int
qi
,
typename
block_q_t
,
allocate_tiles_cuda_t
allocate_tiles
,
load_tiles_cuda_t
load_tiles
,
int
vdr
,
vec_dot_q_mul_mat_cuda_t
vec_dot
>
static
__global__
void
mul_mat_q
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
return
d
*
sumf_d
;
const
block_q_t
*
x
=
(
const
block_q_t
*
)
vx
;
const
block_q8_1
*
y
=
(
const
block_q8_1
*
)
vy
;
#endif
const
int
blocks_per_row_x
=
ncols_x
/
qk
;
const
int
blocks_per_col_y
=
nrows_y
/
QK8_1
;
const
int
blocks_per_warp
=
WARP_SIZE
/
qi
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
const
int
&
ncols_dst
=
ncols_y
;
static
__device__
__forceinline__
float
vec_dot_q6_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
const
int
tid_x
=
threadIdx
.
x
;
const
int
tid_y
=
threadIdx
.
y
;
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q6_K
*
bq6_K
=
(
const
block_q6_K
*
)
vbq
;
const
int
row_dst_0
=
blockIdx
.
x
*
GGML_CUDA_MMQ_Y
;
const
int
&
row_x_0
=
row_dst_0
;
const
int
row_dst
=
row_dst_0
+
tid_x
;
const
int
bq8_offset
=
2
*
QR6_K
*
(
iqs
/
(
QI6_K
/
2
))
+
(
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
);
const
int
scale_offset
=
(
QI6_K
/
4
)
*
(
iqs
/
(
QI6_K
/
2
))
+
(
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
8
);
const
int
vh_shift
=
2
*
((
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
));
const
int
col_dst_0
=
blockIdx
.
y
*
WARP_SIZE
;
const
int
&
col_y_0
=
col_dst_0
;
float
sumf
=
0.0
f
;
int
*
tile_x_ql
=
nullptr
;
half2
*
tile_x_dm
=
nullptr
;
int
*
tile_x_qh
=
nullptr
;
int
*
tile_x_sc
=
nullptr
;
const
f
loat
d
=
bq6_K
->
d
;
al
lo
c
at
e_tiles
(
&
tile_x_ql
,
&
tile_x_dm
,
&
tile_x_qh
,
&
tile_x_sc
)
;
int
vl
;
memcpy
(
&
vl
,
&
bq6_K
->
ql
[
sizeof
(
int
)
*
iqs
],
sizeof
(
int
));
const
int
blocks_per_tile_y_col
=
qr
*
WARP_SIZE
/
QI8_1
;
int
vh
;
memcpy
(
&
vh
,
&
bq6_K
->
qh
[
sizeof
(
int
)
*
((
QI6_K
/
4
)
*
(
iqs
/
(
QI6_K
/
2
))
+
iqs
%
(
QI6_K
/
4
))],
sizeof
(
int
))
;
__shared__
int
tile_y_qs
[(
WARP_SIZE
)
*
(
qr
*
WARP_SIZE
)]
;
__shared__
half2
tile_y_ds
[(
WARP_SIZE
)
*
blocks_per_tile_y_col
]
;
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
const
int
sc
=
bq6_K
->
scales
[
scale_offset
+
4
*
i
];
float
sum
[
GGML_CUDA_MMQ_Y
/
WARP_SIZE
][
4
]
=
{
0.0
f
};
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
2
*
i
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
(
QI8_1
))]);
const
float
d8i
=
bq8i
->
d
;
for
(
int
ib0
=
0
;
ib0
<
blocks_per_row_x
;
ib0
+=
blocks_per_warp
)
{
const
int
vil
=
(
vl
>>
(
4
*
i
))
&
0x0F0F0F0F
;
load_tiles
(
x
+
row_x_0
*
blocks_per_row_x
+
ib0
,
tile_x_ql
,
tile_x_dm
,
tile_x_qh
,
tile_x_sc
,
tid_y
,
tid_x
,
blocks_per_row_x
);
const
int
vih
=
((
vh
>>
(
vh_shift
+
4
*
i
))
<<
4
)
&
0x30303030
;
for
(
int
ir
=
0
;
ir
<
qr
;
++
ir
)
{
const
int
kqs
=
ir
*
WARP_SIZE
+
tid_x
;
const
int
kbxd
=
kqs
/
QI8_1
;
const
int
vi
=
__vsubss4
((
vil
|
vih
),
0x20202020
);
// vi = (vil | vih) - 32
for
(
int
i
=
0
;
i
<
WARP_SIZE
;
i
+=
8
)
{
const
int
col_y_eff
=
min
(
col_y_0
+
tid_y
+
i
,
ncols_y
-
1
);
// to prevent out-of-bounds memory accesses
const
block_q8_1
*
by0
=
&
y
[
col_y_eff
*
blocks_per_col_y
+
ib0
*
(
qk
/
QK8_1
)
+
kbxd
];
sumf
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
tile_y_qs
[(
tid_y
+
i
)
*
(
qr
*
WARP_SIZE
)
+
kqs
]
=
get_int_from_int8_aligned
(
by0
->
qs
,
tid_x
%
QI8_1
);
}
}
for
(
int
ids0
=
0
;
ids0
<
WARP_SIZE
;
ids0
+=
8
*
(
WARP_SIZE
/
blocks_per_tile_y_col
))
{
const
int
ids
=
(
ids0
+
tid_y
*
(
WARP_SIZE
/
blocks_per_tile_y_col
)
+
tid_x
/
blocks_per_tile_y_col
)
%
WARP_SIZE
;
const
int
kby
=
tid_x
%
blocks_per_tile_y_col
;
const
int
col_y_eff
=
min
(
col_y_0
+
ids
,
ncols_y
-
1
);
tile_y_ds
[
ids
*
(
qr
*
WARP_SIZE
/
QI8_1
)
+
kby
]
=
y
[
col_y_eff
*
blocks_per_col_y
+
ib0
*
(
qk
/
QK8_1
)
+
kby
].
ds
;
}
__syncthreads
();
#if __CUDA_ARCH__ >= 700 // TODO: actually test this with compute capability 7.X cards
#pragma unroll
#endif // __CUDA_ARCH__ >= 700
for
(
int
k
=
0
;
k
<
WARP_SIZE
/
vdr
;
++
k
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
WARP_SIZE
;
j
+=
8
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
GGML_CUDA_MMQ_Y
;
i
+=
WARP_SIZE
)
{
sum
[
i
/
WARP_SIZE
][
j
/
8
]
+=
vec_dot
(
tile_x_ql
,
tile_x_dm
,
tile_x_qh
,
tile_x_sc
,
tile_y_qs
,
tile_y_ds
,
tid_x
+
i
,
tid_y
+
j
,
k
);
}
}
}
__syncthreads
();
}
return
d
*
sumf
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
if
(
row_dst
>=
nrows_dst
)
{
return
;
}
for
(
int
j
=
0
;
j
<
WARP_SIZE
;
j
+=
8
)
{
const
int
col_dst
=
col_dst_0
+
j
+
tid_y
;
if
(
col_dst
>=
ncols_dst
)
{
return
;
}
for
(
int
i
=
0
;
i
<
GGML_CUDA_MMQ_Y
;
i
+=
WARP_SIZE
)
{
dst
[
col_dst
*
nrows_dst
+
row_dst
+
i
]
=
sum
[
i
/
WARP_SIZE
][
j
/
8
];
}
}
}
template
<
int
qk
,
int
qi
,
typename
block_q_t
,
vec_dot_q_cuda_t
vec_dot_q_cuda
>
template
<
int
qk
,
int
qi
,
typename
block_q_t
,
int
vdr
,
vec_dot_q_cuda_t
vec_dot_q_cuda
>
static
__global__
void
mul_mat_vec_q
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols
,
const
int
nrows
)
{
const
int
row
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
...
...
@@ -1839,7 +2943,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
}
const
int
blocks_per_row
=
ncols
/
qk
;
const
int
blocks_per_warp
=
WARP_SIZE
/
qi
;
const
int
blocks_per_warp
=
vdr
*
WARP_SIZE
/
qi
;
// partial sum for each thread
float
tmp
=
0.0
f
;
...
...
@@ -1848,11 +2952,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
const
block_q8_1
*
y
=
(
const
block_q8_1
*
)
vy
;
for
(
int
i
=
0
;
i
<
blocks_per_row
;
i
+=
blocks_per_warp
)
{
const
int
ibx
=
row
*
blocks_per_row
+
i
+
threadIdx
.
x
/
qi
;
// x block index
const
int
ibx
=
row
*
blocks_per_row
+
i
+
threadIdx
.
x
/
(
qi
/
vdr
)
;
// x block index
const
int
iby
=
(
i
+
threadIdx
.
x
/
qi
)
*
qk
/
QK8_1
;
// y block index that aligns with ibx
const
int
iby
=
(
i
+
threadIdx
.
x
/
(
qi
/
vdr
)
)
*
qk
/
QK8_1
;
// y block index that aligns with ibx
const
int
iqs
=
threadIdx
.
x
%
qi
;
// x block quant index when casting the quants to int
const
int
iqs
=
threadIdx
.
x
%
(
qi
/
vdr
)
;
// x block quant index when casting the quants to int
tmp
+=
vec_dot_q_cuda
(
&
x
[
ibx
],
&
y
[
iby
],
iqs
);
}
...
...
@@ -1885,11 +2989,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
const
int
y_offset
=
qr
==
1
?
1
:
qk
/
2
;
// partial sum for each thread
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
half2
tmp
=
{
0.0
f
,
0.0
f
};
// two sums for f16 to take advantage of half2 intrinsics
#else
float
tmp
=
0.0
f
;
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
for
(
int
i
=
0
;
i
<
ncols
;
i
+=
iter_stride
)
{
const
int
col
=
i
+
vals_per_iter
*
tid
;
...
...
@@ -1909,7 +3013,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
// matrix multiplication
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
tmp
+=
__hmul2
(
v
,
{
y
[
iybs
+
iqs
+
j
/
qr
+
0
],
y
[
iybs
+
iqs
+
j
/
qr
+
y_offset
]
...
...
@@ -1917,7 +3021,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
#else
tmp
+=
v
.
x
*
y
[
iybs
+
iqs
+
j
/
qr
+
0
];
tmp
+=
v
.
y
*
y
[
iybs
+
iqs
+
j
/
qr
+
y_offset
];
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
}
}
...
...
@@ -1928,11 +3032,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
}
if
(
tid
==
0
)
{
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
dst
[
row
]
=
tmp
.
x
+
tmp
.
y
;
#else
dst
[
row
]
=
tmp
;
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
}
}
...
...
@@ -2072,7 +3176,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
}
// rope == RoPE == rotary positional embedding
static
__global__
void
rope_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
float
p
,
const
float
theta_scale
)
{
static
__global__
void
rope_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
float
p0
,
const
float
p_delta
,
const
int
p_delta_rows
,
const
float
theta_scale
)
{
const
int
col
=
2
*
(
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
);
if
(
col
>=
ncols
)
{
...
...
@@ -2082,7 +3187,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
const
int
row
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
i
=
row
*
ncols
+
col
;
const
float
theta
=
p
*
powf
(
theta_scale
,
col
/
2
);
const
float
theta
=
(
p0
+
p_delta
*
(
row
/
p_delta_rows
))
*
powf
(
theta_scale
,
col
/
2
);
const
float
sin_theta
=
sinf
(
theta
);
const
float
cos_theta
=
cosf
(
theta
);
...
...
@@ -2229,9 +3334,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
rms_norm_f32
<<<
nrows
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
eps
);
}
static
void
quantize_row_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int
ndata
,
const
int
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_QUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_QUANTIZE_BLOCK_SIZE
;
quantize_q8_1
<<<
num_blocks
,
CUDA_QUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
vy
,
ndata
,
k
);
static
void
quantize_row_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int
kx
,
const
int
ky
,
const
int
kx_padded
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
kx_padded
+
CUDA_QUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_QUANTIZE_BLOCK_SIZE
;
const
dim3
num_blocks
(
block_num_x
,
ky
,
1
);
const
dim3
block_size
(
CUDA_DEQUANTIZE_BLOCK_SIZE
,
1
,
1
);
quantize_q8_1
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx
,
kx_padded
);
}
static
void
dequantize_row_q4_0_cuda
(
const
void
*
vx
,
float
*
y
,
const
int
k
,
cudaStream_t
stream
)
{
...
...
@@ -2392,7 +3499,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK4_0
,
QI4_0
,
block_q4_0
,
vec_dot_q4_0_q8_1
>
mul_mat_vec_q
<
QK4_0
,
QI4_0
,
block_q4_0
,
VDR_q4_0_q8_1
,
vec_dot_q4_0_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2401,7 +3508,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK4_0
,
QI4_1
,
block_q4_1
,
vec_dot_q4_1_q8_1
>
mul_mat_vec_q
<
QK4_0
,
QI4_1
,
block_q4_1
,
VDR_q4_1_q8_1
,
vec_dot_q4_1_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2410,7 +3517,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK5_0
,
QI5_0
,
block_q5_0
,
vec_dot_q5_0_q8_1
>
mul_mat_vec_q
<
QK5_0
,
QI5_0
,
block_q5_0
,
VDR_q5_0_q8_1
,
vec_dot_q5_0_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2419,7 +3526,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK5_1
,
QI5_1
,
block_q5_1
,
vec_dot_q5_1_q8_1
>
mul_mat_vec_q
<
QK5_1
,
QI5_1
,
block_q5_1
,
VDR_q5_1_q8_1
,
vec_dot_q5_1_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2428,7 +3535,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK8_0
,
QI8_0
,
block_q8_0
,
vec_dot_q8_0_q8_1
>
mul_mat_vec_q
<
QK8_0
,
QI8_0
,
block_q8_0
,
VDR_q8_0_q8_1
,
vec_dot_q8_0_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2437,7 +3544,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI2_K
,
block_q2_K
,
vec_dot_q2_K_q8_1
>
mul_mat_vec_q
<
QK_K
,
QI2_K
,
block_q2_K
,
VDR_q2_K_q8_1
,
vec_dot_q2_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2446,7 +3553,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI3_K
,
block_q3_K
,
vec_dot_q3_K_q8_1
>
mul_mat_vec_q
<
QK_K
,
QI3_K
,
block_q3_K
,
VDR_q3_K_q8_1
,
vec_dot_q3_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2455,10 +3562,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
// Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
// is better amortized.
mul_mat_vec_q
<
QK_K
,
QI4_K
/
2
,
block_q4_K
,
vec_dot_q4_K_q8_1
>
mul_mat_vec_q
<
QK_K
,
QI4_K
,
block_q4_K
,
VDR_q4_K_q8_1
,
vec_dot_q4_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2467,10 +3571,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
// Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
// is better amortized.
mul_mat_vec_q
<
QK_K
,
QI5_K
/
2
,
block_q5_K
,
vec_dot_q5_K_q8_1
>
mul_mat_vec_q
<
QK_K
,
QI5_K
,
block_q5_K
,
VDR_q5_K_q8_1
,
vec_dot_q5_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2479,7 +3580,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI6_K
,
block_q6_K
,
vec_dot_q6_K_q8_1
>
mul_mat_vec_q
<
QK_K
,
QI6_K
,
block_q6_K
,
VDR_q6_K_q8_1
,
vec_dot_q6_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -2526,6 +3627,126 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
}
}
static
void
ggml_mul_mat_q4_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK4_0
,
QR4_0
,
QI4_0
,
block_q4_0
,
allocate_tiles_q4_0
,
load_tiles_q4_0
,
VDR_q4_0_q8_1
,
vec_dot_q4_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_q4_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK4_1
,
QR4_1
,
QI4_1
,
block_q4_1
,
allocate_tiles_q4_1
,
load_tiles_q4_1
,
VDR_q4_1_q8_1
,
vec_dot_q4_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_q5_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK5_0
,
QR5_0
,
QI5_0
,
block_q5_0
,
allocate_tiles_q5_0
,
load_tiles_q5_0
,
VDR_q5_0_q8_1
,
vec_dot_q5_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_q5_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK5_1
,
QR5_1
,
QI5_1
,
block_q5_1
,
allocate_tiles_q5_1
,
load_tiles_q5_1
,
VDR_q5_1_q8_1
,
vec_dot_q5_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_q8_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK8_0
,
QR8_0
,
QI8_0
,
block_q8_0
,
allocate_tiles_q8_0
,
load_tiles_q8_0
,
VDR_q8_0_q8_1
,
vec_dot_q8_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_q2_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR2_K
,
QI2_K
,
block_q2_K
,
allocate_tiles_q2_K
,
load_tiles_q2_K
,
VDR_q2_K_q8_1
,
vec_dot_q2_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_q3_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR3_K
,
QI3_K
,
block_q3_K
,
allocate_tiles_q3_K
,
load_tiles_q3_K
,
VDR_q3_K_q8_1
,
vec_dot_q3_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_q4_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR4_K
,
QI4_K
,
block_q4_K
,
allocate_tiles_q4_K
,
load_tiles_q4_K
,
VDR_q4_K_q8_1
,
vec_dot_q4_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_q5_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR5_K
,
QI5_K
,
block_q5_K
,
allocate_tiles_q5_K
,
load_tiles_q5_K
,
VDR_q5_K_q8_1
,
vec_dot_q5_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_q6_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
const
int
block_num_x
=
(
nrows_x
+
GGML_CUDA_MMQ_Y
-
1
)
/
GGML_CUDA_MMQ_Y
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR6_K
,
QI6_K
,
block_q6_K
,
allocate_tiles_q6_K
,
load_tiles_q6_K
,
VDR_q6_K_q8_1
,
vec_dot_q6_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
static
void
ggml_mul_mat_p021_f16_f32_cuda
(
const
void
*
vx
,
const
float
*
y
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nchannels_x
,
const
int
nchannels_y
,
cudaStream_t
stream
)
{
...
...
@@ -2570,12 +3791,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
scale_f32
<<<
num_blocks
,
CUDA_SCALE_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
dst
,
scale
,
k
);
}
static
void
rope_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
const
float
p
,
const
float
theta_scale
,
cudaStream_t
stream
)
{
static
void
rope_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
const
float
p0
,
const
float
p_delta
,
const
int
p_delta_rows
,
const
float
theta_scale
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
nrows
%
2
==
0
);
const
dim3
block_dims
(
2
*
CUDA_ROPE_BLOCK_SIZE
,
1
,
1
);
const
int
num_blocks_x
=
(
ncols
+
2
*
CUDA_ROPE_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_ROPE_BLOCK_SIZE
);
const
dim3
block_nums
(
num_blocks_x
,
nrows
,
1
);
rope_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
p
,
theta_scale
);
rope_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
p
0
,
p_delta
,
p_delta_rows
,
theta_scale
);
}
static
void
rope_glm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
const
float
p
,
const
float
block_p
,
const
float
theta_scale
,
cudaStream_t
stream
)
{
...
...
@@ -2702,10 +3924,9 @@ static size_t g_scratch_offset = 0;
static
int
g_device_count
=
-
1
;
static
int
g_main_device
=
0
;
#ifndef GGML_CUDA_FORCE_DMMV
static
int
g_compute_capabilities
[
GGML_CUDA_MAX_DEVICES
];
#endif
static
float
g_tensor_split
[
GGML_CUDA_MAX_DEVICES
]
=
{
0
};
static
bool
g_mul_mat_q
=
false
;
static
cublasHandle_t
g_cublas_handles
[
GGML_CUDA_MAX_DEVICES
]
=
{
nullptr
};
...
...
@@ -2727,9 +3948,7 @@ void ggml_init_cublas() {
g_tensor_split
[
id
]
=
total_vram
;
total_vram
+=
prop
.
totalGlobalMem
;
#ifndef GGML_CUDA_FORCE_DMMV
g_compute_capabilities
[
id
]
=
100
*
prop
.
major
+
10
*
prop
.
minor
;
#endif
}
for
(
int
id
=
0
;
id
<
g_device_count
;
++
id
)
{
g_tensor_split
[
id
]
/=
total_vram
;
...
...
@@ -2991,6 +4210,83 @@ inline void ggml_cuda_op_rms_norm(
(
void
)
i1
;
}
inline
void
ggml_cuda_op_mul_mat_q
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
char
*
src0_ddq_i
,
float
*
src0_ddf_i
,
float
*
src1_ddf_i
,
float
*
dst_ddf_i
,
int64_t
i02
,
int64_t
i01_low
,
int64_t
i01_high
,
int
i1
,
cudaStream_t
&
cudaStream_main
){
GGML_ASSERT
(
src0_ddq_i
!=
nullptr
);
GGML_ASSERT
(
src1_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
ne10
=
src1
->
ne
[
0
];
const
int64_t
ne11
=
src1
->
ne
[
1
];
GGML_ASSERT
(
ne10
%
QK8_1
==
0
);
const
int64_t
ne0
=
dst
->
ne
[
0
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
int
id
;
CUDA_CHECK
(
cudaGetDevice
(
&
id
));
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
const
int64_t
nrows_dst
=
dst
->
backend
==
GGML_BACKEND_GPU
&&
id
==
g_main_device
?
ne0
:
i01_diff
;
const
int64_t
padded_row_size
=
ne10
%
MATRIX_ROW_PADDING
==
0
?
ne10
:
ne10
-
ne10
%
MATRIX_ROW_PADDING
+
MATRIX_ROW_PADDING
;
size_t
as
;
void
*
src1_q8_1
=
ggml_cuda_pool_malloc
(
padded_row_size
*
ne11
*
sizeof
(
block_q8_1
)
/
QK8_1
,
&
as
);
quantize_row_q8_1_cuda
(
src1_ddf_i
,
src1_q8_1
,
ne10
,
ne11
,
padded_row_size
,
cudaStream_main
);
switch
(
src0
->
type
)
{
case
GGML_TYPE_Q4_0
:
ggml_mul_mat_q4_0_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q4_1
:
ggml_mul_mat_q4_1_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q5_0
:
ggml_mul_mat_q5_0_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q5_1
:
ggml_mul_mat_q5_1_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q8_0
:
ggml_mul_mat_q8_0_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q2_K
:
ggml_mul_mat_q2_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q3_K
:
ggml_mul_mat_q3_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q4_K
:
ggml_mul_mat_q4_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q5_K
:
ggml_mul_mat_q5_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q6_K
:
ggml_mul_mat_q6_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
i01_diff
,
ne11
,
padded_row_size
,
nrows_dst
,
cudaStream_main
);
break
;
default:
GGML_ASSERT
(
false
);
break
;
}
ggml_cuda_pool_free
(
src1_q8_1
,
as
);
(
void
)
src1
;
(
void
)
dst
;
(
void
)
src0_ddf_i
;
(
void
)
i02
;
(
void
)
i1
;
}
inline
void
ggml_cuda_op_mul_mat_vec
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
char
*
src0_ddq_i
,
float
*
src0_ddf_i
,
float
*
src1_ddf_i
,
float
*
dst_ddf_i
,
int64_t
i02
,
int64_t
i01_low
,
int64_t
i01_high
,
int
i1
,
...
...
@@ -3005,6 +4301,7 @@ inline void ggml_cuda_op_mul_mat_vec(
#ifdef GGML_CUDA_FORCE_DMMV
const
bool
use_mul_mat_vec_q
=
false
;
(
void
)
g_compute_capabilities
[
0
];
#else
int
id
;
CUDA_CHECK
(
cudaGetDevice
(
&
id
));
...
...
@@ -3032,7 +4329,7 @@ inline void ggml_cuda_op_mul_mat_vec(
ne00
:
ne00
-
ne00
%
MATRIX_ROW_PADDING
+
MATRIX_ROW_PADDING
;
size_t
as
;
void
*
src1_q8_1
=
ggml_cuda_pool_malloc
(
padded_row_size
*
sizeof
(
block_q8_1
)
/
QK8_1
,
&
as
);
quantize_row_q8_1_cuda
(
src1_ddf_i
,
src1_q8_1
,
ne00
,
padded_row_size
,
cudaStream_main
);
quantize_row_q8_1_cuda
(
src1_ddf_i
,
src1_q8_1
,
ne00
,
1
,
padded_row_size
,
cudaStream_main
);
switch
(
src0
->
type
)
{
case
GGML_TYPE_Q4_0
:
...
...
@@ -3073,7 +4370,7 @@ inline void ggml_cuda_op_mul_mat_vec(
ggml_cuda_pool_free
(
src1_q8_1
,
as
);
}
else
{
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
size_t
ash
;
dfloat
*
src1_dfloat
=
nullptr
;
// dfloat == half
...
...
@@ -3089,7 +4386,7 @@ inline void ggml_cuda_op_mul_mat_vec(
}
#else
dfloat
*
src1_dfloat
=
src1_ddf_i
;
// dfloat == float, no conversion
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
switch
(
src0
->
type
)
{
case
GGML_TYPE_Q4_0
:
...
...
@@ -3130,11 +4427,11 @@ inline void ggml_cuda_op_mul_mat_vec(
break
;
}
#ifdef GGML_CUDA_
DMMV_
F16
#ifdef GGML_CUDA_F16
if
(
src1_convert_f16
)
{
ggml_cuda_pool_free
(
src1_dfloat
,
ash
);
}
#endif // GGML_CUDA_
DMMV_
F16
#endif // GGML_CUDA_F16
}
(
void
)
src1
;
...
...
@@ -3194,6 +4491,7 @@ inline void ggml_cuda_op_rope(
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
ne01
=
src0
->
ne
[
1
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
const
int
n_past
=
((
int32_t
*
)
dst
->
op_params
)[
0
];
...
...
@@ -3207,17 +4505,18 @@ inline void ggml_cuda_op_rope(
memcpy
(
&
freq_scale
,
(
int32_t
*
)
dst
->
op_params
+
5
,
sizeof
(
float
));
const
float
theta_scale
=
powf
(
freq_base
,
-
2.0
f
/
n_dims
);
const
float
p
=
(((
mode
&
1
)
==
0
?
n_past
+
i02
:
i02
))
*
freq_scale
;
bool
is_glm
=
mode
&
4
;
const
bool
is_glm
=
mode
&
4
;
// compute
if
(
is_glm
)
{
const
float
p
=
(((
mode
&
1
)
==
0
?
n_past
+
i02
:
i02
))
*
freq_scale
;
const
float
id_p
=
min
(
p
,
n_ctx
-
2.
f
);
const
float
block_p
=
max
(
p
-
(
n_ctx
-
2.
f
),
0.
f
);
rope_glm_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
id_p
,
block_p
,
theta_scale
,
cudaStream_main
);
}
else
{
rope_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
p
,
theta_scale
,
cudaStream_main
);
const
float
p0
=
(((
mode
&
1
)
==
0
?
n_past
:
0
))
*
freq_scale
;
rope_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
p0
,
freq_scale
,
ne01
,
theta_scale
,
cudaStream_main
);
}
(
void
)
src1
;
...
...
@@ -3389,7 +4688,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
int64_t
row_low
,
row_high
;
if
(
split
)
{
row_low
=
id
==
0
?
0
:
nrows0
*
g_tensor_split
[
id
];
row_low
-=
row_low
%
GGML_CUDA_MMQ_Y
;
row_high
=
id
==
g_device_count
-
1
?
nrows0
:
nrows0
*
g_tensor_split
[
id
+
1
];
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
}
else
{
row_low
=
0
;
row_high
=
nrows0
*
i02_divisor
;
...
...
@@ -3555,13 +4857,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
if
(
split
)
{
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
// dst is NOT transposed.
// The outputs of
cuBLAS
matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
// Instead they need to be copied to the correct slice in ne0 = dst row index.
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
for
(
int64_t
j
=
0
;
j
<
ne1
;
++
j
)
{
float
*
dhf_dst_i
=
(
float
*
)
((
char
*
)
dst_off_device
+
(
j
*
ne0
+
i01_low
)
*
sizeof
(
float
)
+
i02
*
nb2
+
i03
*
nb3
);
CUDA_CHECK
(
cudaMemcpyAsync
(
dhf_dst_i
,
dst_ddf_i
+
j
*
i01_diff
,
i01_diff
*
sizeof
(
float
),
kind
,
cudaStream_main
));
}
float
*
dhf_dst_i
=
(
float
*
)
((
char
*
)
dst_off_device
+
i01_low
*
sizeof
(
float
)
+
i02
*
nb2
+
i03
*
nb3
);
CUDA_CHECK
(
cudaMemcpy2DAsync
(
dhf_dst_i
,
ne0
*
sizeof
(
float
),
dst_ddf_i
,
i01_diff
*
sizeof
(
float
),
i01_diff
*
sizeof
(
float
),
ne1
,
kind
,
cudaStream_main
));
}
else
{
float
*
dhf_dst_i
=
(
float
*
)
((
char
*
)
dst_off_device
+
i02
*
nb2
+
i03
*
nb3
);
CUDA_CHECK
(
cudaMemcpyAsync
(
dhf_dst_i
,
dst_ddf_i
,
dst_stride
*
sizeof
(
float
),
kind
,
cudaStream_main
));
...
...
@@ -3744,7 +5045,18 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
if
(
src1
->
ne
[
1
]
==
1
&&
src0
->
ne
[
0
]
%
GGML_CUDA_DMMV_X
==
0
)
{
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_mul_mat_vec
,
false
,
false
);
}
else
{
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_mul_mat_cublas
,
true
,
false
);
int
min_compute_capability
=
INT_MAX
;
for
(
int
id
=
0
;
id
<
g_device_count
;
++
id
)
{
if
(
min_compute_capability
>
g_compute_capabilities
[
id
])
{
min_compute_capability
=
g_compute_capabilities
[
id
];
}
}
if
(
g_mul_mat_q
&&
ggml_is_quantized
(
src0
->
type
)
&&
min_compute_capability
>=
MIN_CC_DP4A
)
{
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_mul_mat_q
,
false
,
false
);
}
else
{
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_mul_mat_cublas
,
true
,
false
);
}
}
}
else
{
GGML_ASSERT
(
false
);
...
...
@@ -3821,7 +5133,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
void
ggml_cuda_rope
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_rope
,
true
,
false
);
// FIXME flatten changes results
const
int
mode
=
((
int32_t
*
)
dst
->
op_params
)[
2
];
const
bool
is_glm
=
mode
&
4
;
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_rope
,
true
,
!
is_glm
);
// flatten support not implemented for glm
}
void
ggml_cuda_nop
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
...
...
@@ -3854,7 +5169,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
row_high
=
nrows
;
}
else
if
(
backend
==
GGML_BACKEND_GPU_SPLIT
)
{
row_low
=
id
==
0
?
0
:
nrows
*
g_tensor_split
[
id
];
row_low
-=
row_low
%
GGML_CUDA_MMQ_Y
;
row_high
=
id
==
g_device_count
-
1
?
nrows
:
nrows
*
g_tensor_split
[
id
+
1
];
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
}
else
{
GGML_ASSERT
(
false
);
}
...
...
@@ -4028,6 +5346,10 @@ void ggml_cuda_set_main_device(int main_device) {
}
}
void
ggml_cuda_set_mul_mat_q
(
bool
mul_mat_q
)
{
g_mul_mat_q
=
mul_mat_q
;
}
void
ggml_cuda_set_scratch_size
(
size_t
scratch_size
)
{
g_scratch_size
=
scratch_size
;
}
...
...
llama/ggml-cuda.h
View file @
7a1c3e62
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
@@ -53,6 +53,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
void
ggml_cuda_assign_buffers_no_scratch
(
struct
ggml_tensor
*
tensor
);
void
ggml_cuda_assign_buffers_force_inplace
(
struct
ggml_tensor
*
tensor
);
void
ggml_cuda_set_main_device
(
int
main_device
);
void
ggml_cuda_set_mul_mat_q
(
bool
mul_mat_q
);
void
ggml_cuda_set_scratch_size
(
size_t
scratch_size
);
void
ggml_cuda_free_scratch
(
void
);
bool
ggml_cuda_compute_forward
(
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
tensor
);
...
...
llama/ggml-metal.h
View file @
7a1c3e62
//go:build darwin
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
llama/ggml-metal.m
View file @
7a1c3e62
//go:build darwin
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
@@ -746,7 +746,8 @@ void ggml_metal_graph_compute(
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT
(
ne00
==
ne10
);
GGML_ASSERT
(
ne02
==
ne12
);
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
GGML_ASSERT
(
ne03
==
ne13
);
if
(
ggml_is_contiguous
(
src0
)
&&
ggml_is_contiguous
(
src1
)
&&
...
...
@@ -774,11 +775,11 @@ void ggml_metal_graph_compute(
initWithDevice:
ctx
->
device
transposeLeft
:
false
transposeRight
:
true
resultRows:
ne11
resultColumns
:
ne01
interiorColumns
:
ne00
alpha
:
1
.
0
beta
:
0
.
0
];
// we need to do ne
0
2 multiplications
// we need to do ne
1
2 multiplications
// TODO: is there a way to do this in parallel - currently very slow ..
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
for
(
int64_t
i02
=
0
;
i02
<
ne
0
2
;
++
i02
)
{
size_t
offs_src0_cur
=
offs_src0
+
i02
*
nb02
;
for
(
int64_t
i02
=
0
;
i02
<
ne
1
2
;
++
i02
)
{
size_t
offs_src0_cur
=
offs_src0
+
i02
/
(
ne12
/
ne02
)
*
nb02
;
// gqa not used for now
size_t
offs_src1_cur
=
offs_src1
+
i02
*
nb12
;
size_t
offs_dst_cur
=
offs_dst
+
i02
*
nb2
;
...
...
@@ -800,8 +801,6 @@ void ggml_metal_graph_compute(
switch
(
src0t
)
{
case
GGML_TYPE_F16
:
{
GGML_ASSERT
(
ne02
==
ne12
);
nth0
=
64
;
nth1
=
1
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_f16_f32
];
...
...
@@ -881,16 +880,18 @@ void ggml_metal_graph_compute(
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
2
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
ne00
)
atIndex
:
3
];
[
encoder
setBytes
:
&
ne01
length
:
sizeof
(
ne01
)
atIndex
:
4
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
nb00
)
atIndex
:
5
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
nb01
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
nb02
)
atIndex
:
7
];
[
encoder
setBytes
:
&
ne10
length
:
sizeof
(
ne10
)
atIndex
:
8
];
[
encoder
setBytes
:
&
ne11
length
:
sizeof
(
ne11
)
atIndex
:
9
];
[
encoder
setBytes
:
&
nb10
length
:
sizeof
(
nb10
)
atIndex
:
10
];
[
encoder
setBytes
:
&
nb11
length
:
sizeof
(
nb11
)
atIndex
:
11
];
[
encoder
setBytes
:
&
nb12
length
:
sizeof
(
nb12
)
atIndex
:
12
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
ne0
)
atIndex
:
13
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
ne1
)
atIndex
:
14
];
[
encoder
setBytes
:
&
ne02
length
:
sizeof
(
ne02
)
atIndex
:
5
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
nb00
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
nb01
)
atIndex
:
7
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
nb02
)
atIndex
:
8
];
[
encoder
setBytes
:
&
ne10
length
:
sizeof
(
ne10
)
atIndex
:
9
];
[
encoder
setBytes
:
&
ne11
length
:
sizeof
(
ne11
)
atIndex
:
10
];
[
encoder
setBytes
:
&
ne12
length
:
sizeof
(
ne12
)
atIndex
:
11
];
[
encoder
setBytes
:
&
nb10
length
:
sizeof
(
nb10
)
atIndex
:
12
];
[
encoder
setBytes
:
&
nb11
length
:
sizeof
(
nb11
)
atIndex
:
13
];
[
encoder
setBytes
:
&
nb12
length
:
sizeof
(
nb12
)
atIndex
:
14
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
ne0
)
atIndex
:
15
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
ne1
)
atIndex
:
16
];
if
(
src0t
==
GGML_TYPE_Q4_0
||
src0t
==
GGML_TYPE_Q4_1
||
src0t
==
GGML_TYPE_Q2_K
||
src0t
==
GGML_TYPE_Q4_K
)
{
...
...
llama/ggml-metal.metal
View file @
7a1c3e62
//go:build darwin
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
@@ -537,11 +537,13 @@ kernel void kernel_mul_mat_f16_f32(
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
...
...
@@ -557,7 +559,7 @@ kernel void kernel_mul_mat_f16_f32(
const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
device const half * x = (device const half *) (src0 + r0*nb01 + im
/(ne12/ne02)
*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
sum[tpitg.x] = 0.0f;
...
...
@@ -580,6 +582,7 @@ kernel void kernel_mul_mat_f16_f32(
}
}
kernel void kernel_alibi_f32(
device const float * src0,
device float * dst,
...
...
llama/ggml-mpi.c
View file @
7a1c3e62
//go:build mpi
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
llama/ggml-mpi.h
View file @
7a1c3e62
//go:build mpi
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
llama/ggml-opencl.cpp
View file @
7a1c3e62
//go:build opencl
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
llama/ggml-opencl.h
View file @
7a1c3e62
//go:build opencl
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
llama/ggml.c
View file @
7a1c3e62
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
@@ -4583,10 +4583,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
static struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_context * ctx,
enum ggml_type type,
int n_dims,
const int64_t* ne,
void* data) {
enum ggml_type type,
int n_dims,
const int64_t * ne,
void * data) {
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
size_t data_size = 0;
...
...
@@ -4674,22 +4676,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
int n_dims,
const int64_t * ne) {
enum ggml_type
type,
int
n_dims,
const int64_t
* ne) {
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
}
struct ggml_tensor * ggml_new_tensor_1d(
struct ggml_context * ctx,
enum ggml_type type,
enum ggml_type
type,
int64_t ne0) {
return ggml_new_tensor(ctx, type, 1, &ne0);
}
struct ggml_tensor * ggml_new_tensor_2d(
struct ggml_context * ctx,
enum ggml_type type,
enum ggml_type
type,
int64_t ne0,
int64_t ne1) {
const int64_t ne[2] = { ne0, ne1 };
...
...
@@ -4698,7 +4700,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
struct ggml_tensor * ggml_new_tensor_3d(
struct ggml_context * ctx,
enum ggml_type type,
enum ggml_type
type,
int64_t ne0,
int64_t ne1,
int64_t ne2) {
...
...
@@ -6264,6 +6266,27 @@ struct ggml_tensor * ggml_reshape_4d(
// ggml_view_1d
static struct ggml_tensor * ggml_view_tensor_offset(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_dims,
const int64_t * ne,
size_t offset) {
// don't calculate an offset from an unallocated tensor
void * data = NULL;
if (a->data != NULL) {
data = (char *) a->data + offset;
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
return result;
}
struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
...
...
@@ -6276,10 +6299,7 @@ struct ggml_tensor * ggml_view_1d(
is_node = true;
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
result->op = GGML_OP_VIEW;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
...
...
@@ -6306,10 +6326,7 @@ struct ggml_tensor * ggml_view_2d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
result->nb[1] = nb1;
result->nb[2] = result->nb[1]*ne1;
...
...
@@ -6342,10 +6359,7 @@ struct ggml_tensor * ggml_view_3d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
result->nb[1] = nb1;
result->nb[2] = nb2;
...
...
@@ -6380,10 +6394,7 @@ struct ggml_tensor * ggml_view_4d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
result->nb[1] = nb1;
result->nb[2] = nb2;
...
...
@@ -6767,6 +6778,18 @@ struct ggml_tensor * ggml_rope_inplace(
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
}
struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
int n_ctx,
float freq_base,
float freq_scale) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
}
struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
...
...
llama/ggml.h
View file @
7a1c3e62
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
@@ -1196,7 +1196,18 @@ extern "C" {
int
mode
,
int
n_ctx
);
// custom RoPE, in-place, returns view(a)
// custom RoPE
GGML_API
struct
ggml_tensor
*
ggml_rope_custom
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
int
n_past
,
int
n_dims
,
int
mode
,
int
n_ctx
,
float
freq_base
,
float
freq_scale
);
// in-place, returns view(a)
GGML_API
struct
ggml_tensor
*
ggml_rope_custom_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
...
...
llama/k_quants.c
View file @
7a1c3e62
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
@@ -65,6 +65,8 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
//
// 2-6 bit quantization in super-blocks
//
...
...
@@ -1379,7 +1381,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const
__m256i
all_scales
=
_mm256_cvtepi8_epi16
(
scales8
);
const
__m128i
l_scales
=
_mm256_extracti128_si256
(
all_scales
,
0
);
const
__m128i
h_scales
=
_mm256_extracti128_si256
(
all_scales
,
1
);
const
__m256i
scales
[
2
]
=
{
_mm
256_
set_m
128
i
(
l_scales
,
l_scales
),
_mm
256_
set_m
128
i
(
h_scales
,
h_scales
)};
const
__m256i
scales
[
2
]
=
{
MM
256_
SET_M
128
I
(
l_scales
,
l_scales
),
MM
256_
SET_M
128
I
(
h_scales
,
h_scales
)};
__m256i
sumi
=
_mm256_setzero_si256
();
...
...
@@ -1447,7 +1449,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
summs_1
=
_mm_madd_epi16
(
mins_1
,
_mm_loadu_si128
((
const
__m128i
*
)
&
y
[
i
].
bsums
[
8
]));
// sumf += -dmin * summs in 32bits*8
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
dmin
),
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
summs_1
,
summs_0
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
dmin
),
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
summs_1
,
summs_0
))),
acc
);
const
__m128i
scales_0
=
_mm_cvtepi8_epi16
(
scales16
);
const
__m128i
scales_1
=
_mm_cvtepi8_epi16
(
_mm_unpackhi_epi64
(
scales16
,
scales16
));
...
...
@@ -1519,7 +1521,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
}
// sumf += dall * isum - dmin * summs in 32bits
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
dall
),
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
...
...
@@ -1670,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
summs
+=
dmin
*
smin
;
const
__m128i
q2bits
=
_mm_loadu_si128
((
const
__m128i
*
)
q2
);
const
__m256i
q2_0
=
_mm256_and_si256
(
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q2bits
,
2
),
q2bits
),
m3
);
const
__m256i
q2_1
=
_mm256_and_si256
(
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q2bits
,
6
),
_mm_srli_epi16
(
q2bits
,
4
)),
m3
);
const
__m256i
q2_0
=
_mm256_and_si256
(
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q2bits
,
2
),
q2bits
),
m3
);
const
__m256i
q2_1
=
_mm256_and_si256
(
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q2bits
,
6
),
_mm_srli_epi16
(
q2bits
,
4
)),
m3
);
const
__m256i
q8_0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
q8
+
0
));
const
__m256i
q8_1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
q8
+
32
));
...
...
@@ -1735,10 +1737,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
p2
=
_mm_maddubs_epi16
(
q2_2
,
_mm256_extractf128_si256
(
q8_1
,
0
));
const
__m128i
p3
=
_mm_maddubs_epi16
(
q2_3
,
_mm256_extractf128_si256
(
q8_1
,
1
));
const
__m256i
p_0
=
_mm
256_
set_m
128
i
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p0
,
p0
)),
_mm_cvtepi16_epi32
(
p0
));
const
__m256i
p_1
=
_mm
256_
set_m
128
i
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p1
,
p1
)),
_mm_cvtepi16_epi32
(
p1
));
const
__m256i
p_2
=
_mm
256_
set_m
128
i
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p2
,
p2
)),
_mm_cvtepi16_epi32
(
p2
));
const
__m256i
p_3
=
_mm
256_
set_m
128
i
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p3
,
p3
)),
_mm_cvtepi16_epi32
(
p3
));
const
__m256i
p_0
=
MM
256_
SET_M
128
I
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p0
,
p0
)),
_mm_cvtepi16_epi32
(
p0
));
const
__m256i
p_1
=
MM
256_
SET_M
128
I
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p1
,
p1
)),
_mm_cvtepi16_epi32
(
p1
));
const
__m256i
p_2
=
MM
256_
SET_M
128
I
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p2
,
p2
)),
_mm_cvtepi16_epi32
(
p2
));
const
__m256i
p_3
=
MM
256_
SET_M
128
I
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p3
,
p3
)),
_mm_cvtepi16_epi32
(
p3
));
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
*
db
[
0
]),
_mm256_cvtepi32_ps
(
p_0
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
*
db
[
1
]),
_mm256_cvtepi32_ps
(
p_1
)),
acc
);
...
...
@@ -1943,7 +1945,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const
__m256i
all_scales
=
_mm256_cvtepi8_epi16
(
scales128
);
const
__m128i
l_scales
=
_mm256_extracti128_si256
(
all_scales
,
0
);
const
__m128i
h_scales
=
_mm256_extracti128_si256
(
all_scales
,
1
);
const
__m256i
scales
[
2
]
=
{
_mm
256_
set_m
128
i
(
l_scales
,
l_scales
),
_mm
256_
set_m
128
i
(
h_scales
,
h_scales
)};
const
__m256i
scales
[
2
]
=
{
MM
256_
SET_M
128
I
(
l_scales
,
l_scales
),
MM
256_
SET_M
128
I
(
h_scales
,
h_scales
)};
// high bit
const
__m256i
hbits
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
].
hmask
);
...
...
@@ -2154,7 +2156,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
}
// multiply with block scale and accumulate
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
...
...
@@ -2329,13 +2331,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
aux16
[
0
]
=
a
&
0x0f0f
;
aux16
[
1
]
=
(
a
>>
4
)
&
0x0f0f
;
const
__m256i
scale_0
=
_mm
256_
set_m
128
i
(
_mm_set1_epi16
(
aux8
[
2
]
-
8
),
_mm_set1_epi16
(
aux8
[
0
]
-
8
));
const
__m256i
scale_1
=
_mm
256_
set_m
128
i
(
_mm_set1_epi16
(
aux8
[
3
]
-
8
),
_mm_set1_epi16
(
aux8
[
1
]
-
8
));
const
__m256i
scale_0
=
MM
256_
SET_M
128
I
(
_mm_set1_epi16
(
aux8
[
2
]
-
8
),
_mm_set1_epi16
(
aux8
[
0
]
-
8
));
const
__m256i
scale_1
=
MM
256_
SET_M
128
I
(
_mm_set1_epi16
(
aux8
[
3
]
-
8
),
_mm_set1_epi16
(
aux8
[
1
]
-
8
));
memcpy
(
&
aux64
,
x
[
i
].
hmask
,
8
);
const
__m128i
haux
=
_mm_set_epi64x
(
aux64
>>
1
,
aux64
>>
0
);
__m256i
q3h_0
=
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
haux
,
2
),
haux
);
__m256i
q3h_0
=
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
haux
,
2
),
haux
);
__m256i
q3h_1
=
_mm256_srli_epi16
(
q3h_0
,
4
);
q3h_0
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
q3h_0
,
m1
),
2
);
q3h_1
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
q3h_1
,
m1
),
2
);
...
...
@@ -2344,7 +2346,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
q3bits
=
_mm_loadu_si128
((
const
__m128i
*
)
q3
);
// prepare low and high bits
const
__m256i
q3aux
=
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q3bits
,
2
),
q3bits
);
const
__m256i
q3aux
=
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q3bits
,
2
),
q3bits
);
const
__m256i
q3l_0
=
_mm256_and_si256
(
q3aux
,
m3
);
const
__m256i
q3l_1
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q3aux
,
4
),
m3
);
...
...
@@ -2455,7 +2457,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
p16_0
=
_mm_add_epi32
(
p16_0
,
p16_2
);
p16_1
=
_mm_add_epi32
(
p16_1
,
p16_3
);
__m256i
p16
=
_mm
256_
set_m
128
i
(
p16_1
,
p16_0
);
__m256i
p16
=
MM
256_
SET_M
128
I
(
p16_1
,
p16_0
);
// multiply with block scale and accumulate
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
p16
)),
acc
);
...
...
@@ -2646,7 +2648,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
acc_m
=
_mm_fmadd_ps
(
_mm_set1_ps
(
dmin
),
_mm_cvtepi32_ps
(
prod
),
acc_m
);
const
__m128i
sc128
=
_mm256_extracti128_si256
(
mins_and_scales
,
0
);
const
__m256i
scales
=
_mm
256_
set_m
128
i
(
sc128
,
sc128
);
const
__m256i
scales
=
MM
256_
SET_M
128
I
(
sc128
,
sc128
);
__m256i
sumi
=
_mm256_setzero_si256
();
...
...
@@ -2753,7 +2755,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
}
__m256
vd
=
_mm256_set1_ps
(
d
);
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
...
...
@@ -2994,11 +2996,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
p32_0
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
0
]),
p16_0
);
const
__m128i
p32_1
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
0
]),
p16_1
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
p32_1
,
p32_0
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
p32_1
,
p32_0
))),
acc
);
const
__m128i
p32_2
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
1
]),
p16_2
);
const
__m128i
p32_3
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
1
]),
p16_3
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
p32_3
,
p32_2
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
p32_3
,
p32_2
))),
acc
);
}
...
...
@@ -3186,7 +3188,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
summs
+=
dmin
*
_mm_extract_epi32
(
hsum
,
0
);
const
__m128i
sc128
=
_mm256_extracti128_si256
(
mins_and_scales
,
0
);
const
__m256i
scales
=
_mm
256_
set_m
128
i
(
sc128
,
sc128
);
const
__m256i
scales
=
MM
256_
SET_M
128
I
(
sc128
,
sc128
);
const
__m256i
hbits
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
].
qh
);
__m256i
hmask
=
mone
;
...
...
@@ -3325,7 +3327,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
}
__m256
vd
=
_mm256_set1_ps
(
d
);
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
...
...
@@ -3488,13 +3490,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
const
__m256i
q5bits
=
_mm256_loadu_si256
((
const
__m256i
*
)
q5
);
const
__m256i
scale_l
=
_mm
256_
set_m
128
i
(
_mm_set1_epi16
(
x
[
i
].
scales
[
1
]),
_mm_set1_epi16
(
x
[
i
].
scales
[
0
]));
const
__m256i
scale_h
=
_mm
256_
set_m
128
i
(
_mm_set1_epi16
(
x
[
i
].
scales
[
3
]),
_mm_set1_epi16
(
x
[
i
].
scales
[
2
]));
const
__m256i
scale_l
=
MM
256_
SET_M
128
I
(
_mm_set1_epi16
(
x
[
i
].
scales
[
1
]),
_mm_set1_epi16
(
x
[
i
].
scales
[
0
]));
const
__m256i
scale_h
=
MM
256_
SET_M
128
I
(
_mm_set1_epi16
(
x
[
i
].
scales
[
3
]),
_mm_set1_epi16
(
x
[
i
].
scales
[
2
]));
int64_t
aux64
;
memcpy
(
&
aux64
,
x
[
i
].
qh
,
8
);
const
__m128i
haux128
=
_mm_set_epi64x
(
aux64
>>
1
,
aux64
);
const
__m256i
haux256
=
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
haux128
,
2
),
haux128
);
const
__m256i
haux256
=
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
haux128
,
2
),
haux128
);
const
__m256i
q5h_0
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
haux256
,
mone
),
4
);
const
__m256i
q5h_1
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
_mm256_srli_epi16
(
haux256
,
4
),
mone
),
4
);
...
...
@@ -3569,7 +3571,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
dot_0
=
_mm_sub_epi32
(
_mm_add_epi32
(
p16_0
,
p16_2
),
_mm_add_epi32
(
s16_0
,
s16_2
));
const
__m128i
dot_1
=
_mm_sub_epi32
(
_mm_add_epi32
(
p16_1
,
p16_3
),
_mm_add_epi32
(
s16_1
,
s16_3
));
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
),
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
dot_1
,
dot_0
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
),
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
dot_1
,
dot_0
))),
acc
);
}
...
...
@@ -3951,7 +3953,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
}
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
...
...
@@ -4109,8 +4111,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
const
__m256i
q4bits1
=
_mm256_loadu_si256
((
const
__m256i
*
)
q4
);
const
__m128i
q4bitsH
=
_mm_loadu_si128
((
const
__m128i
*
)
qh
);
const
__m256i
q4h_0
=
_mm256_slli_epi16
(
_mm256_and_si256
(
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q4bitsH
,
2
),
q4bitsH
),
m2
),
4
);
const
__m256i
q4h_1
=
_mm256_slli_epi16
(
_mm256_and_si256
(
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q4bitsH
,
6
),
_mm_srli_epi16
(
q4bitsH
,
4
)),
m2
),
4
);
const
__m256i
q4h_0
=
_mm256_slli_epi16
(
_mm256_and_si256
(
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q4bitsH
,
2
),
q4bitsH
),
m2
),
4
);
const
__m256i
q4h_1
=
_mm256_slli_epi16
(
_mm256_and_si256
(
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q4bitsH
,
6
),
_mm_srli_epi16
(
q4bitsH
,
4
)),
m2
),
4
);
const
__m256i
q4_0
=
_mm256_or_si256
(
_mm256_and_si256
(
q4bits1
,
m4
),
q4h_0
);
const
__m256i
q4_1
=
_mm256_or_si256
(
_mm256_and_si256
(
_mm256_srli_epi16
(
q4bits1
,
4
),
m4
),
q4h_1
);
...
...
@@ -4203,7 +4205,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
sumi_0
=
_mm_add_epi32
(
sumi_0
,
_mm_add_epi32
(
p16_0
,
p16_2
));
sumi_1
=
_mm_add_epi32
(
sumi_1
,
_mm_add_epi32
(
p16_1
,
p16_3
));
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
))),
acc
);
}
*
s
=
hsum_float_8
(
acc
);
...
...
llama/k_quants.h
View file @
7a1c3e62
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
llama/llama-util.h
View file @
7a1c3e62
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
llama/llama.cpp
View file @
7a1c3e62
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
@@ -82,8 +82,14 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
#include "ggml-alloc.h"
#define LLAMA_USE_ALLOCATOR
#else
#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16
#endif
// available llama models
enum
e_model
{
...
...
@@ -353,13 +359,22 @@ struct llama_model {
struct
llama_context
{
llama_context
(
const
llama_model
&
model
)
:
model
(
model
),
t_load_us
(
model
.
t_load_us
),
t_start_us
(
model
.
t_start_us
)
{}
#ifdef GGML_USE_METAL
~
llama_context
()
{
if
(
model_owner
)
{
delete
&
model
;
}
#ifdef GGML_USE_METAL
if
(
ctx_metal
)
{
ggml_metal_free
(
ctx_metal
);
}
}
#endif
#ifdef LLAMA_USE_ALLOCATOR
if
(
alloc
)
{
ggml_allocr_free
(
alloc
);
}
#endif
}
std
::
mt19937
rng
;
bool
has_evaluated_once
=
false
;
...
...
@@ -397,7 +412,17 @@ struct llama_context {
// memory buffers used to evaluate the model
// TODO: move in llama_state
llama_ctx_buffer
buf_compute
;
#ifdef LLAMA_USE_ALLOCATOR
llama_ctx_buffer
buf_alloc
;
ggml_allocr
*
alloc
=
NULL
;
#endif
#ifdef LLAMA_USE_SCRATCH
llama_ctx_buffer
buf_scratch
[
LLAMA_MAX_SCRATCH_BUFFERS
];
int
buf_last
=
0
;
size_t
buf_max_size
[
LLAMA_MAX_SCRATCH_BUFFERS
]
=
{
0
};
#endif
#ifdef GGML_USE_METAL
ggml_metal_context
*
ctx_metal
=
NULL
;
...
...
@@ -407,9 +432,6 @@ struct llama_context {
ggml_mpi_context
*
ctx_mpi
=
NULL
;
#endif
int
buf_last
=
0
;
size_t
buf_max_size
[
LLAMA_MAX_SCRATCH_BUFFERS
]
=
{
0
};
void
use_buf
(
struct
ggml_context
*
ctx
,
int
i
)
{
#if defined(LLAMA_USE_SCRATCH)
size_t
last_size
=
0
;
...
...
@@ -905,6 +927,7 @@ struct llama_context_params llama_context_default_params() {
/*.progress_callback =*/
nullptr
,
/*.progress_callback_user_data =*/
nullptr
,
/*.low_vram =*/
false
,
/*.mul_mat_q =*/
false
,
/*.f16_kv =*/
true
,
/*.logits_all =*/
false
,
/*.vocab_only =*/
false
,
...
...
@@ -1032,6 +1055,7 @@ static void llama_model_load_internal(
int
n_gpu_layers
,
int
main_gpu
,
const
float
*
tensor_split
,
const
bool
mul_mat_q
,
float
rope_freq_base
,
float
rope_freq_scale
,
bool
low_vram
,
...
...
@@ -1160,9 +1184,11 @@ static void llama_model_load_internal(
}
(
void
)
main_gpu
;
(
void
)
mul_mat_q
;
#if defined(GGML_USE_CUBLAS)
fprintf
(
stderr
,
"%s: using CUDA for GPU acceleration
\n
"
,
__func__
);
ggml_cuda_set_main_device
(
main_gpu
);
ggml_cuda_set_mul_mat_q
(
mul_mat_q
);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
#elif defined(GGML_USE_CLBLAST)
...
...
@@ -1256,12 +1282,16 @@ static void llama_model_load_internal(
const
size_t
scale
=
memory_type
==
GGML_TYPE_F32
?
2
:
1
;
// this is the total memory required to run the inference
const
size_t
mem_required
=
size_t
mem_required
=
ctx_size
+
mmapped_size
-
vram_weights
+
// weights in VRAM not in memory
mmapped_size
-
vram_weights
;
// weights in VRAM not in memory
#ifndef LLAMA_USE_ALLOCATOR
mem_required
+=
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
model
.
type
)
+
MEM_REQ_SCRATCH1
().
at
(
model
.
type
)
+
MEM_REQ_EVAL
().
at
(
model
.
type
);
#endif
// this is the memory required by one llama_state
const
size_t
mem_required_state
=
...
...
@@ -1367,6 +1397,7 @@ static bool llama_model_load(
int
n_gpu_layers
,
int
main_gpu
,
const
float
*
tensor_split
,
const
bool
mul_mat_q
,
float
rope_freq_base
,
float
rope_freq_scale
,
bool
low_vram
,
...
...
@@ -1377,7 +1408,8 @@ static bool llama_model_load(
llama_progress_callback
progress_callback
,
void
*
progress_callback_user_data
)
{
try
{
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gqa
,
rms_norm_eps
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
rope_freq_base
,
rope_freq_scale
,
low_vram
,
memory_type
,
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gqa
,
rms_norm_eps
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
mul_mat_q
,
rope_freq_base
,
rope_freq_scale
,
low_vram
,
memory_type
,
use_mmap
,
use_mlock
,
vocab_only
,
progress_callback
,
progress_callback_user_data
);
return
true
;
}
catch
(
const
std
::
exception
&
err
)
{
...
...
@@ -1386,32 +1418,15 @@ static bool llama_model_load(
}
}
// evaluate the transformer
//
// - lctx: llama context
// - tokens: new batch of tokens to process
// - embd embeddings input
// - n_tokens number of tokens
// - n_past: the context size so far
// - n_threads: number of threads to use
//
static
bool
llama_eval_internal
(
static
struct
ggml_cgraph
*
llama_build_graph
(
llama_context
&
lctx
,
const
llama_token
*
tokens
,
const
float
*
embd
,
int
n_tokens
,
int
n_past
,
int
n_threads
,
const
char
*
cgraph_fname
)
{
int
n_past
)
{
LLAMA_ASSERT
((
!
tokens
&&
embd
)
||
(
tokens
&&
!
embd
));
#ifdef GGML_USE_MPI
ggml_mpi_eval_init
(
lctx
.
ctx_mpi
,
&
n_tokens
,
&
n_past
,
&
n_threads
);
#endif
const
int64_t
t_start_us
=
ggml_time_us
();
const
int
N
=
n_tokens
;
const
auto
&
model
=
lctx
.
model
;
...
...
@@ -1427,10 +1442,8 @@ static bool llama_eval_internal(
const
int64_t
n_head
=
hparams
.
n_head
;
const
int64_t
n_head_kv
=
hparams
.
n_head_kv
;
const
int64_t
n_embd_head
=
hparams
.
n_embd_head
();
const
int64_t
n_vocab
=
hparams
.
n_vocab
;
const
int64_t
n_embd_gqa
=
hparams
.
n_embd_gqa
();
LLAMA_ASSERT
(
n_embd_head
==
hparams
.
n_rot
);
const
float
freq_base
=
hparams
.
rope_freq_base
;
...
...
@@ -1442,26 +1455,35 @@ static bool llama_eval_internal(
auto
&
mem_per_token
=
lctx
.
mem_per_token
;
auto
&
buf_compute
=
lctx
.
buf_compute
;
struct
ggml_init_params
params
=
{
/*.mem_size =*/
buf_compute
.
size
,
/*.mem_buffer =*/
buf_compute
.
addr
,
/*.no_alloc =*/
false
,
};
#ifdef LLAMA_USE_ALLOCATOR
params
.
no_alloc
=
true
;
#endif
struct
ggml_context
*
ctx0
=
ggml_init
(
params
);
ggml_cgraph
*
gf
=
ggml_new_graph
(
ctx0
);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads
=
N
>=
32
&&
ggml_cpu_has_blas
()
&&
!
ggml_cpu_has_gpublas
()
?
1
:
n_threads
;
struct
ggml_tensor
*
cur
;
struct
ggml_tensor
*
inpL
;
if
(
tokens
)
{
struct
ggml_tensor
*
inp_tokens
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
N
);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc
(
lctx
.
alloc
,
inp_tokens
);
if
(
!
ggml_allocr_is_measure
(
lctx
.
alloc
))
{
memcpy
(
inp_tokens
->
data
,
tokens
,
N
*
ggml_element_size
(
inp_tokens
));
}
#else
memcpy
(
inp_tokens
->
data
,
tokens
,
N
*
ggml_element_size
(
inp_tokens
));
#endif
ggml_set_name
(
inp_tokens
,
"inp_tokens"
);
inpL
=
ggml_get_rows
(
ctx0
,
model
.
tok_embeddings
,
inp_tokens
);
...
...
@@ -1471,7 +1493,15 @@ static bool llama_eval_internal(
#endif
inpL
=
ggml_new_tensor_2d
(
ctx0
,
GGML_TYPE_F32
,
n_embd
,
N
);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc
(
lctx
.
alloc
,
inpL
);
if
(
!
ggml_allocr_is_measure
(
lctx
.
alloc
))
{
memcpy
(
inpL
->
data
,
embd
,
N
*
n_embd
*
ggml_element_size
(
inpL
));
}
#else
memcpy
(
inpL
->
data
,
embd
,
N
*
n_embd
*
ggml_element_size
(
inpL
));
#endif
}
const
int
i_gpu_start
=
n_layer
-
n_gpu_layers
;
...
...
@@ -1498,6 +1528,17 @@ static bool llama_eval_internal(
}
#endif // GGML_USE_CUBLAS
struct
ggml_tensor
*
KQ_scale
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_F32
,
1
);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc
(
lctx
.
alloc
,
KQ_scale
);
if
(
!
ggml_allocr_is_measure
(
lctx
.
alloc
))
{
ggml_set_f32
(
KQ_scale
,
1.0
f
/
sqrtf
(
float
(
n_embd
)
/
n_head
));
}
#else
ggml_set_f32
(
KQ_scale
,
1.0
f
/
sqrtf
(
float
(
n_embd
)
/
n_head
));
#endif
ggml_set_name
(
KQ_scale
,
"1/sqrt(n_embd_head)"
);
for
(
int
il
=
0
;
il
<
n_layer
;
++
il
)
{
ggml_format_name
(
inpL
,
"layer_inp_%d"
,
il
);
...
...
@@ -1593,9 +1634,6 @@ static bool llama_eval_internal(
ggml_set_name
(
KQ
,
"KQ"
);
// KQ_scaled = KQ / sqrt(n_embd_head)
struct
ggml_tensor
*
KQ_scale
=
ggml_new_f32
(
ctx0
,
1.0
f
/
sqrtf
(
float
(
n_embd
)
/
n_head
));
ggml_set_name
(
KQ_scale
,
"1/sqrt(n_embd_head)"
);
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct
ggml_tensor
*
KQ_scaled
=
ggml_scale_inplace
(
ctx0
,
KQ
,
KQ_scale
);
offload_func_kq
(
KQ_scaled
);
...
...
@@ -1711,9 +1749,6 @@ static bool llama_eval_internal(
lctx
.
use_buf
(
ctx0
,
0
);
// used at the end to optionally extract the embeddings
struct
ggml_tensor
*
embeddings
=
NULL
;
// norm
{
cur
=
ggml_rms_norm
(
ctx0
,
inpL
,
rms_norm_eps
);
...
...
@@ -1724,8 +1759,6 @@ static bool llama_eval_internal(
cur
=
ggml_mul
(
ctx0
,
cur
,
model
.
norm
);
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
ggml_set_name
(
cur
,
"result_norm"
);
embeddings
=
cur
;
}
// lm_head
...
...
@@ -1737,12 +1770,88 @@ static bool llama_eval_internal(
// logits -> probs
//cur = ggml_soft_max_inplace(ctx0, cur);
// run the computation
ggml_build_forward_expand
(
gf
,
cur
);
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
if
(
mem_per_token
==
0
)
{
mem_per_token
=
ggml_used_mem
(
ctx0
)
/
N
;
}
#if 0
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
ggml_used_mem(ctx0)/1024.0/1024.0,
lctx.get_buf_max_mem(0)/1024.0/1024.0,
lctx.get_buf_max_mem(1)/1024.0/1024.0,
lctx.work_buffer.size()/1024.0/1024.0,
n_past, N);
#endif
ggml_free
(
ctx0
);
return
gf
;
}
// evaluate the transformer
//
// - lctx: llama context
// - tokens: new batch of tokens to process
// - embd embeddings input
// - n_tokens number of tokens
// - n_past: the context size so far
// - n_threads: number of threads to use
//
static
bool
llama_eval_internal
(
llama_context
&
lctx
,
const
llama_token
*
tokens
,
const
float
*
embd
,
int
n_tokens
,
int
n_past
,
int
n_threads
,
const
char
*
cgraph_fname
)
{
LLAMA_ASSERT
((
!
tokens
&&
embd
)
||
(
tokens
&&
!
embd
));
const
int64_t
t_start_us
=
ggml_time_us
();
#ifdef GGML_USE_MPI
ggml_mpi_eval_init
(
lctx
.
ctx_mpi
,
&
n_tokens
,
&
n_past
,
&
n_threads
);
#endif
const
int
N
=
n_tokens
;
const
auto
&
model
=
lctx
.
model
;
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
kv_self
=
lctx
.
kv_self
;
LLAMA_ASSERT
(
!!
kv_self
.
ctx
);
const
int64_t
n_embd
=
hparams
.
n_embd
;
const
int64_t
n_vocab
=
hparams
.
n_vocab
;
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_reset
(
lctx
.
alloc
);
#endif
ggml_cgraph
*
gf
=
llama_build_graph
(
lctx
,
tokens
,
embd
,
n_tokens
,
n_past
);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc_graph
(
lctx
.
alloc
,
gf
);
#endif
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads
=
N
>=
32
&&
ggml_cpu_has_blas
()
&&
!
ggml_cpu_has_gpublas
()
?
1
:
n_threads
;
struct
ggml_tensor
*
res
=
gf
->
nodes
[
gf
->
n_nodes
-
1
];
struct
ggml_tensor
*
embeddings
=
gf
->
nodes
[
gf
->
n_nodes
-
2
];
LLAMA_ASSERT
(
strcmp
(
res
->
name
,
"result_output"
)
==
0
);
LLAMA_ASSERT
(
strcmp
(
embeddings
->
name
,
"result_norm"
)
==
0
);
#if GGML_USE_MPI
const
int64_t
n_layer
=
hparams
.
n_layer
;
ggml_mpi_graph_compute_pre
(
lctx
.
ctx_mpi
,
gf
,
n_layer
);
#endif
...
...
@@ -1754,7 +1863,10 @@ static bool llama_eval_internal(
//}
ggml_metal_set_n_cb
(
lctx
.
ctx_metal
,
n_threads
);
ggml_metal_graph_compute
(
lctx
.
ctx_metal
,
gf
);
ggml_metal_get_tensor
(
lctx
.
ctx_metal
,
cur
);
ggml_metal_get_tensor
(
lctx
.
ctx_metal
,
res
);
if
(
!
lctx
.
embedding
.
empty
())
{
ggml_metal_get_tensor
(
lctx
.
ctx_metal
,
embeddings
);
}
}
else
{
// IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
...
...
@@ -1785,8 +1897,6 @@ static bool llama_eval_internal(
// update kv token count
lctx
.
kv_self
.
n
=
n_past
+
N
;
struct
ggml_tensor
*
res
=
gf
->
nodes
[
gf
->
n_nodes
-
1
];
if
(
cgraph_fname
)
{
ggml_graph_export
(
gf
,
cgraph_fname
);
}
...
...
@@ -1824,21 +1934,6 @@ static bool llama_eval_internal(
memcpy
(
embedding_out
.
data
(),
(
float
*
)
ggml_get_data
(
embeddings
)
+
(
n_embd
*
(
N
-
1
)),
sizeof
(
float
)
*
n_embd
);
}
if
(
mem_per_token
==
0
)
{
mem_per_token
=
ggml_used_mem
(
ctx0
)
/
N
;
}
#if 0
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
ggml_used_mem(ctx0)/1024.0/1024.0,
lctx.get_buf_max_mem(0)/1024.0/1024.0,
lctx.get_buf_max_mem(1)/1024.0/1024.0,
lctx.work_buffer.size()/1024.0/1024.0,
n_past, N);
#endif
ggml_free
(
ctx0
);
// measure the performance only for the single-token evals
if
(
N
==
1
)
{
lctx
.
t_eval_us
+=
ggml_time_us
()
-
t_start_us
;
...
...
@@ -1950,7 +2045,9 @@ struct llama_tokenizer {
if
(
token
==
vocab_
.
token_to_id
.
end
())
{
// output any symbols that did not form tokens as bytes.
for
(
int
j
=
0
;
j
<
(
int
)
symbol
.
n
;
++
j
)
{
llama_vocab
::
id
token_id
=
static_cast
<
uint8_t
>
(
symbol
.
text
[
j
])
+
3
;
// NOTE: old version, before #2420 - not sure what are the implications of this
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
llama_vocab
::
id
token_id
=
vocab_
.
token_to_id
.
at
(
std
::
string
(
1
,
symbol
.
text
[
j
]));
output
.
push_back
(
token_id
);
}
}
else
{
...
...
@@ -3127,7 +3224,7 @@ struct llama_model * llama_load_model_from_file(
ggml_type
memory_type
=
params
.
f16_kv
?
GGML_TYPE_F16
:
GGML_TYPE_F32
;
if
(
!
llama_model_load
(
path_model
,
*
model
,
model
->
vocab
,
params
.
n_ctx
,
params
.
n_batch
,
params
.
n_gqa
,
params
.
rms_norm_eps
,
params
.
n_gpu_layers
,
params
.
main_gpu
,
params
.
tensor_split
,
params
.
rope_freq_base
,
params
.
rope_freq_scale
,
params
.
low_vram
,
params
.
main_gpu
,
params
.
tensor_split
,
params
.
mul_mat_q
,
params
.
rope_freq_base
,
params
.
rope_freq_scale
,
params
.
low_vram
,
memory_type
,
params
.
use_mmap
,
params
.
use_mlock
,
params
.
vocab_only
,
params
.
progress_callback
,
params
.
progress_callback_user_data
))
{
delete
model
;
...
...
@@ -3204,10 +3301,47 @@ struct llama_context * llama_new_context_with_model(
ctx
->
embedding
.
resize
(
hparams
.
n_embd
);
}
#ifdef LLAMA_USE_ALLOCATOR
{
static
const
size_t
tensor_alignment
=
32
;
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
ctx
->
buf_compute
.
resize
(
ggml_tensor_overhead
()
*
GGML_MAX_NODES
+
ggml_graph_overhead
());
// create measure allocator
ctx
->
alloc
=
ggml_allocr_new_measure
(
tensor_alignment
);
// build worst-case graph
int
n_tokens
=
std
::
min
((
int
)
hparams
.
n_ctx
,
params
.
n_batch
);
int
n_past
=
hparams
.
n_ctx
-
n_tokens
;
llama_token
token
=
llama_token_bos
();
// not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph
*
gf
=
llama_build_graph
(
*
ctx
,
&
token
,
NULL
,
n_tokens
,
n_past
);
// measure memory requirements for the graph
size_t
alloc_size
=
ggml_allocr_alloc_graph
(
ctx
->
alloc
,
gf
)
+
tensor_alignment
;
fprintf
(
stderr
,
"%s: compute buffer total size = %7.2f MB
\n
"
,
__func__
,
(
ctx
->
buf_compute
.
size
+
alloc_size
)
/
1024.0
/
1024.0
);
// debug - for comparison with scratch buffer
//size_t prev_req =
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
// MEM_REQ_EVAL().at(ctx->model.type);
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
// recreate allocator with exact memory requirements
ggml_allocr_free
(
ctx
->
alloc
);
ctx
->
buf_alloc
.
resize
(
alloc_size
);
ctx
->
alloc
=
ggml_allocr_new
(
ctx
->
buf_alloc
.
addr
,
ctx
->
buf_alloc
.
size
,
tensor_alignment
);
}
#else
ctx
->
buf_compute
.
resize
(
MEM_REQ_EVAL
().
at
(
ctx
->
model
.
type
)
+
ggml_graph_overhead
());
#endif
#ifdef LLAMA_USE_SCRATCH
ctx
->
buf_scratch
[
0
].
resize
(
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
1
].
resize
(
MEM_REQ_SCRATCH1
().
at
(
ctx
->
model
.
type
));
#endif
}
#ifdef GGML_USE_METAL
...
...
@@ -3277,9 +3411,6 @@ struct llama_context * llama_init_from_file(
}
void
llama_free
(
struct
llama_context
*
ctx
)
{
if
(
ctx
->
model_owner
)
{
delete
&
ctx
->
model
;
}
delete
ctx
;
}
...
...
llama/llama.h
View file @
7a1c3e62
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
...
...
@@ -134,6 +134,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value.
bool
low_vram
;
// if true, reduce VRAM usage at the cost of performance
bool
mul_mat_q
;
// if true, use experimental mul_mat_q kernels
bool
f16_kv
;
// use fp16 for KV cache
bool
logits_all
;
// the llama_eval() call computes all logits, not just the last one
bool
vocab_only
;
// only load the vocabulary, no weights
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment