Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
544b6739
Unverified
Commit
544b6739
authored
Nov 06, 2025
by
Daniel Hiltgen
Committed by
GitHub
Nov 06, 2025
Browse files
ggml update to b6840 (#12791)
parent
c4ba257c
Changes
103
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
991 additions
and
566 deletions
+991
-566
ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
+100
-4
ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu
ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu
+1
-1
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+0
-7
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
+61
-172
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh
+1
-5
ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
+1
-0
ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec.cuh
ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec.cuh
+2
-7
ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
+12
-7
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+44
-57
ml/backend/ggml/ggml/src/ggml-cuda/mmf.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmf.cu
+40
-6
ml/backend/ggml/ggml/src/ggml-cuda/mmf.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmf.cuh
+313
-31
ml/backend/ggml/ggml/src/ggml-cuda/mmid.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmid.cu
+164
-0
ml/backend/ggml/ggml/src/ggml-cuda/mmid.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmid.cuh
+5
-0
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
+3
-166
ml/backend/ggml/ggml/src/ggml-cuda/mmvf.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmvf.cu
+44
-28
ml/backend/ggml/ggml/src/ggml-cuda/topk-moe.cu
ml/backend/ggml/ggml/src/ggml-cuda/topk-moe.cu
+119
-68
ml/backend/ggml/ggml/src/ggml-cuda/topk-moe.cuh
ml/backend/ggml/ggml/src/ggml-cuda/topk-moe.cuh
+4
-3
ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
+4
-2
ml/backend/ggml/ggml/src/ggml-impl.h
ml/backend/ggml/ggml/src/ggml-impl.h
+48
-2
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-device.cpp
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-device.cpp
+25
-0
No files found.
ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
View file @
544b6739
#include "argsort.cuh"
#ifdef GGML_CUDA_USE_CUB
# include <cub/cub.cuh>
using
namespace
cub
;
#endif // GGML_CUDA_USE_CUB
static
__global__
void
init_indices
(
int
*
indices
,
const
int
ncols
,
const
int
nrows
)
{
const
int
col
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
row
=
blockIdx
.
y
;
if
(
col
<
ncols
&&
row
<
nrows
)
{
indices
[
row
*
ncols
+
col
]
=
col
;
}
}
static
__global__
void
init_offsets
(
int
*
offsets
,
const
int
ncols
,
const
int
nrows
)
{
const
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
<=
nrows
)
{
offsets
[
idx
]
=
idx
*
ncols
;
}
}
#ifdef GGML_CUDA_USE_CUB
static
void
argsort_f32_i32_cuda_cub
(
ggml_cuda_pool
&
pool
,
const
float
*
x
,
int
*
dst
,
const
int
ncols
,
const
int
nrows
,
ggml_sort_order
order
,
cudaStream_t
stream
)
{
ggml_cuda_pool_alloc
<
int
>
temp_indices_alloc
(
pool
,
ncols
*
nrows
);
ggml_cuda_pool_alloc
<
float
>
temp_keys_alloc
(
pool
,
ncols
*
nrows
);
ggml_cuda_pool_alloc
<
int
>
offsets_alloc
(
pool
,
nrows
+
1
);
int
*
temp_indices
=
temp_indices_alloc
.
get
();
float
*
temp_keys
=
temp_keys_alloc
.
get
();
int
*
d_offsets
=
offsets_alloc
.
get
();
static
const
int
block_size
=
256
;
const
dim3
grid_size
((
ncols
+
block_size
-
1
)
/
block_size
,
nrows
);
init_indices
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
temp_indices
,
ncols
,
nrows
);
const
dim3
offset_grid
((
nrows
+
block_size
-
1
)
/
block_size
);
init_offsets
<<<
offset_grid
,
block_size
,
0
,
stream
>>>
(
d_offsets
,
ncols
,
nrows
);
cudaMemcpyAsync
(
temp_keys
,
x
,
ncols
*
nrows
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
,
stream
);
size_t
temp_storage_bytes
=
0
;
if
(
order
==
GGML_SORT_ORDER_ASC
)
{
DeviceSegmentedRadixSort
::
SortPairs
(
nullptr
,
temp_storage_bytes
,
temp_keys
,
temp_keys
,
// keys (in-place)
temp_indices
,
dst
,
// values (indices)
ncols
*
nrows
,
nrows
,
// num items, num segments
d_offsets
,
d_offsets
+
1
,
0
,
sizeof
(
float
)
*
8
,
// all bits
stream
);
}
else
{
DeviceSegmentedRadixSort
::
SortPairsDescending
(
nullptr
,
temp_storage_bytes
,
temp_keys
,
temp_keys
,
temp_indices
,
dst
,
ncols
*
nrows
,
nrows
,
d_offsets
,
d_offsets
+
1
,
0
,
sizeof
(
float
)
*
8
,
stream
);
}
ggml_cuda_pool_alloc
<
uint8_t
>
temp_storage_alloc
(
pool
,
temp_storage_bytes
);
void
*
d_temp_storage
=
temp_storage_alloc
.
get
();
if
(
order
==
GGML_SORT_ORDER_ASC
)
{
DeviceSegmentedRadixSort
::
SortPairs
(
d_temp_storage
,
temp_storage_bytes
,
temp_keys
,
temp_keys
,
temp_indices
,
dst
,
ncols
*
nrows
,
nrows
,
d_offsets
,
d_offsets
+
1
,
0
,
sizeof
(
float
)
*
8
,
stream
);
}
else
{
DeviceSegmentedRadixSort
::
SortPairsDescending
(
d_temp_storage
,
temp_storage_bytes
,
temp_keys
,
temp_keys
,
temp_indices
,
dst
,
ncols
*
nrows
,
nrows
,
d_offsets
,
d_offsets
+
1
,
0
,
sizeof
(
float
)
*
8
,
stream
);
}
}
#endif // GGML_CUDA_USE_CUB
// Bitonic sort implementation
template
<
typename
T
>
static
inline
__device__
void
ggml_cuda_swap
(
T
&
a
,
T
&
b
)
{
T
tmp
=
a
;
...
...
@@ -65,7 +141,12 @@ static int next_power_of_2(int x) {
return
n
;
}
static
void
argsort_f32_i32_cuda
(
const
float
*
x
,
int
*
dst
,
const
int
ncols
,
const
int
nrows
,
ggml_sort_order
order
,
cudaStream_t
stream
)
{
static
void
argsort_f32_i32_cuda_bitonic
(
const
float
*
x
,
int
*
dst
,
const
int
ncols
,
const
int
nrows
,
ggml_sort_order
order
,
cudaStream_t
stream
)
{
// bitonic sort requires ncols to be power of 2
const
int
ncols_pad
=
next_power_of_2
(
ncols
);
...
...
@@ -77,9 +158,11 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
GGML_ASSERT
(
shared_mem
<=
ggml_cuda_info
().
devices
[
ggml_cuda_get_device
()].
smpb
);
if
(
order
==
GGML_SORT_ORDER_ASC
)
{
k_argsort_f32_i32
<
GGML_SORT_ORDER_ASC
><<<
block_nums
,
block_dims
,
shared_mem
,
stream
>>>
(
x
,
dst
,
ncols
,
ncols_pad
);
k_argsort_f32_i32
<
GGML_SORT_ORDER_ASC
>
<<<
block_nums
,
block_dims
,
shared_mem
,
stream
>>>
(
x
,
dst
,
ncols
,
ncols_pad
);
}
else
if
(
order
==
GGML_SORT_ORDER_DESC
)
{
k_argsort_f32_i32
<
GGML_SORT_ORDER_DESC
><<<
block_nums
,
block_dims
,
shared_mem
,
stream
>>>
(
x
,
dst
,
ncols
,
ncols_pad
);
k_argsort_f32_i32
<
GGML_SORT_ORDER_DESC
>
<<<
block_nums
,
block_dims
,
shared_mem
,
stream
>>>
(
x
,
dst
,
ncols
,
ncols_pad
);
}
else
{
GGML_ABORT
(
"fatal error"
);
}
...
...
@@ -197,6 +280,19 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
if
(
src0
->
type
==
GGML_TYPE_I32
)
{
argsort_i32_i32_cuda
((
const
int32_t
*
)
src0_d
,
(
int
*
)
dst_d
,
ncols
,
nrows
,
order
,
stream
);
}
else
{
argsort_f32_i32_cuda
(
src0_d
,
(
int
*
)
dst_d
,
ncols
,
nrows
,
order
,
stream
);
#ifdef GGML_CUDA_USE_CUB
const
int
ncols_pad
=
next_power_of_2
(
ncols
);
const
size_t
shared_mem
=
ncols_pad
*
sizeof
(
int
);
const
size_t
max_shared_mem
=
ggml_cuda_info
().
devices
[
ggml_cuda_get_device
()].
smpb
;
if
(
shared_mem
>
max_shared_mem
||
ncols
>
1024
)
{
ggml_cuda_pool
&
pool
=
ctx
.
pool
();
argsort_f32_i32_cuda_cub
(
pool
,
src0_d
,
(
int
*
)
dst_d
,
ncols
,
nrows
,
order
,
stream
);
}
else
{
argsort_f32_i32_cuda_bitonic
(
src0_d
,
(
int
*
)
dst_d
,
ncols
,
nrows
,
order
,
stream
);
}
#else
argsort_f32_i32_cuda_bitonic
(
src0_d
,
(
int
*
)
dst_d
,
ncols
,
nrows
,
order
,
stream
);
#endif
}
}
ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu
View file @
544b6739
...
...
@@ -272,7 +272,7 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
const
uint3
ne12
=
init_fastdiv_values
((
uint32_t
)
cne1
[
2
]);
const
uint3
ne13
=
init_fastdiv_values
((
uint32_t
)
cne1
[
3
]);
if
(
block_nums
.
z
>
65535
)
{
if
(
block_nums
.
z
>
65535
||
block_nums
.
y
>
65535
)
{
int
block_num
=
(
ne0
*
ne1
*
ne2
*
ne3
+
block_size
-
1
)
/
block_size
;
const
uint3
prod_012
=
init_fastdiv_values
((
uint32_t
)
(
ne0
*
ne1
*
ne2
));
const
uint3
prod_01
=
init_fastdiv_values
((
uint32_t
)
(
ne0
*
ne1
));
...
...
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
View file @
544b6739
...
...
@@ -982,13 +982,6 @@ struct ggml_cuda_graph {
bool
disable_due_to_failed_graph_capture
=
false
;
int
number_consecutive_updates
=
0
;
std
::
vector
<
ggml_graph_node_properties
>
ggml_graph_properties
;
bool
use_cpy_indirection
=
false
;
std
::
vector
<
char
*>
cpy_dest_ptrs
;
char
**
dest_ptrs_d
;
int
dest_ptrs_size
=
0
;
// Index to allow each cpy kernel to be aware of it's position within the graph
// relative to other cpy nodes.
int
graph_cpynode_index
=
-
1
;
#endif
};
...
...
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
View file @
544b6739
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh
View file @
544b6739
...
...
@@ -2,10 +2,6 @@
#define CUDA_CPY_BLOCK_SIZE 64
void
ggml_cuda_cpy
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
ggml_tensor
*
src1
,
bool
disable_indirection
=
false
);
void
ggml_cuda_cpy
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
ggml_tensor
*
src1
);
void
ggml_cuda_dup
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
void
*
ggml_cuda_cpy_fn
(
const
ggml_tensor
*
src0
,
ggml_tensor
*
src1
);
void
ggml_cuda_cpy_dest_ptrs_copy
(
ggml_cuda_graph
*
cuda_graph
,
char
**
host_dest_ptrs
,
const
int
host_dest_ptrs_size
,
cudaStream_t
stream
);
ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
View file @
544b6739
...
...
@@ -895,6 +895,7 @@ void launch_fattn(
const
dim3
block_dim
(
warp_size
,
nwarps
,
1
);
int
max_blocks_per_sm
=
1
;
// Max. number of active blocks limited by occupancy.
CUDA_CHECK
(
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
max_blocks_per_sm
,
fattn_kernel
,
block_dim
.
x
*
block_dim
.
y
*
block_dim
.
z
,
nbytes_shared
));
GGML_ASSERT
(
max_blocks_per_sm
>
0
);
int
parallel_blocks
=
max_blocks_per_sm
;
dim3
blocks_num
;
...
...
ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec.cuh
View file @
544b6739
...
...
@@ -516,8 +516,8 @@ void ggml_cuda_flash_attn_ext_vec_case_impl(ggml_backend_cuda_context & ctx, ggm
const
int
nthreads
=
ggml_cuda_fattn_vec_get_nthreads_host
(
cc
);
const
int
nwarps
=
nthreads
/
WARP_SIZE
;
fattn_kernel_t
fattn_kernel
=
flash_attn_ext_vec
<
D
,
cols_per_block
,
type_K
,
type_V
,
use_logit_softcap
>
;
const
expr
bool
need_f16_K
=
false
;
const
expr
bool
need_f16_V
=
false
;
const
bool
need_f16_K
=
type_K
==
GGML_TYPE_F16
;
const
bool
need_f16_V
=
type_V
==
GGML_TYPE_F16
;
constexpr
size_t
nbytes_shared
=
0
;
launch_fattn
<
D
,
cols_per_block
,
1
>
(
ctx
,
dst
,
fattn_kernel
,
nwarps
,
nbytes_shared
,
D
,
need_f16_K
,
need_f16_V
,
false
);
}
...
...
@@ -526,11 +526,6 @@ template <int D, ggml_type type_K, ggml_type type_V>
void
ggml_cuda_flash_attn_ext_vec_case
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
KQV
=
dst
;
const
ggml_tensor
*
Q
=
dst
->
src
[
0
];
const
ggml_tensor
*
K
=
dst
->
src
[
1
];
const
ggml_tensor
*
V
=
dst
->
src
[
2
];
GGML_ASSERT
(
K
->
type
==
type_K
);
GGML_ASSERT
(
V
->
type
==
type_V
);
float
logit_softcap
;
memcpy
(
&
logit_softcap
,
(
const
float
*
)
KQV
->
op_params
+
2
,
sizeof
(
float
));
...
...
ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
View file @
544b6739
...
...
@@ -116,11 +116,15 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
}
}
#define FATTN_VEC_CASE(D, type_K, type_V) \
if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
ggml_cuda_flash_attn_ext_vec_case<D, type_K, type_V>(ctx, dst); \
return; \
} \
#define FATTN_VEC_CASE(D, type_K, type_V) \
{ \
const bool type_K_okay = K->type == (type_K) || (K->type == GGML_TYPE_F32 && (type_K) == GGML_TYPE_F16); \
const bool type_V_okay = V->type == (type_V) || (V->type == GGML_TYPE_F32 && (type_V) == GGML_TYPE_F16); \
if (Q->ne[0] == (D) && type_K_okay && type_V_okay) { \
ggml_cuda_flash_attn_ext_vec_case<D, type_K, type_V>(ctx, dst); \
return; \
} \
} \
#define FATTN_VEC_CASES_ALL_D(type_K, type_V) \
FATTN_VEC_CASE( 64, type_K, type_V) \
...
...
@@ -247,6 +251,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
#endif // GGML_CUDA_FA_ALL_QUANTS
switch
(
K
->
type
)
{
case
GGML_TYPE_F32
:
case
GGML_TYPE_F16
:
break
;
case
GGML_TYPE_Q4_1
:
...
...
@@ -272,7 +277,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
// If Turing tensor cores available, use them:
if
(
turing_mma_available
(
cc
)
&&
K
->
ne
[
1
]
%
FATTN_KQ_STRIDE
==
0
&&
Q
->
ne
[
0
]
!=
40
)
{
if
(
can_use_vector_kernel
)
{
if
(
K
->
type
==
GGML_TYPE_F16
&&
V
->
type
==
GGML_TYPE_F16
)
{
if
(
!
ggml_is_quantized
(
K
->
type
)
&&
!
ggml_is_quantized
(
V
->
type
)
)
{
if
(
cc
>=
GGML_CUDA_CC_ADA_LOVELACE
&&
Q
->
ne
[
1
]
==
1
&&
Q
->
ne
[
3
]
==
1
&&
!
(
gqa_ratio
>
4
&&
K
->
ne
[
1
]
>=
8192
))
{
return
BEST_FATTN_KERNEL_VEC
;
}
...
...
@@ -305,7 +310,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
// If there are no tensor cores available, use the generic tile kernel:
if
(
can_use_vector_kernel
)
{
if
(
K
->
type
==
GGML_TYPE_F16
&&
V
->
type
==
GGML_TYPE_F16
)
{
if
(
!
ggml_is_quantized
(
K
->
type
)
&&
!
ggml_is_quantized
(
V
->
type
)
)
{
if
(
Q
->
ne
[
1
]
==
1
)
{
if
(
!
gqa_opt_applies
)
{
return
BEST_FATTN_KERNEL_VEC
;
...
...
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
View file @
544b6739
...
...
@@ -2774,11 +2774,10 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
}
#ifdef USE_CUDA_GRAPH
static
bool
check_node_graph_compatibility
_and_refresh_copy_ops
(
ggml_backend_cuda_context
*
cuda_ctx
,
ggml_cgraph
*
cgraph
,
static
bool
check_node_graph_compatibility
(
ggml_cgraph
*
cgraph
,
int
batch_size
,
bool
use_cuda_graph
)
{
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx
->
cuda_graph
->
cpy_dest_ptrs
.
clear
();
const
std
::
string
gemma3n_per_layer_proj_src0_name
=
"inp_per_layer_selected"
;
const
std
::
string
gemma3n_per_layer_proj_src1_name
=
"per_layer_proj"
;
...
...
@@ -2839,33 +2838,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
}
}
if
(
node
->
op
==
GGML_OP_CPY
)
{
// Store the pointers which are updated for each token, such that these can be sent
// to the device and accessed using indirection from CUDA graph
cuda_ctx
->
cuda_graph
->
cpy_dest_ptrs
.
push_back
((
char
*
)
node
->
src
[
1
]
->
data
);
// store a pointer to each copy op CUDA kernel to identify it later
void
*
ptr
=
ggml_cuda_cpy_fn
(
node
->
src
[
0
],
node
->
src
[
1
]);
if
(
!
ptr
)
{
use_cuda_graph
=
false
;
#ifndef NDEBUG
GGML_LOG_DEBUG
(
"%s: disabling CUDA graphs due to unsupported copy op
\n
"
,
__func__
);
#endif
}
}
if
(
!
use_cuda_graph
)
{
break
;
}
}
if
(
use_cuda_graph
)
{
cuda_ctx
->
cuda_graph
->
use_cpy_indirection
=
true
;
// copy pointers to GPU so they can be accessed via indirection within CUDA graph
ggml_cuda_cpy_dest_ptrs_copy
(
cuda_ctx
->
cuda_graph
.
get
(),
cuda_ctx
->
cuda_graph
->
cpy_dest_ptrs
.
data
(),
cuda_ctx
->
cuda_graph
->
cpy_dest_ptrs
.
size
(),
cuda_ctx
->
stream
());
}
return
use_cuda_graph
;
}
...
...
@@ -2884,7 +2861,6 @@ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_p
static
bool
ggml_graph_node_has_matching_properties
(
ggml_tensor
*
node
,
ggml_graph_node_properties
*
graph_node_properties
)
{
if
(
node
->
data
!=
graph_node_properties
->
node_address
&&
node
->
op
!=
GGML_OP_CPY
&&
node
->
op
!=
GGML_OP_VIEW
)
{
return
false
;
}
...
...
@@ -2905,7 +2881,6 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
for
(
int
i
=
0
;
i
<
GGML_MAX_SRC
;
i
++
)
{
if
(
node
->
src
[
i
]
&&
node
->
src
[
i
]
->
data
!=
graph_node_properties
->
src_address
[
i
]
&&
node
->
op
!=
GGML_OP_CPY
&&
node
->
op
!=
GGML_OP_VIEW
)
{
return
false
;
...
...
@@ -2985,18 +2960,15 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
#endif
//TODO: remove special case once ggml_can_fuse can handle empty nodes
std
::
initializer_list
<
enum
ggml_op
>
topk_moe_ops
=
ggml_cuda_topk_moe_ops
(
false
);
std
::
initializer_list
<
enum
ggml_op
>
topk_moe_ops_with_norm
=
ggml_cuda_topk_moe_ops
(
true
);
if
(
ops
.
size
()
==
topk_moe_ops_with_norm
.
size
()
&&
std
::
equal
(
ops
.
begin
(),
ops
.
end
(),
topk_moe_ops_with_norm
.
begin
()))
{
if
(
node_idx
+
topk_moe_ops_with_norm
.
size
()
>
(
size_t
)
cgraph
->
n_nodes
)
{
return
false
;
}
for
(
size_t
i
=
0
;
i
<
topk_moe_ops_with_norm
.
size
();
i
++
)
{
if
(
cgraph
->
nodes
[
node_idx
+
i
]
->
op
!=
topk_moe_ops_with_norm
.
begin
()[
i
])
return
false
;
}
std
::
initializer_list
<
enum
ggml_op
>
topk_moe_ops
=
ggml_cuda_topk_moe_ops
(
/*with_norm*/
false
,
/*delayed_softmax=*/
false
);
std
::
initializer_list
<
enum
ggml_op
>
topk_moe_ops_with_norm
=
ggml_cuda_topk_moe_ops
(
/*with_norm=*/
true
,
/*delayed_softmax=*/
false
);
std
::
initializer_list
<
enum
ggml_op
>
topk_moe_ops_delayed_softmax
=
ggml_cuda_topk_moe_ops
(
/*with_norm=*/
false
,
/*delayed_softmax=*/
true
);
if
(
ops
.
size
()
==
topk_moe_ops_with_norm
.
size
()
&&
ggml_can_fuse_subgraph
(
cgraph
,
node_idx
,
ops
,
{
node_idx
+
3
,
node_idx
+
8
}))
{
ggml_tensor
*
softmax
=
cgraph
->
nodes
[
node_idx
];
ggml_tensor
*
weights
=
cgraph
->
nodes
[
node_idx
+
8
];
...
...
@@ -3005,18 +2977,20 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}
}
if
(
ops
.
size
()
==
topk_moe_ops
.
size
()
&&
std
::
equal
(
ops
.
begin
(),
ops
.
end
(),
topk_moe_ops
.
begin
()))
{
if
(
node_idx
+
topk_moe_ops
.
size
()
>
(
size_t
)
cgraph
->
n_nodes
)
{
return
false
;
if
(
ops
.
size
()
==
topk_moe_ops
.
size
()
&&
ggml_can_fuse_subgraph
(
cgraph
,
node_idx
,
ops
,
{
node_idx
+
3
,
node_idx
+
4
}))
{
ggml_tensor
*
softmax
=
cgraph
->
nodes
[
node_idx
];
ggml_tensor
*
weights
=
cgraph
->
nodes
[
node_idx
+
4
];
if
(
ggml_cuda_should_use_topk_moe
(
softmax
,
weights
))
{
return
true
;
}
}
for
(
size_t
i
=
0
;
i
<
topk_moe_ops
.
size
();
i
++
)
{
if
(
cgraph
->
nodes
[
node_idx
+
i
]
->
op
!=
topk_moe_ops
.
begin
()[
i
])
return
false
;
}
if
(
ops
.
size
()
==
topk_moe_ops_delayed_softmax
.
size
()
&&
ggml_can_fuse_subgraph
(
cgraph
,
node_idx
,
ops
,
{
node_idx
+
2
,
node_idx
+
5
}))
{
ggml_tensor
*
softmax
=
cgraph
->
nodes
[
node_idx
+
4
];
ggml_tensor
*
weights
=
cgraph
->
nodes
[
node_idx
+
5
];
ggml_tensor
*
softmax
=
cgraph
->
nodes
[
node_idx
];
ggml_tensor
*
weights
=
cgraph
->
nodes
[
node_idx
+
4
];
if
(
ggml_cuda_should_use_topk_moe
(
softmax
,
weights
))
{
return
true
;
}
...
...
@@ -3052,7 +3026,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}
//if rms norm is the B operand, then we don't handle broadcast
if
(
rms_norm
==
mul
->
src
[
1
]
&&
!
ggml_are_same_shape
(
mul
->
src
[
0
],
rms_norm
->
src
[
1
]
))
{
if
(
rms_norm
==
mul
->
src
[
1
]
&&
!
ggml_are_same_shape
(
mul
->
src
[
0
],
rms_norm
))
{
return
false
;
}
...
...
@@ -3121,7 +3095,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
if
(
ggml_cuda_can_fuse
(
cgraph
,
i
,
ggml_cuda_topk_moe_ops
(
/*with norm*/
true
),
{}))
{
ggml_tensor
*
weights
=
cgraph
->
nodes
[
i
+
8
];
ggml_tensor
*
selected_experts
=
cgraph
->
nodes
[
i
+
3
];
ggml_cuda_op_topk_moe
(
*
cuda_ctx
,
node
,
weights
,
selected_experts
,
/*with norm*/
true
);
ggml_cuda_op_topk_moe
(
*
cuda_ctx
,
node
->
src
[
0
],
weights
,
selected_experts
,
/*with norm*/
true
,
/*delayed softmax*/
false
);
i
+=
8
;
continue
;
}
...
...
@@ -3129,11 +3104,23 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
if
(
ggml_cuda_can_fuse
(
cgraph
,
i
,
ggml_cuda_topk_moe_ops
(
/*with norm*/
false
),
{}))
{
ggml_tensor
*
weights
=
cgraph
->
nodes
[
i
+
4
];
ggml_tensor
*
selected_experts
=
cgraph
->
nodes
[
i
+
3
];
ggml_cuda_op_topk_moe
(
*
cuda_ctx
,
node
,
weights
,
selected_experts
,
/*with norm*/
false
);
ggml_cuda_op_topk_moe
(
*
cuda_ctx
,
node
->
src
[
0
],
weights
,
selected_experts
,
/*with norm*/
false
,
/*delayed softmax*/
false
);
i
+=
4
;
continue
;
}
if
(
ggml_cuda_can_fuse
(
cgraph
,
i
,
ggml_cuda_topk_moe_ops
(
/*with norm*/
false
,
/*delayed softmax*/
true
),
{}))
{
ggml_tensor
*
weights
=
cgraph
->
nodes
[
i
+
5
];
ggml_tensor
*
ids
=
cgraph
->
nodes
[
i
+
1
];
ggml_cuda_op_topk_moe
(
*
cuda_ctx
,
node
->
src
[
0
],
weights
,
ids
,
/*with norm*/
false
,
/*delayed_softmax*/
true
);
i
+=
5
;
continue
;
}
if
(
node
->
op
==
GGML_OP_ADD
)
{
int
n_fuse
=
0
;
ggml_op
ops
[
8
];
...
...
@@ -3278,7 +3265,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if
(
use_cuda_graph
)
{
cuda_graph_update_required
=
is_cuda_graph_update_required
(
cuda_ctx
,
cgraph
);
use_cuda_graph
=
check_node_graph_compatibility
_and_refresh_copy_ops
(
cuda_ctx
,
cgraph
,
batch_size
,
use_cuda_graph
);
use_cuda_graph
=
check_node_graph_compatibility
(
cgraph
,
batch_size
,
use_cuda_graph
);
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if
(
use_cuda_graph
&&
cuda_graph_update_required
)
{
...
...
@@ -3305,10 +3292,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
CUDA_CHECK
(
cudaStreamBeginCapture
(
cuda_ctx
->
stream
(),
cudaStreamCaptureModeRelaxed
));
}
if
(
!
use_cuda_graph
)
{
cuda_ctx
->
cuda_graph
->
use_cpy_indirection
=
false
;
}
#else
bool
use_cuda_graph
=
false
;
bool
cuda_graph_update_required
=
false
;
...
...
@@ -3922,12 +3905,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case
GGML_OP_CONV_2D_DW
:
case
GGML_OP_CONV_TRANSPOSE_2D
:
case
GGML_OP_POOL_2D
:
case
GGML_OP_SUM
:
case
GGML_OP_ACC
:
return
true
;
case
GGML_OP_SUM
:
return
ggml_is_contiguous_rows
(
op
->
src
[
0
]);
case
GGML_OP_ARGSORT
:
// TODO: Support arbitrary column width
#ifndef GGML_CUDA_USE_CUB
return
op
->
src
[
0
]
->
ne
[
0
]
<=
1024
;
#else
return
true
;
#endif
case
GGML_OP_SUM_ROWS
:
case
GGML_OP_MEAN
:
case
GGML_OP_GROUP_NORM
:
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmf.cu
View file @
544b6739
#include "ggml.h"
#include "mmf.cuh"
#include "mmid.cuh"
void
ggml_cuda_mul_mat_f
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
const
ggml_tensor
*
ids
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src1
->
type
==
GGML_TYPE_F32
);
...
...
@@ -37,6 +39,12 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
const
int64_t
ids_s0
=
ids
?
ids
->
nb
[
0
]
/
ggml_type_size
(
ids
->
type
)
:
0
;
const
int64_t
ids_s1
=
ids
?
ids
->
nb
[
1
]
/
ggml_type_size
(
ids
->
type
)
:
0
;
mmf_ids_data
ids_info
{};
mmf_ids_data
*
ids_info_ptr
=
nullptr
;
ggml_cuda_pool_alloc
<
int32_t
>
ids_src_compact_dev
;
ggml_cuda_pool_alloc
<
int32_t
>
ids_dst_compact_dev
;
ggml_cuda_pool_alloc
<
int32_t
>
expert_bounds_dev
;
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
const
int64_t
ncols_dst
=
ids
?
ne2
:
ne1
;
const
int64_t
nchannels_dst
=
ids
?
ne1
:
ne2
;
...
...
@@ -54,6 +62,33 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
nchannels_y
=
ids
->
ne
[
0
];
}
if
(
ids
&&
ncols_dst
>
16
)
{
const
int64_t
n_expert_used
=
ids
->
ne
[
0
];
const
int64_t
n_experts
=
ne02
;
const
int64_t
n_tokens
=
ne12
;
const
int64_t
ne_get_rows
=
n_tokens
*
n_expert_used
;
ids_src_compact_dev
.
alloc
(
ctx
.
pool
(),
ne_get_rows
);
ids_dst_compact_dev
.
alloc
(
ctx
.
pool
(),
ne_get_rows
);
expert_bounds_dev
.
alloc
(
ctx
.
pool
(),
n_experts
+
1
);
const
int
si1
=
static_cast
<
int
>
(
ids_s1
);
const
int
sis1
=
static_cast
<
int
>
(
src1
->
nb
[
2
]
/
src1
->
nb
[
1
]);
GGML_ASSERT
(
sis1
>
0
);
ggml_cuda_launch_mm_ids_helper
(
ids_d
,
ids_src_compact_dev
.
get
(),
ids_dst_compact_dev
.
get
(),
expert_bounds_dev
.
get
(),
static_cast
<
int
>
(
n_experts
),
static_cast
<
int
>
(
n_tokens
),
static_cast
<
int
>
(
n_expert_used
),
static_cast
<
int
>
(
ne11
),
si1
,
sis1
,
ctx
.
stream
());
CUDA_CHECK
(
cudaGetLastError
());
ids_info
.
ids_src_compact
=
ids_src_compact_dev
.
get
();
ids_info
.
ids_dst_compact
=
ids_dst_compact_dev
.
get
();
ids_info
.
expert_bounds_dev
=
expert_bounds_dev
.
get
();
ids_info
.
n_experts
=
static_cast
<
int
>
(
n_experts
);
ids_info
.
sis1
=
sis1
;
ids_info_ptr
=
&
ids_info
;
}
switch
(
src0
->
type
)
{
case
GGML_TYPE_F32
:
{
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
...
...
@@ -61,7 +96,7 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
mul_mat_f_switch_cols_per_block
(
src0_d
,
src1_d
,
ids_d
,
dst_d
,
ne00
/
vals_per_T
,
ne01
,
ncols_dst
,
s01
/
vals_per_T
,
stride_col_y
/
vals_per_T
,
stride_col_dst
,
ids_s0
,
ids_s1
,
ne02
,
nchannels_y
,
nchannels_dst
,
s02
/
vals_per_T
,
stride_channel_y
,
stride_channel_dst
,
ne03
,
ne3
,
s03
/
vals_per_T
,
s13
,
s3
,
ctx
.
stream
());
ne03
,
ne3
,
s03
/
vals_per_T
,
s13
,
s3
,
ctx
.
stream
()
,
ids_info_ptr
);
}
break
;
case
GGML_TYPE_F16
:
{
const
half2
*
src0_d
=
(
const
half2
*
)
src0
->
data
;
...
...
@@ -69,7 +104,7 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
mul_mat_f_switch_cols_per_block
(
src0_d
,
src1_d
,
ids_d
,
dst_d
,
ne00
/
vals_per_T
,
ne01
,
ncols_dst
,
s01
/
vals_per_T
,
stride_col_y
/
vals_per_T
,
stride_col_dst
,
ids_s0
,
ids_s1
,
ne02
,
nchannels_y
,
nchannels_dst
,
s02
/
vals_per_T
,
stride_channel_y
,
stride_channel_dst
,
ne03
,
ne3
,
s03
/
vals_per_T
,
s13
,
s3
,
ctx
.
stream
());
ne03
,
ne3
,
s03
/
vals_per_T
,
s13
,
s3
,
ctx
.
stream
()
,
ids_info_ptr
);
}
break
;
case
GGML_TYPE_BF16
:
{
const
nv_bfloat162
*
src0_d
=
(
const
nv_bfloat162
*
)
src0
->
data
;
...
...
@@ -77,7 +112,7 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
mul_mat_f_switch_cols_per_block
(
src0_d
,
src1_d
,
ids_d
,
dst_d
,
ne00
/
vals_per_T
,
ne01
,
ncols_dst
,
s01
/
vals_per_T
,
stride_col_y
/
vals_per_T
,
stride_col_dst
,
ids_s0
,
ids_s1
,
ne02
,
nchannels_y
,
nchannels_dst
,
s02
/
vals_per_T
,
stride_channel_y
,
stride_channel_dst
,
ne03
,
ne3
,
s03
/
vals_per_T
,
s13
,
s3
,
ctx
.
stream
());
ne03
,
ne3
,
s03
/
vals_per_T
,
s13
,
s3
,
ctx
.
stream
()
,
ids_info_ptr
);
}
break
;
default:
GGML_ABORT
(
"unsupported type: %s"
,
ggml_type_name
(
src0
->
type
));
...
...
@@ -98,10 +133,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
}
if
(
mul_mat_id
)
{
if
(
type
==
GGML_TYPE_F32
&&
src1_ncols
>
3
2
)
{
if
(
src0_ne
[
1
]
<=
1024
&&
src1_ncols
>
51
2
)
{
return
false
;
}
if
((
type
==
GGML_TYPE_F16
||
type
==
GGML_TYPE_BF16
)
&&
src1_ncols
>
64
)
{
}
else
if
(
src0_ne
[
1
]
>
1024
&&
src1_ncols
>
128
)
{
return
false
;
}
}
else
{
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmf.cuh
View file @
544b6739
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmid.cu
0 → 100644
View file @
544b6739
#include "common.cuh"
#include "mmid.cuh"
// To reduce shared memory use, store "it" and "iex_used" with 22/10 bits each.
struct
mm_ids_helper_store
{
uint32_t
data
;
__device__
mm_ids_helper_store
(
const
uint32_t
it
,
const
uint32_t
iex_used
)
{
data
=
(
it
&
0x003FFFFF
)
|
(
iex_used
<<
22
);
}
__device__
uint32_t
it
()
const
{
return
data
&
0x003FFFFF
;
}
__device__
uint32_t
iex_used
()
const
{
return
data
>>
22
;
}
};
static_assert
(
sizeof
(
mm_ids_helper_store
)
==
4
,
"unexpected size for mm_ids_helper_store"
);
// Helper function for mul_mat_id, converts ids to a more convenient format.
// ids_src1 describes how to permute the flattened column indices of src1 in order to get a compact src1 tensor sorted by expert.
// ids_dst describes the same mapping but for the dst tensor.
// The upper and lower bounds for the ith expert in the compact src1 tensor are stored in expert_bounds[i:i+1].
template
<
int
n_expert_used_template
>
__launch_bounds__
(
ggml_cuda_get_physical_warp_size
(),
1
)
static
__global__
void
mm_ids_helper
(
const
int32_t
*
__restrict__
ids
,
int32_t
*
__restrict__
ids_src1
,
int32_t
*
__restrict__
ids_dst
,
int32_t
*
__restrict__
expert_bounds
,
const
int
n_tokens
,
const
int
n_expert_used_var
,
const
int
nchannels_y
,
const
int
si1
,
const
int
sis1
)
{
constexpr
int
warp_size
=
ggml_cuda_get_physical_warp_size
();
const
int
n_expert_used
=
n_expert_used_template
==
0
?
n_expert_used_var
:
n_expert_used_template
;
const
int
expert
=
blockIdx
.
x
;
extern
__shared__
char
data_mm_ids_helper
[];
mm_ids_helper_store
*
store
=
(
mm_ids_helper_store
*
)
data_mm_ids_helper
;
int
nex_prev
=
0
;
// Number of columns for experts with a lower index.
int
it_compact
=
0
;
// Running index for the compact slice of this expert.
if
constexpr
(
n_expert_used_template
==
0
)
{
// Generic implementation:
for
(
int
it
=
0
;
it
<
n_tokens
;
++
it
)
{
int
iex_used
=
-
1
;
// The index at which the expert is used, if any.
for
(
int
iex
=
threadIdx
.
x
;
iex
<
n_expert_used
;
iex
+=
warp_size
)
{
const
int
expert_used
=
ids
[
it
*
si1
+
iex
];
nex_prev
+=
expert_used
<
expert
;
if
(
expert_used
==
expert
)
{
iex_used
=
iex
;
}
}
if
(
iex_used
!=
-
1
)
{
store
[
it_compact
]
=
mm_ids_helper_store
(
it
,
iex_used
);
}
if
(
warp_reduce_any
<
warp_size
>
(
iex_used
!=
-
1
))
{
it_compact
++
;
}
}
}
else
{
// Implementation optimized for specific numbers of experts used:
static_assert
(
n_expert_used
==
6
||
warp_size
%
n_expert_used
==
0
,
"bad n_expert_used"
);
const
int
neu_padded
=
n_expert_used
==
6
?
8
:
n_expert_used
;
// Padded to next higher power of 2.
for
(
int
it0
=
0
;
it0
<
n_tokens
;
it0
+=
warp_size
/
neu_padded
)
{
const
int
it
=
it0
+
threadIdx
.
x
/
neu_padded
;
const
int
iex
=
threadIdx
.
x
%
neu_padded
;
// The index at which the expert is used, if any.
const
int
expert_used
=
(
neu_padded
==
n_expert_used
||
iex
<
n_expert_used
)
&&
it
<
n_tokens
?
ids
[
it
*
si1
+
iex
]
:
INT_MAX
;
const
int
iex_used
=
expert_used
==
expert
?
iex
:
-
1
;
nex_prev
+=
expert_used
<
expert
;
// Whether the threads at this token position have used the expert:
const
int
it_compact_add_self
=
warp_reduce_any
<
neu_padded
>
(
iex_used
!=
-
1
);
// Do a scan over threads at lower token positions in warp to get the correct index for writing data:
int
it_compact_add_lower
=
0
;
#pragma unroll
for
(
int
offset
=
neu_padded
;
offset
<
warp_size
;
offset
+=
neu_padded
)
{
const
int
tmp
=
__shfl_up_sync
(
0xFFFFFFFF
,
it_compact_add_self
,
offset
,
warp_size
);
if
(
threadIdx
.
x
>=
static_cast
<
unsigned
int
>
(
offset
))
{
it_compact_add_lower
+=
tmp
;
}
}
if
(
iex_used
!=
-
1
)
{
store
[
it_compact
+
it_compact_add_lower
]
=
mm_ids_helper_store
(
it
,
iex_used
);
}
// The thread with the highest index in the warp always has the sum over the whole warp, use it to increment all threads:
it_compact
+=
__shfl_sync
(
0xFFFFFFFF
,
it_compact_add_lower
+
it_compact_add_self
,
warp_size
-
1
,
warp_size
);
}
}
nex_prev
=
warp_reduce_sum
<
warp_size
>
(
nex_prev
);
for
(
int
itc
=
threadIdx
.
x
;
itc
<
it_compact
;
itc
+=
warp_size
)
{
const
mm_ids_helper_store
store_it
=
store
[
itc
];
const
int
it
=
store_it
.
it
();
const
int
iex_used
=
store_it
.
iex_used
();
ids_src1
[
nex_prev
+
itc
]
=
it
*
sis1
+
iex_used
%
nchannels_y
;
ids_dst
[
nex_prev
+
itc
]
=
it
*
n_expert_used
+
iex_used
;
}
if
(
threadIdx
.
x
!=
0
)
{
return
;
}
expert_bounds
[
expert
]
=
nex_prev
;
if
(
expert
<
static_cast
<
int
>
(
gridDim
.
x
)
-
1
)
{
return
;
}
expert_bounds
[
gridDim
.
x
]
=
nex_prev
+
it_compact
;
}
template
<
int
n_expert_used_template
>
static
void
launch_mm_ids_helper
(
const
int32_t
*
__restrict__
ids
,
int32_t
*
__restrict__
ids_src1
,
int32_t
*
__restrict__
ids_dst
,
int32_t
*
__restrict__
expert_bounds
,
const
int
n_experts
,
const
int
n_tokens
,
const
int
n_expert_used_var
,
const
int
nchannels_y
,
const
int
si1
,
const
int
sis1
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
n_tokens
<
(
1
<<
22
)
&&
"too few bits in mm_ids_helper_store"
);
GGML_ASSERT
(
n_expert_used_var
<
(
1
<<
10
)
&&
"too few bits in mm_ids_helper_store"
);
const
int
id
=
ggml_cuda_get_device
();
const
int
warp_size
=
ggml_cuda_info
().
devices
[
id
].
warp_size
;
const
size_t
smpbo
=
ggml_cuda_info
().
devices
[
id
].
smpbo
;
CUDA_SET_SHARED_MEMORY_LIMIT
(
mm_ids_helper
<
n_expert_used_template
>
,
smpbo
);
const
dim3
num_blocks
(
n_experts
,
1
,
1
);
const
dim3
block_size
(
warp_size
,
1
,
1
);
const
size_t
nbytes_shared
=
n_tokens
*
sizeof
(
mm_ids_helper_store
);
GGML_ASSERT
(
nbytes_shared
<=
smpbo
);
mm_ids_helper
<
n_expert_used_template
><<<
num_blocks
,
block_size
,
nbytes_shared
,
stream
>>>
(
ids
,
ids_src1
,
ids_dst
,
expert_bounds
,
n_tokens
,
n_expert_used_var
,
nchannels_y
,
si1
,
sis1
);
}
void
ggml_cuda_launch_mm_ids_helper
(
const
int32_t
*
__restrict__
ids
,
int32_t
*
__restrict__
ids_src1
,
int32_t
*
__restrict__
ids_dst
,
int32_t
*
__restrict__
expert_bounds
,
const
int
n_experts
,
const
int
n_tokens
,
const
int
n_expert_used
,
const
int
nchannels_y
,
const
int
si1
,
const
int
sis1
,
cudaStream_t
stream
)
{
switch
(
n_expert_used
)
{
case
2
:
launch_mm_ids_helper
<
2
>
(
ids
,
ids_src1
,
ids_dst
,
expert_bounds
,
n_experts
,
n_tokens
,
n_expert_used
,
nchannels_y
,
si1
,
sis1
,
stream
);
break
;
case
4
:
launch_mm_ids_helper
<
4
>
(
ids
,
ids_src1
,
ids_dst
,
expert_bounds
,
n_experts
,
n_tokens
,
n_expert_used
,
nchannels_y
,
si1
,
sis1
,
stream
);
break
;
case
6
:
launch_mm_ids_helper
<
6
>
(
ids
,
ids_src1
,
ids_dst
,
expert_bounds
,
n_experts
,
n_tokens
,
n_expert_used
,
nchannels_y
,
si1
,
sis1
,
stream
);
break
;
case
8
:
launch_mm_ids_helper
<
8
>
(
ids
,
ids_src1
,
ids_dst
,
expert_bounds
,
n_experts
,
n_tokens
,
n_expert_used
,
nchannels_y
,
si1
,
sis1
,
stream
);
break
;
case
16
:
launch_mm_ids_helper
<
16
>
(
ids
,
ids_src1
,
ids_dst
,
expert_bounds
,
n_experts
,
n_tokens
,
n_expert_used
,
nchannels_y
,
si1
,
sis1
,
stream
);
break
;
case
32
:
launch_mm_ids_helper
<
32
>
(
ids
,
ids_src1
,
ids_dst
,
expert_bounds
,
n_experts
,
n_tokens
,
n_expert_used
,
nchannels_y
,
si1
,
sis1
,
stream
);
break
;
default:
launch_mm_ids_helper
<
0
>
(
ids
,
ids_src1
,
ids_dst
,
expert_bounds
,
n_experts
,
n_tokens
,
n_expert_used
,
nchannels_y
,
si1
,
sis1
,
stream
);
break
;
}
}
ml/backend/ggml/ggml/src/ggml-cuda/mmid.cuh
0 → 100644
View file @
544b6739
#pragma once
void
ggml_cuda_launch_mm_ids_helper
(
const
int32_t
*
ids
,
int32_t
*
ids_src1
,
int32_t
*
ids_dst
,
int32_t
*
expert_bounds
,
int
n_experts
,
int
n_tokens
,
int
n_expert_used
,
int
nchannels_y
,
int
si1
,
int
sis1
,
cudaStream_t
stream
);
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
View file @
544b6739
#include "mmq.cuh"
#include "quantize.cuh"
#include <vector>
// To reduce shared memory use, store "it" and "iex_used" with 22/10 bits each.
struct
mmq_ids_helper_store
{
uint32_t
data
;
__device__
mmq_ids_helper_store
(
const
uint32_t
it
,
const
uint32_t
iex_used
)
{
data
=
(
it
&
0x003FFFFF
)
|
(
iex_used
<<
22
);
}
__device__
uint32_t
it
()
const
{
return
data
&
0x003FFFFF
;
}
__device__
uint32_t
iex_used
()
const
{
return
data
>>
22
;
}
};
static_assert
(
sizeof
(
mmq_ids_helper_store
)
==
4
,
"unexpected size for mmq_ids_helper_store"
);
// Helper function for mul_mat_id, converts ids to a more convenient format.
// ids_src1 describes how to permute the flattened column indices of src1 in order to get a compact src1 tensor sorted by expert.
// ids_dst describes the same mapping but for the dst tensor.
// The upper and lower bounds for the ith expert in the compact src1 tensor are stored in expert_bounds[i:i+1].
template
<
int
n_expert_used_template
>
__launch_bounds__
(
ggml_cuda_get_physical_warp_size
(),
1
)
static
__global__
void
mmq_ids_helper
(
const
int32_t
*
__restrict__
ids
,
int32_t
*
__restrict__
ids_src1
,
int32_t
*
__restrict__
ids_dst
,
int32_t
*
__restrict__
expert_bounds
,
const
int
n_tokens
,
const
int
n_expert_used_var
,
const
int
nchannels_y
,
const
int
si1
,
const
int
sis1
)
{
constexpr
int
warp_size
=
ggml_cuda_get_physical_warp_size
();
const
int
n_expert_used
=
n_expert_used_template
==
0
?
n_expert_used_var
:
n_expert_used_template
;
const
int
expert
=
blockIdx
.
x
;
extern
__shared__
char
data_mmq_ids_helper
[];
mmq_ids_helper_store
*
store
=
(
mmq_ids_helper_store
*
)
data_mmq_ids_helper
;
int
nex_prev
=
0
;
// Number of columns for experts with a lower index.
int
it_compact
=
0
;
// Running index for the compact slice of this expert.
if
constexpr
(
n_expert_used_template
==
0
)
{
// Generic implementation:
for
(
int
it
=
0
;
it
<
n_tokens
;
++
it
)
{
int
iex_used
=
-
1
;
// The index at which the expert is used, if any.
for
(
int
iex
=
threadIdx
.
x
;
iex
<
n_expert_used
;
iex
+=
warp_size
)
{
const
int
expert_used
=
ids
[
it
*
si1
+
iex
];
nex_prev
+=
expert_used
<
expert
;
if
(
expert_used
==
expert
)
{
iex_used
=
iex
;
}
}
if
(
iex_used
!=
-
1
)
{
store
[
it_compact
]
=
mmq_ids_helper_store
(
it
,
iex_used
);
}
if
(
warp_reduce_any
<
warp_size
>
(
iex_used
!=
-
1
))
{
it_compact
++
;
}
}
}
else
{
// Implementation optimized for specific numbers of experts used:
static_assert
(
n_expert_used
==
6
||
warp_size
%
n_expert_used
==
0
,
"bad n_expert_used"
);
const
int
neu_padded
=
n_expert_used
==
6
?
8
:
n_expert_used
;
// Padded to next higher power of 2.
for
(
int
it0
=
0
;
it0
<
n_tokens
;
it0
+=
warp_size
/
neu_padded
)
{
const
int
it
=
it0
+
threadIdx
.
x
/
neu_padded
;
const
int
iex
=
threadIdx
.
x
%
neu_padded
;
// The index at which the expert is used, if any.
const
int
expert_used
=
(
neu_padded
==
n_expert_used
||
iex
<
n_expert_used
)
&&
it
<
n_tokens
?
ids
[
it
*
si1
+
iex
]
:
INT_MAX
;
const
int
iex_used
=
expert_used
==
expert
?
iex
:
-
1
;
nex_prev
+=
expert_used
<
expert
;
// Whether the threads at this token position have used the expert:
const
int
it_compact_add_self
=
warp_reduce_any
<
neu_padded
>
(
iex_used
!=
-
1
);
// Do a scan over threads at lower token positions in warp to get the correct index for writing data:
int
it_compact_add_lower
=
0
;
#pragma unroll
for
(
int
offset
=
neu_padded
;
offset
<
warp_size
;
offset
+=
neu_padded
)
{
const
int
tmp
=
__shfl_up_sync
(
0xFFFFFFFF
,
it_compact_add_self
,
offset
,
warp_size
);
if
(
threadIdx
.
x
>=
static_cast
<
unsigned
int
>
(
offset
))
{
it_compact_add_lower
+=
tmp
;
}
}
if
(
iex_used
!=
-
1
)
{
store
[
it_compact
+
it_compact_add_lower
]
=
mmq_ids_helper_store
(
it
,
iex_used
);
}
// The thread with the highest index in the warp always has the sum over the whole warp, use it to increment all threads:
it_compact
+=
__shfl_sync
(
0xFFFFFFFF
,
it_compact_add_lower
+
it_compact_add_self
,
warp_size
-
1
,
warp_size
);
}
}
nex_prev
=
warp_reduce_sum
<
warp_size
>
(
nex_prev
);
for
(
int
itc
=
threadIdx
.
x
;
itc
<
it_compact
;
itc
+=
warp_size
)
{
const
mmq_ids_helper_store
store_it
=
store
[
itc
];
const
int
it
=
store_it
.
it
();
const
int
iex_used
=
store_it
.
iex_used
();
ids_src1
[
nex_prev
+
itc
]
=
it
*
sis1
+
iex_used
%
nchannels_y
;
ids_dst
[
nex_prev
+
itc
]
=
it
*
n_expert_used
+
iex_used
;
}
if
(
threadIdx
.
x
!=
0
)
{
return
;
}
expert_bounds
[
expert
]
=
nex_prev
;
if
(
expert
<
static_cast
<
int
>
(
gridDim
.
x
)
-
1
)
{
return
;
}
expert_bounds
[
gridDim
.
x
]
=
nex_prev
+
it_compact
;
}
template
<
int
n_expert_used_template
>
static
void
launch_mmq_ids_helper
(
const
int32_t
*
__restrict__
ids
,
int32_t
*
__restrict__
ids_src1
,
int32_t
*
__restrict__
ids_dst
,
int32_t
*
__restrict__
expert_bounds
,
const
int
n_experts
,
const
int
n_tokens
,
const
int
n_expert_used_var
,
const
int
nchannels_y
,
const
int
si1
,
const
int
sis1
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
n_tokens
<
(
1
<<
22
)
&&
"too few bits in mmq_ids_helper_store"
);
GGML_ASSERT
(
n_expert_used_var
<
(
1
<<
10
)
&&
"too few bits in mmq_ids_helper_store"
);
const
int
id
=
ggml_cuda_get_device
();
const
int
warp_size
=
ggml_cuda_info
().
devices
[
id
].
warp_size
;
const
size_t
smpbo
=
ggml_cuda_info
().
devices
[
id
].
smpbo
;
CUDA_SET_SHARED_MEMORY_LIMIT
(
mmq_ids_helper
<
n_expert_used_template
>
,
smpbo
);
const
dim3
num_blocks
(
n_experts
,
1
,
1
);
const
dim3
block_size
(
warp_size
,
1
,
1
);
const
size_t
nbytes_shared
=
n_tokens
*
sizeof
(
mmq_ids_helper_store
);
GGML_ASSERT
(
nbytes_shared
<=
smpbo
);
mmq_ids_helper
<
n_expert_used_template
><<<
num_blocks
,
block_size
,
nbytes_shared
,
stream
>>>
(
ids
,
ids_src1
,
ids_dst
,
expert_bounds
,
n_tokens
,
n_expert_used_var
,
nchannels_y
,
si1
,
sis1
);
}
#include "mmid.cuh"
static
void
ggml_cuda_mul_mat_q_switch_type
(
ggml_backend_cuda_context
&
ctx
,
const
mmq_args
&
args
,
cudaStream_t
stream
)
{
switch
(
args
.
type_x
)
{
...
...
@@ -293,36 +158,8 @@ void ggml_cuda_mul_mat_q(
const
int
si1
=
ids
->
nb
[
1
]
/
ggml_element_size
(
ids
);
const
int
sis1
=
nb12
/
nb11
;
switch
(
n_expert_used
)
{
case
2
:
launch_mmq_ids_helper
<
2
>
((
const
int32_t
*
)
ids
->
data
,
ids_src1
.
get
(),
ids_dst
.
get
(),
expert_bounds
.
get
(),
ne02
,
ne12
,
n_expert_used
,
ne11
,
si1
,
sis1
,
stream
);
break
;
case
4
:
launch_mmq_ids_helper
<
4
>
((
const
int32_t
*
)
ids
->
data
,
ids_src1
.
get
(),
ids_dst
.
get
(),
expert_bounds
.
get
(),
ne02
,
ne12
,
n_expert_used
,
ne11
,
si1
,
sis1
,
stream
);
break
;
case
6
:
launch_mmq_ids_helper
<
6
>
((
const
int32_t
*
)
ids
->
data
,
ids_src1
.
get
(),
ids_dst
.
get
(),
expert_bounds
.
get
(),
ne02
,
ne12
,
n_expert_used
,
ne11
,
si1
,
sis1
,
stream
);
break
;
case
8
:
launch_mmq_ids_helper
<
8
>
((
const
int32_t
*
)
ids
->
data
,
ids_src1
.
get
(),
ids_dst
.
get
(),
expert_bounds
.
get
(),
ne02
,
ne12
,
n_expert_used
,
ne11
,
si1
,
sis1
,
stream
);
break
;
case
16
:
launch_mmq_ids_helper
<
16
>
((
const
int32_t
*
)
ids
->
data
,
ids_src1
.
get
(),
ids_dst
.
get
(),
expert_bounds
.
get
(),
ne02
,
ne12
,
n_expert_used
,
ne11
,
si1
,
sis1
,
stream
);
break
;
case
32
:
launch_mmq_ids_helper
<
32
>
((
const
int32_t
*
)
ids
->
data
,
ids_src1
.
get
(),
ids_dst
.
get
(),
expert_bounds
.
get
(),
ne02
,
ne12
,
n_expert_used
,
ne11
,
si1
,
sis1
,
stream
);
break
;
default:
launch_mmq_ids_helper
<
0
>
((
const
int32_t
*
)
ids
->
data
,
ids_src1
.
get
(),
ids_dst
.
get
(),
expert_bounds
.
get
(),
ne02
,
ne12
,
n_expert_used
,
ne11
,
si1
,
sis1
,
stream
);
break
;
}
ggml_cuda_launch_mm_ids_helper
((
const
int32_t
*
)
ids
->
data
,
ids_src1
.
get
(),
ids_dst
.
get
(),
expert_bounds
.
get
(),
ne02
,
ne12
,
n_expert_used
,
ne11
,
si1
,
sis1
,
stream
);
CUDA_CHECK
(
cudaGetLastError
());
}
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmvf.cu
View file @
544b6739
...
...
@@ -7,14 +7,14 @@ template <typename T, typename type_acc, int ncols_dst, int block_size>
static
__global__
void
mul_mat_vec_f
(
const
T
*
__restrict__
x
,
const
float
*
__restrict__
y
,
const
int32_t
*
__restrict__
ids
,
float
*
__restrict__
dst
,
const
int
ncols2
,
const
int
nchannels_y
,
const
int
stride_row
,
const
int
stride_col_y2
,
const
int
stride_col_dst
,
const
int
channel_ratio
,
const
int
stride_channel_x
,
const
int
stride_channel_y
,
const
int
stride_channel_dst
,
const
int
sample_ratio
,
const
int
stride_sample_x
,
const
int
stride_sample_y
,
const
int
stride_sample_dst
)
{
const
u
int
3
channel_ratio
,
const
int
stride_channel_x
,
const
int
stride_channel_y
,
const
int
stride_channel_dst
,
const
u
int
3
sample_ratio
,
const
int
stride_sample_x
,
const
int
stride_sample_y
,
const
int
stride_sample_dst
)
{
const
int
row
=
blockIdx
.
x
;
const
int
channel_dst
=
blockIdx
.
y
;
const
int
channel_x
=
ids
?
ids
[
channel_dst
]
:
channel_dst
/
channel_ratio
;
const
int
channel_x
=
ids
?
ids
[
channel_dst
]
:
fastdiv
((
uint32_t
)
channel_dst
,
channel_ratio
)
;
const
int
channel_y
=
ids
?
channel_dst
%
nchannels_y
:
channel_dst
;
const
int
sample_dst
=
blockIdx
.
z
;
const
int
sample_x
=
sample_dst
/
sample_ratio
;
const
int
sample_x
=
fastdiv
((
uint32_t
)
sample_dst
,
sample_ratio
)
;
const
int
sample_y
=
sample_dst
;
const
int
tid
=
threadIdx
.
x
;
...
...
@@ -47,8 +47,8 @@ static __global__ void mul_mat_vec_f(
#pragma unroll
for
(
int
j
=
0
;
j
<
ncols_dst
;
++
j
)
{
const
float2
tmpy
=
y2
[
j
*
stride_col_y2
+
col2
];
sumf
[
j
]
+=
tmpx
.
x
*
tmpy
.
x
;
sumf
[
j
]
+=
tmpx
.
y
*
tmpy
.
y
;
ggml_cuda_mad
(
sumf
[
j
]
,
tmpx
.
x
,
tmpy
.
x
)
;
ggml_cuda_mad
(
sumf
[
j
]
,
tmpx
.
y
,
tmpy
.
y
)
;
}
}
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
...
...
@@ -61,8 +61,8 @@ static __global__ void mul_mat_vec_f(
#pragma unroll
for
(
int
j
=
0
;
j
<
ncols_dst
;
++
j
)
{
const
float2
tmpy
=
y2
[
j
*
stride_col_y2
+
col2
];
sumf
[
j
]
+=
tmpx
.
x
*
tmpy
.
x
;
sumf
[
j
]
+=
tmpx
.
y
*
tmpy
.
y
;
ggml_cuda_mad
(
sumf
[
j
]
,
tmpx
.
x
,
tmpy
.
x
)
;
ggml_cuda_mad
(
sumf
[
j
]
,
tmpx
.
y
,
tmpy
.
y
)
;
}
}
}
else
{
...
...
@@ -88,16 +88,32 @@ static __global__ void mul_mat_vec_f(
#endif // FP16_AVAILABLE
}
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
nv_bfloat16
>
)
{
//TODO: add support for ggml_cuda_mad for hip_bfloat162
#if defined(GGML_USE_HIP)
const
int
*
x2
=
(
const
int
*
)
x
;
for
(
int
col2
=
tid
;
col2
<
ncols2
;
col2
+=
block_size
)
{
const
int
tmpx
=
x2
[
col2
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ncols_dst
;
++
j
)
{
const
float2
tmpy
=
y2
[
j
*
stride_col_y2
+
col2
];
sumf
[
j
]
+=
ggml_cuda_cast
<
float
>
(
reinterpret_cast
<
const
nv_bfloat16
*>
(
&
tmpx
)[
0
])
*
tmpy
.
x
;
sumf
[
j
]
+=
ggml_cuda_cast
<
float
>
(
reinterpret_cast
<
const
nv_bfloat16
*>
(
&
tmpx
)[
1
])
*
tmpy
.
y
;
const
float
tmpx0
=
ggml_cuda_cast
<
float
>
(
reinterpret_cast
<
const
nv_bfloat16
*>
(
&
tmpx
)[
0
]);
const
float
tmpx1
=
ggml_cuda_cast
<
float
>
(
reinterpret_cast
<
const
nv_bfloat16
*>
(
&
tmpx
)[
1
]);
ggml_cuda_mad
(
sumf
[
j
],
tmpx0
,
tmpy
.
x
);
ggml_cuda_mad
(
sumf
[
j
],
tmpx1
,
tmpy
.
y
);
}
}
#else
const
nv_bfloat162
*
x2
=
(
const
nv_bfloat162
*
)
x
;
for
(
int
col2
=
tid
;
col2
<
ncols2
;
col2
+=
block_size
)
{
const
nv_bfloat162
tmpx
=
x2
[
col2
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ncols_dst
;
++
j
)
{
const
float2
tmpy
=
y2
[
j
*
stride_col_y2
+
col2
];
ggml_cuda_mad
(
sumf
[
j
],
tmpx
.
x
,
tmpy
.
x
);
ggml_cuda_mad
(
sumf
[
j
],
tmpx
.
y
,
tmpy
.
y
);
}
}
#endif
}
else
{
static_assert
(
std
::
is_same_v
<
T
,
void
>
,
"unsupported type"
);
}
...
...
@@ -140,8 +156,8 @@ static void launch_mul_mat_vec_f_cuda(
GGML_ASSERT
(
stride_col_y
%
2
==
0
);
GGML_ASSERT
(
ids
||
nchannels_dst
%
nchannels_x
==
0
);
GGML_ASSERT
(
nsamples_dst
%
nsamples_x
==
0
);
const
int
64_t
channel_ratio
=
nchannels_dst
/
nchannels_x
;
const
int
64_t
sample_ratio
=
nsamples_dst
/
nsamples_x
;
const
u
int
3
channel_ratio
_fd
=
ids
?
make_uint3
(
0
,
0
,
0
)
:
init_fastdiv_values
(
nchannels_dst
/
nchannels_x
)
;
const
u
int
3
sample_ratio
_fd
=
init_fastdiv_values
(
nsamples_dst
/
nsamples_x
)
;
const
int
device
=
ggml_cuda_get_device
();
const
int
warp_size
=
ggml_cuda_info
().
devices
[
device
].
warp_size
;
...
...
@@ -167,50 +183,50 @@ static void launch_mul_mat_vec_f_cuda(
case
32
:
{
mul_mat_vec_f
<
T
,
type_acc
,
ncols_dst
,
32
><<<
block_nums
,
block_dims
,
nbytes_shared
,
stream
>>>
(
x
,
y
,
ids
,
dst
,
ncols
/
2
,
nchannels_y
,
stride_row
,
stride_col_y
/
2
,
stride_col_dst
,
channel_ratio
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
channel_ratio
_fd
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
_fd
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
}
break
;
case
64
:
{
mul_mat_vec_f
<
T
,
type_acc
,
ncols_dst
,
64
><<<
block_nums
,
block_dims
,
nbytes_shared
,
stream
>>>
(
x
,
y
,
ids
,
dst
,
ncols
/
2
,
nchannels_y
,
stride_row
,
stride_col_y
/
2
,
stride_col_dst
,
channel_ratio
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
channel_ratio
_fd
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
_fd
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
}
break
;
case
96
:
{
mul_mat_vec_f
<
T
,
type_acc
,
ncols_dst
,
96
><<<
block_nums
,
block_dims
,
nbytes_shared
,
stream
>>>
(
x
,
y
,
ids
,
dst
,
ncols
/
2
,
nchannels_y
,
stride_row
,
stride_col_y
/
2
,
stride_col_dst
,
channel_ratio
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
channel_ratio
_fd
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
_fd
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
}
break
;
case
128
:
{
mul_mat_vec_f
<
T
,
type_acc
,
ncols_dst
,
128
><<<
block_nums
,
block_dims
,
nbytes_shared
,
stream
>>>
(
x
,
y
,
ids
,
dst
,
ncols
/
2
,
nchannels_y
,
stride_row
,
stride_col_y
/
2
,
stride_col_dst
,
channel_ratio
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
channel_ratio
_fd
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
_fd
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
}
break
;
case
160
:
{
mul_mat_vec_f
<
T
,
type_acc
,
ncols_dst
,
160
><<<
block_nums
,
block_dims
,
nbytes_shared
,
stream
>>>
(
x
,
y
,
ids
,
dst
,
ncols
/
2
,
nchannels_y
,
stride_row
,
stride_col_y
/
2
,
stride_col_dst
,
channel_ratio
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
channel_ratio
_fd
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
_fd
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
}
break
;
case
192
:
{
mul_mat_vec_f
<
T
,
type_acc
,
ncols_dst
,
192
><<<
block_nums
,
block_dims
,
nbytes_shared
,
stream
>>>
(
x
,
y
,
ids
,
dst
,
ncols
/
2
,
nchannels_y
,
stride_row
,
stride_col_y
/
2
,
stride_col_dst
,
channel_ratio
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
channel_ratio
_fd
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
_fd
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
}
break
;
case
224
:
{
mul_mat_vec_f
<
T
,
type_acc
,
ncols_dst
,
224
><<<
block_nums
,
block_dims
,
nbytes_shared
,
stream
>>>
(
x
,
y
,
ids
,
dst
,
ncols
/
2
,
nchannels_y
,
stride_row
,
stride_col_y
/
2
,
stride_col_dst
,
channel_ratio
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
channel_ratio
_fd
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
_fd
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
}
break
;
case
256
:
{
mul_mat_vec_f
<
T
,
type_acc
,
ncols_dst
,
256
><<<
block_nums
,
block_dims
,
nbytes_shared
,
stream
>>>
(
x
,
y
,
ids
,
dst
,
ncols
/
2
,
nchannels_y
,
stride_row
,
stride_col_y
/
2
,
stride_col_dst
,
channel_ratio
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
channel_ratio
_fd
,
stride_channel_x
,
stride_channel_y
,
stride_channel_dst
,
sample_ratio
_fd
,
stride_sample_x
,
stride_sample_y
,
stride_sample_dst
);
}
break
;
default:
{
GGML_ABORT
(
"fatal error"
);
...
...
ml/backend/ggml/ggml/src/ggml-cuda/topk-moe.cu
View file @
544b6739
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/topk-moe.cuh
View file @
544b6739
...
...
@@ -6,9 +6,10 @@
void
ggml_cuda_op_topk_moe
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
logits
,
ggml_tensor
*
weights
,
ggml_tensor
*
top_k
,
const
bool
with_norm
);
ggml_tensor
*
ids
,
const
bool
with_norm
,
const
bool
delayed_softmax
=
false
);
bool
ggml_cuda_should_use_topk_moe
(
const
ggml_tensor
*
softmax
,
const
ggml_tensor
*
weights
);
std
::
initializer_list
<
enum
ggml_op
>
ggml_cuda_topk_moe_ops
(
bool
with_norm
);
std
::
initializer_list
<
enum
ggml_op
>
ggml_cuda_topk_moe_ops
(
bool
with_norm
,
bool
delayed_softmax
=
false
);
ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
View file @
544b6739
...
...
@@ -28,8 +28,10 @@ if (CXX_IS_HIPCC)
" Prefer setting the HIP compiler directly. See README for details."
)
endif
()
else
()
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if
(
AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES
)
# Forward (AMD)GPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if
(
GPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES
)
set
(
CMAKE_HIP_ARCHITECTURES
${
GPU_TARGETS
}
)
elseif
(
AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES
)
set
(
CMAKE_HIP_ARCHITECTURES
${
AMDGPU_TARGETS
}
)
endif
()
cmake_minimum_required
(
VERSION 3.21
)
...
...
ml/backend/ggml/ggml/src/ggml-impl.h
View file @
544b6739
...
...
@@ -565,14 +565,23 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
static
inline
int32_t
ggml_node_get_use_count
(
const
struct
ggml_cgraph
*
cgraph
,
int
node_idx
)
{
const
struct
ggml_tensor
*
node
=
cgraph
->
nodes
[
node_idx
];
size_t
hash_pos
=
ggml_hash_find
(
&
cgraph
->
visited_hash_set
,
node
);
if
(
!
ggml_bitset_get
(
cgraph
->
visited_hash_set
.
used
,
hash_pos
))
{
return
0
;
}
return
cgraph
->
use_counts
[
hash_pos
];
}
// return true if the node's results are only used by N other nodes
// and can be fused into their calculations.
static
inline
bool
ggml_node_has_n_uses
(
const
struct
ggml_cgraph
*
cgraph
,
int
node_idx
,
int32_t
n_uses
)
{
const
struct
ggml_tensor
*
node
=
cgraph
->
nodes
[
node_idx
];
// check the use count against how many we're replacing
size_t
hash_pos
=
ggml_hash_find
(
&
cgraph
->
visited_hash_set
,
node
);
if
(
!
ggml_bitset_get
(
cgraph
->
visited_hash_set
.
used
,
hash_pos
)
||
cgraph
->
use_counts
[
hash_pos
]
!=
n_uses
)
{
if
(
ggml_node_get_use_count
(
cgraph
,
node_idx
)
!=
n_uses
)
{
return
false
;
}
...
...
@@ -638,6 +647,36 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
return
ggml_can_fuse_ext
(
cgraph
,
idxs
,
ops
,
num_ops
);
}
GGML_API
bool
ggml_can_fuse_subgraph_ext
(
const
struct
ggml_cgraph
*
cgraph
,
const
int
*
node_idxs
,
int
count
,
const
enum
ggml_op
*
ops
,
const
int
*
outputs
,
int
num_outputs
);
// Returns true if the subgraph formed by {node_idxs} can be fused
// checks whethers all nodes which are not part of outputs can be elided
// by checking if their num_uses are confined to the subgraph
static
inline
bool
ggml_can_fuse_subgraph
(
const
struct
ggml_cgraph
*
cgraph
,
int
node_idx
,
int
count
,
const
enum
ggml_op
*
ops
,
const
int
*
outputs
,
int
num_outputs
)
{
GGML_ASSERT
(
count
<
32
);
if
(
node_idx
+
count
>
cgraph
->
n_nodes
)
{
return
false
;
}
int
idxs
[
32
];
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
idxs
[
i
]
=
node_idx
+
i
;
}
return
ggml_can_fuse_subgraph_ext
(
cgraph
,
idxs
,
count
,
ops
,
outputs
,
num_outputs
);
}
// Management libraries for fetching more accurate free VRAM data
GGML_API
int
ggml_nvml_init
();
GGML_API
int
ggml_nvml_get_device_memory
(
const
char
*
uuid
,
size_t
*
free
,
size_t
*
total
);
...
...
@@ -662,6 +701,13 @@ inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::
return
ggml_can_fuse
(
cgraph
,
node_idx
,
ops
.
begin
(),
(
int
)
ops
.
size
());
}
inline
bool
ggml_can_fuse_subgraph
(
const
struct
ggml_cgraph
*
cgraph
,
int
start_idx
,
std
::
initializer_list
<
enum
ggml_op
>
ops
,
std
::
initializer_list
<
int
>
outputs
=
{})
{
return
ggml_can_fuse_subgraph
(
cgraph
,
start_idx
,
ops
.
size
(),
ops
.
begin
(),
outputs
.
begin
(),
outputs
.
size
());
}
// expose GGUF internals for test code
GGML_API
size_t
gguf_type_size
(
enum
gguf_type
type
);
GGML_API
struct
gguf_context
*
gguf_init_from_file_impl
(
FILE
*
file
,
struct
gguf_init_params
params
);
...
...
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-device.cpp
View file @
544b6739
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment