Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
23125648
Unverified
Commit
23125648
authored
May 13, 2025
by
Michael Yang
Committed by
GitHub
May 13, 2025
Browse files
chore: update mllama to use ollama engine (#10637)
parent
0478d440
Changes
67
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
202 additions
and
482 deletions
+202
-482
Makefile.sync
Makefile.sync
+10
-7
convert/convert.go
convert/convert.go
+14
-14
convert/convert_mllama.go
convert/convert_mllama.go
+160
-0
convert/reader.go
convert/reader.go
+4
-1
fs/ggml/ggml.go
fs/ggml/ggml.go
+1
-0
llama/llama.cpp/include/llama.h
llama/llama.cpp/include/llama.h
+0
-6
llama/llama.cpp/src/llama-arch.cpp
llama/llama.cpp/src/llama-arch.cpp
+0
-44
llama/llama.cpp/src/llama-arch.h
llama/llama.cpp/src/llama-arch.h
+0
-10
llama/llama.cpp/src/llama-batch.cpp
llama/llama.cpp/src/llama-batch.cpp
+0
-3
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+8
-15
llama/llama.cpp/src/llama-context.h
llama/llama.cpp/src/llama-context.h
+0
-1
llama/llama.cpp/src/llama-cparams.h
llama/llama.cpp/src/llama-cparams.h
+0
-1
llama/llama.cpp/src/llama-graph.cpp
llama/llama.cpp/src/llama-graph.cpp
+0
-25
llama/llama.cpp/src/llama-graph.h
llama/llama.cpp/src/llama-graph.h
+0
-12
llama/llama.cpp/src/llama-hparams.cpp
llama/llama.cpp/src/llama-hparams.cpp
+0
-4
llama/llama.cpp/src/llama-hparams.h
llama/llama.cpp/src/llama-hparams.h
+0
-7
llama/llama.cpp/src/llama-kv-cache.cpp
llama/llama.cpp/src/llama-kv-cache.cpp
+3
-11
llama/llama.cpp/src/llama-model-loader.cpp
llama/llama.cpp/src/llama-model-loader.cpp
+0
-2
llama/llama.cpp/src/llama-model.cpp
llama/llama.cpp/src/llama-model.cpp
+2
-307
llama/llama.cpp/src/llama-model.h
llama/llama.cpp/src/llama-model.h
+0
-12
No files found.
Makefile.sync
View file @
23125648
...
@@ -15,11 +15,13 @@ help:
...
@@ -15,11 +15,13 @@ help:
@
echo
" make -f
$(
lastword
$(MAKEFILE_LIST)
)
clean sync"
@
echo
" make -f
$(
lastword
$(MAKEFILE_LIST)
)
clean sync"
.PHONY
:
sync
.PHONY
:
sync
sync
:
llama/build-info.cpp
llama/llama.cpp
ml/backend/ggml/ggml
sync
:
llama/build-info.cpp ml/backend/ggml/ggml
/src/ggml-metal/ggml-metal-embed.metal
.PHONY
:
llama/build-info.cpp
llama/build-info.cpp
:
llama/build-info.cpp.in llama/llama.cpp
llama/build-info.cpp
:
llama/build-info.cpp.in
sed
-e
's|@FETCH_HEAD@|
$(FETCH_HEAD)
|'
<
$<
>
$@
sed
-e
's|@FETCH_HEAD@|
$(FETCH_HEAD)
|'
$<
>
$@
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
:
ml/backend/ggml/ggml
go generate ./
$
(
@D
)
.PHONY
:
llama/llama.cpp
.PHONY
:
llama/llama.cpp
llama/llama.cpp
:
llama/vendor/
llama/llama.cpp
:
llama/vendor/
...
@@ -30,12 +32,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
...
@@ -30,12 +32,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
rsync
-arvzc
-f
"merge
$@
/.rsync-filter"
$<
$@
rsync
-arvzc
-f
"merge
$@
/.rsync-filter"
$<
$@
PATCHES
=
$(
wildcard
llama/patches/
*
.patch
)
PATCHES
=
$(
wildcard
llama/patches/
*
.patch
)
PATCHED
=
$(
join
$(
dir
$(PATCHES)
)
,
$(
addsuffix
ed,
$(
addprefix
.,
$(
notdir
$(PATCHES)
))))
.PHONY
:
apply-patches
.PHONY
:
apply-patches
.NOTPARALLEL
:
.NOTPARALLEL
:
apply-patches
:
$(addsuffix ed
,
$(PATCHE
S)
)
apply-patches
:
$(PATCHE
D
)
%.patched
:
%.patch
llama/patches/.%.patched
:
llama/patches/
%.patch
@
if
git
-c
user.name
=
nobody
-c
'user.email=<>'
-C
$(WORKDIR)
am
-3
$(
realpath
$<
)
;
then
touch
$@
;
else
git
-C
$(WORKDIR)
am
--abort
;
exit
1
;
fi
@
if
git
-c
user.name
=
nobody
-c
'user.email=<>'
-C
$(WORKDIR)
am
-3
$(
realpath
$<
)
;
then
touch
$@
;
else
git
-C
$(WORKDIR)
am
--abort
;
exit
1
;
fi
.PHONY
:
checkout
.PHONY
:
checkout
...
@@ -57,4 +60,4 @@ format-patches: llama/patches
...
@@ -57,4 +60,4 @@ format-patches: llama/patches
.PHONE
:
clean
.PHONE
:
clean
clean
:
checkout
clean
:
checkout
$(RM)
$(
addsuffix
ed,
$(PATCHES)
)
$(RM)
llama/patches/.
*
.patched
convert/convert.go
View file @
23125648
package
convert
package
convert
import
(
import
(
"cmp"
"encoding/json"
"encoding/json"
"errors"
"errors"
"fmt"
"fmt"
...
@@ -14,13 +15,12 @@ import (
...
@@ -14,13 +15,12 @@ import (
)
)
type
ModelParameters
struct
{
type
ModelParameters
struct
{
Architectures
[]
string
`json:"architectures"`
Architectures
[]
string
`json:"architectures"`
VocabSize
uint32
`json:"vocab_size"`
VocabSize
uint32
`json:"vocab_size"`
TextModel
TextParameters
`json:"text_config"`
}
type
TextParameters
struct
{
TextModel
struct
{
VocabSize
uint32
`json:"vocab_size"`
VocabSize
uint32
`json:"vocab_size"`
}
`json:"text_config"`
}
}
type
AdapterParameters
struct
{
type
AdapterParameters
struct
{
...
@@ -173,6 +173,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
...
@@ -173,6 +173,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
switch
p
.
Architectures
[
0
]
{
switch
p
.
Architectures
[
0
]
{
case
"LlamaForCausalLM"
:
case
"LlamaForCausalLM"
:
conv
=
&
llamaModel
{}
conv
=
&
llamaModel
{}
case
"MllamaForConditionalGeneration"
:
conv
=
&
mllamaModel
{}
case
"Llama4ForConditionalGeneration"
:
case
"Llama4ForConditionalGeneration"
:
conv
=
&
llama4Model
{}
conv
=
&
llama4Model
{}
case
"Mistral3ForConditionalGeneration"
:
case
"Mistral3ForConditionalGeneration"
:
...
@@ -212,24 +214,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
...
@@ -212,24 +214,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
return
err
return
err
}
}
vocabSize
:=
int
(
p
.
VocabSize
)
vocabSize
:=
int
(
cmp
.
Or
(
p
.
VocabSize
,
p
.
TextModel
.
VocabSize
))
if
vocabSize
==
0
{
tVocabSize
:=
int
(
p
.
TextModel
.
VocabSize
)
vocabSize
=
tVocabSize
}
switch
{
switch
{
case
vocabSize
==
0
:
case
vocabSize
==
0
:
slog
.
Warn
(
"vocabulary size was not explicitly set by the model"
,
"default size"
,
len
(
t
.
Vocabulary
.
Tokens
))
slog
.
Debug
(
"vocabulary size was not explicitly set by the model"
,
"default size"
,
len
(
t
.
Vocabulary
.
Tokens
))
case
vocabSize
>
len
(
t
.
Vocabulary
.
Tokens
)
:
case
vocabSize
>
len
(
t
.
Vocabulary
.
Tokens
)
:
slog
.
Warn
(
"vocabulary is smaller than expected, padding with dummy tokens"
,
"expect"
,
vocabSize
,
"actual"
,
len
(
t
.
Vocabulary
.
Tokens
))
slog
.
Debug
(
"vocabulary is smaller than expected, padding with dummy tokens"
,
"expect"
,
vocabSize
,
"actual"
,
len
(
t
.
Vocabulary
.
Tokens
))
for
i
:=
range
vocabSize
-
len
(
t
.
Vocabulary
.
Tokens
)
{
for
i
:=
range
vocabSize
-
len
(
t
.
Vocabulary
.
Tokens
)
{
t
.
Vocabulary
.
Tokens
=
append
(
t
.
Vocabulary
.
Tokens
,
fmt
.
Sprintf
(
"[PAD%d]"
,
i
))
t
.
Vocabulary
.
Tokens
=
append
(
t
.
Vocabulary
.
Tokens
,
fmt
.
Sprintf
(
"[PAD%d]"
,
i
))
t
.
Vocabulary
.
Scores
=
append
(
t
.
Vocabulary
.
Scores
,
-
1
)
t
.
Vocabulary
.
Scores
=
append
(
t
.
Vocabulary
.
Scores
,
-
1
)
t
.
Vocabulary
.
Types
=
append
(
t
.
Vocabulary
.
Types
,
tokenTypeUserDefined
)
t
.
Vocabulary
.
Types
=
append
(
t
.
Vocabulary
.
Types
,
tokenTypeUserDefined
)
}
}
case
vocabSize
<
len
(
t
.
Vocabulary
.
Tokens
)
:
case
vocabSize
<
len
(
t
.
Vocabulary
.
Tokens
)
:
return
fmt
.
Errorf
(
"vocabulary is larger than expected '%d' instead of '%d'"
,
len
(
t
.
Vocabulary
.
Tokens
),
vocabSize
)
slog
.
Debug
(
"vocabulary is larger than expected"
,
"want"
,
vocabSize
,
"got"
,
len
(
t
.
Vocabulary
.
Tokens
))
p
.
VocabSize
=
uint32
(
len
(
t
.
Vocabulary
.
Tokens
))
p
.
TextModel
.
VocabSize
=
uint32
(
len
(
t
.
Vocabulary
.
Tokens
))
default
:
default
:
slog
.
Debug
(
"vocabulary"
,
"size"
,
len
(
t
.
Vocabulary
.
Tokens
))
slog
.
Debug
(
"vocabulary"
,
"size"
,
len
(
t
.
Vocabulary
.
Tokens
))
}
}
...
...
convert/convert_mllama.go
0 → 100644
View file @
23125648
package
convert
import
(
"strings"
"github.com/ollama/ollama/fs/ggml"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
)
type
mllamaModel
struct
{
ModelParameters
TextModel
struct
{
llamaModel
CrossAttentionLayers
[]
int32
`json:"cross_attention_layers"`
}
`json:"text_config"`
VisionModel
struct
{
NumHiddenLayers
uint32
`json:"num_hidden_layers"`
NumGlobalLayers
uint32
`json:"num_global_layers"`
IntermediateLayersIndices
[]
int32
`json:"intermediate_layers_indices"`
HiddenSize
uint32
`json:"hidden_size"`
IntermediateSize
uint32
`json:"intermediate_size"`
AttentionHeads
uint32
`json:"attention_heads"`
ImageSize
uint32
`json:"image_size"`
PatchSize
uint32
`json:"patch_size"`
NumChannels
uint32
`json:"num_channels"`
MaxNumTiles
uint32
`json:"max_num_tiles"`
NormEpsilon
float32
`json:"norm_eps"`
RopeTheta
float32
`json:"rope.freq_base"`
}
`json:"vision_config"`
}
func
(
m
*
mllamaModel
)
KV
(
t
*
Tokenizer
)
ggml
.
KV
{
kv
:=
m
.
ModelParameters
.
KV
(
t
)
kv
[
"general.architecture"
]
=
"mllama"
for
k
,
v
:=
range
m
.
TextModel
.
KV
(
t
)
{
if
strings
.
HasPrefix
(
k
,
"llama."
)
{
kv
[
strings
.
ReplaceAll
(
k
,
"llama."
,
"mllama."
)]
=
v
}
}
kv
[
"mllama.attention.cross_attention_layers"
]
=
m
.
TextModel
.
CrossAttentionLayers
kv
[
"mllama.vision.block_count"
]
=
m
.
VisionModel
.
NumHiddenLayers
kv
[
"mllama.vision.global.block_count"
]
=
m
.
VisionModel
.
NumGlobalLayers
kv
[
"mllama.vision.intermediate_layers_indices"
]
=
m
.
VisionModel
.
IntermediateLayersIndices
kv
[
"mllama.vision.embedding_length"
]
=
m
.
VisionModel
.
HiddenSize
kv
[
"mllama.vision.feed_forward_length"
]
=
m
.
VisionModel
.
IntermediateSize
kv
[
"mllama.vision.attention.head_count"
]
=
m
.
VisionModel
.
AttentionHeads
kv
[
"mllama.vision.attention.layer_norm_epsilon"
]
=
m
.
VisionModel
.
NormEpsilon
kv
[
"mllama.vision.image_size"
]
=
m
.
VisionModel
.
ImageSize
kv
[
"mllama.vision.patch_size"
]
=
m
.
VisionModel
.
PatchSize
kv
[
"mllama.vision.max_num_tiles"
]
=
m
.
VisionModel
.
MaxNumTiles
kv
[
"mllama.vision.num_channels"
]
=
m
.
VisionModel
.
NumChannels
return
kv
}
func
(
m
*
mllamaModel
)
Replacements
()
[]
string
{
return
append
(
m
.
TextModel
.
Replacements
(),
"language_model."
,
""
,
"gate_attn"
,
"attn_gate"
,
"gate_ffn"
,
"ffn_gate"
,
"cross_attn."
,
"cross_attn_"
,
"vision_model"
,
"v"
,
"class_embedding"
,
"class_embd"
,
"patch_embedding"
,
"patch_embd"
,
"gated_positional_embedding.tile_embedding"
,
"tile_position_embd"
,
"gated_positional_embedding.embedding"
,
"position_embd.weight"
,
"gated_positional_embedding"
,
"position_embd"
,
"embedding.weight"
,
"weight"
,
"pre_tile_positional_embedding"
,
"pre_tile_position_embd"
,
"post_tile_positional_embedding"
,
"post_tile_position_embd"
,
"layernorm_pre"
,
"pre_ln"
,
"layernorm_post"
,
"post_ln"
,
"global_transformer.layers"
,
"global.blk"
,
"transformer.layers"
,
"blk"
,
"mlp.fc1"
,
"ffn_up"
,
"mlp.fc2"
,
"ffn_down"
,
"multi_modal_projector"
,
"mm.0"
,
)
}
func
(
m
*
mllamaModel
)
Tensors
(
ts
[]
Tensor
)
[]
*
ggml
.
Tensor
{
var
out
[]
*
ggml
.
Tensor
var
text
[]
Tensor
for
_
,
t
:=
range
ts
{
if
t
.
Name
()
==
"v.position_embd.gate"
{
for
_
,
name
:=
range
[]
string
{
"v.position_embd.gate"
,
"v.tile_position_embd.gate"
}
{
tt
:=
t
.
Clone
()
tt
.
SetRepacker
(
m
.
repack
(
name
))
out
=
append
(
out
,
&
ggml
.
Tensor
{
Name
:
name
,
Kind
:
t
.
Kind
(),
Shape
:
t
.
Shape
(),
WriterTo
:
tt
,
})
}
}
else
if
t
.
Name
()
==
"v.pre_tile_position_embd.gate"
||
t
.
Name
()
==
"v.post_tile_position_embd.gate"
{
t
.
SetRepacker
(
m
.
repack
(
t
.
Name
()))
out
=
append
(
out
,
&
ggml
.
Tensor
{
Name
:
t
.
Name
(),
Kind
:
t
.
Kind
(),
Shape
:
t
.
Shape
(),
WriterTo
:
t
,
})
}
else
if
strings
.
HasPrefix
(
t
.
Name
(),
"v."
)
||
strings
.
HasPrefix
(
t
.
Name
(),
"mm."
)
{
out
=
append
(
out
,
&
ggml
.
Tensor
{
Name
:
t
.
Name
(),
Kind
:
t
.
Kind
(),
Shape
:
t
.
Shape
(),
WriterTo
:
t
,
})
}
else
{
text
=
append
(
text
,
t
)
}
}
return
append
(
out
,
m
.
TextModel
.
Tensors
(
text
)
...
)
}
func
(
m
*
mllamaModel
)
repack
(
name
string
)
Repacker
{
return
func
(
_
string
,
data
[]
float32
,
shape
[]
uint64
)
(
_
[]
float32
,
err
error
)
{
dims
:=
make
([]
int
,
len
(
shape
))
for
i
,
dim
:=
range
shape
{
dims
[
i
]
=
int
(
dim
)
}
var
t
tensor
.
Tensor
=
tensor
.
New
(
tensor
.
WithShape
(
dims
...
),
tensor
.
WithBacking
(
data
))
t
,
err
=
tensor
.
Tanh
(
t
)
if
err
!=
nil
{
return
nil
,
err
}
if
name
==
"v.position_embd.gate"
{
t
,
err
=
tensor
.
Sub
(
float32
(
1
),
t
)
if
err
!=
nil
{
return
nil
,
err
}
}
t
=
tensor
.
Materialize
(
t
)
// flatten tensor so it can be return as a vector
if
err
:=
t
.
Reshape
(
t
.
Shape
()
.
TotalSize
());
err
!=
nil
{
return
nil
,
err
}
return
native
.
VectorF32
(
t
.
(
*
tensor
.
Dense
))
}
}
convert/reader.go
View file @
23125648
...
@@ -38,7 +38,10 @@ const (
...
@@ -38,7 +38,10 @@ const (
func
(
t
tensorBase
)
Kind
()
uint32
{
func
(
t
tensorBase
)
Kind
()
uint32
{
if
strings
.
HasSuffix
(
t
.
name
,
".ffn_gate_inp.weight"
)
||
if
strings
.
HasSuffix
(
t
.
name
,
".ffn_gate_inp.weight"
)
||
t
.
name
==
"token_types.weight"
||
t
.
name
==
"token_types.weight"
||
t
.
name
==
"v.positional_embedding_vlm"
{
t
.
name
==
"v.positional_embedding_vlm"
||
t
.
name
==
"v.tile_position_embd.weight"
||
t
.
name
==
"v.pre_tile_position_embd.weight"
||
t
.
name
==
"v.post_tile_position_embd.weight"
{
// these tensors are always F32
// these tensors are always F32
return
0
return
0
}
}
...
...
fs/ggml/ggml.go
View file @
23125648
...
@@ -125,6 +125,7 @@ func (kv KV) OllamaEngineRequired() bool {
...
@@ -125,6 +125,7 @@ func (kv KV) OllamaEngineRequired() bool {
"gemma3"
,
"gemma3"
,
"mistral3"
,
"mistral3"
,
"llama4"
,
"llama4"
,
"mllama"
,
},
kv
.
Architecture
())
},
kv
.
Architecture
())
}
}
...
...
llama/llama.cpp/include/llama.h
View file @
23125648
...
@@ -258,7 +258,6 @@ extern "C" {
...
@@ -258,7 +258,6 @@ extern "C" {
llama_token
*
token
;
llama_token
*
token
;
float
*
embd
;
float
*
embd
;
int32_t
n_embd
;
llama_pos
*
pos
;
llama_pos
*
pos
;
int32_t
*
n_seq_id
;
int32_t
*
n_seq_id
;
llama_seq_id
**
seq_id
;
llama_seq_id
**
seq_id
;
...
@@ -366,7 +365,6 @@ extern "C" {
...
@@ -366,7 +365,6 @@ extern "C" {
bool
flash_attn
;
// whether to use flash attention [EXPERIMENTAL]
bool
flash_attn
;
// whether to use flash attention [EXPERIMENTAL]
bool
no_perf
;
// whether to measure performance timings
bool
no_perf
;
// whether to measure performance timings
bool
op_offload
;
// whether to offload host tensor operations to device
bool
op_offload
;
// whether to offload host tensor operations to device
bool
cross_attn
;
// whether to use cross attention
};
};
// model quantization parameters
// model quantization parameters
...
@@ -466,10 +464,6 @@ extern "C" {
...
@@ -466,10 +464,6 @@ extern "C" {
struct
llama_context_params
params
),
struct
llama_context_params
params
),
"use llama_init_from_model instead"
);
"use llama_init_from_model instead"
);
// TODO (jmorganca): this should most likely be passed in as part of a batch
// and not set on the context for all batches.
LLAMA_API
void
llama_set_cross_attention
(
struct
llama_context
*
ctx
,
bool
cross_attn_state
);
// Frees all allocated memory
// Frees all allocated memory
LLAMA_API
void
llama_free
(
struct
llama_context
*
ctx
);
LLAMA_API
void
llama_free
(
struct
llama_context
*
ctx
);
...
...
llama/llama.cpp/src/llama-arch.cpp
View file @
23125648
...
@@ -6,7 +6,6 @@
...
@@ -6,7 +6,6 @@
static
const
std
::
map
<
llm_arch
,
const
char
*>
LLM_ARCH_NAMES
=
{
static
const
std
::
map
<
llm_arch
,
const
char
*>
LLM_ARCH_NAMES
=
{
{
LLM_ARCH_LLAMA
,
"llama"
},
{
LLM_ARCH_LLAMA
,
"llama"
},
{
LLM_ARCH_MLLAMA
,
"mllama"
},
{
LLM_ARCH_LLAMA4
,
"llama4"
},
{
LLM_ARCH_LLAMA4
,
"llama4"
},
{
LLM_ARCH_DECI
,
"deci"
},
{
LLM_ARCH_DECI
,
"deci"
},
{
LLM_ARCH_FALCON
,
"falcon"
},
{
LLM_ARCH_FALCON
,
"falcon"
},
...
@@ -145,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
...
@@ -145,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{
LLM_KV_ATTENTION_SLIDING_WINDOW
,
"%s.attention.sliding_window"
},
{
LLM_KV_ATTENTION_SLIDING_WINDOW
,
"%s.attention.sliding_window"
},
{
LLM_KV_ATTENTION_SCALE
,
"%s.attention.scale"
},
{
LLM_KV_ATTENTION_SCALE
,
"%s.attention.scale"
},
{
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
"%s.attention.block_skip_connection"
},
{
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
"%s.attention.block_skip_connection"
},
{
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS
,
"%s.attention.cross_attention_layers"
},
{
LLM_KV_ATTENTION_KEY_LENGTH_MLA
,
"%s.attention.key_length_mla"
},
{
LLM_KV_ATTENTION_KEY_LENGTH_MLA
,
"%s.attention.key_length_mla"
},
{
LLM_KV_ATTENTION_VALUE_LENGTH_MLA
,
"%s.attention.value_length_mla"
},
{
LLM_KV_ATTENTION_VALUE_LENGTH_MLA
,
"%s.attention.value_length_mla"
},
...
@@ -275,40 +273,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
...
@@ -275,40 +273,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{
LLM_TENSOR_FFN_UP_SHEXP
,
"blk.%d.ffn_up_shexp"
},
{
LLM_TENSOR_FFN_UP_SHEXP
,
"blk.%d.ffn_up_shexp"
},
},
},
},
},
{
LLM_ARCH_MLLAMA
,
{
{
LLM_TENSOR_TOKEN_EMBD
,
"token_embd"
},
{
LLM_TENSOR_OUTPUT_NORM
,
"output_norm"
},
{
LLM_TENSOR_OUTPUT
,
"output"
},
{
LLM_TENSOR_ROPE_FREQS
,
"rope_freqs"
},
{
LLM_TENSOR_ATTN_NORM
,
"blk.%d.attn_norm"
},
{
LLM_TENSOR_ATTN_Q
,
"blk.%d.attn_q"
},
{
LLM_TENSOR_ATTN_K
,
"blk.%d.attn_k"
},
{
LLM_TENSOR_ATTN_V
,
"blk.%d.attn_v"
},
{
LLM_TENSOR_ATTN_OUT
,
"blk.%d.attn_output"
},
{
LLM_TENSOR_ATTN_ROT_EMBD
,
"blk.%d.attn_rot_embd"
},
{
LLM_TENSOR_FFN_GATE_INP
,
"blk.%d.ffn_gate_inp"
},
{
LLM_TENSOR_FFN_NORM
,
"blk.%d.ffn_norm"
},
{
LLM_TENSOR_FFN_GATE
,
"blk.%d.ffn_gate"
},
{
LLM_TENSOR_FFN_DOWN
,
"blk.%d.ffn_down"
},
{
LLM_TENSOR_FFN_UP
,
"blk.%d.ffn_up"
},
{
LLM_TENSOR_FFN_GATE_EXP
,
"blk.%d.ffn_gate.%d"
},
{
LLM_TENSOR_FFN_DOWN_EXP
,
"blk.%d.ffn_down.%d"
},
{
LLM_TENSOR_FFN_UP_EXP
,
"blk.%d.ffn_up.%d"
},
{
LLM_TENSOR_FFN_GATE_EXPS
,
"blk.%d.ffn_gate_exps"
},
{
LLM_TENSOR_FFN_DOWN_EXPS
,
"blk.%d.ffn_down_exps"
},
{
LLM_TENSOR_FFN_UP_EXPS
,
"blk.%d.ffn_up_exps"
},
{
LLM_TENSOR_CROSS_ATTN_K_NORM
,
"blk.%d.cross_attn_k_norm"
},
{
LLM_TENSOR_CROSS_ATTN_K_PROJ
,
"blk.%d.cross_attn_k_proj"
},
{
LLM_TENSOR_CROSS_ATTN_O_PROJ
,
"blk.%d.cross_attn_o_proj"
},
{
LLM_TENSOR_CROSS_ATTN_Q_NORM
,
"blk.%d.cross_attn_q_norm"
},
{
LLM_TENSOR_CROSS_ATTN_Q_PROJ
,
"blk.%d.cross_attn_q_proj"
},
{
LLM_TENSOR_CROSS_ATTN_V_PROJ
,
"blk.%d.cross_attn_v_proj"
},
{
LLM_TENSOR_CROSS_ATTN_ATTN_GATE
,
"blk.%d.cross_attn_attn_gate"
},
{
LLM_TENSOR_CROSS_ATTN_MLP_GATE
,
"blk.%d.cross_attn_mlp_gate"
},
},
},
{
{
LLM_ARCH_DECI
,
LLM_ARCH_DECI
,
{
{
...
@@ -1737,14 +1701,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
...
@@ -1737,14 +1701,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
// this tensor is loaded for T5, but never used
{
LLM_TENSOR_DEC_CROSS_ATTN_REL_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_NONE
}},
{
LLM_TENSOR_DEC_CROSS_ATTN_REL_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_NONE
}},
{
LLM_TENSOR_BSKCN_TV
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_BSKCN_TV
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CROSS_ATTN_K_NORM
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CROSS_ATTN_K_PROJ
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_CROSS_ATTN_O_PROJ
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_CROSS_ATTN_Q_NORM
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CROSS_ATTN_Q_PROJ
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_CROSS_ATTN_V_PROJ
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_CROSS_ATTN_ATTN_GATE
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CROSS_ATTN_MLP_GATE
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CONV1D
,
{
LLM_TENSOR_LAYER_INPUT
,
GGML_OP_IM2COL
}},
{
LLM_TENSOR_CONV1D
,
{
LLM_TENSOR_LAYER_INPUT
,
GGML_OP_IM2COL
}},
{
LLM_TENSOR_POS_NET_NORM
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_POS_NET_NORM
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_POS_NET_NORM1
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_POS_NET_NORM1
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
...
...
llama/llama.cpp/src/llama-arch.h
View file @
23125648
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
enum
llm_arch
{
enum
llm_arch
{
LLM_ARCH_LLAMA
,
LLM_ARCH_LLAMA
,
LLM_ARCH_LLAMA4
,
LLM_ARCH_LLAMA4
,
LLM_ARCH_MLLAMA
,
LLM_ARCH_DECI
,
LLM_ARCH_DECI
,
LLM_ARCH_FALCON
,
LLM_ARCH_FALCON
,
LLM_ARCH_BAICHUAN
,
LLM_ARCH_BAICHUAN
,
...
@@ -149,7 +148,6 @@ enum llm_kv {
...
@@ -149,7 +148,6 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW
,
LLM_KV_ATTENTION_SLIDING_WINDOW
,
LLM_KV_ATTENTION_SCALE
,
LLM_KV_ATTENTION_SCALE
,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS
,
LLM_KV_ATTENTION_KEY_LENGTH_MLA
,
LLM_KV_ATTENTION_KEY_LENGTH_MLA
,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA
,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA
,
...
@@ -351,14 +349,6 @@ enum llm_tensor {
...
@@ -351,14 +349,6 @@ enum llm_tensor {
LLM_TENSOR_CLS
,
LLM_TENSOR_CLS
,
LLM_TENSOR_CLS_OUT
,
LLM_TENSOR_CLS_OUT
,
LLM_TENSOR_BSKCN_TV
,
LLM_TENSOR_BSKCN_TV
,
LLM_TENSOR_CROSS_ATTN_K_NORM
,
LLM_TENSOR_CROSS_ATTN_K_PROJ
,
LLM_TENSOR_CROSS_ATTN_O_PROJ
,
LLM_TENSOR_CROSS_ATTN_Q_NORM
,
LLM_TENSOR_CROSS_ATTN_Q_PROJ
,
LLM_TENSOR_CROSS_ATTN_V_PROJ
,
LLM_TENSOR_CROSS_ATTN_ATTN_GATE
,
LLM_TENSOR_CROSS_ATTN_MLP_GATE
,
LLM_TENSOR_CONV1D
,
LLM_TENSOR_CONV1D
,
LLM_TENSOR_CONVNEXT_DW
,
LLM_TENSOR_CONVNEXT_DW
,
LLM_TENSOR_CONVNEXT_NORM
,
LLM_TENSOR_CONVNEXT_NORM
,
...
...
llama/llama.cpp/src/llama-batch.cpp
View file @
23125648
...
@@ -320,7 +320,6 @@ struct llama_batch llama_batch_get_one(
...
@@ -320,7 +320,6 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/
n_tokens
,
/*n_tokens =*/
n_tokens
,
/*tokens =*/
tokens
,
/*tokens =*/
tokens
,
/*embd =*/
nullptr
,
/*embd =*/
nullptr
,
/*n_embd =*/
0
,
/*pos =*/
nullptr
,
/*pos =*/
nullptr
,
/*n_seq_id =*/
nullptr
,
/*n_seq_id =*/
nullptr
,
/*seq_id =*/
nullptr
,
/*seq_id =*/
nullptr
,
...
@@ -333,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
...
@@ -333,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/
0
,
/*n_tokens =*/
0
,
/*tokens =*/
nullptr
,
/*tokens =*/
nullptr
,
/*embd =*/
nullptr
,
/*embd =*/
nullptr
,
/*n_embd =*/
0
,
/*pos =*/
nullptr
,
/*pos =*/
nullptr
,
/*n_seq_id =*/
nullptr
,
/*n_seq_id =*/
nullptr
,
/*seq_id =*/
nullptr
,
/*seq_id =*/
nullptr
,
...
@@ -342,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
...
@@ -342,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if
(
embd
)
{
if
(
embd
)
{
batch
.
embd
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
n_tokens_alloc
*
embd
);
batch
.
embd
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
n_tokens_alloc
*
embd
);
batch
.
n_embd
=
embd
;
}
else
{
}
else
{
batch
.
token
=
(
llama_token
*
)
malloc
(
sizeof
(
llama_token
)
*
n_tokens_alloc
);
batch
.
token
=
(
llama_token
*
)
malloc
(
sizeof
(
llama_token
)
*
n_tokens_alloc
);
}
}
...
...
llama/llama.cpp/src/llama-context.cpp
View file @
23125648
...
@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
...
@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw
std
::
runtime_error
(
format
(
"corrupt output buffer (j=%d, n_outputs=%d)"
,
j
,
n_outputs
));
throw
std
::
runtime_error
(
format
(
"corrupt output buffer (j=%d, n_outputs=%d)"
,
j
,
n_outputs
));
}
}
return
logits
+
j
*
model
.
hparams
.
n_vocab
;
return
logits
+
j
*
model
.
vocab
.
n_tokens
()
;
}
catch
(
const
std
::
exception
&
err
)
{
}
catch
(
const
std
::
exception
&
err
)
{
LLAMA_LOG_ERROR
(
"%s: invalid logits id %d, reason: %s
\n
"
,
__func__
,
i
,
err
.
what
());
LLAMA_LOG_ERROR
(
"%s: invalid logits id %d, reason: %s
\n
"
,
__func__
,
i
,
err
.
what
());
#ifndef NDEBUG
#ifndef NDEBUG
...
@@ -632,10 +632,6 @@ void llama_context::set_warmup(bool value) {
...
@@ -632,10 +632,6 @@ void llama_context::set_warmup(bool value) {
cparams
.
warmup
=
value
;
cparams
.
warmup
=
value
;
}
}
void
llama_context
::
set_cross_attn
(
bool
value
)
{
cparams
.
cross_attn
=
value
;
}
void
llama_context
::
set_adapter_lora
(
void
llama_context
::
set_adapter_lora
(
llama_adapter_lora
*
adapter
,
llama_adapter_lora
*
adapter
,
float
scale
)
{
float
scale
)
{
...
@@ -713,7 +709,7 @@ int llama_context::encode(llama_batch & inp_batch) {
...
@@ -713,7 +709,7 @@ int llama_context::encode(llama_batch & inp_batch) {
const
int64_t
n_embd
=
hparams
.
n_embd
;
const
int64_t
n_embd
=
hparams
.
n_embd
;
llama_sbatch
sbatch
=
llama_sbatch
(
batch
,
batch
.
n_embd
,
/* simple_split */
true
,
/* logits_all */
true
);
llama_sbatch
sbatch
=
llama_sbatch
(
batch
,
n_embd
,
/* simple_split */
true
,
/* logits_all */
true
);
const
llama_ubatch
ubatch
=
sbatch
.
split_simple
(
n_tokens
);
const
llama_ubatch
ubatch
=
sbatch
.
split_simple
(
n_tokens
);
...
@@ -867,9 +863,10 @@ int llama_context::decode(llama_batch & inp_batch) {
...
@@ -867,9 +863,10 @@ int llama_context::decode(llama_batch & inp_batch) {
const
llama_batch
&
batch
=
batch_allocr
.
batch
;
const
llama_batch
&
batch
=
batch_allocr
.
batch
;
const
auto
&
vocab
=
model
.
vocab
;
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
hparams
=
model
.
hparams
;
const
int32_t
n_vocab
=
hparams
.
n_vocab
;
const
int32_t
n_vocab
=
vocab
.
n_tokens
()
;
const
int64_t
n_tokens_all
=
batch
.
n_tokens
;
const
int64_t
n_tokens_all
=
batch
.
n_tokens
;
const
int64_t
n_embd
=
hparams
.
n_embd
;
const
int64_t
n_embd
=
hparams
.
n_embd
;
...
@@ -1093,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
...
@@ -1093,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// make the outputs have the same order they had in the user-provided batch
// make the outputs have the same order they had in the user-provided batch
// note: this is mostly relevant for recurrent models atm
// note: this is mostly relevant for recurrent models atm
if
(
!
sorted_output
)
{
if
(
!
sorted_output
)
{
const
uint32_t
n_vocab
=
model
.
hparams
.
n_vocab
;
const
uint32_t
n_vocab
=
model
.
vocab
.
n_tokens
()
;
const
uint32_t
n_embd
=
model
.
hparams
.
n_embd
;
const
uint32_t
n_embd
=
model
.
hparams
.
n_embd
;
GGML_ASSERT
((
size_t
)
n_outputs
==
out_ids
.
size
());
GGML_ASSERT
((
size_t
)
n_outputs
==
out_ids
.
size
());
...
@@ -1148,11 +1145,12 @@ int llama_context::decode(llama_batch & inp_batch) {
...
@@ -1148,11 +1145,12 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t
llama_context
::
output_reserve
(
int32_t
n_outputs
)
{
int32_t
llama_context
::
output_reserve
(
int32_t
n_outputs
)
{
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
vocab
=
model
.
vocab
;
const
int64_t
n_outputs_max
=
std
::
max
<
int64_t
>
(
n_outputs
,
n_seq_max
());
const
int64_t
n_outputs_max
=
std
::
max
<
int64_t
>
(
n_outputs
,
n_seq_max
());
const
auto
n_batch
=
cparams
.
n_batch
;
const
auto
n_batch
=
cparams
.
n_batch
;
const
auto
n_vocab
=
hparams
.
n_vocab
;
const
auto
n_vocab
=
vocab
.
n_tokens
()
;
const
auto
n_embd
=
hparams
.
n_embd
;
const
auto
n_embd
=
hparams
.
n_embd
;
// TODO: use a per-batch flag for logits presence instead
// TODO: use a per-batch flag for logits presence instead
...
@@ -1687,7 +1685,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
...
@@ -1687,7 +1685,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{
{
LLAMA_LOG_DEBUG
(
"%s: - writing logits
\n
"
,
__func__
);
LLAMA_LOG_DEBUG
(
"%s: - writing logits
\n
"
,
__func__
);
const
uint64_t
logits_size
=
std
::
min
((
uint64_t
)
this
->
logits_size
,
(
uint64_t
)
n_outputs
*
model
.
hparams
.
n_vocab
);
const
uint64_t
logits_size
=
std
::
min
((
uint64_t
)
this
->
logits_size
,
(
uint64_t
)
n_outputs
*
model
.
vocab
.
n_tokens
()
);
io
.
write
(
&
logits_size
,
sizeof
(
logits_size
));
io
.
write
(
&
logits_size
,
sizeof
(
logits_size
));
...
@@ -2099,7 +2097,6 @@ llama_context_params llama_context_default_params() {
...
@@ -2099,7 +2097,6 @@ llama_context_params llama_context_default_params() {
/*.flash_attn =*/
false
,
/*.flash_attn =*/
false
,
/*.no_perf =*/
true
,
/*.no_perf =*/
true
,
/*.op_offload =*/
true
,
/*.op_offload =*/
true
,
/*.cross_attn =*/
false
,
};
};
return
result
;
return
result
;
...
@@ -2225,10 +2222,6 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
...
@@ -2225,10 +2222,6 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx
->
set_warmup
(
warmup
);
ctx
->
set_warmup
(
warmup
);
}
}
void
llama_set_cross_attention
(
struct
llama_context
*
ctx
,
bool
cross_attention
)
{
ctx
->
set_cross_attn
(
cross_attention
);
}
void
llama_synchronize
(
llama_context
*
ctx
)
{
void
llama_synchronize
(
llama_context
*
ctx
)
{
ctx
->
synchronize
();
ctx
->
synchronize
();
}
}
...
...
llama/llama.cpp/src/llama-context.h
View file @
23125648
...
@@ -72,7 +72,6 @@ struct llama_context {
...
@@ -72,7 +72,6 @@ struct llama_context {
void
set_embeddings
(
bool
value
);
void
set_embeddings
(
bool
value
);
void
set_causal_attn
(
bool
value
);
void
set_causal_attn
(
bool
value
);
void
set_warmup
(
bool
value
);
void
set_warmup
(
bool
value
);
void
set_cross_attn
(
bool
value
);
void
set_adapter_lora
(
void
set_adapter_lora
(
llama_adapter_lora
*
adapter
,
llama_adapter_lora
*
adapter
,
...
...
llama/llama.cpp/src/llama-cparams.h
View file @
23125648
...
@@ -31,7 +31,6 @@ struct llama_cparams {
...
@@ -31,7 +31,6 @@ struct llama_cparams {
bool
no_perf
;
bool
no_perf
;
bool
warmup
;
bool
warmup
;
bool
op_offload
;
bool
op_offload
;
bool
cross_attn
;
enum
llama_pooling_type
pooling_type
;
enum
llama_pooling_type
pooling_type
;
...
...
llama/llama.cpp/src/llama-graph.cpp
View file @
23125648
...
@@ -532,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
...
@@ -532,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
}
}
void
llm_graph_input_cross_attn_state
::
set_input
(
const
llama_ubatch
*
ubatch
)
{
if
(
ubatch
->
embd
)
{
ggml_backend_tensor_set
(
cross_attn_state
,
ubatch
->
embd
,
0
,
ggml_nbytes
(
cross_attn_state
));
}
}
//
//
// llm_graph_context
// llm_graph_context
//
//
...
@@ -1520,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
...
@@ -1520,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return
(
llm_graph_input_attn_cross
*
)
res
->
add_input
(
std
::
move
(
inp
));
return
(
llm_graph_input_attn_cross
*
)
res
->
add_input
(
std
::
move
(
inp
));
}
}
ggml_tensor
*
llm_graph_context
::
build_inp_cross_attn_state
()
const
{
const
int64_t
n_embd
=
hparams
.
n_embd
;
auto
inp
=
std
::
make_unique
<
llm_graph_input_cross_attn_state
>
();
ggml_tensor
*
cur
=
nullptr
;
inp
->
cross_attn_state
=
ggml_new_tensor_3d
(
ctx0
,
GGML_TYPE_F32
,
n_embd
,
1601
,
4
);
ggml_set_input
(
inp
->
cross_attn_state
);
cur
=
inp
->
cross_attn_state
;
cb
(
cur
,
"inp_cross_attn_state"
,
-
1
);
res
->
add_input
(
std
::
move
(
inp
));
return
cur
;
}
ggml_tensor
*
llm_graph_context
::
build_attn
(
ggml_tensor
*
llm_graph_context
::
build_attn
(
llm_graph_input_attn_cross
*
inp
,
llm_graph_input_attn_cross
*
inp
,
ggml_cgraph
*
gf
,
ggml_cgraph
*
gf
,
...
...
llama/llama.cpp/src/llama-graph.h
View file @
23125648
...
@@ -87,7 +87,6 @@ public:
...
@@ -87,7 +87,6 @@ public:
ggml_tensor
*
tokens
=
nullptr
;
// I32 [n_batch]
ggml_tensor
*
tokens
=
nullptr
;
// I32 [n_batch]
ggml_tensor
*
embd
=
nullptr
;
// F32 [n_embd, n_batch]
ggml_tensor
*
embd
=
nullptr
;
// F32 [n_embd, n_batch]
ggml_tensor
*
cross_attn_state
;
// F32 [4, n_embd, 1061]
};
};
class
llm_graph_input_pos
:
public
llm_graph_input_i
{
class
llm_graph_input_pos
:
public
llm_graph_input_i
{
...
@@ -285,16 +284,6 @@ public:
...
@@ -285,16 +284,6 @@ public:
const
llama_cross
*
cross
=
nullptr
;
const
llama_cross
*
cross
=
nullptr
;
};
};
class
llm_graph_input_cross_attn_state
:
public
llm_graph_input_i
{
public:
llm_graph_input_cross_attn_state
()
=
default
;
virtual
~
llm_graph_input_cross_attn_state
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
cross_attn_state
;
// F32 [4, n_embd, 1061]
};
//
//
// llm_graph_result
// llm_graph_result
//
//
...
@@ -506,7 +495,6 @@ struct llm_graph_context {
...
@@ -506,7 +495,6 @@ struct llm_graph_context {
ggml_tensor
*
build_inp_cls
()
const
;
ggml_tensor
*
build_inp_cls
()
const
;
ggml_tensor
*
build_inp_s_copy
()
const
;
ggml_tensor
*
build_inp_s_copy
()
const
;
ggml_tensor
*
build_inp_s_mask
()
const
;
ggml_tensor
*
build_inp_s_mask
()
const
;
ggml_tensor
*
build_inp_cross_attn_state
()
const
;
ggml_tensor
*
build_inp_cross_embd
()
const
;
ggml_tensor
*
build_inp_cross_embd
()
const
;
ggml_tensor
*
build_inp_pos_bucket_enc
()
const
;
ggml_tensor
*
build_inp_pos_bucket_enc
()
const
;
...
...
llama/llama.cpp/src/llama-hparams.cpp
View file @
23125648
...
@@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
...
@@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT
(
"fatal error"
);
GGML_ABORT
(
"fatal error"
);
}
}
bool
llama_hparams
::
cross_attention_layers
(
uint32_t
il
)
const
{
return
std
::
find
(
cross_attn_layers
.
begin
(),
cross_attn_layers
.
end
(),
il
)
!=
cross_attn_layers
.
end
();
}
llama/llama.cpp/src/llama-hparams.h
View file @
23125648
...
@@ -2,8 +2,6 @@
...
@@ -2,8 +2,6 @@
#include "llama.h"
#include "llama.h"
#include <algorithm>
#include <array>
#include <array>
// bump if necessary
// bump if necessary
...
@@ -44,7 +42,6 @@ struct llama_hparams {
...
@@ -44,7 +42,6 @@ struct llama_hparams {
uint32_t
n_expert
=
0
;
uint32_t
n_expert
=
0
;
uint32_t
n_expert_used
=
0
;
uint32_t
n_expert_used
=
0
;
uint32_t
n_rel_attn_bkts
=
0
;
uint32_t
n_rel_attn_bkts
=
0
;
uint32_t
n_vocab
=
0
;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t
n_embd_head_k_mla
=
0
;
uint32_t
n_embd_head_k_mla
=
0
;
...
@@ -59,7 +56,6 @@ struct llama_hparams {
...
@@ -59,7 +56,6 @@ struct llama_hparams {
std
::
array
<
uint32_t
,
LLAMA_MAX_LAYERS
>
n_ff_arr
;
std
::
array
<
uint32_t
,
LLAMA_MAX_LAYERS
>
n_ff_arr
;
std
::
array
<
std
::
array
<
uint32_t
,
LLAMA_MAX_LAYERS
>
,
4
>
n_bskcn_arr
=
{};
std
::
array
<
std
::
array
<
uint32_t
,
LLAMA_MAX_LAYERS
>
,
4
>
n_bskcn_arr
=
{};
std
::
array
<
uint32_t
,
LLAMA_MAX_LAYERS
>
cross_attn_layers
;
uint32_t
n_layer_dense_lead
=
0
;
uint32_t
n_layer_dense_lead
=
0
;
uint32_t
n_lora_q
=
0
;
uint32_t
n_lora_q
=
0
;
...
@@ -163,9 +159,6 @@ struct llama_hparams {
...
@@ -163,9 +159,6 @@ struct llama_hparams {
// Block skip connection
// Block skip connection
bool
n_bskcn
(
uint32_t
n
,
uint32_t
il
)
const
;
bool
n_bskcn
(
uint32_t
n
,
uint32_t
il
)
const
;
// cross attention layers
bool
cross_attention_layers
(
uint32_t
il
)
const
;
bool
is_swa
(
uint32_t
il
)
const
;
bool
is_swa
(
uint32_t
il
)
const
;
};
};
...
...
llama/llama.cpp/src/llama-kv-cache.cpp
View file @
23125648
...
@@ -100,16 +100,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
...
@@ -100,16 +100,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
throw
std
::
runtime_error
(
"failed to create ggml context for kv cache"
);
throw
std
::
runtime_error
(
"failed to create ggml context for kv cache"
);
}
}
ggml_tensor
*
k
,
*
v
;
ggml_tensor
*
k
=
ggml_new_tensor_1d
(
ctx
,
type_k
,
n_embd_k_gqa
*
kv_size
);
ggml_tensor
*
v
=
ggml_new_tensor_1d
(
ctx
,
type_v
,
n_embd_v_gqa
*
kv_size
);
// for cross attention layers
if
(
model
.
arch
==
LLM_ARCH_MLLAMA
&&
hparams
.
cross_attention_layers
(
i
))
{
k
=
ggml_new_tensor_3d
(
ctx
,
GGML_TYPE_F32
,
hparams
.
n_embd_head_k
,
6404
,
hparams
.
n_head_kv
(
i
));
v
=
ggml_new_tensor_3d
(
ctx
,
GGML_TYPE_F32
,
hparams
.
n_embd_head_v
,
6404
,
hparams
.
n_head_kv
(
i
));
}
else
{
k
=
ggml_new_tensor_1d
(
ctx
,
type_k
,
n_embd_k_gqa
*
kv_size
);
v
=
ggml_new_tensor_1d
(
ctx
,
type_v
,
n_embd_v_gqa
*
kv_size
);
}
ggml_format_name
(
k
,
"cache_k_l%d"
,
i
);
ggml_format_name
(
k
,
"cache_k_l%d"
,
i
);
ggml_format_name
(
v
,
"cache_v_l%d"
,
i
);
ggml_format_name
(
v
,
"cache_v_l%d"
,
i
);
k_l
.
push_back
(
k
);
k_l
.
push_back
(
k
);
...
@@ -459,7 +451,7 @@ void llama_kv_cache_unified::set_full() {
...
@@ -459,7 +451,7 @@ void llama_kv_cache_unified::set_full() {
llama_sbatch
llama_kv_cache_unified
::
sbatch_init
(
llama_sbatch
llama_kv_cache_unified
::
sbatch_init
(
const
llama_batch
&
batch
,
const
llama_batch
&
batch
,
bool
logits_all
)
{
bool
logits_all
)
{
return
llama_sbatch
(
batch
,
batch
.
n_embd
,
true
,
logits_all
);
return
llama_sbatch
(
batch
,
hparams
.
n_embd
,
true
,
logits_all
);
}
}
llama_ubatch
llama_kv_cache_unified
::
ubatch_next
(
llama_ubatch
llama_kv_cache_unified
::
ubatch_next
(
...
...
llama/llama.cpp/src/llama-model-loader.cpp
View file @
23125648
...
@@ -315,8 +315,6 @@ namespace GGUFMeta {
...
@@ -315,8 +315,6 @@ namespace GGUFMeta {
return
true
;
return
true
;
}
}
template
bool
llama_model_loader
::
get_arr
<
std
::
array
<
unsigned
int
,
512
>
>
(
enum
llm_kv
kid
,
std
::
array
<
unsigned
int
,
512
>&
result
,
bool
required
);
template
<
typename
T
,
size_t
N_MAX
>
template
<
typename
T
,
size_t
N_MAX
>
bool
llama_model_loader
::
get_arr
(
const
std
::
string
&
key
,
std
::
array
<
T
,
N_MAX
>
&
result
,
bool
required
)
{
bool
llama_model_loader
::
get_arr
(
const
std
::
string
&
key
,
std
::
array
<
T
,
N_MAX
>
&
result
,
bool
required
)
{
const
int
kid
=
gguf_find_key
(
meta
.
get
(),
key
.
c_str
());
const
int
kid
=
gguf_find_key
(
meta
.
get
(),
key
.
c_str
());
...
...
llama/llama.cpp/src/llama-model.cpp
View file @
23125648
...
@@ -433,7 +433,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -433,7 +433,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
// everything past this point is not vocab-related
// everything past this point is not vocab-related
if (hparams.vocab_only) {
if (hparams.vocab_only) {
...
@@ -445,7 +444,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -445,7 +444,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
...
@@ -469,11 +467,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -469,11 +467,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
// n_head_kv is optional, default to n_head
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
hparams.n_head_kv_arr = hparams.n_head_arr;
...
@@ -526,7 +522,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -526,7 +522,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
if (arch == LLM_ARCH_LLAMA || arch ==
LLM_ARCH_MLLAMA || arch ==
LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
}
...
@@ -589,16 +585,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -589,16 +585,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false;
hparams.use_kq_norm = false;
}
}
} break;
} break;
case LLM_ARCH_MLLAMA:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 40: type = LLM_TYPE_11B; break;
case 100: type = LLM_TYPE_90B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
...
@@ -1595,7 +1581,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -1595,7 +1581,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_vocab =
hparams.n_vocab
;
const int64_t n_vocab =
vocab.n_tokens()
;
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert = hparams.n_expert;
...
@@ -1854,52 +1840,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -1854,52 +1840,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
}
}
} break;
} break;
case LLM_ARCH_MLLAMA:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
// output
{
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
}
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
if (hparams.cross_attention_layers(i)) {
layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
} else {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
}
} break;
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
...
@@ -4816,246 +4756,6 @@ struct llm_build_llama : public llm_graph_context {
...
@@ -4816,246 +4756,6 @@ struct llm_build_llama : public llm_graph_context {
}
}
};
};
struct llm_build_mllama: public llm_graph_context {
llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * inpCAS;
inpL = build_inp_embd(model.tok_embd);
inpCAS = build_inp_cross_attn_state();
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
if (hparams.cross_attention_layers(il)) {
if (!ubatch.embd && !cparams.cross_attn) {
continue;
}
// cross attention layer
ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
cb(Qcur, "Qcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cb(Qcur, "Qcur", il);
Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
cb(Qcur, "Qcur", il);
Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur, * Vcur;
if (ubatch.embd) {
Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
cb(Kcur, "Kcur", il);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
cb(Kcur, "Kcur", il);
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
cb(Kcur, "Kcur", il);
Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
cb(Vcur, "Vcur", il);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
cb(Vcur, "Vcur", il);
Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
cb(Vcur, "Vcur", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
} else {
Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
cb(Kcur, "Kcur (view)", il);
Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
cb(Vcur, "Vcur (view)", il);
}
struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
cb(kq, "kq", il);
// TODO: apply causal masks
struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
cb(kq_soft_max, "kq_soft_max", il);
Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
cb(Vcur, "Vcur", il);
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
cb(kqv, "kqv", il);
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
cb(kqv_merged, "kqv_merged", il);
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
cb(cur, "kqv_merged_cont", il);
cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
cb(cur, "cur", il);
// TODO: do this in place once?
cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
// TODO: do this inplace once?
cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
} else {
// self attention layer
// rope freq factors for llama3; may return nullptr for llama2 and other models
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
n_tokens = n_outputs;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
};
struct llm_build_deci : public llm_graph_context {
struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v;
...
@@ -13428,10 +13128,6 @@ llm_graph_result_ptr llama_model::build_graph(
...
@@ -13428,10 +13128,6 @@ llm_graph_result_ptr llama_model::build_graph(
{
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
} break;
case LLM_ARCH_MLLAMA:
{
llm = std::make_unique<llm_build_mllama>(*this, params, gf);
} break;
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
llm = std::make_unique<llm_build_deci>(*this, params, gf);
...
@@ -13793,7 +13489,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
...
@@ -13793,7 +13489,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
case LLM_ARCH_LLAMA4:
case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_STARCODER:
...
...
llama/llama.cpp/src/llama-model.h
View file @
23125648
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include <stdexcept>
struct
llama_cparams
;
struct
llama_cparams
;
struct
llama_ubatch
;
struct
llama_ubatch
;
...
@@ -75,7 +74,6 @@ enum llm_type {
...
@@ -75,7 +74,6 @@ enum llm_type {
LLM_TYPE_40B
,
LLM_TYPE_40B
,
LLM_TYPE_65B
,
LLM_TYPE_65B
,
LLM_TYPE_70B
,
LLM_TYPE_70B
,
LLM_TYPE_90B
,
LLM_TYPE_236B
,
LLM_TYPE_236B
,
LLM_TYPE_290B
,
LLM_TYPE_290B
,
LLM_TYPE_314B
,
LLM_TYPE_314B
,
...
@@ -320,16 +318,6 @@ struct llama_layer {
...
@@ -320,16 +318,6 @@ struct llama_layer {
struct
ggml_tensor
*
bskcn_tv
=
nullptr
;
struct
ggml_tensor
*
bskcn_tv
=
nullptr
;
// cross attention
struct
ggml_tensor
*
cross_attn_k_norm
=
nullptr
;
struct
ggml_tensor
*
cross_attn_k_proj
=
nullptr
;
struct
ggml_tensor
*
cross_attn_o_proj
=
nullptr
;
struct
ggml_tensor
*
cross_attn_q_norm
=
nullptr
;
struct
ggml_tensor
*
cross_attn_q_proj
=
nullptr
;
struct
ggml_tensor
*
cross_attn_v_proj
=
nullptr
;
struct
ggml_tensor
*
cross_attn_attn_gate
=
nullptr
;
struct
ggml_tensor
*
cross_attn_mlp_gate
=
nullptr
;
struct
llama_layer_posnet
posnet
;
struct
llama_layer_posnet
posnet
;
struct
llama_layer_convnext
convnext
;
struct
llama_layer_convnext
convnext
;
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment