Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
9038f7ab
Commit
9038f7ab
authored
Aug 20, 2025
by
wooway777
Browse files
issue/29 - fixing tensor length mismatches across requests
parent
c41055b5
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
4 deletions
+4
-4
src/models/jiuge/jiuge.cpp
src/models/jiuge/jiuge.cpp
+4
-4
No files found.
src/models/jiuge/jiuge.cpp
View file @
9038f7ab
...
@@ -217,17 +217,17 @@ void inferDeviceBatch(const JiugeMeta &meta, DeviceResource &rsrc,
...
@@ -217,17 +217,17 @@ void inferDeviceBatch(const JiugeMeta &meta, DeviceResource &rsrc,
rearrange
(
kv_caches
[
req
]
->
k
[
idev
][
layer
]
->
slice
(
0
,
past_len
,
seq_len
),
k
);
rearrange
(
kv_caches
[
req
]
->
k
[
idev
][
layer
]
->
slice
(
0
,
past_len
,
seq_len
),
k
);
rearrange
(
kv_caches
[
req
]
->
v
[
idev
][
layer
]
->
slice
(
0
,
past_len
,
seq_len
),
v
);
rearrange
(
kv_caches
[
req
]
->
v
[
idev
][
layer
]
->
slice
(
0
,
past_len
,
seq_len
),
v
);
// qk
// qk
rearrange
(
q_rearrange
,
q
);
rearrange
(
q_rearrange
->
slice
(
2
,
0
,
seq_len
)
,
q
);
auto
qk_gemm
=
qk_buf
->
view
({
nkvh
,
ngroup
*
seq_len
,
total_len
});
auto
qk_gemm
=
qk_buf
->
view
({
nkvh
,
ngroup
*
seq_len
,
total_len
});
auto
k_gemm
=
kv_caches
[
req
]
->
k
[
idev
][
layer
]
->
slice
(
0
,
0
,
total_len
)
->
permute
({
1
,
2
,
0
});
auto
k_gemm
=
kv_caches
[
req
]
->
k
[
idev
][
layer
]
->
slice
(
0
,
0
,
total_len
)
->
permute
({
1
,
2
,
0
});
linear
(
qk_gemm
,
rearrange_q_buf
,
k_gemm
,
1.
f
/
float
(
sqrt
(
dh
)),
0.
f
,
nullptr
,
nullptr
);
linear
(
qk_gemm
,
rearrange_q_buf
->
slice
(
1
,
0
,
ngroup
*
seq_len
)
,
k_gemm
,
1.
f
/
float
(
sqrt
(
dh
)),
0.
f
,
nullptr
,
nullptr
);
// softmax
// softmax
auto
qk_softmax
=
qk_buf
->
view
({
nh
,
seq_len
,
total_len
});
auto
qk_softmax
=
qk_buf
->
view
({
nh
,
seq_len
,
total_len
});
causalSoftmax
(
qk_softmax
,
qk_softmax
);
causalSoftmax
(
qk_softmax
,
qk_softmax
);
auto
v_gemm
=
kv_caches
[
req
]
->
v
[
idev
][
layer
]
->
slice
(
0
,
0
,
total_len
)
->
permute
({
1
,
0
,
2
});
auto
v_gemm
=
kv_caches
[
req
]
->
v
[
idev
][
layer
]
->
slice
(
0
,
0
,
total_len
)
->
permute
({
1
,
0
,
2
});
linear
(
attn_val_buf
,
qk_gemm
,
v_gemm
,
1.
f
,
0.
f
,
nullptr
,
nullptr
);
linear
(
attn_val_buf
->
slice
(
1
,
0
,
ngroup
*
seq_len
)
,
qk_gemm
,
v_gemm
,
1.
f
,
0.
f
,
nullptr
,
nullptr
);
// rearrange attn val
// rearrange attn val
rearrange
(
o
,
attn_val_gemm
);
rearrange
(
o
,
attn_val_gemm
->
slice
(
2
,
0
,
seq_len
)
);
token_offset
+=
seq_len
;
token_offset
+=
seq_len
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment