Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
a68cb201
Unverified
Commit
a68cb201
authored
Sep 21, 2024
by
Ke Bao
Committed by
GitHub
Sep 21, 2024
Browse files
Fix triton head num (#1482)
parent
014982b5
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
54 additions
and
1 deletion
+54
-1
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+6
-0
python/sglang/srt/layers/attention_backend.py
python/sglang/srt/layers/attention_backend.py
+3
-1
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+1
-0
test/srt/test_mla.py
test/srt/test_mla.py
+44
-0
No files found.
.github/workflows/pr-test.yml
View file @
a68cb201
...
...
@@ -233,6 +233,12 @@ jobs:
run
:
|
cd test/srt
python3 test_moe_eval_accuracy_large.py
-
name
:
Evaluate MLA Accuracy (TP=2)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_mla.TestMLA.test_mmlu
finish
:
needs
:
[
...
...
python/sglang/srt/layers/attention_backend.py
View file @
a68cb201
...
...
@@ -346,7 +346,9 @@ class TritonAttnBackend(AttentionBackend):
self
.
decode_attention_fwd
=
decode_attention_fwd
self
.
extend_attention_fwd
=
extend_attention_fwd
self
.
num_head
=
model_runner
.
model_config
.
num_attention_heads
self
.
num_head
=
(
model_runner
.
model_config
.
num_attention_heads
//
model_runner
.
tp_size
)
if
global_server_args_dict
.
get
(
"triton_attention_reduce_in_fp32"
,
False
):
self
.
reduce_dtype
=
torch
.
float32
...
...
python/sglang/test/test_utils.py
View file @
a68cb201
...
...
@@ -25,6 +25,7 @@ from sglang.utils import get_exception_traceback
DEFAULT_FP8_MODEL_NAME_FOR_TEST
=
"neuralmagic/Meta-Llama-3.1-8B-FP8"
DEFAULT_MODEL_NAME_FOR_TEST
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
DEFAULT_MOE_MODEL_NAME_FOR_TEST
=
"mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_MLA_MODEL_NAME_FOR_TEST
=
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
=
600
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
=
"meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
=
"meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
...
...
test/srt/test_mla.py
0 → 100644
View file @
a68cb201
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestMLA
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MLA_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--tp"
,
"2"
,
"--trust-remote-code"
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_child_process
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
64
,
num_threads
=
32
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.5
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment