Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
f239268f
Unverified
Commit
f239268f
authored
Nov 19, 2024
by
Yineng Zhang
Committed by
GitHub
Nov 19, 2024
Browse files
minor: update gsm8k eval (#2091)
parent
929c7621
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
84 additions
and
4 deletions
+84
-4
test/srt/test_nightly_gsm8k_eval.py
test/srt/test_nightly_gsm8k_eval.py
+84
-4
No files found.
test/srt/test_nightly_gsm8k_eval.py
View file @
f239268f
import
json
import
os
import
unittest
from
datetime
import
datetime
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_child_process
...
...
@@ -14,6 +17,26 @@ from sglang.test.test_utils import (
popen_launch_server
,
)
MODEL_SCORE_THRESHOLDS
=
{
"meta-llama/Llama-3.1-8B-Instruct"
:
0.8316
,
"mistralai/Mistral-7B-Instruct-v0.3"
:
0.5861
,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
0.8672
,
"google/gemma-2-27b-it"
:
0.9227
,
"meta-llama/Llama-3.1-70B-Instruct"
:
0.9623
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
0.6415
,
"Qwen/Qwen2-57B-A14B-Instruct"
:
0.8791
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
:
0.8672
,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8"
:
0.5544
,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
:
0.8356
,
"neuralmagic/gemma-2-2b-it-FP8"
:
0.6059
,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8"
:
0.9504
,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
:
0.6138
,
"neuralmagic/Qwen2-72B-Instruct-FP8"
:
0.9504
,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
:
0.8197
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
:
0.8395
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
:
0.8435
,
}
def
parse_models
(
model_string
):
return
[
model
.
strip
()
for
model
in
model_string
.
split
(
","
)
if
model
.
strip
()]
...
...
@@ -23,10 +46,8 @@ def launch_server(base_url, model, is_fp8, is_tp2):
other_args
=
[
"--log-level-http"
,
"warning"
,
"--trust-remote-code"
]
if
is_fp8
:
if
"Llama-3"
in
model
or
"gemma-2"
in
model
:
# compressed-tensors
other_args
.
extend
([
"--kv-cache-dtype"
,
"fp8_e5m2"
])
elif
"Qwen2-72B-Instruct-FP8"
in
model
:
# bug
other_args
.
extend
([
"--quantization"
,
"fp8"
])
else
:
other_args
.
extend
([
"--quantization"
,
"fp8"
,
"--kv-cache-dtype"
,
"fp8_e5m2"
])
...
...
@@ -48,6 +69,49 @@ def launch_server(base_url, model, is_fp8, is_tp2):
return
process
def
write_results_to_json
(
model
,
metrics
,
mode
=
"a"
):
result
=
{
"timestamp"
:
datetime
.
now
().
isoformat
(),
"model"
:
model
,
"metrics"
:
metrics
,
"score"
:
metrics
[
"score"
],
}
existing_results
=
[]
if
mode
==
"a"
and
os
.
path
.
exists
(
"results.json"
):
try
:
with
open
(
"results.json"
,
"r"
)
as
f
:
existing_results
=
json
.
load
(
f
)
except
json
.
JSONDecodeError
:
existing_results
=
[]
if
isinstance
(
existing_results
,
list
):
existing_results
.
append
(
result
)
else
:
existing_results
=
[
result
]
with
open
(
"results.json"
,
"w"
)
as
f
:
json
.
dump
(
existing_results
,
f
,
indent
=
2
)
def
check_model_scores
(
results
):
failed_models
=
[]
for
model
,
score
in
results
:
threshold
=
MODEL_SCORE_THRESHOLDS
.
get
(
model
)
if
threshold
is
None
:
print
(
f
"Warning: No threshold defined for model
{
model
}
"
)
continue
if
score
<
threshold
:
failed_models
.
append
(
f
"
\n
Score Check Failed:
{
model
}
\n
"
f
"Model
{
model
}
score (
{
score
:.
4
f
}
) is below threshold (
{
threshold
:.
4
f
}
)"
)
if
failed_models
:
raise
AssertionError
(
"
\n
"
.
join
(
failed_models
))
class
TestEvalAccuracyLarge
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
...
...
@@ -68,6 +132,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
kill_child_process
(
self
.
process
.
pid
,
include_self
=
True
)
def
test_mgsm_en_all_models
(
self
):
is_first
=
True
all_results
=
[]
for
model_group
,
is_fp8
,
is_tp2
in
self
.
model_groups
:
for
model
in
model_group
:
with
self
.
subTest
(
model
=
model
):
...
...
@@ -85,11 +152,24 @@ class TestEvalAccuracyLarge(unittest.TestCase):
print
(
f
"
{
'='
*
42
}
\n
{
model
}
- metrics=
{
metrics
}
score=
{
metrics
[
'score'
]
}
\n
{
'='
*
42
}
\n
"
)
# loosely threshold
assert
metrics
[
"score"
]
>
0.5
,
f
"score=
{
metrics
[
'score'
]
}
<= 0.5"
write_results_to_json
(
model
,
metrics
,
"w"
if
is_first
else
"a"
)
is_first
=
False
all_results
.
append
((
model
,
metrics
[
"score"
]))
self
.
tearDown
()
try
:
with
open
(
"results.json"
,
"r"
)
as
f
:
print
(
"
\n
Final Results from results.json:"
)
print
(
json
.
dumps
(
json
.
load
(
f
),
indent
=
2
))
except
Exception
as
e
:
print
(
f
"Error reading results.json:
{
e
}
"
)
# Check all scores after collecting all results
check_model_scores
(
all_results
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment