Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
4f8c3aea
Unverified
Commit
4f8c3aea
authored
Nov 22, 2024
by
Yineng Zhang
Committed by
GitHub
Nov 22, 2024
Browse files
minor: update gsm8k threshold (#2125)
parent
2369e882
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
43 additions
and
33 deletions
+43
-33
.github/workflows/nightly-eval.yml
.github/workflows/nightly-eval.yml
+4
-4
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+16
-12
test/srt/test_nightly_gsm8k_eval.py
test/srt/test_nightly_gsm8k_eval.py
+23
-17
No files found.
.github/workflows/nightly-eval.yml
View file @
4f8c3aea
...
@@ -27,14 +27,14 @@ jobs:
...
@@ -27,14 +27,14 @@ jobs:
bash scripts/ci_install_dependency.sh
bash scripts/ci_install_dependency.sh
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
-
name
:
Test
human eval
-
name
:
Test
gsm8k
timeout-minutes
:
120
timeout-minutes
:
120
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 test_nightly_
human
_eval.py
python3 test_nightly_
gsm8k
_eval.py
-
name
:
Test
gsm8k
-
name
:
Test
human eval
timeout-minutes
:
120
timeout-minutes
:
120
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 test_nightly_
gsm8k
_eval.py
python3 test_nightly_
human
_eval.py
python/sglang/test/test_utils.py
View file @
4f8c3aea
...
@@ -439,13 +439,17 @@ def popen_launch_server(
...
@@ -439,13 +439,17 @@ def popen_launch_server(
process
=
subprocess
.
Popen
(
command
,
stdout
=
None
,
stderr
=
None
,
env
=
env
)
process
=
subprocess
.
Popen
(
command
,
stdout
=
None
,
stderr
=
None
,
env
=
env
)
start_time
=
time
.
time
()
start_time
=
time
.
time
()
with
requests
.
Session
()
as
session
:
while
time
.
time
()
-
start_time
<
timeout
:
while
time
.
time
()
-
start_time
<
timeout
:
try
:
try
:
headers
=
{
headers
=
{
"Content-Type"
:
"application/json; charset=utf-8"
,
"Content-Type"
:
"application/json; charset=utf-8"
,
"Authorization"
:
f
"Bearer
{
api_key
}
"
,
"Authorization"
:
f
"Bearer
{
api_key
}
"
,
}
}
response
=
requests
.
get
(
f
"
{
base_url
}
/health_generate"
,
headers
=
headers
)
response
=
session
.
get
(
f
"
{
base_url
}
/health_generate"
,
headers
=
headers
,
)
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
return
process
return
process
except
requests
.
RequestException
:
except
requests
.
RequestException
:
...
...
test/srt/test_nightly_gsm8k_eval.py
View file @
4f8c3aea
import
json
import
json
import
os
import
os
import
subprocess
import
unittest
import
unittest
import
warnings
from
datetime
import
datetime
from
datetime
import
datetime
from
types
import
SimpleNamespace
from
types
import
SimpleNamespace
...
@@ -18,23 +20,23 @@ from sglang.test.test_utils import (
...
@@ -18,23 +20,23 @@ from sglang.test.test_utils import (
)
)
MODEL_SCORE_THRESHOLDS
=
{
MODEL_SCORE_THRESHOLDS
=
{
"meta-llama/Llama-3.1-8B-Instruct"
:
0.83
16
,
"meta-llama/Llama-3.1-8B-Instruct"
:
0.83
,
"mistralai/Mistral-7B-Instruct-v0.3"
:
0.58
61
,
"mistralai/Mistral-7B-Instruct-v0.3"
:
0.58
,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
0.8
672
,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
0.8
4
,
"google/gemma-2-27b-it"
:
0.92
27
,
"google/gemma-2-27b-it"
:
0.92
,
"meta-llama/Llama-3.1-70B-Instruct"
:
0.96
23
,
"meta-llama/Llama-3.1-70B-Instruct"
:
0.96
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
0.64
15
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
0.64
,
"Qwen/Qwen2-57B-A14B-Instruct"
:
0.87
91
,
"Qwen/Qwen2-57B-A14B-Instruct"
:
0.87
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
:
0.8
672
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
:
0.8
4
,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8"
:
0.5
54
4
,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8"
:
0.54
,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
:
0.83
56
,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
:
0.83
,
"neuralmagic/gemma-2-2b-it-FP8"
:
0.60
59
,
"neuralmagic/gemma-2-2b-it-FP8"
:
0.60
,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8"
:
0.95
04
,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8"
:
0.95
,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
:
0.61
38
,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
:
0.61
,
"neuralmagic/Qwen2-72B-Instruct-FP8"
:
0.95
04
,
"neuralmagic/Qwen2-72B-Instruct-FP8"
:
0.95
,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
:
0.8
197
,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
:
0.8
2
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
:
0.8
395
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
:
0.8
4
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
:
0.84
35
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
:
0.84
,
}
}
...
@@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2):
...
@@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2):
base_url
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
other_args
=
other_args
,
return_stdout_stderr
=
(
subprocess
.
DEVNULL
,
subprocess
.
DEVNULL
),
)
)
return
process
return
process
...
@@ -132,6 +135,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -132,6 +135,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
kill_child_process
(
self
.
process
.
pid
,
include_self
=
True
)
kill_child_process
(
self
.
process
.
pid
,
include_self
=
True
)
def
test_mgsm_en_all_models
(
self
):
def
test_mgsm_en_all_models
(
self
):
warnings
.
filterwarnings
(
"ignore"
,
category
=
ResourceWarning
,
message
=
"unclosed.*socket"
)
is_first
=
True
is_first
=
True
all_results
=
[]
all_results
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment