Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
2561ed01
Unverified
Commit
2561ed01
authored
Sep 03, 2024
by
Yineng Zhang
Committed by
GitHub
Sep 03, 2024
Browse files
feat: update nightly gsm8k eval (#1304)
parent
99994427
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
100 additions
and
38 deletions
+100
-38
.github/workflows/nightly-eval.yml
.github/workflows/nightly-eval.yml
+7
-38
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+4
-0
test/srt/test_nightly_gsm8k_eval.py
test/srt/test_nightly_gsm8k_eval.py
+89
-0
No files found.
.github/workflows/nightly-eval.yml
View file @
2561ed01
...
...
@@ -15,9 +15,9 @@ concurrency:
cancel-in-progress
:
true
jobs
:
meta-llama-31-8b-instruct
:
nightly-eval-2-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1
-gpu-runner
runs-on
:
2
-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
...
...
@@ -25,42 +25,11 @@ jobs:
-
name
:
Install dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[
dev
]"
pip install -e "python[
all
]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
git clone https://github.com/EleutherAI/lm-evaluation-harness
pushd lm-evaluation-harness
pip install -e .
pip install lm_eval[api]
popd
-
name
:
Run eval
timeout-minutes
:
2
0
-
name
:
Nightly gsm8k Accuracy
timeout-minutes
:
6
0
run
:
|
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache &
echo "Waiting for server to start..."
for i in {1..120}; do
if curl -s http://127.0.0.1:30000/health; then
echo "Server is up!"
break
fi
if [ $i -eq 120 ]; then
echo "Server failed to start within 120 seconds"
exit 1
fi
sleep 1
done
lm_eval --model local-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://127.0.0.1:30000/v1/completions,num_concurrent=128,max_retries=3,tokenized_requests=False
echo "Stopping server..."
kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -v grep | awk '{print $2}')
finish
:
needs
:
[
meta-llama-31-8b-instruct
]
runs-on
:
ubuntu-latest
steps
:
-
name
:
Finish
run
:
echo "This is an empty step to ensure that all jobs are completed."
cd test/srt
python3 test_nightly_gsm8k_eval.py
python/sglang/test/test_utils.py
View file @
2561ed01
...
...
@@ -23,6 +23,10 @@ from sglang.utils import get_exception_traceback
DEFAULT_MODEL_NAME_FOR_TEST
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
DEFAULT_MOE_MODEL_NAME_FOR_TEST
=
"mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
=
600
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
=
"meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
=
"meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
=
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
=
5157
...
...
test/srt/test_nightly_gsm8k_eval.py
0 → 100644
View file @
2561ed01
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
def
parse_models
(
model_string
):
return
[
model
.
strip
()
for
model
in
model_string
.
split
(
","
)
if
model
.
strip
()]
class
TestEvalAccuracyLarge
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model_groups
=
[
(
parse_models
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
),
False
,
False
),
(
parse_models
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
),
False
,
True
),
(
parse_models
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
),
True
,
False
),
(
parse_models
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
),
True
,
True
),
]
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
def
setUp
(
self
):
self
.
process
=
None
def
tearDown
(
self
):
if
self
.
process
:
kill_child_process
(
self
.
process
.
pid
)
def
launch_server
(
self
,
model
,
is_fp8
,
is_tp2
):
other_args
=
[
"--log-level-http"
,
"warning"
,
"--trust-remote-code"
]
if
is_fp8
:
if
"Llama-3"
in
model
or
"gemma-2"
in
model
:
# compressed-tensors
other_args
.
extend
([
"--kv-cache-dtype"
,
"fp8_e5m2"
])
elif
"Qwen2-72B-Instruct-FP8"
in
model
:
# bug
other_args
.
extend
([
"--quantization"
,
"fp8"
])
else
:
other_args
.
extend
(
[
"--quantization"
,
"fp8"
,
"--kv-cache-dtype"
,
"fp8_e5m2"
]
)
if
is_tp2
:
other_args
.
extend
([
"--tp"
,
"2"
])
if
"DeepSeek"
in
model
:
other_args
.
append
(
"--enable-mla"
)
self
.
process
=
popen_launch_server
(
model
,
self
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
)
def
test_mgsm_en_all_models
(
self
):
for
model_group
,
is_fp8
,
is_tp2
in
self
.
model_groups
:
for
model
in
model_group
:
with
self
.
subTest
(
model
=
model
):
self
.
launch_server
(
model
,
is_fp8
,
is_tp2
)
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
model
,
eval_name
=
"mgsm_en"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
print
(
f
"
{
'='
*
42
}
\n
{
model
}
- metrics=
{
metrics
}
score=
{
metrics
[
'score'
]
}
\n
{
'='
*
42
}
\n
"
)
# loosely threshold
assert
metrics
[
"score"
]
>
0.5
,
f
"score=
{
metrics
[
'score'
]
}
<= 0.5"
self
.
tearDown
()
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment