Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
4d1c9db6
"encoding/vscode:/vscode.git/clone" did not exist on "e57e90d90a2d363a0cb3831409698bffd1b9c4b5"
Unverified
Commit
4d1c9db6
authored
May 10, 2025
by
Yineng Zhang
Committed by
GitHub
May 10, 2025
Browse files
feat: support loogle eval (#6190)
parent
17c36c55
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
158 additions
and
1 deletion
+158
-1
python/sglang/README.md
python/sglang/README.md
+1
-1
python/sglang/eval/llama3_eval.py
python/sglang/eval/llama3_eval.py
+0
-0
python/sglang/eval/loogle_eval.py
python/sglang/eval/loogle_eval.py
+157
-0
No files found.
python/sglang/README.md
View file @
4d1c9db6
# Code Structures
# Code Structures
-
`eval`
: The evaluation utilities.
-
`lang`
: The frontend language.
-
`lang`
: The frontend language.
-
`srt`
: The backend engine for running local models. (SRT = SGLang Runtime).
-
`srt`
: The backend engine for running local models. (SRT = SGLang Runtime).
-
`test`
: The test utilities.
-
`test`
: The test utilities.
...
@@ -11,6 +12,5 @@
...
@@ -11,6 +12,5 @@
-
`check_env.py`
: Check the environment variables and dependencies.
-
`check_env.py`
: Check the environment variables and dependencies.
-
`global_config.py`
: The global configs and constants.
-
`global_config.py`
: The global configs and constants.
-
`launch_server.py`
: The entry point for launching the local server.
-
`launch_server.py`
: The entry point for launching the local server.
-
`llama3_eval.py`
: Evaluation of Llama 3 using the Meta Llama dataset.
-
`utils.py`
: Common utilities.
-
`utils.py`
: Common utilities.
-
`version.py`
: Version info.
-
`version.py`
: Version info.
python/sglang/llama3_eval.py
→
python/sglang/
eval/
llama3_eval.py
View file @
4d1c9db6
File moved
python/sglang/eval/loogle_eval.py
0 → 100644
View file @
4d1c9db6
import
argparse
import
asyncio
import
os
import
pickle
from
pathlib
import
Path
from
typing
import
List
import
openai
import
torch
from
bert_score
import
BERTScorer
from
datasets
import
load_dataset
from
tqdm
import
tqdm
def
get_client
(
api_url
:
str
)
->
openai
.
AsyncOpenAI
:
if
os
.
getenv
(
"OPENAI_API_KEY"
)
is
None
:
os
.
environ
[
"OPENAI_API_KEY"
]
=
"EMPTY"
return
openai
.
AsyncOpenAI
(
base_url
=
api_url
)
def
get_dataset
():
return
load_dataset
(
"bigai-nlco/LooGLE"
,
"longdep_qa"
,
split
=
"test"
)
async
def
fetch_response
(
client
:
openai
.
AsyncOpenAI
,
context
:
str
,
question
:
str
,
semaphore
:
asyncio
.
Semaphore
,
index
:
int
,
model
:
str
,
output_dir
:
Path
,
):
output_file
=
output_dir
/
f
"response_
{
index
}
.pkl"
if
output_file
.
exists
():
return
prompt
=
(
"Please answer the question based on the long texts below.
\n
"
f
"
{
context
}
\n
"
f
"Question:
{
question
}
\n
"
"Answer:"
)
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
prompt
},
]
async
with
semaphore
:
try
:
response
=
await
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
,
temperature
=
0.0
,
max_tokens
=
512
,
)
except
openai
.
BadRequestError
as
e
:
with
open
(
output_file
,
"wb"
)
as
f
:
pickle
.
dump
({
"error"
:
str
(
e
)},
f
)
return
with
open
(
output_file
,
"wb"
)
as
f
:
pickle
.
dump
(
response
,
f
)
async
def
benchmark
(
args
):
dataset
=
get_dataset
()
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
client
=
get_client
(
args
.
api_url
)
semaphore
=
asyncio
.
Semaphore
(
args
.
max_concurrency
)
tasks
:
List
[
asyncio
.
Task
]
=
[]
for
idx
,
ex
in
enumerate
(
dataset
):
tasks
.
append
(
asyncio
.
create_task
(
fetch_response
(
client
,
ex
[
"context"
],
ex
[
"question"
],
semaphore
,
idx
,
args
.
model
,
output_dir
,
)
)
)
for
_
in
tqdm
(
asyncio
.
as_completed
(
tasks
),
total
=
len
(
tasks
),
desc
=
"Running benchmark"
):
await
_
def
analyse
(
args
):
dataset
=
get_dataset
()
output_dir
=
Path
(
args
.
output_dir
)
device
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
scorer
=
BERTScorer
(
lang
=
"en"
,
device
=
device
)
hyps
:
List
[
str
]
=
[]
refs
:
List
[
str
]
=
[]
for
idx
,
ex
in
enumerate
(
tqdm
(
dataset
,
desc
=
"Loading responses"
)):
pkl_file
=
output_dir
/
f
"response_
{
idx
}
.pkl"
if
not
pkl_file
.
exists
():
raise
FileNotFoundError
(
pkl_file
)
response
=
pickle
.
load
(
open
(
pkl_file
,
"rb"
))
if
isinstance
(
response
,
dict
)
and
"error"
in
response
:
continue
hyps
.
append
(
response
.
choices
[
0
].
message
.
content
.
strip
())
refs
.
append
(
ex
[
"answer"
])
if
not
hyps
:
print
(
"No valid responses to score!"
)
return
batch_size
=
64
all_f1
:
List
[
float
]
=
[]
for
i
in
tqdm
(
range
(
0
,
len
(
hyps
),
batch_size
),
desc
=
"Scoring batches"
):
h_batch
=
hyps
[
i
:
i
+
batch_size
]
r_batch
=
refs
[
i
:
i
+
batch_size
]
_
,
_
,
f1_scores
=
scorer
.
score
(
h_batch
,
r_batch
,
verbose
=
False
)
all_f1
.
extend
([
float
(
x
)
for
x
in
f1_scores
])
avg
=
sum
(
all_f1
)
/
len
(
all_f1
)
print
(
f
"Average BERTScore (F1):
{
avg
:.
2
%
}
"
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Run benchmark and evaluation in one go."
)
parser
.
add_argument
(
"--api-url"
,
default
=
"http://127.0.0.1:30000/v1"
,
help
=
"OpenAI‑compatible API base URL"
,
)
parser
.
add_argument
(
"--model"
,
default
=
"meta-llama/Llama-4-Maverick-17B-128E-Instruct"
,
help
=
"Model name or ID"
,
)
parser
.
add_argument
(
"--max-concurrency"
,
type
=
int
,
default
=
144
,
help
=
"Maximum concurrent requests"
)
parser
.
add_argument
(
"--output-dir"
,
default
=
"tmp-output-dir"
,
help
=
"Directory for cached responses"
)
args
=
parser
.
parse_args
()
asyncio
.
run
(
benchmark
(
args
))
analyse
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment