Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
a1d03892
"vscode:/vscode.git/clone" did not exist on "7c3e1f94b9c897aad8cd50fac4dca35a4954d184"
Unverified
Commit
a1d03892
authored
Sep 09, 2025
by
Sundara Raman Ramachandran
Committed by
GitHub
Sep 10, 2025
Browse files
[Benchmark] Prefil-only benchmark scripts (#10240)
parent
dccf52f9
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
1153 additions
and
0 deletions
+1153
-0
benchmark/prefill_only/bench_embeddings.py
benchmark/prefill_only/bench_embeddings.py
+148
-0
benchmark/prefill_only/bench_score.py
benchmark/prefill_only/bench_score.py
+192
-0
benchmark/prefill_only/util.py
benchmark/prefill_only/util.py
+813
-0
No files found.
benchmark/prefill_only/bench_embeddings.py
0 → 100644
View file @
a1d03892
"""
SGLang Embeddings Benchmark Script
This script benchmarks SGLang's /v1/embeddings API performance using HTTP requests.
Features:
- HTTP-only implementation
- Uses /v1/embeddings API endpoint directly
- Configurable RPS, duration, and batch sizes
- Progress tracking and detailed metrics
- Poisson and constant request distributions
Usage:
- Update configuration variables at the top of the file
- Ensure SGLang server is running on the configured HTTP_URL
- Run: python bench_embeddings.py
"""
import
asyncio
import
logging
from
transformers
import
AutoTokenizer
from
util
import
(
BenchmarkConfig
,
generate_text_with_token_count
,
run_benchmark_main
,
run_generic_benchmark
,
)
# Configure logging
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
__name__
)
###############################################################################
# CONFIG
###############################################################################
# Create benchmark configuration
config
=
BenchmarkConfig
()
config
.
rps_values
=
[
500
]
config
.
duration_secs_values
=
[
60
]
config
.
num_unique_requests
=
100
config
.
distribution
=
"POISSON"
config
.
profile
=
False
config
.
freeze_gc
=
True
# Enable GC freeze functionality
# Profiler output directory - by default uses present working directory (pwd)
# Uncomment and customize the line below to override the default location:
# config.profiler_dir = "/sglang-oss-trace"
# HTTP Configuration
HTTP_URL
=
"http://localhost:30000/v1/embeddings"
# Embeddings API Config
EMBEDDINGS_MODEL_PATH
=
"/Qwen/Qwen3-Embedding-0.6B"
BATCH_SIZE
=
[
1
]
# Number of items per request (batch size)
# Configurable input token length
EMBEDDINGS_INPUT_TOKENS
=
500
# Default token length
# Load tokenizer once for embeddings text generation
print
(
"Loading tokenizer for embeddings input generation..."
)
embeddings_tokenizer
=
AutoTokenizer
.
from_pretrained
(
EMBEDDINGS_MODEL_PATH
)
# Generate input text with the specified token length using pre-loaded tokenizer
EMBEDDINGS_INPUT_TEXT
=
generate_text_with_token_count
(
EMBEDDINGS_MODEL_PATH
,
EMBEDDINGS_INPUT_TOKENS
,
config
.
special_replicated_token
,
tokenizer
=
embeddings_tokenizer
,
)
###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def
build_embeddings_request
(
index
:
int
,
item_count
:
int
)
->
tuple
:
"""Build a single embeddings request."""
try
:
# For embeddings, input can be a string or list of strings
if
item_count
==
1
:
input_data
=
EMBEDDINGS_INPUT_TEXT
else
:
input_data
=
[
EMBEDDINGS_INPUT_TEXT
for
_
in
range
(
item_count
)]
req
=
{
"input"
:
input_data
,
"model"
:
EMBEDDINGS_MODEL_PATH
,
}
return
(
index
,
req
)
except
Exception
as
e
:
logger
.
error
(
f
"Error building request
{
index
}
:
{
e
}
"
)
return
(
index
,
None
)
def
validate_embeddings_response
(
response_data
:
dict
)
->
bool
:
"""Validate embeddings API response."""
return
"data"
in
response_data
def
build_warmup_embeddings_request
()
->
dict
:
"""Build a warmup request for the embeddings API."""
return
{
"input"
:
EMBEDDINGS_INPUT_TEXT
,
"model"
:
EMBEDDINGS_MODEL_PATH
,
}
###############################################################################
# MAIN
###############################################################################
async
def
run_benchmark
(
rps
,
duration_secs
,
item_count
):
"""Run a single embeddings benchmark with the given RPS value."""
return
await
run_generic_benchmark
(
rps
=
rps
,
duration_secs
=
duration_secs
,
item_count
=
item_count
,
config
=
config
,
http_url
=
HTTP_URL
,
build_request_func
=
build_embeddings_request
,
response_validator
=
validate_embeddings_response
,
api_name
=
"EMBEDDINGS"
,
request_description
=
"embeddings requests"
,
)
async
def
main
():
additional_info
=
{
"Input text length"
:
f
"
{
EMBEDDINGS_INPUT_TOKENS
}
tokens"
,
"Input text preview"
:
(
EMBEDDINGS_INPUT_TEXT
[:
100
]
+
"..."
if
len
(
EMBEDDINGS_INPUT_TEXT
)
>
100
else
EMBEDDINGS_INPUT_TEXT
),
}
await
run_benchmark_main
(
config
,
run_benchmark
,
"EMBEDDINGS"
,
HTTP_URL
,
BATCH_SIZE
,
additional_info
,
build_warmup_embeddings_request
,
)
if
__name__
==
"__main__"
:
asyncio
.
run
(
main
())
benchmark/prefill_only/bench_score.py
0 → 100644
View file @
a1d03892
"""
SGLang Scoring Benchmark Script
This script benchmarks SGLang's scoring API performance using HTTP requests.
Current Features:
- HTTP-only implementation (open source compatible)
- Uses /v1/score API endpoint directly
- Single item scoring with batching support
- Configurable RPS, duration, and batch sizes
- Progress tracking and detailed metrics
- Poisson and constant request distributions
Usage:
- Update configuration variables at the top of the file
- Ensure SGLang server is running on the configured HTTP_URL
- Run: python bench_score.py
- Each request will contain ITEM_COUNT_VALUES items for batch scoring
"""
import
asyncio
from
transformers
import
AutoTokenizer
from
util
import
(
BenchmarkConfig
,
generate_text_with_token_count
,
run_benchmark_main
,
run_generic_benchmark
,
)
###############################################################################
# CONFIG
###############################################################################
# Create benchmark configuration
config
=
BenchmarkConfig
()
config
.
rps_values
=
[
160
]
config
.
duration_secs_values
=
[
60
]
config
.
num_unique_requests
=
100
config
.
distribution
=
"POISSON"
config
.
profile
=
False
config
.
freeze_gc
=
True
# Enable GC freeze functionality
# Profiler output directory - by default uses present working directory (pwd)
# Uncomment and customize the line below to override the default location:
# config.profiler_dir = "/sglang-oss-trace"
# HTTP Configuration
HTTP_URL
=
"http://localhost:30000/v1/score"
# Use score API directly
# Score API Config
# ITEM_COUNT_VALUES determines number of items per score request (batch size)
SCORE_QUERY_TOKENS
=
120
SCORE_ITEM_TOKENS
=
180
SCORE_MODEL_PATH
=
"Qwen/Qwen3-0.6B"
SCORE_LABEL_TOKEN_IDS
=
[
9454
,
2753
]
# Yes/No token IDs
ITEM_COUNT_VALUES
=
[
10
]
# Number of items per request
# Special token to replicate for precise token counting
SPECIAL_REPLICATED_TOKEN
=
"<|im_start|>"
###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def
create_score_request_builder
():
"""Create a score request builder function with shared tokenizer."""
# Load tokenizer once here to verify special token and get precise counts
print
(
"Loading tokenizer..."
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
SCORE_MODEL_PATH
)
# Verify that our special token produces exactly 1 token
special_token_count
=
len
(
tokenizer
.
encode
(
config
.
special_replicated_token
,
add_special_tokens
=
False
)
)
print
(
f
"Special token '
{
config
.
special_replicated_token
}
' produces "
f
"
{
special_token_count
}
token(s)"
)
def
generate_text_with_token_count_local
(
num_toks
):
"""Generate text with precise token count using replicated token."""
return
generate_text_with_token_count
(
SCORE_MODEL_PATH
,
num_toks
,
config
.
special_replicated_token
,
tokenizer
=
tokenizer
,
)
def
build_score_request
(
index
:
int
,
item_count
:
int
)
->
tuple
:
"""Build a single score request."""
try
:
# Generate query and items for score API
query
=
generate_text_with_token_count_local
(
SCORE_QUERY_TOKENS
)
items
=
[
generate_text_with_token_count_local
(
SCORE_ITEM_TOKENS
)
for
_
in
range
(
item_count
)
]
# Return as dict for score API format
score_data
=
{
"query"
:
query
,
"items"
:
items
,
"label_token_ids"
:
SCORE_LABEL_TOKEN_IDS
,
"model"
:
SCORE_MODEL_PATH
,
}
return
(
index
,
score_data
)
except
Exception
as
e
:
print
(
f
"Error building request
{
index
}
:
{
e
}
"
)
return
(
index
,
None
)
return
build_score_request
def
validate_score_response
(
response_data
:
dict
)
->
bool
:
"""Validate score API response."""
return
"scores"
in
response_data
or
"logprobs"
in
response_data
def
build_warmup_score_request
()
->
dict
:
"""Build a warmup request for the score API."""
# Load tokenizer once for warmup generation
tokenizer
=
AutoTokenizer
.
from_pretrained
(
SCORE_MODEL_PATH
)
warmup_query
=
generate_text_with_token_count
(
SCORE_MODEL_PATH
,
SCORE_QUERY_TOKENS
,
config
.
special_replicated_token
,
tokenizer
=
tokenizer
,
)
warmup_items
=
[
generate_text_with_token_count
(
SCORE_MODEL_PATH
,
SCORE_ITEM_TOKENS
,
config
.
special_replicated_token
,
tokenizer
=
tokenizer
,
)
for
_
in
range
(
3
)
]
return
{
"query"
:
warmup_query
,
"items"
:
warmup_items
,
"label_token_ids"
:
SCORE_LABEL_TOKEN_IDS
,
"model"
:
SCORE_MODEL_PATH
,
# Add missing parameters for consistency with the original warmup
"apply_softmax"
:
True
,
"item_first"
:
False
,
}
###############################################################################
# MAIN
###############################################################################
async
def
run_benchmark
(
rps
,
duration_secs
,
item_count
):
"""Run a single benchmark with the given RPS value."""
# Create the request builder function with shared tokenizer
build_request_func
=
create_score_request_builder
()
return
await
run_generic_benchmark
(
rps
=
rps
,
duration_secs
=
duration_secs
,
item_count
=
item_count
,
config
=
config
,
http_url
=
HTTP_URL
,
build_request_func
=
build_request_func
,
response_validator
=
validate_score_response
,
api_name
=
"SINGLE_ITEM_SCORING"
,
request_description
=
"score requests"
,
)
async
def
main
():
"""Main function that runs benchmarks for all RPS values."""
additional_info
=
{
"Query tokens per request"
:
SCORE_QUERY_TOKENS
,
"Item tokens per item"
:
SCORE_ITEM_TOKENS
,
}
await
run_benchmark_main
(
config
,
run_benchmark
,
"SINGLE_ITEM_SCORING"
,
HTTP_URL
,
ITEM_COUNT_VALUES
,
additional_info
,
build_warmup_score_request
,
)
if
__name__
==
"__main__"
:
asyncio
.
run
(
main
())
benchmark/
score/bench_score
.py
→
benchmark/
prefill_only/util
.py
View file @
a1d03892
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment