Unverified Commit a1d03892 authored by Sundara Raman Ramachandran's avatar Sundara Raman Ramachandran Committed by GitHub
Browse files

[Benchmark] Prefil-only benchmark scripts (#10240)

parent dccf52f9
"""
SGLang Embeddings Benchmark Script
This script benchmarks SGLang's /v1/embeddings API performance using HTTP requests.
Features:
- HTTP-only implementation
- Uses /v1/embeddings API endpoint directly
- Configurable RPS, duration, and batch sizes
- Progress tracking and detailed metrics
- Poisson and constant request distributions
Usage:
- Update configuration variables at the top of the file
- Ensure SGLang server is running on the configured HTTP_URL
- Run: python bench_embeddings.py
"""
import asyncio
import logging
from transformers import AutoTokenizer
from util import (
BenchmarkConfig,
generate_text_with_token_count,
run_benchmark_main,
run_generic_benchmark,
)
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
###############################################################################
# CONFIG
###############################################################################
# Create benchmark configuration
config = BenchmarkConfig()
config.rps_values = [500]
config.duration_secs_values = [60]
config.num_unique_requests = 100
config.distribution = "POISSON"
config.profile = False
config.freeze_gc = True # Enable GC freeze functionality
# Profiler output directory - by default uses present working directory (pwd)
# Uncomment and customize the line below to override the default location:
# config.profiler_dir = "/sglang-oss-trace"
# HTTP Configuration
HTTP_URL = "http://localhost:30000/v1/embeddings"
# Embeddings API Config
EMBEDDINGS_MODEL_PATH = "/Qwen/Qwen3-Embedding-0.6B"
BATCH_SIZE = [1] # Number of items per request (batch size)
# Configurable input token length
EMBEDDINGS_INPUT_TOKENS = 500 # Default token length
# Load tokenizer once for embeddings text generation
print("Loading tokenizer for embeddings input generation...")
embeddings_tokenizer = AutoTokenizer.from_pretrained(EMBEDDINGS_MODEL_PATH)
# Generate input text with the specified token length using pre-loaded tokenizer
EMBEDDINGS_INPUT_TEXT = generate_text_with_token_count(
EMBEDDINGS_MODEL_PATH,
EMBEDDINGS_INPUT_TOKENS,
config.special_replicated_token,
tokenizer=embeddings_tokenizer,
)
###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def build_embeddings_request(index: int, item_count: int) -> tuple:
"""Build a single embeddings request."""
try:
# For embeddings, input can be a string or list of strings
if item_count == 1:
input_data = EMBEDDINGS_INPUT_TEXT
else:
input_data = [EMBEDDINGS_INPUT_TEXT for _ in range(item_count)]
req = {
"input": input_data,
"model": EMBEDDINGS_MODEL_PATH,
}
return (index, req)
except Exception as e:
logger.error(f"Error building request {index}: {e}")
return (index, None)
def validate_embeddings_response(response_data: dict) -> bool:
"""Validate embeddings API response."""
return "data" in response_data
def build_warmup_embeddings_request() -> dict:
"""Build a warmup request for the embeddings API."""
return {
"input": EMBEDDINGS_INPUT_TEXT,
"model": EMBEDDINGS_MODEL_PATH,
}
###############################################################################
# MAIN
###############################################################################
async def run_benchmark(rps, duration_secs, item_count):
"""Run a single embeddings benchmark with the given RPS value."""
return await run_generic_benchmark(
rps=rps,
duration_secs=duration_secs,
item_count=item_count,
config=config,
http_url=HTTP_URL,
build_request_func=build_embeddings_request,
response_validator=validate_embeddings_response,
api_name="EMBEDDINGS",
request_description="embeddings requests",
)
async def main():
additional_info = {
"Input text length": f"{EMBEDDINGS_INPUT_TOKENS} tokens",
"Input text preview": (
EMBEDDINGS_INPUT_TEXT[:100] + "..."
if len(EMBEDDINGS_INPUT_TEXT) > 100
else EMBEDDINGS_INPUT_TEXT
),
}
await run_benchmark_main(
config,
run_benchmark,
"EMBEDDINGS",
HTTP_URL,
BATCH_SIZE,
additional_info,
build_warmup_embeddings_request,
)
if __name__ == "__main__":
asyncio.run(main())
"""
SGLang Scoring Benchmark Script
This script benchmarks SGLang's scoring API performance using HTTP requests.
Current Features:
- HTTP-only implementation (open source compatible)
- Uses /v1/score API endpoint directly
- Single item scoring with batching support
- Configurable RPS, duration, and batch sizes
- Progress tracking and detailed metrics
- Poisson and constant request distributions
Usage:
- Update configuration variables at the top of the file
- Ensure SGLang server is running on the configured HTTP_URL
- Run: python bench_score.py
- Each request will contain ITEM_COUNT_VALUES items for batch scoring
"""
import asyncio
from transformers import AutoTokenizer
from util import (
BenchmarkConfig,
generate_text_with_token_count,
run_benchmark_main,
run_generic_benchmark,
)
###############################################################################
# CONFIG
###############################################################################
# Create benchmark configuration
config = BenchmarkConfig()
config.rps_values = [160]
config.duration_secs_values = [60]
config.num_unique_requests = 100
config.distribution = "POISSON"
config.profile = False
config.freeze_gc = True # Enable GC freeze functionality
# Profiler output directory - by default uses present working directory (pwd)
# Uncomment and customize the line below to override the default location:
# config.profiler_dir = "/sglang-oss-trace"
# HTTP Configuration
HTTP_URL = "http://localhost:30000/v1/score" # Use score API directly
# Score API Config
# ITEM_COUNT_VALUES determines number of items per score request (batch size)
SCORE_QUERY_TOKENS = 120
SCORE_ITEM_TOKENS = 180
SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B"
SCORE_LABEL_TOKEN_IDS = [9454, 2753] # Yes/No token IDs
ITEM_COUNT_VALUES = [10] # Number of items per request
# Special token to replicate for precise token counting
SPECIAL_REPLICATED_TOKEN = "<|im_start|>"
###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def create_score_request_builder():
"""Create a score request builder function with shared tokenizer."""
# Load tokenizer once here to verify special token and get precise counts
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
# Verify that our special token produces exactly 1 token
special_token_count = len(
tokenizer.encode(config.special_replicated_token, add_special_tokens=False)
)
print(
f"Special token '{config.special_replicated_token}' produces "
f"{special_token_count} token(s)"
)
def generate_text_with_token_count_local(num_toks):
"""Generate text with precise token count using replicated token."""
return generate_text_with_token_count(
SCORE_MODEL_PATH,
num_toks,
config.special_replicated_token,
tokenizer=tokenizer,
)
def build_score_request(index: int, item_count: int) -> tuple:
"""Build a single score request."""
try:
# Generate query and items for score API
query = generate_text_with_token_count_local(SCORE_QUERY_TOKENS)
items = [
generate_text_with_token_count_local(SCORE_ITEM_TOKENS)
for _ in range(item_count)
]
# Return as dict for score API format
score_data = {
"query": query,
"items": items,
"label_token_ids": SCORE_LABEL_TOKEN_IDS,
"model": SCORE_MODEL_PATH,
}
return (index, score_data)
except Exception as e:
print(f"Error building request {index}: {e}")
return (index, None)
return build_score_request
def validate_score_response(response_data: dict) -> bool:
"""Validate score API response."""
return "scores" in response_data or "logprobs" in response_data
def build_warmup_score_request() -> dict:
"""Build a warmup request for the score API."""
# Load tokenizer once for warmup generation
tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
warmup_query = generate_text_with_token_count(
SCORE_MODEL_PATH,
SCORE_QUERY_TOKENS,
config.special_replicated_token,
tokenizer=tokenizer,
)
warmup_items = [
generate_text_with_token_count(
SCORE_MODEL_PATH,
SCORE_ITEM_TOKENS,
config.special_replicated_token,
tokenizer=tokenizer,
)
for _ in range(3)
]
return {
"query": warmup_query,
"items": warmup_items,
"label_token_ids": SCORE_LABEL_TOKEN_IDS,
"model": SCORE_MODEL_PATH,
# Add missing parameters for consistency with the original warmup
"apply_softmax": True,
"item_first": False,
}
###############################################################################
# MAIN
###############################################################################
async def run_benchmark(rps, duration_secs, item_count):
"""Run a single benchmark with the given RPS value."""
# Create the request builder function with shared tokenizer
build_request_func = create_score_request_builder()
return await run_generic_benchmark(
rps=rps,
duration_secs=duration_secs,
item_count=item_count,
config=config,
http_url=HTTP_URL,
build_request_func=build_request_func,
response_validator=validate_score_response,
api_name="SINGLE_ITEM_SCORING",
request_description="score requests",
)
async def main():
"""Main function that runs benchmarks for all RPS values."""
additional_info = {
"Query tokens per request": SCORE_QUERY_TOKENS,
"Item tokens per item": SCORE_ITEM_TOKENS,
}
await run_benchmark_main(
config,
run_benchmark,
"SINGLE_ITEM_SCORING",
HTTP_URL,
ITEM_COUNT_VALUES,
additional_info,
build_warmup_score_request,
)
if __name__ == "__main__":
asyncio.run(main())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment