Commit 909abb58 authored by maxiao's avatar maxiao
Browse files

adapt to sglang v0.5.2rc1 on dcu

parents
import concurrent.futures
import os
import random
import time
from concurrent.futures import ProcessPoolExecutor
from statistics import mean
import requests
from tqdm import tqdm
from transformers import AutoTokenizer
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
###############################################################################
# CONFIG
###############################################################################
ENDPOINT_URL = "http://127.0.0.1:30000"
TOKENIZER_DIR = "/models/meta-llama/Llama-3.2-3B"
# Benchmark configurations
NUM_REQUESTS = 10 # Total number of requests (each with BATCH_SIZE prompts)
NUM_TOKENS = 32000 # Tokens per prompt
BATCH_SIZE = 8 # Number of prompts per request
GEN_TOKENS = 0 # Tokens to generate per prompt
###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def generate_random_prompt(index, tokenizer_dir, num_tokens):
"""Generate a single random prompt with specified token count."""
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
vocab_size = tokenizer.vocab_size
def generate_random_text(num_toks):
random_token_ids = [random.randint(0, vocab_size - 1) for _ in range(num_toks)]
return tokenizer.decode(random_token_ids, clean_up_tokenization_spaces=True)
random_text = generate_random_text(num_tokens)
return f"Prompt {index}: {random_text}"
def prepare_all_prompts(num_requests, batch_size, num_tokens, tokenizer_dir):
"""Generate prompts for all requests in parallel."""
total_prompts = num_requests * batch_size
all_prompts = [None] * total_prompts
max_workers = min(os.cpu_count() or 1, total_prompts)
with ProcessPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(generate_random_prompt, i, tokenizer_dir, num_tokens)
for i in range(total_prompts)
]
for future in tqdm(
concurrent.futures.as_completed(futures),
total=total_prompts,
desc="Generating prompts",
):
index = futures.index(future)
all_prompts[index] = future.result()
batched_prompts = [
all_prompts[i * batch_size : (i + 1) * batch_size] for i in range(num_requests)
]
print(
f"Generated {total_prompts} prompts with {num_tokens} tokens each, grouped into {num_requests} requests of {batch_size} prompts.\n"
)
return batched_prompts
###############################################################################
# HTTP CALLS
###############################################################################
def send_batch_request(endpoint, prompts, gen_tokens, request_id):
"""Send a batch of prompts to the /generate endpoint synchronously."""
sampling_params = {
"max_new_tokens": gen_tokens,
"temperature": 0.7,
"stop": "\n",
}
data = {"text": prompts, "sampling_params": sampling_params}
start_time = time.perf_counter()
try:
response = requests.post(
endpoint.base_url + "/generate", json=data, timeout=3600
)
if response.status_code != 200:
error = response.json()
raise RuntimeError(f"Request {request_id} failed: {error}")
result = response.json()
elapsed_time = (time.perf_counter() - start_time) * 1000 # Convert to ms
avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
except Exception as e:
print(f"[Request] Error for request {request_id}: {e}")
return request_id, 0, 0, False, len(prompts)
def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
"""Run the benchmark sequentially."""
results = []
num_requests = len(batched_prompts)
# Record start time for total latency
benchmark_start_time = time.perf_counter()
for i, batch_prompts in enumerate(batched_prompts):
request_id = i + 1
assert (
len(batch_prompts) == batch_size
), f"Request {request_id} should have {batch_size} prompts, got {len(batch_prompts)}"
print(
f"[Request] Sending request {request_id}/{num_requests} with {len(batch_prompts)} prompts at {int(time.time()*1000)}"
)
result = send_batch_request(endpoint, batch_prompts, gen_tokens, request_id)
results.append(result)
# Calculate total latency
total_latency = (time.perf_counter() - benchmark_start_time) * 1000 # Convert to ms
return results, total_latency
###############################################################################
# RESULTS
###############################################################################
def process_results(results, total_latency, num_requests):
"""Process and display benchmark results."""
total_time = 0
successful_requests = 0
failed_requests = 0
request_latencies = []
per_prompt_latencies = []
total_prompts = 0
for request_id, elapsed_time, avg_per_prompt, success, batch_size in results:
if success:
successful_requests += 1
total_prompts += batch_size
request_latencies.append(elapsed_time)
per_prompt_latencies.append(avg_per_prompt)
total_time += elapsed_time / 1000 # Convert to seconds
else:
failed_requests += 1
avg_request_latency = mean(request_latencies) if request_latencies else 0
avg_per_prompt_latency = mean(per_prompt_latencies) if per_prompt_latencies else 0
throughput = total_prompts / total_time if total_time > 0 else 0
print("\nBenchmark Summary:")
print(f" Total requests sent: {len(results)}")
print(f" Total prompts sent: {total_prompts}")
print(f" Successful requests: {successful_requests}")
print(f" Failed requests: {failed_requests}")
print(f" Total latency (all requests): {total_latency:.2f} ms")
print(f" Avg per request latency: {avg_request_latency:.2f} ms")
print(f" Avg per prompt latency: {avg_per_prompt_latency:.2f} ms")
print(f" Throughput: {throughput:.2f} prompts/second\n")
###############################################################################
# MAIN
###############################################################################
def main():
# Initialize endpoint
endpoint = RuntimeEndpoint(ENDPOINT_URL)
# Generate prompts
batched_prompts = prepare_all_prompts(
NUM_REQUESTS, BATCH_SIZE, NUM_TOKENS, TOKENIZER_DIR
)
# Flush cache before benchmark
# endpoint.flush_cache()
# Run benchmark
print(
f"Starting benchmark: NUM_TOKENS={NUM_TOKENS}, BATCH_SIZE={BATCH_SIZE}, NUM_REQUESTS={NUM_REQUESTS}\n"
)
results, total_latency = run_benchmark(
endpoint, batched_prompts, BATCH_SIZE, GEN_TOKENS
)
# Process and display results
process_results(results, total_latency, NUM_REQUESTS)
if __name__ == "__main__":
random.seed(0)
main()
import random
import time
from statistics import mean
from transformers import AutoTokenizer
# CONFIG
TOKENIZER_DIR = (
"/shared/public/sharing/fait360brew/training/models/meta-llama/Llama-3.2-3B"
)
NUM_TOKENS = 20000 # Each prompt should contain this many tokens
BATCH_SIZES = [1, 2, 4, 8] # Test different batch sizes
NUM_RUNS = 5 # Number of runs for each batch size to get reliable measurements
def generate_random_prompts(num_prompts, num_tokens, tokenizer):
"""Generate random prompts with specified token count."""
vocab_size = tokenizer.vocab_size
all_prompts = []
print(f"Generating {num_prompts} random prompts with {num_tokens} tokens each...")
for i in range(num_prompts):
# Generate random token IDs - this directly gives us the exact token count
random_token_ids = [
random.randint(0, vocab_size - 1) for _ in range(num_tokens)
]
random_text = tokenizer.decode(
random_token_ids, clean_up_tokenization_spaces=True
)
prompt = f"Prompt {i}: {random_text}"
tokens = tokenizer.encode(prompt)
print(f" Prompt {i}: {len(tokens)} tokens")
all_prompts.append(prompt)
return all_prompts
def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
"""Compare sequential vs batch tokenization for a given batch size."""
# Sequential tokenization using encode()
sequential_times = []
for run in range(NUM_RUNS):
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
start_time = time.perf_counter()
for prompt in batch_prompts:
tokens = tokenizer.encode(prompt)
sequential_time = (time.perf_counter() - start_time) * 1000
sequential_times.append(sequential_time)
# Batch tokenization using tokenizer()
batch_times = []
for run in range(NUM_RUNS):
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
start_time = time.perf_counter()
tokens = tokenizer(batch_prompts)
batch_time = (time.perf_counter() - start_time) * 1000
batch_times.append(batch_time)
return {
"batch_size": batch_size,
"avg_sequential_ms": mean(sequential_times),
"avg_batch_ms": mean(batch_times),
"speedup_factor": (
mean(sequential_times) / mean(batch_times) if mean(batch_times) > 0 else 0
),
"sequential_runs": sequential_times,
"batch_runs": batch_times,
}
def main():
print("Tokenizer Benchmark: Sequential vs Batch Processing")
print("-" * 60)
print(f"Tokenizer: {TOKENIZER_DIR}")
print(f"Tokens per prompt: {NUM_TOKENS}")
print(f"Number of runs per batch size: {NUM_RUNS}")
print("-" * 60)
# Load tokenizer once for all operations
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
# The largest batch size determines how many prompts we need
max_batch_size = max(BATCH_SIZES)
all_prompts = generate_random_prompts(max_batch_size, NUM_TOKENS, tokenizer)
results = []
print("\nRunning benchmark...")
for batch_size in BATCH_SIZES:
print(f"\nBenchmarking batch size: {batch_size}")
result = benchmark_sequential_vs_batch(all_prompts, batch_size, tokenizer)
results.append(result)
print(f" Sequential tokenization (encode):")
for i, run_time in enumerate(result["sequential_runs"]):
print(f" Run {i+1}: {run_time:.2f} ms")
print(f" Average: {result['avg_sequential_ms']:.2f} ms")
print(f" Batch tokenization (tokenizer):")
for i, run_time in enumerate(result["batch_runs"]):
print(f" Run {i+1}: {run_time:.2f} ms")
print(f" Average: {result['avg_batch_ms']:.2f} ms")
print(f" Speedup factor: {result['speedup_factor']:.2f}x")
print("\n" + "=" * 60)
print("SUMMARY OF RESULTS")
print("=" * 60)
print(
f"{'Batch Size':<10} {'Sequential (ms)':<18} {'Batch (ms)':<18} {'Speedup':<10}"
)
print("-" * 60)
for result in results:
print(
f"{result['batch_size']:<10} {result['avg_sequential_ms']:.2f} ms{' ' * 8} {result['avg_batch_ms']:.2f} ms{' ' * 8} {result['speedup_factor']:.2f}x"
)
if __name__ == "__main__":
random.seed(0)
main()
## How to reproduce the benchmark results for SGLang v0.3.0 compared to vLLM v0.6.0
In short, with multi step enabled, in online scenarios that we benchmarked, the Median TTFT of vLLM is **3 times** that of SGLang, and the Median ITL is **10 times** that of SGLang. Lower Median TTFT and ITL are better. vLLM's multi-step optimization did not improve throughput while ensuring lower Median TTFT and ITL. Also, under maximum throughput benchmark, if vLLM does not set gpu util to 0.95 separately and uses the default configuration instead, its maximum throughput is **lower** than that of SGLang.
## Online benchmark results
### Llama 3.1 8B Instruct 1 x A100 80G
| RPS | Num prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
|------|-------------|--------|--------------------|-------------|-------------|------------|
| 4 | 1200 | SGLang | 1564.17 | **31.98** | 13.17 | **11.93** |
| 4 | 1200 | vLLM | 1691.97 | **100.48** | 14.14 | **129.32** |
| 8 | 2400 | SGLang | 2175.02 | **35.68** | 17.85 | **14.41** |
| 8 | 2400 | vLLM | 2137.16 | **120.39** | 17.09 | **158.63** |
### Llama 3.1 70B Insruct 4 x H100 80G
| RPS | Num Prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
|------|-------------|--------|--------------------|-------------|-------------|------------|
| 4 | 1200 | SGLang | 3005.24 | **53.94** | 25.03 | **21.67** |
| 4 | 1200 | vLLM | 2915.60 | **179.15** | 23.58 | **231.23** |
| 8 | 2400 | SGLang | 4064.98 | **58.11** | 33.07 | **24.45** |
| 8 | 2400 | vLLM | 3752.38 | **207.12** | 29.15 | **275.32** |
## Offline benchmark results
### Llama 3.1 8B Instruct 1 x A100 80G
| RPS | Num Prompts | Engine | Request throughput | Output token throughput |
|------|-------------|--------|--------------------|-------------------------|
| inf | 5000 | SGLang | 22.03 | **4281.51** |
| inf | 5000 | vLLM | 21.27 | **4132.37** |
### Llama 3.1 70B Insruct 4 x H100 80G
| RPS | Num Prompts | Engine | Request throughput | Output token throughput |
|------|-------------|--------|--------------------|-------------------------|
| inf | 5000 | SGLang | 19.84 | **3856.01** |
| inf | 5000 | vLLM | 19.04 | **3700.64** |
## Installation
```bash
# install sglang v0.3.0
pip install --upgrade pip
pip install "sglang[all]"==0.3.0
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
# install vllm v0.6.0
pip install vllm==0.6.0
```
## Notes
We referred to the reproduction method in https://github.com/vllm-project/vllm/issues/8176, and added the `--num-scheduler-steps 10` parameter when starting the vLLM server. The `gpu_memory_utilization` of vLLM is by default 0.9 at both TP 1 and TP 4, while SGLang's `mem_frac` is 0.88 at TP 1 and 0.85 at TP 4, so we manually set it to 0.88 at TP 4.
## Online benchmarks
```bash
# Llama 3.1 8B Instruct on 1 x A100
python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
# Llama 3.1 70B Instruct on 4 x H100
python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4
python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
# bench serving
python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 1200 --request-rate 4
python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 2400 --request-rate 8
python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 1200 --request-rate 4
python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 2400 --request-rate 8
```
## Offline benchmarks
```bash
# Llama 3.1 8B Instruct on 1 x A100
python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
# Llama 3.1 70B Instruct on 4 x H100
python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88
python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
# bench serving
python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 5000
python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 5000
```
# Create dummy weights:
# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
# 2. Get `config.json`` from ./config.md
# 3. Download the tokenizer
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
# Launch sglang
# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quantization fp8 --disable-radix --mem-frac 0.87
# offline
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
# online
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
# Launch trtllm
# https://github.com/sgl-project/tensorrt-demo
# offline
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log11
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log12
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log13
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log14
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log15
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 2000 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log21
# online
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log31
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log32
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log33
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log34
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log35
# Create dummy weights:
# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
# 2. Get `config.json`` from ./config.md
# 3. Download the tokenizer
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
# Launch vllm
# python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000
# offline
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > vllm_log11
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > vllm_log12
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > vllm_log13
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > vllm_log14
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > vllm_log15
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 2000 > vllm_log21
# online
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > vllm_log31
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > vllm_log32
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > vllm_log33
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > vllm_log34
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35
# How to reproduce the benchmark results of SGLang
## Prerequisite
### Install the latest SGLang
```bash
git clone https://github.com/sgl-project/sglang.git
cd sglang
git checkout v0.2.7
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
```
### Set up ulimit and HF_TOKEN
```bash
ulimit -n 65535
# Change the token to a real and usable one, with access permissions for the Llama 3 models.
export HF_TOKEN=hf_token
```
### Launch the server
```bash
# Meta-Llama-3.1-8B-Instruct
python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
# Meta-Llama-3.1-70B-Instruct
python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 8
# Meta-Llama-3-70B-Instruct-FP8
python -m sglang.launch_server --model-path neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-radix-cache --tp 8
```
## Benchmark
### Hardware Requirements
- 8B models: Single NVIDIA A100 80GB GPU
- 70B models: 8 x NVIDIA A100 80GB GPUs with Tensor Parallelism (TP) 8
- 70B FP8 models: 8 x NVIDIA H100 GPUs with Tensor Parallelism (TP) 8
Please ensure you have the appropriate hardware before running the benchmarks.
#### Offline benchmark
```bash
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl
python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl
cat offline.jsonl | cut -d':' -f12 | cut -d',' -f1
```
#### Online benchmark
```bash
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl
cat online.jsonl | cut -d':' -f9 | cut -d',' -f1
```
## Other
We tried using vLLM 0.5.3.post1, but it often crashes under high loads, and it seems to have similar or worse performance compared to vLLM 0.5.2 from our partial benchmarking, so we are using the older version, vLLM 0.5.2.
Preparation for TensorRT LLM can refer to https://github.com/sgl-project/tensorrt-demo. Specifically, we used a batch size of 512, a max input length of 8192, and a max number of tokens of 8192. The instance count for preprocessing and postprocessing in Triton Server is 16.
```bash
# vLLM
pip install vllm==0.5.2
pip install jsonschema==4.21.1
# Meta-Llama-3-8B-Instruct
python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B-Instruct --disable-log-requests
# meta-llama/Meta-Llama-3-70B-Instruct
python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B-Instruct --disable-log-requests --tensor 8
# neuralmagic/Meta-Llama-3-70B-Instruct-FP8
python -m vllm.entrypoints.openai.api_server --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-log-requests --tensor 8
```
```bash
wget https://raw.githubusercontent.com/sgl-project/sglang/main/python/sglang/bench_serving.py
```
```bash
# vLLM Offline
python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_vllm.jsonl
python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_vllm.jsonl
python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_vllm.jsonl
python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_vllm.jsonl
python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_vllm.jsonl
python3 bench_serving.py --backend vllm --dataset-name sharegpt --num-prompts 3000 --output-file offline_vllm.jsonl
cat offline_vllm.jsonl | cut -d':' -f12 | cut -d',' -f1
```
```bash
# vLLM Online
python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_vllm.jsonl
python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_vllm.jsonl
python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_vllm.jsonl
python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_vllm.jsonl
python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_vllm.jsonl
cat online_vllm.jsonl | cut -d':' -f9 | cut -d',' -f1
```
```bash
# TensorRT LLM Offline 8B
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_8b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_8b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_8b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_8b.jsonl
python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_8b.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_8b.jsonl
cat offline_trt_8b.jsonl | cut -d':' -f12 | cut -d',' -f1
```
```bash
# TensorRT LLM Online 8B
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_8b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_8b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_8b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_8b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_8b.jsonl
cat online_trt_8b.jsonl | cut -d':' -f9 | cut -d',' -f1
```
```bash
# TensorRT LLM Offline 70B
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_70b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_70b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_70b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_70b.jsonl
python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_70b.jsonl --model meta-llama/Meta-Llama-3-70B-Instruct
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_70b.jsonl
cat offline_trt_70b.jsonl | cut -d':' -f12 | cut -d',' -f1
```
```bash
# TensorRT LLM Online 70B
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_70b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_70b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_70b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_70b.jsonl
python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_70b.jsonl
cat online_trt_70b.jsonl | cut -d':' -f9 | cut -d',' -f1
```
### used for TensorRT LLM
```
{
"architecture": "LlamaForCausalLM",
"dtype": "float16",
"logits_dtype": "float32",
"vocab_size": 128256,
"max_position_embeddings": 8192,
"hidden_size": 16384,
"num_hidden_layers": 126,
"num_attention_heads": 128,
"num_key_value_heads": 16,
"head_size": 128,
"qk_layernorm": false,
"hidden_act": "silu",
"intermediate_size": 53248,
"norm_epsilon": 1e-05,
"position_embedding_type": "rope_gpt_neox",
"use_parallel_embedding": false,
"embedding_sharding_dim": 0,
"share_embedding_table": false,
"mapping": {
"world_size": 8,
"tp_size": 8,
"pp_size": 1,
"gpus_per_node": 8
},
"quantization": {
"quant_algo": "FP8",
"kv_cache_quant_algo": null,
"group_size": 128,
"smoothquant_val": null,
"has_zero_point": false,
"pre_quant_scale": false,
"exclude_modules": [
"lm_head"
]
},
"kv_dtype": "float16",
"rotary_scaling": null,
"residual_mlp": false,
"moe_normalization_mode": null,
"rotary_base": 500000.0,
"moe_num_experts": 0,
"moe_top_k": 0,
"moe_tp_mode": 2,
"attn_bias": false,
"disable_weight_only_quant_plugin": false,
"mlp_bias": false
}
```
### used for vLLM and SGLang
```
{
"_name_or_path": "dummy_fp8",
"architectures": [
"LlamaForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 128000,
"eos_token_id": 128009,
"hidden_act": "silu",
"hidden_size": 16384,
"initializer_range": 0.02,
"intermediate_size": 53248,
"mlp_bias": false,
"model_type": "llama",
"num_attention_heads": 128,
"num_hidden_layers": 126,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"quantization_config": {
"activation_scheme": "static",
"ignored_layers": [
"lm_head"
],
"quant_method": "fp8"
},
"rope_scaling": {
"factor": 8.0,
"low_freq_factor": 1.0,
"high_freq_factor": 4.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"max_position_embeddings": 131072,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 500000.0,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.41.1",
"use_cache": true,
"vocab_size": 128256
}
```
## Download data
```
git clone https://hf-mirror.com/datasets/google/boolq
```
## Convert parquet to json
```
bash parquet_to_json.sh
```
## Run benchmark
### Benchmark sglang
```
python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
```
```
python3 bench_sglang.py
```
import argparse
import json
import time
import numpy as np
from sglang.api import set_default_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import read_jsonl
def get_example(lines, i, answer):
prompt = "Question: " + lines[i]["question"] + lines[i]["passage"] + "\nAnswer:"
if answer:
prompt += str(lines[i]["answer"])
return prompt
def few_shot_examples(lines, k):
prompts = ""
for i in range(k):
prompts += get_example(lines, i, True) + "\n\n"
return prompts
def main(args):
# Select backend
set_default_backend(select_sglang_backend(args))
# Read data
train_data_path = args.train_data_path
test_data_path = args.test_data_path
lines_train = list(read_jsonl(train_data_path))
lines_test = list(read_jsonl(test_data_path))
# Construct prompts
num_questions = args.num_questions
num_shots = args.num_shots
few_shots = few_shot_examples(lines_train, num_shots)
questions = []
answer = []
for i in range(len(lines_test[:num_questions])):
questions.append(get_example(lines_test, i, False))
answer.append(str(lines_test[i]["answer"]))
arguments = [{"question": q} for q in questions]
#####################################
######### SGL Program Begin #########
#####################################
import sglang as sgl
@sgl.function
def few_shot_boolq(s, question):
s += few_shots + question
s += sgl.gen("answer", max_tokens=5, stop=["\n"])
#####################################
########## SGL Program End ##########
#####################################
# Run requests
tic = time.perf_counter()
states = few_shot_boolq.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):
preds.append(states[i]["answer"])
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(answer))
# Compute speed
num_output_tokens = sum(
s.get_meta_info("answer")["completion_tokens"] for s in states
)
output_throughput = num_output_tokens / latency
# Print results
print(f"Accuracy: {acc:.3f}")
print(f"Latency: {latency:.3f} s")
print(f"Output throughput: {output_throughput:.3f} token/s")
# Results
with open(args.result_file, "a") as fout:
value = {
"task": "boolq",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"accuracy": round(acc, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-shots", type=int, default=5)
parser.add_argument(
"--train-data-path", type=str, default="./boolq/data/train-00000-of-00001.json"
)
parser.add_argument(
"--test-data-path",
type=str,
default="./boolq/data/validation-00000-of-00001.json",
)
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_sglang_args_and_parse(parser)
main(args)
import sys
import pyarrow.parquet as pq
def convert_parquet_to_json(input_file, output_file):
# read parquet file
table = pq.read_table(input_file)
# turn parquet data to dataframe
df = table.to_pandas()
# turn dataframe to json form
json_data = df.to_json(orient="records", lines=True)
# write json to file
with open(output_file, "w") as f:
f.write(json_data)
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage:python convert_parquet_to_json.py <input_file> <output_file>")
input_file = sys.argv[1]
output_file = sys.argv[2]
convert_parquet_to_json(input_file, output_file)
#!/bin/bash
#define input and output direction
input_dir="./boolq/data"
output_dir="./boolq/data"
#define files needed to be handled
files=(
"train-00000-of-00001.parquet"
"validation-00000-of-00001.parquet"
)
#foe files above, use python script to convert the form
for file in "${files[@]}"; do
input_file="${input_dir}/${file}"
output_file="${output_dir}/${file%.parquet}.json"
echo "Converting ${input_file} to ${output_file} ..."
python3 convert_parquet_to_json.py "${input_file}" "${output_file}"
if [ $? -eq 0 ]; then
echo "Conversion successful: ${output_file}"
else
echo "Conversion failed: ${input_file}"
fi
done
## Download data
```
git lfs clone https://huggingface.co/datasets/ceval/ceval-exam
```
## Run benchmark
### Benchmark sglang
```
python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
```
```
python3 bench_sglang.py
```
import argparse
import json
import os
import random
import re
import time
import numpy as np
from datasets import load_dataset
from sglang.api import set_default_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
choices = ["A", "B", "C", "D"]
def get_one_example(line, include_answer):
res = line["question"]
res += f"\nA. {line['A']}"
res += f"\nB. {line['B']}"
res += f"\nC. {line['C']}"
res += f"\nD. {line['D']}"
if include_answer:
res += f"\nAnswer: {line['answer']} \n\n"
return res
def get_few_shot_examples(lines):
res = ""
for line in lines:
res += get_one_example(line, True) + "\n\n"
return res
def get_answer_value(response):
pattern = r"(Answer:|answer:|答案是|答案是:|正确答案是:|答案:|Assistant:)\s*([A-D])(?![\w])"
match = re.search(pattern, response)
if match:
return match.group(2)
return random.choice(choices)
def main(args):
# Read data && Construct prompts
arguments = []
labels = []
examples = "examples:\n"
data_path = args.data_path
for subject in os.listdir(data_path):
subject_path = os.path.join(data_path, subject)
if os.path.isdir(subject_path) and subject != ".git":
dataset = load_dataset(data_path, name=subject)
dev_lines_temp = dataset["dev"]
val_lines_temp = dataset["val"]
few_shot_examples = get_few_shot_examples(dev_lines_temp, subject)
examples += f"{few_shot_examples}"
for val_line in val_lines_temp:
arguments.append(
{
"examples": few_shot_examples,
"question": get_one_example(val_line, False),
}
)
labels.append(val_line["answer"])
#####################################
######### SGL Program Begin #########
#####################################
import sglang as sgl
@sgl.function
def few_shot_ceval(s, examples, question):
s += examples + question + sgl.gen("Answer")
#####################################
########## SGL Program End ##########
#####################################
num_questions = args.num_questions if args.num_questions else len(arguments)
# Select backend
set_default_backend(select_sglang_backend(args))
# Run requests
tic = time.perf_counter()
states = few_shot_ceval.run_batch(
arguments[:num_questions],
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.perf_counter() - tic
preds = [get_answer_value(states[i]["Answer"]) for i in range(num_questions)]
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels[:num_questions]))
# Compute speed
num_output_tokens = sum(
s.get_meta_info("Answer")["completion_tokens"] for s in states
)
output_throughput = num_output_tokens / latency
# Print results
print(f"Accuracy: {acc:.3f}")
print(f"Latency: {latency:.3f} s")
print(f"Output throughput: {output_throughput:.3f} token/s")
# Write results
with open(args.result_file, "a") as fout:
value = {
"task": "ceval",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"accuracy": round(acc, 3),
"num_requests": args.num_questions,
"other": {
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="ceval-exam")
parser.add_argument("--num-questions", type=int, default=None)
args = add_common_sglang_args_and_parse(parser)
main(args)
# DeepSeek V3.1/V3/R1 Support
The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVIDIA and AMD GPUs **from day one**. SGLang also supports [MLA optimization](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) and [DP attention](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models), making SGLang one of the best open-source LLM engines for running DeepSeek models. SGLang is the inference engine recommended by the official [DeepSeek team](https://github.com/deepseek-ai/DeepSeek-V3/tree/main?tab=readme-ov-file#62-inference-with-sglang-recommended).
Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources.
For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://docs.sglang.ai/basic_usage/deepseek.html).
## Installation & Launch
If you encounter errors when starting the server, ensure the weights have finished downloading. It's recommended to download them beforehand or restart multiple times until all weights are downloaded.
### Using Docker (Recommended)
```bash
# Pull latest image
# https://hub.docker.com/r/lmsysorg/sglang/tags
docker pull lmsysorg/sglang:latest
# Launch
docker run --gpus all --shm-size 32g -p 30000:30000 -v ~/.cache/huggingface:/root/.cache/huggingface --ipc=host --network=host --privileged lmsysorg/sglang:latest \
python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code --port 30000
```
If you are using RDMA, please note that:
1. `--network host` and `--privileged` are required by RDMA. If you don't need RDMA, you can remove them.
2. You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
Add [performance optimization options](#performance-optimization-options) as needed.
### Using pip
```bash
# Installation
pip install "sglang[all]>=0.5.2rc1"
# Launch
python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
```
Add [performance optimization options](#performance-optimization-options) as needed.
<a id="option_args"></a>
### Performance Optimization Options
[MLA optimizations](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) are enabled by default. Here are some optional optimizations can be enabled as needed.
- [Data Parallelism Attention](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models): For high QPS scenarios, add the `--enable-dp-attention` argument to boost throughput.
- [Torch.compile Optimization](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#torchcompile-latency-optimizations): Add `--enable-torch-compile` argument to enable it. This will take some time while server starts. The maximum batch size for torch.compile optimization can be controlled with `--torch-compile-max-bs`. It's recommended to set it between `1` and `8`. (e.g., `--torch-compile-max-bs 8`)
### Usage: Chat with DeepSeek
#### DeepSeek V3/R1
```python3
import openai
client = openai.Client(
base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
# Chat completion
response = client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant"},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=0,
max_tokens=64,
)
print(response)
```
#### DeepSeek V3.1
On top of the basic usage similar to the DeepSeek V3/R1 example, DeepSeek V3.1 supports a request-level thinking/non-thinking toggle. Simply switch the `"thinking"` field in `extra_body={"chat_template_kwargs": {"thinking": True}}` to enable/disable the thinking mode.
##### Non Thinking
```python3
import openai
client = openai.Client(
base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
# Chat completion
response = client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant"},
{"role": "user", "content": "Answer the following with the second letter of the correct answer only: What is the capital of France?"},
],
temperature=0,
max_tokens=1024,
extra_body = {"chat_template_kwargs": {"thinking": False}}
)
print(response.choices[0].message.content)
```
Answer:
```
h
```
* The correct response should be 'A', as the correct answer to the question is 'Paris'.
##### Thinking
```python3
import openai
client = openai.Client(
base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
# Chat completion
response = client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant"},
{"role": "user", "content": "Answer the following with the second letter of the correct answer only: What is the capital of France?"},
],
temperature=0,
max_tokens=1024,
extra_body = {"chat_template_kwargs": {"thinking": True}}
)
print(response)
```
Answer:
```
First, the question is: "What is the capital of France?" I know that the capital of France is Paris.
The user says: "Answer the following with the second letter of the correct answer only." So, I need to provide only the second letter of the correct answer.
The correct answer is "Paris". Now, I need to find the second letter of "Paris".
Let's spell it out: P-A-R-I-S.
- First letter: P
- Second letter: A
- Third letter: R
- Fourth letter: I
- Fifth letter: S
So, the second letter is "A".
I should only output the second letter, which is "A". No additional text or explanation, just the letter.
The user emphasized "the second letter of the correct answer only", so my response should be just "A".
Finally, I need to make sure that this is the correct answer. Yes, Paris is indeed the capital of France.</think>A
```
* The response contains `</think>` thinking trace and model was able to derive the correct answer from it.
### Example: Serving with two H20\*8 nodes
For example, there are two H20 nodes, each with 8 GPUs. The first node's IP is `10.0.0.1`, and the second node's IP is `10.0.0.2`. Please **use the first node's IP** for both commands.
If the command fails, try setting the `GLOO_SOCKET_IFNAME` parameter. For more information, see [Common Environment Variables](https://pytorch.org/docs/stable/distributed.html#common-environment-variables).
If the multi nodes support NVIDIA InfiniBand and encounter hanging issues during startup, consider adding the parameter `export NCCL_IB_GID_INDEX=3`. For more information, see [this](https://github.com/sgl-project/sglang/issues/3516#issuecomment-2668493307).
```bash
# node 1
python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 10.0.0.1:5000 --nnodes 2 --node-rank 0 --trust-remote-code
# node 2
python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 10.0.0.1:5000 --nnodes 2 --node-rank 1 --trust-remote-code
```
If you have two H100 nodes, the usage is similar to the aforementioned H20.
> **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
### Example: Serving with two H200\*8 nodes and docker
There are two H200 nodes, each with 8 GPUs. The first node's IP is `192.168.114.10`, and the second node's IP is `192.168.114.11`. Configure the endpoint to expose it to another Docker container using `--host 0.0.0.0` and `--port 40000`, and set up communications with `--dist-init-addr 192.168.114.10:20000`.
A single H200 with 8 devices can run DeepSeek V3, the dual H200 setup is just to demonstrate multi-node usage.
```bash
# node 1
docker run --gpus all \
--shm-size 32g \
--network=host \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--name sglang_multinode1 \
-it \
--rm \
--env "HF_TOKEN=$HF_TOKEN" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 192.168.114.10:20000 --nnodes 2 --node-rank 0 --trust-remote-code --host 0.0.0.0 --port 40000
```
```bash
# node 2
docker run --gpus all \
--shm-size 32g \
--network=host \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--name sglang_multinode2 \
-it \
--rm \
--env "HF_TOKEN=$HF_TOKEN" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 192.168.114.10:20000 --nnodes 2 --node-rank 1 --trust-remote-code --host 0.0.0.0 --port 40000
```
To ensure functionality, we include a test from a client Docker container.
```bash
docker run --gpus all \
--shm-size 32g \
--network=host \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--name sglang_multinode_client \
-it \
--rm \
--env "HF_TOKEN=$HF_TOKEN" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1 --random-output 512 --random-range-ratio 1 --num-prompts 1 --host 0.0.0.0 --port 40000 --output-file "deepseekv3_multinode.jsonl"
```
> **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
### Example: Serving with four A100\*8 nodes
To serve DeepSeek-V3 with A100 GPUs, we need to convert the [FP8 model checkpoints](https://huggingface.co/deepseek-ai/DeepSeek-V3) to BF16 with [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) mentioned [here](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) first.
Since the BF16 model is over 1.3 TB, we need to prepare four A100 nodes, each with 8 80GB GPUs. Assume the first node's IP is `10.0.0.1`, and the converted model path is `/path/to/DeepSeek-V3-BF16`, we can have following commands to launch the server.
```bash
# node 1
python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 0 --trust-remote-code --host 0.0.0.0 --port 30000
# node 2
python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 1 --trust-remote-code
# node 3
python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 2 --trust-remote-code
# node 4
python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 3 --trust-remote-code
```
> **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
Then we can benchmark the accuracy and latency by accessing the first node's exposed port with the following example commands.
```bash
# bench accuracy
python3 benchmark/gsm8k/bench_sglang.py --num-questions 1319 --host http://10.0.0.1 --port 30000
# bench latency
python3 -m sglang.bench_one_batch_server --model None --base-url http://10.0.0.1:30000 --batch-size 1 --input-len 128 --output-len 128
```
### Example: Serving with 8 A100/A800 with AWQ Quantization
**Recommended Usage**
Add `--quantization moe_wna16` flag to enable moe wna16 kernel for better performance.
One example is as follows:
```bash
python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --quantization moe_wna16
```
Alternatively, you can use `--quantization awq_marlin` as follows:
```bash
python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --quantization awq_marlin --dtype float16
```
Note that `awq_marlin` only supports `float16` now, which may lead to some precision loss.
### Example: Serving with 16 A100/A800 with int8 Quantization
There are block-wise and per-channel quantization methods, and the quantization parameters have already been uploaded to Huggingface. One example is as follows:
- [meituan/DeepSeek-R1-Block-INT8](https://huggingface.co/meituan/DeepSeek-R1-Block-INT8)
- [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8)
Assuming that master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-INT8` and port=5000, we can have following commands to launch the server:
```bash
#master
python3 -m sglang.launch_server \
--model meituan/DeepSeek-R1-Block-INT8 --tp 16 --dist-init-addr \
MASTER_IP:5000 --nnodes 2 --node-rank 0 --trust-remote-code --enable-torch-compile --torch-compile-max-bs 8
#cluster
python3 -m sglang.launch_server \
--model meituan/DeepSeek-R1-Block-INT8 --tp 16 --dist-init-addr \
MASTER_IP:5000 --nnodes 2 --node-rank 1 --trust-remote-code --enable-torch-compile --torch-compile-max-bs 8
```
> **Note that the launch command here enables `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
Then on the **master node**, supposing the ShareGPT data is located at `/path/to/ShareGPT_V3_unfiltered_cleaned_split.json`, you can run the following commands to benchmark the launched server:
```bash
# bench accuracy
python3 benchmark/gsm8k/bench_sglang.py --num-questions 1319
# bench serving
python3 -m sglang.bench_serving --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --dataset-name random --random-input 128 --random-output 128 --num-prompts 1000 --request-rate 128 --random-range-ratio 1.0
```
> **Note: using `--parallel 200` can accelerate accuracy benchmarking**.
### Example: Serving with 32 L40S with int8 Quantization
Running with per-channel quantization model:
- [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8)
Assuming that master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-Channel-INT8` and port=5000, we can have following commands to launch the server:
```bash
#master
python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 0 --trust-remote \
--enable-torch-compile --torch-compile-max-bs 32
#cluster
python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 1 --trust-remote \
--enable-torch-compile --torch-compile-max-bs 32
python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 2 --trust-remote \
--enable-torch-compile --torch-compile-max-bs 32
python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 3 --trust-remote \
--enable-torch-compile --torch-compile-max-bs 32
```
The benchmarking method is the same as describted in the previous [16 x A100](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-16-a100a800-with-int8-quantization) example.
### Example: Serving on any cloud or Kubernetes with SkyPilot
SkyPilot helps find cheapest available GPUs across any cloud or existing Kubernetes clusters and launch distributed serving with a single command. See details [here](https://github.com/skypilot-org/skypilot/tree/master/llm/deepseek-r1).
To serve on multiple nodes:
```bash
git clone https://github.com/skypilot-org/skypilot.git
# Serve on 2 H100/H200x8 nodes
sky launch -c r1 llm/deepseek-r1/deepseek-r1-671B.yaml --retry-until-up
# Serve on 4 A100x8 nodes
sky launch -c r1 llm/deepseek-r1/deepseek-r1-671B-A100.yaml --retry-until-up
```
#### Troubleshooting
If you encounter the following error with fp16/bf16 checkpoint:
```bash
ValueError: Weight output_partition_size = 576 is not divisible by weight quantization block_n = 128.
```
edit your `config.json` and remove the `quantization_config` block. For example:
```json
"quantization_config": {
"activation_scheme": "dynamic",
"fmt": "e4m3",
"quant_method": "fp8",
"weight_block_size": [128, 128]
},
```
Removing this block typically resolves the error. For more details, see the discussion in [sgl-project/sglang#3491](https://github.com/sgl-project/sglang/issues/3491#issuecomment-2650779851).
## DeepSeek V3 Optimization Plan
https://github.com/sgl-project/sglang/issues/2591
## Install
```
pip3 install dspy-ai
```
Turn off cache at https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/dsp/modules/cache_utils.py#L10.
```
cache_turn_on = False
```
or set the environment variable
```
export DSP_CACHEBOOL=false
```
## Benchmark SGLang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
python3 bench_dspy_intro.py --backend sglang
```
## Benchmark TGI
```
docker run --name tgi --rm -ti --gpus all --network host \
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
ghcr.io/huggingface/text-generation-inference:1.3.0 \
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
--max-input-length 2048 --max-total-tokens 4096 \
--port 24000
```
```
python3 bench_dspy_intro.py --backend tgi
```
## Benchmark vLLM
```
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
```
```
python3 bench_dspy_intro.py --backend vllm
```
"""
Adapted from
https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/intro.ipynb#L9
"""
import argparse
import dspy
from dspy.datasets import HotPotQA
class BasicQA(dspy.Signature):
"""Answer questions with short factoid answers."""
question = dspy.InputField()
answer = dspy.OutputField(desc="often between 1 and 5 words")
class GenerateAnswer(dspy.Signature):
"""Answer questions with short factoid answers."""
context = dspy.InputField(desc="may contain relevant facts")
question = dspy.InputField()
answer = dspy.OutputField(desc="often between 1 and 5 words")
class RAG(dspy.Module):
def __init__(self, num_passages=3):
super().__init__()
self.retrieve = dspy.Retrieve(k=num_passages)
self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
def forward(self, question):
context = self.retrieve(question).passages
prediction = self.generate_answer(context=context, question=question)
return dspy.Prediction(context=context, answer=prediction.answer)
def main(args):
# lm = dspy.OpenAI(model='gpt-3.5-turbo')
if args.backend == "tgi":
lm = dspy.HFClientTGI(
model="meta-llama/Llama-2-7b-chat-hf",
port=args.port,
url="http://localhost",
)
elif args.backend == "sglang":
lm = dspy.HFClientSGLang(
model="meta-llama/Llama-2-7b-chat-hf",
port=args.port,
url="http://localhost",
)
elif args.backend == "vllm":
lm = dspy.HFClientVLLM(
model="meta-llama/Llama-2-7b-chat-hf",
port=args.port,
url="http://localhost",
)
else:
raise ValueError(f"Invalid backend: {args.backend}")
colbertv2_wiki17_abstracts = dspy.ColBERTv2(
url="http://20.102.90.50:2017/wiki17_abstracts"
)
dspy.settings.configure(lm=lm, rm=colbertv2_wiki17_abstracts)
# Load the dataset.
dataset = HotPotQA(
train_seed=1, train_size=20, eval_seed=2023, dev_size=args.dev_size, test_size=0
)
# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs("question") for x in dataset.train]
devset = [x.with_inputs("question") for x in dataset.dev]
print(len(trainset), len(devset))
train_example = trainset[0]
print(f"Question: {train_example.question}")
print(f"Answer: {train_example.answer}")
dev_example = devset[18]
print(f"Question: {dev_example.question}")
print(f"Answer: {dev_example.answer}")
print(f"Relevant Wikipedia Titles: {dev_example.gold_titles}")
print(
f"For this dataset, training examples have input keys {train_example.inputs().keys()} and label keys {train_example.labels().keys()}"
)
print(
f"For this dataset, dev examples have input keys {dev_example.inputs().keys()} and label keys {dev_example.labels().keys()}"
)
# Define the predictor.
generate_answer = dspy.Predict(BasicQA)
# Call the predictor on a particular input.
pred = generate_answer(question=dev_example.question)
# Print the input and the prediction.
print(f"Question: {dev_example.question}")
print(f"Predicted Answer: {pred.answer}")
lm.inspect_history(n=1)
# Define the predictor. Notice we're just changing the class. The signature BasicQA is unchanged.
generate_answer_with_chain_of_thought = dspy.ChainOfThought(BasicQA)
# Call the predictor on the same input.
pred = generate_answer_with_chain_of_thought(question=dev_example.question)
# Print the input, the chain of thought, and the prediction.
print(f"Question: {dev_example.question}")
print(f"Thought: {pred.rationale.split('.', 1)[1].strip()}")
print(f"Predicted Answer: {pred.answer}")
retrieve = dspy.Retrieve(k=3)
topK_passages = retrieve(dev_example.question).passages
print(
f"Top {retrieve.k} passages for question: {dev_example.question} \n",
"-" * 30,
"\n",
)
for idx, passage in enumerate(topK_passages):
print(f"{idx+1}]", passage, "\n")
retrieve("When was the first FIFA World Cup held?").passages[0]
from dspy.teleprompt import BootstrapFewShot
# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
answer_EM = dspy.evaluate.answer_exact_match(example, pred)
answer_PM = dspy.evaluate.answer_passage_match(example, pred)
return answer_EM and answer_PM
# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)
# Ask any question you like to this simple RAG program.
my_question = "What castle did David Gregory inherit?"
# Get the prediction. This contains `pred.context` and `pred.answer`.
pred = compiled_rag(my_question)
# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")
from dspy.evaluate.evaluate import Evaluate
# Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
evaluate_on_hotpotqa = Evaluate(
devset=devset,
num_threads=args.num_threads,
display_progress=True,
display_table=5,
)
# Evaluate the `compiled_rag` program with the `answer_exact_match` metric.
metric = dspy.evaluate.answer_exact_match
evaluate_on_hotpotqa(compiled_rag, metric=metric)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--port", type=int)
parser.add_argument("--num-threads", type=int, default=32)
parser.add_argument("--dev-size", type=int, default=150)
parser.add_argument(
"--backend", type=str, choices=["sglang", "tgi", "vllm"], default="sglang"
)
args = parser.parse_args()
if args.port is None:
default_port = {
"vllm": 21000,
"lightllm": 22000,
"tgi": 24000,
"sglang": 30000,
}
args.port = default_port.get(args.backend, None)
main(args)
## Download the dataset
```
wget -O agent_calls.jsonl https://drive.google.com/uc?export=download&id=19qLpD45e9JGTKF2cUjJJegwzSUEZEKht
```
## Run benchmark
Ensure that this benchmark is run in a serial manner (using --parallel 1) to preserve any potential dependencies between requests.
### Benchmark sglang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
python3 bench_sglang.py --num-events 1000 --parallel 1
```
### Benchmark vllm
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
```
```
python3 bench_other.py --num-events 1000 --backend vllm --parallel 1
```
### Benchmark guidance
```
python3 bench_other.py --num-events 1000 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --num-events 1000 --backend lmql --parallel 1
```
import sglang as sgl
# here are the top five agent functions contributing ~70% LLM calls
# reference: https://github.com/joonspk-research/generative_agents/
@sgl.function
def poignancy_event(s, persona_name, persona_iss, event):
s += "Here is a brief description of " + persona_name + ".\n"
s += persona_iss + "\n"
s += "On the scale of 1 to 10, where 1 is purely mundane (e.g., brushing teeth, making bed) and 10 is extremely poignant (e.g., a break up, college acceptance), rate the likely poignancy of the following event for"
s += persona_name + ".\n\n"
s += "Event: " + event
s += "Rate (return a number between 1 to 10):"
s += sgl.gen(name="Rate", max_tokens=2)
def poignancy_event_prompt(persona_name, persona_iss, event):
# return prompt and max_tokens
s = ""
s += "Here is a brief description of " + persona_name + ".\n"
s += persona_iss + "\n"
s += "On the scale of 1 to 10, where 1 is purely mundane (e.g., brushing teeth, making bed) and 10 is extremely poignant (e.g., a break up, college acceptance), rate the likely poignancy of the following event for"
s += persona_name + ".\n\n"
s += "Event: " + event
s += "Rate (return a number between 1 to 10):"
return {"prompt": s, "max_tokens": 2, "stop": None}
@sgl.function
def generate_event_triple(s, persona_name, action):
s += """Task: Turn the input into (subject, predicate, object).
Input: Sam Johnson is eating breakfast.
Output: (Dolores Murphy, eat, breakfast)
---
Input: Joon Park is brewing coffee.
Output: (Joon Park, brew, coffee)
---
Input: Jane Cook is sleeping.
Output: (Jane Cook, is, sleep)
---
Input: Michael Bernstein is writing email on a computer.
Output: (Michael Bernstein, write, email)
---
Input: Percy Liang is teaching students in a classroom.
Output: (Percy Liang, teach, students)
---
Input: Merrie Morris is running on a treadmill.
Output: (Merrie Morris, run, treadmill)
---"""
s += persona_name + "is" + action + ".\n"
s += "(" + persona_name + ","
s += sgl.gen(name="Triple", max_tokens=20, stop=")")
def generate_event_triple_prompt(persona_name, action):
s = ""
s += """Task: Turn the input into (subject, predicate, object).
Input: Sam Johnson is eating breakfast.
Output: (Dolores Murphy, eat, breakfast)
---
Input: Joon Park is brewing coffee.
Output: (Joon Park, brew, coffee)
---
Input: Jane Cook is sleeping.
Output: (Jane Cook, is, sleep)
---
Input: Michael Bernstein is writing email on a computer.
Output: (Michael Bernstein, write, email)
---
Input: Percy Liang is teaching students in a classroom.
Output: (Percy Liang, teach, students)
---
Input: Merrie Morris is running on a treadmill.
Output: (Merrie Morris, run, treadmill)
---"""
s += persona_name + "is" + action + ".\n"
s += "(" + persona_name + ","
return {"prompt": s, "max_tokens": 20, "stop": ")"}
@sgl.function
def generate_pronunciatio(s, action):
s += "Convert an action description to an emoji (important: use two or less emojis).\n"
s += "Action description: " + action + ".\n"
s += "Emoji:" + sgl.gen(name="Emoji", max_tokens=6)
def generate_pronunciatio_prompt(action):
s = ""
s += "Convert an action description to an emoji (important: use two or less emojis).\n"
s += "Action description: " + action + ".\n"
s += "Emoji:"
return {"prompt": s, "max_tokens": 6, "stop": None}
@sgl.function
def action_location_sector(
s,
persona_name,
living_sector,
living_sector_areas,
current_sector,
current_sector_areas,
daily_plan,
sector_options,
current_action,
next_action,
):
s += """Task -- choose an appropriate area from the area options for a task at hand.
Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim.
For taking a walk, Sam Kim should go to the following area: {Johnson Park}
---
Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
Jane Anderson is currently in {Oak Hill College} that has a classroom, library
Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim.
For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
---"""
s += (
persona_name
+ " lives in "
+ living_sector
+ " that has "
+ living_sector_areas
+ ".\n"
)
s += (
persona_name
+ " is currently in "
+ current_sector
+ " that has "
+ current_sector_areas
+ ".\n"
)
s += daily_plan + ".\n"
s += "Area options: " + sector_options + ".\n"
s += """* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim.\n"""
s += (
persona_name
+ " is "
+ current_action
+ ". For "
+ next_action
+ ", "
+ persona_name
+ " should go to the following area: {"
)
s += sgl.gen(name="Location", max_tokens=10, stop="}")
def action_location_sector_prompt(
persona_name,
living_sector,
living_sector_areas,
current_sector,
current_sector_areas,
daily_plan,
sector_options,
current_action,
next_action,
):
s = ""
s += """Task -- choose an appropriate area from the area options for a task at hand.
Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim.
For taking a walk, Sam Kim should go to the following area: {Johnson Park}
---
Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
Jane Anderson is currently in {Oak Hill College} that has a classroom, library
Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim.
For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
---"""
s += (
persona_name
+ " lives in "
+ living_sector
+ " that has "
+ living_sector_areas
+ ".\n"
)
s += (
persona_name
+ " is currently in "
+ current_sector
+ " that has "
+ current_sector_areas
+ ".\n"
)
s += daily_plan + ".\n"
s += "Area options: " + sector_options + ".\n"
s += """* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim.\n"""
s += (
persona_name
+ " is "
+ current_action
+ ". For "
+ next_action
+ ", "
+ persona_name
+ " should go to the following area: {"
)
return {"prompt": s, "max_tokens": 10, "stop": "}"}
@sgl.function
def action_location_object(
s, persona_name, target_sector, target_sector_areas, current_action, next_action
):
s += """
Jane Anderson is in kitchen in Jane Anderson's house.
Jane Anderson is going to Jane Anderson's house that has the following areas: {kitchen, bedroom, bathroom}
Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
Answer: {kitchen}
---
Tom Watson is in common room in Tom Watson's apartment.
Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
Answer: {cafe}
---"""
s += (
persona_name
+ " is going to "
+ target_sector
+ " that has the following areas: {"
+ target_sector_areas
+ "}\n"
)
s += """* Stay in the current area if the activity can be done there.
* NEVER go into other people's rooms unless necessary."""
s += (
persona_name
+ " is "
+ current_action
+ ". For "
+ next_action
+ ", "
+ persona_name
+ "should go to the following area in "
+ target_sector
)
s += " (MUST pick one of {" + target_sector_areas + "}):\n"
s += "Answer: {" + sgl.gen(name="Area", max_tokens=5, stop="}")
def action_location_object_prompt(
persona_name, target_sector, target_sector_areas, current_action, next_action
):
s = ""
s += """
Jane Anderson is in kitchen in Jane Anderson's house.
Jane Anderson is going to Jane Anderson's house that has the following areas: {kitchen, bedroom, bathroom}
Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
Answer: {kitchen}
---
Tom Watson is in common room in Tom Watson's apartment.
Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
Answer: {cafe}
---"""
s += (
persona_name
+ " is going to "
+ target_sector
+ " that has the following areas: {"
+ target_sector_areas
+ "}\n"
)
s += """* Stay in the current area if the activity can be done there.
* NEVER go into other people's rooms unless necessary."""
s += (
persona_name
+ " is "
+ current_action
+ ". For "
+ next_action
+ ", "
+ persona_name
+ "should go to the following area in "
+ target_sector
)
s += " (MUST pick one of {" + target_sector_areas + "}):\n"
s += "Answer: {"
return {"prompt": s, "max_tokens": 5, "stop": "}"}
import argparse
import json
import time
from agent_functions import (
action_location_object_prompt,
action_location_sector_prompt,
generate_event_triple_prompt,
generate_pronunciatio_prompt,
poignancy_event_prompt,
)
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
from sglang.utils import dump_state_text, read_jsonl
def main(args):
lines = read_jsonl(args.data_path)[: args.num_events]
mapping = {
"poignancy_event": poignancy_event_prompt,
"generate_event_triple": generate_event_triple_prompt,
"generate_pronunciatio": generate_pronunciatio_prompt,
"action_location_sector": action_location_sector_prompt,
"action_location_object": action_location_object_prompt,
}
arguments = [mapping[k](**v) for l in lines for k, v in l.items()]
states = []
# Select backend
call_generate = get_call_generate(args)
def get_one_answer(arg):
answer = call_generate(**arg, temperature=0)
states.append(answer)
async def get_one_answer_async(arg):
answer = await call_generate(**arg, temperature=0)
states.append(answer)
tic = time.perf_counter()
# we always sequentially execute agent calls to maintain its dependency
if args.backend != "lmql":
for arg in tqdm(arguments):
get_one_answer(arg)
else:
import asyncio
loop = asyncio.get_event_loop()
for arg in tqdm(arguments):
loop.run_until_complete(get_one_answer_async(arg))
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")
# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(args.result_file, "a") as fout:
value = {
"task": "Generative Agents",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
# to pack weighted functions as a single agent
"num_requests": len(arguments) / len(mapping),
"other": {
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="agent_calls.jsonl")
parser.add_argument("--num-events", type=int, default=10)
args = add_common_other_args_and_parse(parser)
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment