Commit 909abb58 authored by maxiao's avatar maxiao
Browse files

adapt to sglang v0.5.2rc1 on dcu

parents
import argparse
import json
import time
from agent_functions import (
action_location_object,
action_location_sector,
generate_event_triple,
generate_pronunciatio,
poignancy_event,
)
import sglang as sgl
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
def main(args):
lines = read_jsonl(args.data_path)[: args.num_events]
mapping = {
"poignancy_event": poignancy_event,
"generate_event_triple": generate_event_triple,
"generate_pronunciatio": generate_pronunciatio,
"action_location_sector": action_location_sector,
"action_location_object": action_location_object,
}
arguments = [{mapping[k]: v for k, v in l.items()} for l in lines]
# Select backend
backend = select_sglang_backend(args)
sgl.set_default_backend(backend)
states = []
# Run requests
tic = time.perf_counter()
for a in arguments:
# only a single key in the dict
for func, arg in a.items():
result = func.run(**arg)
result.sync()
states.append(result)
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(args.result_file, "a") as fout:
value = {
"task": "Generative Agents",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
# to pack weighted functions as a single agent
"num_requests": len(arguments) / len(mapping),
"other": {
"num_events": args.num_events,
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="agent_calls.jsonl")
parser.add_argument("--num-events", type=int, default=10)
args = add_common_sglang_args_and_parse(parser)
main(args)
# How to reproduce the result of GPT-OSS with SGLang
### Install the latest SGLang
```bash
git clone https://github.com/sgl-project/sglang.git
cd sglang
git checkout v0.5.1.post3
pip install --upgrade pip
pip install -e "python[all]"
```
### Reproduce the benchmark throughput result (Batch Size 1)
Launch Command
```bash
# MXFP4 120B on H100
python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 8 --attention-backend triton
# BF16 120B on H100
python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 8 --attention-backend triton
# MXFP4 120B on B200
python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 4
# BF16 120B on B200
python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 4
```
Benchmark Command
```bash
# MXFP4 120B on H100
python3 -m sglang.bench_one_batch_server --model openai/gpt-oss-120b --base-url http://localhost:30000 --batch-size 1 --input-len 1024 --output-len 512 --show-report
```
### Reproduce the benchmark throughput result (Batch Size 32)
Launch Command
```bash
# MXFP4 120B on H100
python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 8
# BF16 120B on H100
python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 8
# MXFP4 120B on B200
python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 4
# BF16 120B on B200
python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 4
```
Benchmark Command
```bash
python3 -m sglang.bench_one_batch_server --model openai/gpt-oss-120b --base-url http://localhost:30000 --batch-size 32 --input-len 1024 8192 --output-len 512 --show-report
```
### Reproduce the evaluation result
Install gpt-oss
```bash
git clone https://github.com/openai/gpt-oss.git
cd gpt-oss
pip install -e .
```
Evaluation Command
```bash
DATASET=gpqa
BASE_URL=YOUR_BASE_URL
OPENAI_API_KEY=dummy python -m gpt_oss.evals \
--base-url ${BASE_URL}/v1 \
--model dummy \
--reasoning-effort low,medium,high \
--eval $DATASET \
--n-threads 1000
```
### Reproduce the benchmark result of acceptance length
> Note: On B200, if top k is 1, set `--attention-backend trtllm_mha`
```bash
git clone https://github.com/sgl-project/SpecForge.git
cd SpecForge/benchmarks
config_list=(
"1,0,0,0"
"1,3,1,4"
"1,5,4,8"
)
python3 bench_model_speedup.py \
--model-path openai/gpt-oss-120b \
--speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 \
--port 20001 \
--trust-remote-code \
--mem-fraction-static 0.8 \
--tp-size 4 \
--attention-backend fa3 \
--config-list "${config_list[@]}" \
--benchmark-list mtbench:80 gsm8k:200 humaneval:200 math500:200 \
--output lmsys_gpt-oss-120b_Eagle3_result.jsonl
python3 bench_model_speedup.py \
--model-path openai/gpt-oss-120b \
--speculative-draft-model-path nvidia/gpt-oss-120b-Eagle3 \
--port 20001 \
--trust-remote-code \
--mem-fraction-static 0.8 \
--tp-size 4 \
--attention-backend fa3 \
--config-list "${config_list[@]}" \
--benchmark-list mtbench:80 gsm8k:200 humaneval:200 math500:200 \
--output nv_gpt-oss-120b_Eagle3_result.jsonl
```
### Reproduce the result of speculative decoding speedup
Launch Command
```bash
# On Hopper:
# - Tree decoding (topk > 1) and chain decoding (topk = 1) are supported on both FA3 and Triton backends.
python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algorithm EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --tp 4
python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algorithm EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8 --tp 4
# On Blackwell:
# - Chain decoding (topk = 1) is supported on TRTLLM-MHA backend. Tree decoding (topk > 1) is in progress, stay tuned!
# - Both tree decoding (topk > 1) and chain decoding (topk = 1) are supported on the Triton backend.
python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --tp 4
python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8 --attention-backend triton --tp 4
```
Benchmark Command
```bash
config_list=(
"1,0,0,0"
"1,3,1,4"
"1,5,4,8"
)
python3 bench_model_speedup.py \
--model-path openai/gpt-oss-120b \
--speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 \
--port 20001 \
--trust-remote-code \
--mem-fraction-static 0.8 \
--tp-size 4 \
--attention-backend fa3 \
--config-list "${config_list[@]}" \
--benchmark-list gsm8k:200 humaneval:200 math500:200 \
--output lmsys_gpt-oss-120b_Eagle3_result.jsonl
```
We can gain the best speedup with the following settings:
- **1.39x** speedup with the `--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4` setting.
- **1.52x** speedup with the `--speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8` setting.
## Run benchmark
### Benchmark sglang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
python3 bench_sglang.py --num-questions 200
```
### Benchmark vllm
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
```
```
python3 bench_other.py --num-questions 200 --backend vllm
```
### Benchmark lightllm
```
# A10G
python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
```
```
python3 bench_other.py --num-questions 200 --backend lightllm
```
### Benchmark guidance
```
python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
```
```
python3 bench_other.py --num-questions 100 --backend lmql --parallel 2
```
import argparse
import ast
import asyncio
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
INVALID = -9999999
def get_one_example(lines, i, include_answer):
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
if include_answer:
ret += " " + lines[i]["answer"]
return ret
def get_few_shot_examples(lines, k):
ret = ""
for i in range(k):
ret += get_one_example(lines, i, True) + "\n\n"
return ret
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
return ast.literal_eval(numbers[-1])
except SyntaxError:
return INVALID
def main(args):
# Select backend
call_generate = get_call_generate(args)
# Read data
url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
filename = download_and_cache_file(url)
lines = list(read_jsonl(filename))
# Construct prompts
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)
questions = []
labels = []
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
states = [None] * len(labels)
# Run requests
if args.backend != "lmql":
# Use thread pool
def get_one_answer(i):
answer = call_generate(
prompt=few_shot_examples + questions[i],
temperature=0,
max_tokens=256,
stop=["Question", "Assistant:", "<|separator|>"],
)
states[i] = answer
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
list(
tqdm(
executor.map(get_one_answer, list(range(len(questions)))),
total=len(questions),
)
)
else:
# Use asyncio
async def batched_call(batch_size):
for i in range(0, len(questions), batch_size):
tasks = []
for q in questions[i : i + batch_size]:
tasks.append(
call_generate(
few_shot_examples + q,
temperature=0,
max_tokens=256,
stop="Question",
)
)
rets = await asyncio.gather(*tasks)
for j in range(len(rets)):
states[i + j] = rets[j]
tic = time.perf_counter()
asyncio.run(batched_call(batch_size=args.parallel))
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):
preds.append(get_answer_value(states[i]))
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
invalid = np.mean(np.array(preds) == INVALID)
# Print results
print(f"Accuracy: {acc:.3f}")
print(f"Invalid: {invalid:.3f}")
print(f"Latency: {latency:.3f} s")
# Dump results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(args.result_file, "a") as fout:
value = {
"task": "gsm8k",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"accuracy": round(acc, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-shots", type=int, default=5)
parser.add_argument("--data-path", type=str, default="test.jsonl")
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_other_args_and_parse(parser)
main(args)
import argparse
import ast
import json
import os
import re
import time
import numpy as np
from sglang.lang.api import set_default_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
dump_bench_raw_result,
select_sglang_backend,
)
from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
INVALID = -9999999
def get_one_example(lines, i, include_answer):
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
if include_answer:
ret += " " + lines[i]["answer"]
return ret
def get_few_shot_examples(lines, k):
ret = ""
for i in range(k):
ret += get_one_example(lines, i, True) + "\n\n"
return ret
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
return ast.literal_eval(numbers[-1])
except SyntaxError:
return INVALID
def main(args):
# Select backend
set_default_backend(select_sglang_backend(args))
# Read data
data_path = args.data_path
url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
if not os.path.isfile(data_path):
data_path = download_and_cache_file(url)
lines = list(read_jsonl(data_path))
# Construct prompts
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)
questions = []
labels = []
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
arguments = [{"question": q} for q in questions]
#####################################
######### SGL Program Begin #########
#####################################
import sglang as sgl
@sgl.function
def few_shot_gsm8k(s, question):
s += few_shot_examples + question
s += sgl.gen(
"answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
)
#####################################
########## SGL Program End ##########
#####################################
# Run requests
tic = time.perf_counter()
states = few_shot_gsm8k.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):
preds.append(get_answer_value(states[i]["answer"]))
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
invalid = np.mean(np.array(preds) == INVALID)
# Compute speed
num_output_tokens = sum(
s.get_meta_info("answer")["completion_tokens"] for s in states
)
output_throughput = num_output_tokens / latency
# Print results
print(f"Accuracy: {acc:.3f}")
print(f"Invalid: {invalid:.3f}")
print(f"Latency: {latency:.3f} s")
print(f"Output throughput: {output_throughput:.3f} token/s")
# Dump results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
dump_bench_raw_result(
path=args.raw_result_file,
states=states,
preds=preds,
labels=labels,
)
with open(args.result_file, "a") as fout:
value = {
"task": "gsm8k",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"accuracy": round(acc, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-shots", type=int, default=5)
parser.add_argument("--data-path", type=str, default="test.jsonl")
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_sglang_args_and_parse(parser)
main(args)
## Run benchmark
### Benchmark sglang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
python3 bench_sglang.py --num-questions 200
```
### Benchmark vllm
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
```
```
python3 bench_other.py --num-questions 200 --backend vllm
```
### Benchmark lightllm
```
# A10G
python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
```
```
python3 bench_other.py --num-questions 200 --backend lightllm
```
### Benchmark guidance
```
CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
```
```
python3 bench_other.py --num-questions 200 --backend lmql --port 23000 --parallel 1
```
import argparse
import asyncio
import json
import time
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_select
from sglang.utils import download_and_cache_file, read_jsonl
def get_one_example(lines, i, include_answer):
ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
if include_answer:
ret += lines[i]["endings"][lines[i]["label"]]
return ret
def get_few_shot_examples(lines, k):
ret = ""
for i in range(k):
ret += get_one_example(lines, i, True) + "\n\n"
return ret
def main(args):
# Select backend
call_select = get_call_select(args)
# Read data
url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
filename = download_and_cache_file(url)
lines = list(read_jsonl(filename))
# Construct prompts
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)
questions = []
choices = []
labels = []
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
choices.append(lines[i]["endings"])
labels.append(lines[i]["label"])
preds = [None] * len(labels)
# Run requests
if args.backend != "lmql":
# Use thread pool
def get_one_answer(i):
preds[i] = call_select(
context=few_shot_examples + questions[i], choices=choices[i]
)
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
list(
tqdm(
executor.map(get_one_answer, list(range(len(questions)))),
total=len(questions),
)
)
else:
# Use asyncio
async def batched_call(batch_size):
for i in range(0, len(questions), batch_size):
tasks = []
for q, c in zip(
questions[i : i + batch_size], choices[i : i + batch_size]
):
tasks.append(call_select(context=few_shot_examples + q, choices=c))
rets = await asyncio.gather(*tasks)
for j in range(len(rets)):
preds[i + j] = rets[j]
tic = time.perf_counter()
asyncio.run(batched_call(batch_size=args.parallel))
latency = time.perf_counter() - tic
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
print(f"Latency: {latency:.3f}")
print(f"Accuracy: {acc:.3f}")
# Write results
with open(args.result_file, "a") as fout:
value = {
"task": "hellaswag",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"accuracy": round(acc, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-shots", type=int, default=20)
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_other_args_and_parse(parser)
main(args)
import argparse
import json
import os
import time
import numpy as np
from sglang.lang.api import set_default_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import download_and_cache_file, read_jsonl
def get_one_example(lines, i, include_answer):
ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
if include_answer:
ret += lines[i]["endings"][lines[i]["label"]]
return ret
def get_few_shot_examples(lines, k):
ret = ""
for i in range(k):
ret += get_one_example(lines, i, True) + "\n\n"
return ret
def main(args):
# Select backend
set_default_backend(select_sglang_backend(args))
# Read data
data_path = args.data_path
url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
if not os.path.isfile(data_path):
data_path = download_and_cache_file(url)
lines = list(read_jsonl(data_path))
# Construct prompts
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)
questions = []
choices = []
labels = []
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
choices.append(lines[i]["endings"])
labels.append(lines[i]["label"])
arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
#####################################
######### SGL Program Begin #########
#####################################
import sglang as sgl
@sgl.function
def few_shot_hellaswag(s, question, choices):
s += few_shot_examples + question
s += sgl.select("answer", choices=choices)
#####################################
########## SGL Program End ##########
#####################################
# Run requests
tic = time.perf_counter()
rets = few_shot_hellaswag.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
latency = time.perf_counter() - tic
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
print(f"Latency: {latency:.3f}")
print(f"Accuracy: {acc:.3f}")
# Write results
with open(args.result_file, "a") as fout:
value = {
"task": "hellaswag",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"accuracy": round(acc, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-shots", type=int, default=20)
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_sglang_args_and_parse(parser)
main(args)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
python3 benchmark/hf3fs/bench_client.py
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs.json \
python3 benchmark/hf3fs/bench_storage.py
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
export SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs.json
echo '{"file_path_prefix": "/data/hf3fs-test-0", "file_size": 1099511627776, "numjobs": 16, "entries": 8}' > \
${SGLANG_HICACHE_HF3FS_CONFIG_PATH}
python3 benchmark/hf3fs/bench_zerocopy.py
####################################################################################################
rm -rf nohup.out && \
nohup python3 -m sglang.launch_server \
--model-path /code/models/Qwen3-32B/ \
--host 0.0.0.0 --port 33301 \
--page-size 64 \
--enable-hierarchical-cache \
--hicache-ratio 2 --hicache-size 0 \
--hicache-write-policy write_through \
--hicache-storage-backend hf3fs &
rm -rf bench_multiturn.out && \
nohup python3 benchmark/hicache/bench_multiturn.py \
--model-path /code/models/Qwen3-32B \
--dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
--port 33301 \
--request-length 2048 --num-clients 512 --num-rounds 3 --max-parallel 8 \
> bench_multiturn.out &
####################################################################################################
rm -rf nohup.out && \
nohup python3 -m sglang.launch_server \
--model-path /code/models/DeepSeek-R1/ \
--tp 16 --nnodes 2 --node-rank 0 \
--dist-init-addr 10.74.249.153:5000 \
--host 0.0.0.0 --port 33301 \
--page-size 64 \
--enable-hierarchical-cache \
--hicache-ratio 2 --hicache-size 60 \
--hicache-write-policy write_through \
--hicache-storage-backend hf3fs &
rm -rf bench_multiturn.out && \
nohup python3 benchmark/hicache/bench_multiturn.py \
--model-path /code/models/Qwen3-32B \
--dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
--port 33301 \
--request-length 2048 --num-clients 1024 --num-rounds 3 --max-parallel 8 \
> bench_multiturn.out &
####################################################################################################
ps aux | grep "sglang.launch_server" | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep "bench_multiturn.py" | grep -v grep | awk '{print $2}' | xargs kill -9
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment