Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
97e0f7d2
Unverified
Commit
97e0f7d2
authored
Jul 25, 2024
by
Yineng Zhang
Committed by
GitHub
Jul 25, 2024
Browse files
docs: update comment (#721)
parent
d5146bae
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
108 additions
and
29 deletions
+108
-29
benchmark/blog_v0_2/405b_sglang.sh
benchmark/blog_v0_2/405b_sglang.sh
+3
-10
benchmark/blog_v0_2/405b_trt.sh
benchmark/blog_v0_2/405b_trt.sh
+1
-8
benchmark/blog_v0_2/405b_vllm.sh
benchmark/blog_v0_2/405b_vllm.sh
+4
-11
benchmark/blog_v0_2/config.md
benchmark/blog_v0_2/config.md
+100
-0
No files found.
benchmark/blog_v0_2/405b_sglang.sh
View file @
97e0f7d2
# create ~/llama-3.1-405b-fp8-dummy and create config.json and tokenizer:
# create ~/llama-3.1-405b-fp8-dummy and create config.json and tokenizer:
# config.json from
https://gist.github.com/zhyncs/748597c44d47b45fa15866a4ae2c2b29?permalink_comment_id=5128893
# config.json from
./config.md
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
?download=true
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
# wget
wget
https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
?download=true
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
# Launch sglang
# Launch sglang
# python -m sglang.launch_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.88
# python -m sglang.launch_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.88
...
@@ -19,10 +19,3 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
...
@@ -19,10 +19,3 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
python3
-m
sglang.bench_serving
--backend
sglang
--dataset-name
random
--num-prompt
1200
--request-rate
4
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
sglang/log33
python3
-m
sglang.bench_serving
--backend
sglang
--dataset-name
random
--num-prompt
1200
--request-rate
4
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
sglang/log33
python3
-m
sglang.bench_serving
--backend
sglang
--dataset-name
random
--num-prompt
2400
--request-rate
8
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
sglang/log34
python3
-m
sglang.bench_serving
--backend
sglang
--dataset-name
random
--num-prompt
2400
--request-rate
8
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
sglang/log34
python3
-m
sglang.bench_serving
--backend
sglang
--dataset-name
random
--num-prompt
3200
--request-rate
16
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
sglang/log35
python3
-m
sglang.bench_serving
--backend
sglang
--dataset-name
random
--num-prompt
3200
--request-rate
16
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
sglang/log35
# python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1000 --request-rate 32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > sglang/log36
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 1 > sglang/log41
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 2 > sglang/log42
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 4 > sglang/log43
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 8 > sglang/log44
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 16 > sglang/log45
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 32 > sglang/log46
benchmark/blog_v0_2/405b_trt.sh
View file @
97e0f7d2
# Launch trtllm
# Launch trtllm
# https://
gist.
github.com/
zhyncs/748597c44d47b45fa15866a4ae2c2b29?permalink_comment_id=5129302
# https://github.com/
sgl-project/tensorrt-demo
# offline
# offline
python3 ../../python/sglang/bench_serving.py
--backend
trt
--dataset-name
random
--num-prompt
2500
--random-input
1024
--random-output
1024
--random-range-ratio
0.5
--model
meta-llama/Meta-Llama-3-8B-Instruct
>
trtllm/log11
python3 ../../python/sglang/bench_serving.py
--backend
trt
--dataset-name
random
--num-prompt
2500
--random-input
1024
--random-output
1024
--random-range-ratio
0.5
--model
meta-llama/Meta-Llama-3-8B-Instruct
>
trtllm/log11
...
@@ -14,10 +14,3 @@ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random
...
@@ -14,10 +14,3 @@ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random
python3 ../../python/sglang/bench_serving.py
--backend
trt
--dataset-name
random
--num-prompt
1200
--request-rate
4
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
--model
meta-llama/Meta-Llama-3-8B-Instruct
>
trtllm/log33
python3 ../../python/sglang/bench_serving.py
--backend
trt
--dataset-name
random
--num-prompt
1200
--request-rate
4
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
--model
meta-llama/Meta-Llama-3-8B-Instruct
>
trtllm/log33
python3 ../../python/sglang/bench_serving.py
--backend
trt
--dataset-name
random
--num-prompt
2400
--request-rate
8
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
--model
meta-llama/Meta-Llama-3-8B-Instruct
>
trtllm/log34
python3 ../../python/sglang/bench_serving.py
--backend
trt
--dataset-name
random
--num-prompt
2400
--request-rate
8
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
--model
meta-llama/Meta-Llama-3-8B-Instruct
>
trtllm/log34
python3 ../../python/sglang/bench_serving.py
--backend
trt
--dataset-name
random
--num-prompt
3200
--request-rate
16
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
--model
meta-llama/Meta-Llama-3-8B-Instruct
>
trtllm/log35
python3 ../../python/sglang/bench_serving.py
--backend
trt
--dataset-name
random
--num-prompt
3200
--request-rate
16
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
--model
meta-llama/Meta-Llama-3-8B-Instruct
>
trtllm/log35
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1000 --request-rate 32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log36
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 1 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log41
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 2 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log42
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 4 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log43
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 8 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log44
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 16 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log45
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 32 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log46
benchmark/blog_v0_2/405b_vllm.sh
View file @
97e0f7d2
# create ~/llama-3.1-405b-fp8-dummy and create config.json and tokenizer:
# create ~/llama-3.1-405b-fp8-dummy and create config.json and tokenizer:
# config.json from
https://gist.github.com/zhyncs/748597c44d47b45fa15866a4ae2c2b29?permalink_comment_id=5128893
# config.json from
./config.md
#
(
remove the new llama3 rope_scaling entry to run with vLLM 0.5.2
)
# remove the new llama3 rope_scaling entry to run with vLLM 0.5.2
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
?download=true
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
# wget
wget
https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
?download=true
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
# Launch vllm
# Launch vllm
# python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000
# python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000
...
@@ -20,10 +20,3 @@ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name rando
...
@@ -20,10 +20,3 @@ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name rando
python3 ../../python/sglang/bench_serving.py
--backend
vllm
--dataset-name
random
--num-prompt
1200
--request-rate
4
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
vllm/log33
python3 ../../python/sglang/bench_serving.py
--backend
vllm
--dataset-name
random
--num-prompt
1200
--request-rate
4
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
vllm/log33
python3 ../../python/sglang/bench_serving.py
--backend
vllm
--dataset-name
random
--num-prompt
2400
--request-rate
8
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
vllm/log34
python3 ../../python/sglang/bench_serving.py
--backend
vllm
--dataset-name
random
--num-prompt
2400
--request-rate
8
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
vllm/log34
python3 ../../python/sglang/bench_serving.py
--backend
vllm
--dataset-name
random
--num-prompt
3200
--request-rate
16
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
vllm/log35
python3 ../../python/sglang/bench_serving.py
--backend
vllm
--dataset-name
random
--num-prompt
3200
--request-rate
16
--random-input
4096
--random-output
1024
--random-range-ratio
0.125
>
vllm/log35
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1000 --request-rate 32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > vllm/log36
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 1 > vllm/log41
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 2 > vllm/log42
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 4 > vllm/log43
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 8 > vllm/log44
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 16 > vllm/log45
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 32 > vllm/log46
benchmark/blog_v0_2/config.md
0 → 100644
View file @
97e0f7d2
### used for TensorRT LLM
```
{
"architecture": "LlamaForCausalLM",
"dtype": "float16",
"logits_dtype": "float32",
"vocab_size": 128256,
"max_position_embeddings": 8192,
"hidden_size": 16384,
"num_hidden_layers": 126,
"num_attention_heads": 128,
"num_key_value_heads": 16,
"head_size": 128,
"qk_layernorm": false,
"hidden_act": "silu",
"intermediate_size": 53248,
"norm_epsilon": 1e-05,
"position_embedding_type": "rope_gpt_neox",
"use_parallel_embedding": false,
"embedding_sharding_dim": 0,
"share_embedding_table": false,
"mapping": {
"world_size": 8,
"tp_size": 8,
"pp_size": 1,
"gpus_per_node": 8
},
"quantization": {
"quant_algo": "FP8",
"kv_cache_quant_algo": null,
"group_size": 128,
"smoothquant_val": null,
"has_zero_point": false,
"pre_quant_scale": false,
"exclude_modules": [
"lm_head"
]
},
"kv_dtype": "float16",
"rotary_scaling": null,
"residual_mlp": false,
"moe_normalization_mode": null,
"rotary_base": 500000.0,
"moe_num_experts": 0,
"moe_top_k": 0,
"moe_tp_mode": 2,
"attn_bias": false,
"disable_weight_only_quant_plugin": false,
"mlp_bias": false
}
```
### used for vLLM and SGLang
```
{
"_name_or_path": "dummy_fp8",
"architectures": [
"LlamaForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 128000,
"eos_token_id": 128009,
"hidden_act": "silu",
"hidden_size": 16384,
"initializer_range": 0.02,
"intermediate_size": 53248,
"mlp_bias": false,
"model_type": "llama",
"num_attention_heads": 128,
"num_hidden_layers": 126,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"quantization_config": {
"activation_scheme": "static",
"ignored_layers": [
"lm_head"
],
"quant_method": "fp8"
},
"rope_scaling": {
"factor": 8.0,
"low_freq_factor": 1.0,
"high_freq_factor": 4.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"max_position_embeddings": 131072,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 500000.0,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.41.1",
"use_cache": true,
"vocab_size": 128256
}
```
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment