"src/vscode:/vscode.git/clone" did not exist on "37a5f1b3b69ed284086fb31fb1b49668cba6c365"
Unverified Commit 4aa5dd2c authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update version to v0.1.13 (#280)

parent 13662fd5
......@@ -11,8 +11,7 @@ We tested our system on the following common LLM workloads and reported the achi
- **[DSPy RAG](https://github.com/stanfordnlp/dspy)**: A retrieval-augmented generation pipeline in the DSPy tutorial.
- **[LLaVA Bench](https://github.com/haotian-liu/LLaVA)**: Running LLaVA v1.5, a vision language model on the LLaVA-in-the-wild benchmark.
We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, and Hugging Face TGI v1.3.0 as baseline systems.
We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, Hugging Face TGI v1.3.0, and SGLang v0.1.5.
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
![llama_7b](../assets/llama_7b.jpg)
......
......@@ -5,14 +5,7 @@ It can be used in SGLang runtime to accelerate attention computation.
### Install flashinfer
You can install flashinfer via pip as follows for CUDA 12.1.
```bash
pip install flashinfer -i https://flashinfer.ai/whl/cu121/
```
You can look for other CUDA versions in https://github.com/flashinfer-ai/flashinfer?tab=readme-ov-file#installation. If there is no desire version for your environment,
please build it from source (the compilation takes a long time).
See https://docs.flashinfer.ai/installation.html.
### Run a Server With Flashinfer Mode
......
......@@ -37,6 +37,23 @@ python3 bench_sglang.py --nsub 3
# Average accuracy: 0.413
```
#### GSM-8K
```
cd benchmark/gsm8k
```
Follow README.md to download the data.
```
python3 bench_sglang.py --num-q 200
# Expected performance on A10G
# Latency: 32.103
# Accuracy: 0.250
```
#### More
Please also test `benchmark/hellaswag`, `benchmark/latency_throughput`.
### More Models
#### LLaVA
......@@ -48,6 +65,9 @@ python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenize
```
cd benchmark/llava_bench
python3 bench_sglang.py
# Expected performance on A10G
# Latency: 50.031
```
## SGLang Unit Tests
......
......@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "sglang"
version = "0.1.12"
version = "0.1.13"
description = "A structured generation langauge for LLMs."
readme = "README.md"
requires-python = ">=3.8"
......
__version__ = "0.1.12"
__version__ = "0.1.13"
from sglang.api import *
from sglang.global_config import global_config
......@@ -17,7 +17,7 @@ class RadixAttention(nn.Module):
from sglang.srt.managers.router.model_runner import global_server_args_dict
if global_server_args_dict["enable_flashinfer"]:
if global_server_args_dict.get("enable_flashinfer", False):
self.prefill_forward = self.prefill_forward_flashinfer
self.extend_forward = self.prefill_forward_flashinfer
self.decode_forward = self.decode_forward_flashinfer
......
......@@ -7,7 +7,7 @@ import triton.language as tl
from sglang.srt.managers.router.model_runner import global_server_args_dict
from sglang.srt.utils import wrap_kernel_launcher
if global_server_args_dict["attention_reduce_in_fp32"]:
if global_server_args_dict.get("attention_reduce_in_fp32", False):
REDUCE_TRITON_TYPE = tl.float32
REDUCE_TORCH_TYPE = torch.float32
else:
......
......@@ -222,7 +222,7 @@ class InputMetadata:
if forward_mode == ForwardMode.EXTEND:
ret.init_extend_args()
if global_server_args_dict["enable_flashinfer"]:
if global_server_args_dict.get("enable_flashinfer", False):
ret.init_flashinfer_args(tp_size)
return ret
......
......@@ -28,8 +28,8 @@ def test_generate_worker(model_path, tp_rank, tp_size):
reqs = []
for i in range(len(prompts)):
req = Req(i, None, None)
req.input_ids = tokenizer.encode(prompts[i])[:cut_num]
input_ids = tokenizer.encode(prompts[i])[:cut_num]
req = Req(i, prompts[i], input_ids)
req.sampling_params = sampling_params
reqs.append(req)
......@@ -60,7 +60,7 @@ def test_generate_worker(model_path, tp_rank, tp_size):
# Decode
for i in range(6):
batch.prepare_for_decode(next_token_ids.cpu().numpy())
logits = model.forward(batch, ForwardMode.DECODE)
logits, _ = model.forward(batch, ForwardMode.DECODE)
next_token_ids, next_token_probs = batch.sample(logits)
print(
......
......@@ -71,7 +71,7 @@ def test_generate_worker(
) = model.token_to_kv_pool.alloc_contiguous(batch_size)
model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc
seq_lens.add_(1)
logits = model.forward_decode(
logits, _ = model.forward_decode(
torch.from_numpy(predict_ids).cuda().reshape(-1),
req_pool_indices,
seq_lens,
......@@ -80,6 +80,7 @@ def test_generate_worker(
None,
out_cache_cont_start,
out_cache_cont_end,
False,
)
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
......
......@@ -63,7 +63,7 @@ def decode(step, model, tp_rank, batch_size, predict_ids, params, print_logits):
) = model.token_to_kv_pool.alloc_contiguous(batch_size)
model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc
seq_lens.add_(1)
logits = model.forward_decode(
logits, _ = model.forward_decode(
torch.from_numpy(predict_ids).cuda().reshape(-1),
req_pool_indices,
seq_lens,
......@@ -72,6 +72,7 @@ def decode(step, model, tp_rank, batch_size, predict_ids, params, print_logits):
None,
out_cache_cont_start,
out_cache_cont_end,
False,
)
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
......@@ -92,7 +93,7 @@ def test_generate_worker(
# Prepare data
prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:"
image_path = "/home/ubuntu/sglang/test/lang/image.png"
image_path = "/home/ubuntu/sglang/test/lang/test_image.png"
image = load_image(image_path)
processor = get_processor("llava-hf/llava-1.5-7b-hf")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment