Unverified Commit c31f084c authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

chore: update vllm to 0.5.4 (#966)

parent a01ddd96
...@@ -34,8 +34,7 @@ jobs: ...@@ -34,8 +34,7 @@ jobs:
pip cache purge pip cache purge
pip install --upgrade pip pip install --upgrade pip
pip install -e "python[all]" pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
pip install --upgrade transformers
- name: Benchmark Serving Throughput - name: Benchmark Serving Throughput
run: | run: |
......
...@@ -34,8 +34,7 @@ jobs: ...@@ -34,8 +34,7 @@ jobs:
pip cache purge pip cache purge
pip install --upgrade pip pip install --upgrade pip
pip install -e "python[all]" pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
pip install --upgrade transformers
pip install accelerate pip install accelerate
- name: Test Frontend Language - name: Test Frontend Language
......
...@@ -49,7 +49,7 @@ pip install --upgrade pip ...@@ -49,7 +49,7 @@ pip install --upgrade pip
pip install "sglang[all]" pip install "sglang[all]"
# Install FlashInfer CUDA kernels # Install FlashInfer CUDA kernels
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
``` ```
### Method 2: From source ### Method 2: From source
...@@ -62,7 +62,7 @@ pip install --upgrade pip ...@@ -62,7 +62,7 @@ pip install --upgrade pip
pip install -e "python[all]" pip install -e "python[all]"
# Install FlashInfer CUDA kernels # Install FlashInfer CUDA kernels
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
``` ```
### Method 3: Using docker ### Method 3: Using docker
......
...@@ -29,6 +29,6 @@ RUN pip3 --no-cache-dir install --upgrade pip \ ...@@ -29,6 +29,6 @@ RUN pip3 --no-cache-dir install --upgrade pip \
&& git clone --depth=1 https://github.com/sgl-project/sglang.git \ && git clone --depth=1 https://github.com/sgl-project/sglang.git \
&& cd sglang \ && cd sglang \
&& pip --no-cache-dir install -e "python[all]" \ && pip --no-cache-dir install -e "python[all]" \
&& pip3 --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ && pip3 --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
ENV DEBIAN_FRONTEND=interactive ENV DEBIAN_FRONTEND=interactive
...@@ -23,7 +23,7 @@ dependencies = [ ...@@ -23,7 +23,7 @@ dependencies = [
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
"packaging", "pillow", "psutil", "pydantic", "python-multipart", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
"torch", "uvicorn", "uvloop", "zmq", "torch", "uvicorn", "uvloop", "zmq",
"vllm==0.5.3.post1", "outlines>=0.0.44"] "vllm==0.5.4", "outlines>=0.0.44"]
openai = ["openai>=1.0", "tiktoken"] openai = ["openai>=1.0", "tiktoken"]
anthropic = ["anthropic>=0.20.0"] anthropic = ["anthropic>=0.20.0"]
litellm = ["litellm>=1.0.0"] litellm = ["litellm>=1.0.0"]
......
...@@ -14,6 +14,7 @@ PACKAGE_LIST = [ ...@@ -14,6 +14,7 @@ PACKAGE_LIST = [
"sglang", "sglang",
"flashinfer", "flashinfer",
"triton", "triton",
"transformers",
"requests", "requests",
"tqdm", "tqdm",
"numpy", "numpy",
......
...@@ -18,9 +18,7 @@ import torch ...@@ -18,9 +18,7 @@ import torch
from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
MODELS = [ MODELS = [
# (model_name, tp_size)
("meta-llama/Meta-Llama-3.1-8B-Instruct", 1), ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1),
# ("meta-llama/Meta-Llama-3.1-8B-Instruct", 2),
] ]
TORCH_DTYPES = [torch.float16] TORCH_DTYPES = [torch.float16]
...@@ -51,7 +49,7 @@ class TestCausalModels(unittest.TestCase): ...@@ -51,7 +49,7 @@ class TestCausalModels(unittest.TestCase):
hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i]) hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i]) srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i])
tolerance = 2e-2 tolerance = 3e-2
assert torch.all( assert torch.all(
abs(hf_logprobs - srt_logprobs) < tolerance abs(hf_logprobs - srt_logprobs) < tolerance
), f"prefill logprobs not all close" ), f"prefill logprobs not all close"
......
...@@ -20,7 +20,7 @@ if __name__ == "__main__": ...@@ -20,7 +20,7 @@ if __name__ == "__main__":
arg_parser.add_argument( arg_parser.add_argument(
"--timeout-per-file", "--timeout-per-file",
type=int, type=int,
default=1000, default=2000,
help="The time limit for running one file in seconds.", help="The time limit for running one file in seconds.",
) )
arg_parser.add_argument( arg_parser.add_argument(
......
...@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase): ...@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:8157" cls.base_url = "http://127.0.0.1:8157"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.model,
cls.base_url, cls.base_url,
......
...@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase): ...@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:8157" cls.base_url = "http://127.0.0.1:8157"
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
@classmethod @classmethod
......
...@@ -14,7 +14,7 @@ class TestOpenAIServer(unittest.TestCase): ...@@ -14,7 +14,7 @@ class TestOpenAIServer(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:8157" cls.base_url = "http://127.0.0.1:8157"
cls.api_key = "sk-123456" cls.api_key = "sk-123456"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=300, api_key=cls.api_key cls.model, cls.base_url, timeout=300, api_key=cls.api_key
......
...@@ -13,7 +13,7 @@ class TestSRTEndpoint(unittest.TestCase): ...@@ -13,7 +13,7 @@ class TestSRTEndpoint(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:{8157}" cls.base_url = "http://127.0.0.1:8157"
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
@classmethod @classmethod
......
...@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase): ...@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:8157" cls.base_url = "http://127.0.0.1:8157"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"] cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
) )
......
...@@ -13,7 +13,7 @@ class TestOpenAIVisionServer(unittest.TestCase): ...@@ -13,7 +13,7 @@ class TestOpenAIVisionServer(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = "liuhaotian/llava-v1.6-vicuna-7b" cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
cls.base_url = "http://localhost:8157" cls.base_url = "http://127.0.0.1:8157"
cls.api_key = "sk-123456" cls.api_key = "sk-123456"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.model,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment