Commit 705f6a35 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

parents af837396 4cf256ae
import requests
from PIL import Image
from vllm import LLM, SamplingParams
def run_fuyu():
llm = LLM(model="adept/fuyu-8b", max_model_len=4096)
# single-image prompt
prompt = "What is the highest life expectancy at of male?\n"
url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png"
image = Image.open(requests.get(url, stream=True).raw)
sampling_params = SamplingParams(temperature=0, max_tokens=64)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {
"image": image
},
},
sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
if __name__ == "__main__":
run_fuyu()
import argparse
import os
import subprocess
import torch
from PIL import Image
from vllm import LLM
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
def run_llava_pixel_values(*, disable_image_processor: bool = False):
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
image_input_type="pixel_values",
image_token_id=32000,
image_input_shape="1,3,336,336",
image_feature_size=576,
disable_image_processor=disable_image_processor,
)
def run_llava():
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
prompt = "<image>" * 576 + (
"\nUSER: What is the content of this image?\nASSISTANT:")
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
if disable_image_processor:
image = torch.load("images/stop_sign_pixel_values.pt")
else:
image = Image.open("images/stop_sign.jpg")
image = Image.open("images/stop_sign.jpg")
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": ImagePixelData(image),
"multi_modal_data": {
"image": image
},
})
for o in outputs:
......@@ -40,45 +28,11 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
print(generated_text)
def run_llava_image_features():
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
image_input_type="image_features",
image_token_id=32000,
image_input_shape="1,576,1024",
image_feature_size=576,
)
prompt = "<image>" * 576 + (
"\nUSER: What is the content of this image?\nASSISTANT:")
image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": ImageFeatureData(image),
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def main(args):
if args.type == "pixel_values":
run_llava_pixel_values()
else:
run_llava_image_features()
def main():
run_llava()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Demo on Llava")
parser.add_argument("--type",
type=str,
choices=["pixel_values", "image_features"],
default="pixel_values",
help="image input type")
args = parser.parse_args()
# Download from s3
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
......@@ -95,4 +49,4 @@ if __name__ == "__main__":
local_directory,
"--no-sign-request",
])
main(args)
main()
from io import BytesIO
import requests
from PIL import Image
from vllm import LLM, SamplingParams
def run_llava_next():
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096)
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
image = Image.open(BytesIO(requests.get(url).content))
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
max_tokens=100)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {
"image": image
}
},
sampling_params=sampling_params)
generated_text = ""
for o in outputs:
generated_text += o.outputs[0].text
print(f"LLM output:{generated_text}")
if __name__ == "__main__":
run_llava_next()
......@@ -2,6 +2,7 @@ import argparse
from typing import List, Tuple
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.utils import FlexibleArgumentParser
def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
......@@ -55,7 +56,7 @@ def main(args: argparse.Namespace):
if __name__ == '__main__':
parser = argparse.ArgumentParser(
parser = FlexibleArgumentParser(
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
......
......@@ -5,7 +5,7 @@ distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""
from typing import Dict
from typing import Any, Dict, List
import numpy as np
import ray
......@@ -40,8 +40,8 @@ class LLMPredictor:
# The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information.
outputs = self.llm.generate(batch["text"], sampling_params)
prompt = []
generated_text = []
prompt: List[str] = []
generated_text: List[str] = []
for output in outputs:
prompt.append(output.prompt)
generated_text.append(' '.join([o.text for o in output.outputs]))
......@@ -71,7 +71,7 @@ def scheduling_strategy_fn():
pg, placement_group_capture_child_tasks=True))
resources_kwarg = {}
resources_kwarg: Dict[str, Any] = {}
if tensor_parallel_size == 1:
# For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg["num_gpus"] = 1
......
import gc
import time
from typing import List
from vllm import LLM, SamplingParams
def time_generation(llm: LLM, prompts: List[str],
sampling_params: SamplingParams):
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
# Warmup first
llm.generate(prompts, sampling_params)
llm.generate(prompts, sampling_params)
start = time.time()
outputs = llm.generate(prompts, sampling_params)
end = time.time()
print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
# Print the outputs.
for output in outputs:
generated_text = output.outputs[0].text
print(f"text: {generated_text!r}")
if __name__ == "__main__":
template = (
"Below is an instruction that describes a task. Write a response "
"that appropriately completes the request.\n\n### Instruction:\n{}"
"\n\n### Response:\n")
# Sample prompts.
prompts = [
"Write about the president of the United States.",
]
prompts = [template.format(prompt) for prompt in prompts]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
# Create an LLM without spec decoding
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
print("Without speculation")
time_generation(llm, prompts, sampling_params)
del llm
gc.collect()
# Create an LLM with spec decoding
llm = LLM(
model="meta-llama/Llama-2-13b-chat-hf",
speculative_model="ibm-fms/llama-13b-accelerator",
# These are currently required for MLPSpeculator decoding
use_v2_block_manager=True,
)
print("With speculation")
time_generation(llm, prompts, sampling_params)
File mode changed from 100755 to 100644
"""An example showing how to use vLLM to serve VLMs.
Launch the vLLM server with the following command:
python -m vllm.entrypoints.openai.api_server \
--model llava-hf/llava-1.5-7b-hf \
--chat-template template_llava.jinja
"""
import base64
import requests
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
# Use image url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What’s in this image?"
},
{
"type": "image_url",
"image_url": {
"url": image_url
},
},
],
}],
model=model,
)
result = chat_completion_from_url.choices[0].message.content
print(f"Chat completion output:{result}")
# Use base64 encoded image in the payload
def encode_image_base64_from_url(image_url: str) -> str:
"""Encode an image retrieved from a remote url to base64 format."""
with requests.get(image_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
return result
image_base64 = encode_image_base64_from_url(image_url=image_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What’s in this image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
},
],
}],
model=model,
)
result = chat_completion_from_base64.choices[0].message.content
print(f"Chat completion output:{result}")
import os
import subprocess
from PIL import Image
from vllm import LLM
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
def run_paligemma():
llm = LLM(model="google/paligemma-3b-mix-224")
prompt = "caption es"
image = Image.open("images/stop_sign.jpg")
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": image
},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def main():
run_paligemma()
if __name__ == "__main__":
# Download from s3
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
main()
import os
import subprocess
from PIL import Image
from vllm import LLM, SamplingParams
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
def run_phi3v():
model_path = "microsoft/Phi-3-vision-128k-instruct"
# Note: The default setting of max_num_seqs (256) and
# max_model_len (128k) for this model may cause OOM.
# You may lower either to run this example on lower-end GPUs.
# In this example, we override max_num_seqs to 5 while
# keeping the original context length of 128k.
llm = LLM(
model=model_path,
trust_remote_code=True,
max_num_seqs=5,
)
image = Image.open("images/cherry_blossom.jpg")
# single-image prompt
prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501
sampling_params = SamplingParams(temperature=0, max_tokens=64)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {
"image": image
},
},
sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
if __name__ == "__main__":
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
run_phi3v()
# Setup OpenTelemetry POC
1. Install OpenTelemetry packages:
```
pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai
```
1. Start Jaeger in a docker container:
```
# From: https://www.jaegertracing.io/docs/1.57/getting-started/
docker run --rm --name jaeger \
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
-p 6831:6831/udp \
-p 6832:6832/udp \
-p 5778:5778 \
-p 16686:16686 \
-p 4317:4317 \
-p 4318:4318 \
-p 14250:14250 \
-p 14268:14268 \
-p 14269:14269 \
-p 9411:9411 \
jaegertracing/all-in-one:1.57
```
1. In a new shell, export Jaeger IP:
```
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
```
Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
```
export OTEL_SERVICE_NAME="vllm-server"
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```
1. In a new shell, send requests with trace context from a dummy client
```
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
export OTEL_SERVICE_NAME="client-service"
python dummy_client.py
```
1. Open Jaeger webui: http://localhost:16686/
In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
![Traces](https://i.imgur.com/GYHhFjo.png)
1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
![Spans details](https://i.imgur.com/OPf6CBL.png)
## Exporter Protocol
OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
```
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```
## Instrumentation of FastAPI
OpenTelemetry allows automatic instrumentation of FastAPI.
1. Install the instrumentation library
```
pip install opentelemetry-instrumentation-fastapi
```
1. Run vLLM with `opentelemetry-instrument`
```
opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m"
```
1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
\ No newline at end of file
import requests
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
OTLPSpanExporter)
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
ConsoleSpanExporter)
from opentelemetry.trace import SpanKind, set_tracer_provider
from opentelemetry.trace.propagation.tracecontext import (
TraceContextTextMapPropagator)
trace_provider = TracerProvider()
set_tracer_provider(trace_provider)
trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
tracer = trace_provider.get_tracer("dummy-client")
url = "http://localhost:8000/v1/completions"
with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
prompt = "San Francisco is a"
span.set_attribute("prompt", prompt)
headers = {}
TraceContextTextMapPropagator().inject(headers)
payload = {
"model": "facebook/opt-125m",
"prompt": prompt,
"max_tokens": 10,
"best_of": 20,
"n": 3,
"use_beam_search": "true",
"temperature": 0.0,
# "stream": True,
}
response = requests.post(url, headers=headers, json=payload)
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__elements": {},
"__requires": [
......@@ -1215,11 +1207,21 @@
"templating": {
"list": [
{
"type": "datasource",
"name": "DS_PROMETHEUS",
"label": "datasource",
"current": {},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"hide": 0,
"includeAll": false,
"multi": false,
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false
},
{
"definition": "label_values(model_name)",
"hide": 0,
"includeAll": false,
......@@ -1250,3 +1252,4 @@
"version": 1,
"weekStart": ""
}
......@@ -20,15 +20,15 @@ llm = LLM(
tensor_parallel_size=8,
)
"""
import argparse
import dataclasses
import os
import shutil
from pathlib import Path
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
parser = argparse.ArgumentParser()
parser = FlexibleArgumentParser()
EngineArgs.add_cli_args(parser)
parser.add_argument("--output",
"-o",
......
......@@ -3,18 +3,13 @@ import dataclasses
import json
import os
import uuid
from functools import partial
from tensorizer import stream_io
from vllm import LLM
from vllm.distributed import (init_distributed_environment,
initialize_model_parallel)
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
TensorizerConfig,
serialize_vllm_model)
tensorize_vllm_model)
from vllm.utils import FlexibleArgumentParser
# yapf conflicts with isort for this docstring
# yapf: disable
......@@ -61,6 +56,12 @@ Which downloads the model tensors from your S3 bucket and deserializes them.
You can also provide a `--keyfile` argument to decrypt the model weights if
they were serialized with encryption.
To support distributed tensor-parallel models, each model shard will be
serialized to a separate file. The tensorizer_uri is then specified as a string
template with a format specifier such as '%03d' that will be rendered with the
shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors
For more information on the available arguments for serializing, run
`python -m examples.tensorize_vllm_model serialize --help`.
......@@ -96,7 +97,7 @@ deserialization in this example script, although `--tensorizer-uri` and
def parse_args():
parser = argparse.ArgumentParser(
parser = FlexibleArgumentParser(
description="An example script that can be used to serialize and "
"deserialize vLLM models. These models "
"can be loaded using tensorizer directly to the GPU "
......@@ -168,77 +169,72 @@ def parse_args():
def deserialize():
llm = LLM(model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config
)
return llm
if __name__ == '__main__':
args = parse_args()
args = parse_args()
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
or os.environ.get("S3_ACCESS_KEY_ID", None))
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
or os.environ.get("S3_SECRET_ACCESS_KEY", None))
s3_endpoint = (getattr(args, 's3_endpoint', None)
or os.environ.get("S3_ENDPOINT_URL", None))
credentials = {
"s3_access_key_id": s3_access_key_id,
"s3_secret_access_key": s3_secret_access_key,
"s3_endpoint": s3_endpoint
}
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
or os.environ.get("S3_ACCESS_KEY_ID", None))
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
or os.environ.get("S3_SECRET_ACCESS_KEY", None))
s3_endpoint = (getattr(args, 's3_endpoint', None)
or os.environ.get("S3_ENDPOINT_URL", None))
_read_stream, _write_stream = (partial(
stream_io.open_stream,
mode=mode,
s3_access_key_id=s3_access_key_id,
s3_secret_access_key=s3_secret_access_key,
s3_endpoint=s3_endpoint,
) for mode in ("rb", "wb+"))
credentials = {
"s3_access_key_id": s3_access_key_id,
"s3_secret_access_key": s3_secret_access_key,
"s3_endpoint": s3_endpoint
}
model_ref = args.model
model_ref = args.model
model_name = model_ref.split("/")[1]
model_name = model_ref.split("/")[1]
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "8080"
keyfile = args.keyfile if args.keyfile else None
init_distributed_environment(world_size=1, rank=0, local_rank=0)
initialize_model_parallel()
if args.model_loader_extra_config:
config = json.loads(args.model_loader_extra_config)
tensorizer_args = \
TensorizerConfig(**config)._construct_tensorizer_args()
tensorizer_args.tensorizer_uri = args.path_to_tensors
else:
tensorizer_args = None
keyfile = args.keyfile if args.keyfile else None
if args.command == "serialize":
eng_args_dict = {f.name: getattr(args, f.name) for f in
dataclasses.fields(EngineArgs)}
engine_args = EngineArgs.from_cli_args(
argparse.Namespace(**eng_args_dict)
)
if args.model_loader_extra_config:
config = json.loads(args.model_loader_extra_config)
tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args()
tensorizer_args.tensorizer_uri = args.path_to_tensors
else:
tensorizer_args = None
if args.command == "serialize":
eng_args_dict = {f.name: getattr(args, f.name) for f in
dataclasses.fields(EngineArgs)}
engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
engine = LLMEngine.from_engine_args(engine_args)
input_dir = args.serialized_directory.rstrip('/')
suffix = args.suffix if args.suffix else uuid.uuid4().hex
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
if engine_args.tensor_parallel_size > 1:
model_path = f"{base_path}/model-rank-%03d.tensors"
else:
model_path = f"{base_path}/model.tensors"
input_dir = args.serialized_directory.rstrip('/')
suffix = args.suffix if args.suffix else uuid.uuid4().hex
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
model_path = f"{base_path}/model.tensors"
tensorizer_config = TensorizerConfig(
tensorizer_uri=model_path,
**credentials)
serialize_vllm_model(engine, tensorizer_config, keyfile)
elif args.command == "deserialize":
if not tensorizer_args:
tensorizer_config = TensorizerConfig(
tensorizer_uri=args.path_to_tensors,
encryption_keyfile = keyfile,
**credentials
)
deserialize()
else:
raise ValueError("Either serialize or deserialize must be specified.")
tensorizer_uri=model_path,
encryption_keyfile=keyfile,
**credentials)
tensorize_vllm_model(engine_args, tensorizer_config)
elif args.command == "deserialize":
if not tensorizer_args:
tensorizer_config = TensorizerConfig(
tensorizer_uri=args.path_to_tensors,
encryption_keyfile = keyfile,
**credentials
)
deserialize()
else:
raise ValueError("Either serialize or deserialize must be specified.")
......@@ -36,12 +36,12 @@ tool_version_check() {
fi
}
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-lint.txt | cut -d'=' -f3)"
YAPF_FLAGS=(
'--recursive'
......@@ -96,22 +96,23 @@ echo 'vLLM yapf: Done'
# Run mypy
echo 'vLLM mypy:'
mypy tests --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/attention --config-file pyproject.toml
mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/prompt_adapter --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
# If git diff returns a file that is in the skip list, the file may be checked anyway:
......
......@@ -5,7 +5,7 @@ requires = [
"ninja",
"packaging",
"setuptools >= 49.4.0",
"torch == 2.3.0",
"torch == 2.3.1",
"wheel",
]
build-backend = "setuptools.build_meta"
......@@ -69,7 +69,5 @@ skip_gitignore = true
[tool.pytest.ini_options]
markers = [
"skip_global_cleanup",
"llm: run tests for vLLM API only",
"openai: run tests for OpenAI API only",
"llava: run tests for LLaVA models only",
"vlm: run tests for vision language models only",
]
......@@ -3,5 +3,5 @@ cmake>=3.21
ninja
packaging
setuptools>=49.4.0
torch==2.3.0
torch==2.3.1
wheel
......@@ -2,10 +2,11 @@ cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy
numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3.
transformers >= 4.42.4 # Required for Gemma 2 and for additional chat template parameters.
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
......@@ -16,7 +17,8 @@ pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.1
outlines >= 0.0.43 # Requires torch >= 2.1.0
lm-format-enforcer == 0.10.3
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
......@@ -2,5 +2,6 @@
-r requirements-common.txt
# Dependencies for x86_64 CPUs
torch == 2.3.0+cpu
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
torch == 2.3.1+cpu; platform_machine != "ppc64le"
torchvision == 0.18.1+cpu; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment