Commit 705f6a35 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

parents af837396 4cf256ae
import requests
from PIL import Image
from vllm import LLM, SamplingParams
def run_fuyu():
llm = LLM(model="adept/fuyu-8b", max_model_len=4096)
# single-image prompt
prompt = "What is the highest life expectancy at of male?\n"
url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png"
image = Image.open(requests.get(url, stream=True).raw)
sampling_params = SamplingParams(temperature=0, max_tokens=64)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {
"image": image
},
},
sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
if __name__ == "__main__":
run_fuyu()
import argparse
import os import os
import subprocess import subprocess
import torch
from PIL import Image from PIL import Image
from vllm import LLM from vllm import LLM
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them # You can use `.buildkite/download-images.sh` to download them
def run_llava_pixel_values(*, disable_image_processor: bool = False): def run_llava():
llm = LLM( llm = LLM(model="llava-hf/llava-1.5-7b-hf")
model="llava-hf/llava-1.5-7b-hf",
image_input_type="pixel_values",
image_token_id=32000,
image_input_shape="1,3,336,336",
image_feature_size=576,
disable_image_processor=disable_image_processor,
)
prompt = "<image>" * 576 + ( prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
"\nUSER: What is the content of this image?\nASSISTANT:")
if disable_image_processor: image = Image.open("images/stop_sign.jpg")
image = torch.load("images/stop_sign_pixel_values.pt")
else:
image = Image.open("images/stop_sign.jpg")
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": ImagePixelData(image), "multi_modal_data": {
"image": image
},
}) })
for o in outputs: for o in outputs:
...@@ -40,45 +28,11 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False): ...@@ -40,45 +28,11 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
print(generated_text) print(generated_text)
def run_llava_image_features(): def main():
llm = LLM( run_llava()
model="llava-hf/llava-1.5-7b-hf",
image_input_type="image_features",
image_token_id=32000,
image_input_shape="1,576,1024",
image_feature_size=576,
)
prompt = "<image>" * 576 + (
"\nUSER: What is the content of this image?\nASSISTANT:")
image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": ImageFeatureData(image),
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def main(args):
if args.type == "pixel_values":
run_llava_pixel_values()
else:
run_llava_image_features()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Demo on Llava")
parser.add_argument("--type",
type=str,
choices=["pixel_values", "image_features"],
default="pixel_values",
help="image input type")
args = parser.parse_args()
# Download from s3 # Download from s3
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images" local_directory = "images"
...@@ -95,4 +49,4 @@ if __name__ == "__main__": ...@@ -95,4 +49,4 @@ if __name__ == "__main__":
local_directory, local_directory,
"--no-sign-request", "--no-sign-request",
]) ])
main(args) main()
from io import BytesIO
import requests
from PIL import Image
from vllm import LLM, SamplingParams
def run_llava_next():
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096)
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
image = Image.open(BytesIO(requests.get(url).content))
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
max_tokens=100)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {
"image": image
}
},
sampling_params=sampling_params)
generated_text = ""
for o in outputs:
generated_text += o.outputs[0].text
print(f"LLM output:{generated_text}")
if __name__ == "__main__":
run_llava_next()
...@@ -2,6 +2,7 @@ import argparse ...@@ -2,6 +2,7 @@ import argparse
from typing import List, Tuple from typing import List, Tuple
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.utils import FlexibleArgumentParser
def create_test_prompts() -> List[Tuple[str, SamplingParams]]: def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
...@@ -55,7 +56,7 @@ def main(args: argparse.Namespace): ...@@ -55,7 +56,7 @@ def main(args: argparse.Namespace):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using the LLMEngine class directly') description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()
......
...@@ -5,7 +5,7 @@ distributively on a multi-nodes cluster. ...@@ -5,7 +5,7 @@ distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
""" """
from typing import Dict from typing import Any, Dict, List
import numpy as np import numpy as np
import ray import ray
...@@ -40,8 +40,8 @@ class LLMPredictor: ...@@ -40,8 +40,8 @@ class LLMPredictor:
# The output is a list of RequestOutput objects that contain the prompt, # The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information. # generated text, and other information.
outputs = self.llm.generate(batch["text"], sampling_params) outputs = self.llm.generate(batch["text"], sampling_params)
prompt = [] prompt: List[str] = []
generated_text = [] generated_text: List[str] = []
for output in outputs: for output in outputs:
prompt.append(output.prompt) prompt.append(output.prompt)
generated_text.append(' '.join([o.text for o in output.outputs])) generated_text.append(' '.join([o.text for o in output.outputs]))
...@@ -71,7 +71,7 @@ def scheduling_strategy_fn(): ...@@ -71,7 +71,7 @@ def scheduling_strategy_fn():
pg, placement_group_capture_child_tasks=True)) pg, placement_group_capture_child_tasks=True))
resources_kwarg = {} resources_kwarg: Dict[str, Any] = {}
if tensor_parallel_size == 1: if tensor_parallel_size == 1:
# For tensor_parallel_size == 1, we simply set num_gpus=1. # For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg["num_gpus"] = 1 resources_kwarg["num_gpus"] = 1
......
import gc
import time
from typing import List
from vllm import LLM, SamplingParams
def time_generation(llm: LLM, prompts: List[str],
sampling_params: SamplingParams):
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
# Warmup first
llm.generate(prompts, sampling_params)
llm.generate(prompts, sampling_params)
start = time.time()
outputs = llm.generate(prompts, sampling_params)
end = time.time()
print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
# Print the outputs.
for output in outputs:
generated_text = output.outputs[0].text
print(f"text: {generated_text!r}")
if __name__ == "__main__":
template = (
"Below is an instruction that describes a task. Write a response "
"that appropriately completes the request.\n\n### Instruction:\n{}"
"\n\n### Response:\n")
# Sample prompts.
prompts = [
"Write about the president of the United States.",
]
prompts = [template.format(prompt) for prompt in prompts]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
# Create an LLM without spec decoding
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
print("Without speculation")
time_generation(llm, prompts, sampling_params)
del llm
gc.collect()
# Create an LLM with spec decoding
llm = LLM(
model="meta-llama/Llama-2-13b-chat-hf",
speculative_model="ibm-fms/llama-13b-accelerator",
# These are currently required for MLPSpeculator decoding
use_v2_block_manager=True,
)
print("With speculation")
time_generation(llm, prompts, sampling_params)
File mode changed from 100755 to 100644
"""An example showing how to use vLLM to serve VLMs.
Launch the vLLM server with the following command:
python -m vllm.entrypoints.openai.api_server \
--model llava-hf/llava-1.5-7b-hf \
--chat-template template_llava.jinja
"""
import base64
import requests
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
# Use image url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What’s in this image?"
},
{
"type": "image_url",
"image_url": {
"url": image_url
},
},
],
}],
model=model,
)
result = chat_completion_from_url.choices[0].message.content
print(f"Chat completion output:{result}")
# Use base64 encoded image in the payload
def encode_image_base64_from_url(image_url: str) -> str:
"""Encode an image retrieved from a remote url to base64 format."""
with requests.get(image_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
return result
image_base64 = encode_image_base64_from_url(image_url=image_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What’s in this image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
},
],
}],
model=model,
)
result = chat_completion_from_base64.choices[0].message.content
print(f"Chat completion output:{result}")
import os
import subprocess
from PIL import Image
from vllm import LLM
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
def run_paligemma():
llm = LLM(model="google/paligemma-3b-mix-224")
prompt = "caption es"
image = Image.open("images/stop_sign.jpg")
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": image
},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def main():
run_paligemma()
if __name__ == "__main__":
# Download from s3
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
main()
import os
import subprocess
from PIL import Image
from vllm import LLM, SamplingParams
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
def run_phi3v():
model_path = "microsoft/Phi-3-vision-128k-instruct"
# Note: The default setting of max_num_seqs (256) and
# max_model_len (128k) for this model may cause OOM.
# You may lower either to run this example on lower-end GPUs.
# In this example, we override max_num_seqs to 5 while
# keeping the original context length of 128k.
llm = LLM(
model=model_path,
trust_remote_code=True,
max_num_seqs=5,
)
image = Image.open("images/cherry_blossom.jpg")
# single-image prompt
prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501
sampling_params = SamplingParams(temperature=0, max_tokens=64)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {
"image": image
},
},
sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
if __name__ == "__main__":
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
run_phi3v()
# Setup OpenTelemetry POC
1. Install OpenTelemetry packages:
```
pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai
```
1. Start Jaeger in a docker container:
```
# From: https://www.jaegertracing.io/docs/1.57/getting-started/
docker run --rm --name jaeger \
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
-p 6831:6831/udp \
-p 6832:6832/udp \
-p 5778:5778 \
-p 16686:16686 \
-p 4317:4317 \
-p 4318:4318 \
-p 14250:14250 \
-p 14268:14268 \
-p 14269:14269 \
-p 9411:9411 \
jaegertracing/all-in-one:1.57
```
1. In a new shell, export Jaeger IP:
```
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
```
Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
```
export OTEL_SERVICE_NAME="vllm-server"
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```
1. In a new shell, send requests with trace context from a dummy client
```
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
export OTEL_SERVICE_NAME="client-service"
python dummy_client.py
```
1. Open Jaeger webui: http://localhost:16686/
In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
![Traces](https://i.imgur.com/GYHhFjo.png)
1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
![Spans details](https://i.imgur.com/OPf6CBL.png)
## Exporter Protocol
OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
```
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```
## Instrumentation of FastAPI
OpenTelemetry allows automatic instrumentation of FastAPI.
1. Install the instrumentation library
```
pip install opentelemetry-instrumentation-fastapi
```
1. Run vLLM with `opentelemetry-instrument`
```
opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m"
```
1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
\ No newline at end of file
import requests
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
OTLPSpanExporter)
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
ConsoleSpanExporter)
from opentelemetry.trace import SpanKind, set_tracer_provider
from opentelemetry.trace.propagation.tracecontext import (
TraceContextTextMapPropagator)
trace_provider = TracerProvider()
set_tracer_provider(trace_provider)
trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
tracer = trace_provider.get_tracer("dummy-client")
url = "http://localhost:8000/v1/completions"
with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
prompt = "San Francisco is a"
span.set_attribute("prompt", prompt)
headers = {}
TraceContextTextMapPropagator().inject(headers)
payload = {
"model": "facebook/opt-125m",
"prompt": prompt,
"max_tokens": 10,
"best_of": 20,
"n": 3,
"use_beam_search": "true",
"temperature": 0.0,
# "stream": True,
}
response = requests.post(url, headers=headers, json=payload)
{ {
"__inputs": [ "__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
], ],
"__elements": {}, "__elements": {},
"__requires": [ "__requires": [
...@@ -1215,11 +1207,21 @@ ...@@ -1215,11 +1207,21 @@
"templating": { "templating": {
"list": [ "list": [
{ {
"type": "datasource",
"name": "DS_PROMETHEUS",
"label": "datasource",
"current": {}, "current": {},
"datasource": { "hide": 0,
"type": "prometheus", "includeAll": false,
"uid": "${DS_PROMETHEUS}" "multi": false,
}, "options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false
},
{
"definition": "label_values(model_name)", "definition": "label_values(model_name)",
"hide": 0, "hide": 0,
"includeAll": false, "includeAll": false,
...@@ -1250,3 +1252,4 @@ ...@@ -1250,3 +1252,4 @@
"version": 1, "version": 1,
"weekStart": "" "weekStart": ""
} }
...@@ -20,15 +20,15 @@ llm = LLM( ...@@ -20,15 +20,15 @@ llm = LLM(
tensor_parallel_size=8, tensor_parallel_size=8,
) )
""" """
import argparse
import dataclasses import dataclasses
import os import os
import shutil import shutil
from pathlib import Path from pathlib import Path
from vllm import LLM, EngineArgs from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
parser = argparse.ArgumentParser() parser = FlexibleArgumentParser()
EngineArgs.add_cli_args(parser) EngineArgs.add_cli_args(parser)
parser.add_argument("--output", parser.add_argument("--output",
"-o", "-o",
......
...@@ -3,18 +3,13 @@ import dataclasses ...@@ -3,18 +3,13 @@ import dataclasses
import json import json
import os import os
import uuid import uuid
from functools import partial
from tensorizer import stream_io
from vllm import LLM from vllm import LLM
from vllm.distributed import (init_distributed_environment,
initialize_model_parallel)
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
TensorizerConfig, TensorizerConfig,
serialize_vllm_model) tensorize_vllm_model)
from vllm.utils import FlexibleArgumentParser
# yapf conflicts with isort for this docstring # yapf conflicts with isort for this docstring
# yapf: disable # yapf: disable
...@@ -61,6 +56,12 @@ Which downloads the model tensors from your S3 bucket and deserializes them. ...@@ -61,6 +56,12 @@ Which downloads the model tensors from your S3 bucket and deserializes them.
You can also provide a `--keyfile` argument to decrypt the model weights if You can also provide a `--keyfile` argument to decrypt the model weights if
they were serialized with encryption. they were serialized with encryption.
To support distributed tensor-parallel models, each model shard will be
serialized to a separate file. The tensorizer_uri is then specified as a string
template with a format specifier such as '%03d' that will be rendered with the
shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors
For more information on the available arguments for serializing, run For more information on the available arguments for serializing, run
`python -m examples.tensorize_vllm_model serialize --help`. `python -m examples.tensorize_vllm_model serialize --help`.
...@@ -96,7 +97,7 @@ deserialization in this example script, although `--tensorizer-uri` and ...@@ -96,7 +97,7 @@ deserialization in this example script, although `--tensorizer-uri` and
def parse_args(): def parse_args():
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description="An example script that can be used to serialize and " description="An example script that can be used to serialize and "
"deserialize vLLM models. These models " "deserialize vLLM models. These models "
"can be loaded using tensorizer directly to the GPU " "can be loaded using tensorizer directly to the GPU "
...@@ -168,77 +169,72 @@ def parse_args(): ...@@ -168,77 +169,72 @@ def parse_args():
def deserialize(): def deserialize():
llm = LLM(model=args.model, llm = LLM(model=args.model,
load_format="tensorizer", load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config model_loader_extra_config=tensorizer_config
) )
return llm return llm
if __name__ == '__main__':
args = parse_args()
args = parse_args() s3_access_key_id = (getattr(args, 's3_access_key_id', None)
or os.environ.get("S3_ACCESS_KEY_ID", None))
s3_access_key_id = (getattr(args, 's3_access_key_id', None) s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
or os.environ.get("S3_ACCESS_KEY_ID", None)) or os.environ.get("S3_SECRET_ACCESS_KEY", None))
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None) s3_endpoint = (getattr(args, 's3_endpoint', None)
or os.environ.get("S3_SECRET_ACCESS_KEY", None)) or os.environ.get("S3_ENDPOINT_URL", None))
s3_endpoint = (getattr(args, 's3_endpoint', None)
or os.environ.get("S3_ENDPOINT_URL", None))
credentials = {
"s3_access_key_id": s3_access_key_id,
"s3_secret_access_key": s3_secret_access_key,
"s3_endpoint": s3_endpoint
}
_read_stream, _write_stream = (partial( credentials = {
stream_io.open_stream, "s3_access_key_id": s3_access_key_id,
mode=mode, "s3_secret_access_key": s3_secret_access_key,
s3_access_key_id=s3_access_key_id, "s3_endpoint": s3_endpoint
s3_secret_access_key=s3_secret_access_key, }
s3_endpoint=s3_endpoint,
) for mode in ("rb", "wb+"))
model_ref = args.model model_ref = args.model
model_name = model_ref.split("/")[1] model_name = model_ref.split("/")[1]
os.environ["MASTER_ADDR"] = "127.0.0.1" keyfile = args.keyfile if args.keyfile else None
os.environ["MASTER_PORT"] = "8080"
init_distributed_environment(world_size=1, rank=0, local_rank=0) if args.model_loader_extra_config:
initialize_model_parallel() config = json.loads(args.model_loader_extra_config)
tensorizer_args = \
TensorizerConfig(**config)._construct_tensorizer_args()
tensorizer_args.tensorizer_uri = args.path_to_tensors
else:
tensorizer_args = None
keyfile = args.keyfile if args.keyfile else None if args.command == "serialize":
eng_args_dict = {f.name: getattr(args, f.name) for f in
dataclasses.fields(EngineArgs)}
engine_args = EngineArgs.from_cli_args(
argparse.Namespace(**eng_args_dict)
)
if args.model_loader_extra_config: input_dir = args.serialized_directory.rstrip('/')
config = json.loads(args.model_loader_extra_config) suffix = args.suffix if args.suffix else uuid.uuid4().hex
tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args() base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
tensorizer_args.tensorizer_uri = args.path_to_tensors if engine_args.tensor_parallel_size > 1:
else: model_path = f"{base_path}/model-rank-%03d.tensors"
tensorizer_args = None else:
model_path = f"{base_path}/model.tensors"
if args.command == "serialize":
eng_args_dict = {f.name: getattr(args, f.name) for f in
dataclasses.fields(EngineArgs)}
engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
engine = LLMEngine.from_engine_args(engine_args)
input_dir = args.serialized_directory.rstrip('/')
suffix = args.suffix if args.suffix else uuid.uuid4().hex
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
model_path = f"{base_path}/model.tensors"
tensorizer_config = TensorizerConfig(
tensorizer_uri=model_path,
**credentials)
serialize_vllm_model(engine, tensorizer_config, keyfile)
elif args.command == "deserialize":
if not tensorizer_args:
tensorizer_config = TensorizerConfig( tensorizer_config = TensorizerConfig(
tensorizer_uri=args.path_to_tensors, tensorizer_uri=model_path,
encryption_keyfile = keyfile, encryption_keyfile=keyfile,
**credentials **credentials)
)
deserialize() tensorize_vllm_model(engine_args, tensorizer_config)
else:
raise ValueError("Either serialize or deserialize must be specified.") elif args.command == "deserialize":
if not tensorizer_args:
tensorizer_config = TensorizerConfig(
tensorizer_uri=args.path_to_tensors,
encryption_keyfile = keyfile,
**credentials
)
deserialize()
else:
raise ValueError("Either serialize or deserialize must be specified.")
...@@ -36,12 +36,12 @@ tool_version_check() { ...@@ -36,12 +36,12 @@ tool_version_check() {
fi fi
} }
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)" tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)" tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)" tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)" tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)"
tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-dev.txt | cut -d'=' -f3)" tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-lint.txt | cut -d'=' -f3)"
YAPF_FLAGS=( YAPF_FLAGS=(
'--recursive' '--recursive'
...@@ -96,22 +96,23 @@ echo 'vLLM yapf: Done' ...@@ -96,22 +96,23 @@ echo 'vLLM yapf: Done'
# Run mypy # Run mypy
echo 'vLLM mypy:' echo 'vLLM mypy:'
mypy tests --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/attention --config-file pyproject.toml mypy vllm/attention --config-file pyproject.toml
mypy vllm/core --config-file pyproject.toml mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml mypy vllm/distributed --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml mypy vllm/executor --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml mypy vllm/multimodal --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml mypy vllm/prompt_adapter --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml mypy vllm/transformers_utils --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml mypy vllm/usage --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
# If git diff returns a file that is in the skip list, the file may be checked anyway: # If git diff returns a file that is in the skip list, the file may be checked anyway:
......
...@@ -5,7 +5,7 @@ requires = [ ...@@ -5,7 +5,7 @@ requires = [
"ninja", "ninja",
"packaging", "packaging",
"setuptools >= 49.4.0", "setuptools >= 49.4.0",
"torch == 2.3.0", "torch == 2.3.1",
"wheel", "wheel",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
...@@ -69,7 +69,5 @@ skip_gitignore = true ...@@ -69,7 +69,5 @@ skip_gitignore = true
[tool.pytest.ini_options] [tool.pytest.ini_options]
markers = [ markers = [
"skip_global_cleanup", "skip_global_cleanup",
"llm: run tests for vLLM API only", "vlm: run tests for vision language models only",
"openai: run tests for OpenAI API only",
"llava: run tests for LLaVA models only",
] ]
...@@ -3,5 +3,5 @@ cmake>=3.21 ...@@ -3,5 +3,5 @@ cmake>=3.21
ninja ninja
packaging packaging
setuptools>=49.4.0 setuptools>=49.4.0
torch==2.3.0 torch==2.3.1
wheel wheel
...@@ -2,10 +2,11 @@ cmake >= 3.21 ...@@ -2,10 +2,11 @@ cmake >= 3.21
ninja # For faster builds. ninja # For faster builds.
psutil psutil
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
numpy numpy < 2.0.0
requests requests
tqdm
py-cpuinfo py-cpuinfo
transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. transformers >= 4.42.4 # Required for Gemma 2 and for additional chat template parameters.
tokenizers >= 0.19.1 # Required for Llama 3. tokenizers >= 0.19.1 # Required for Llama 3.
fastapi fastapi
aiohttp aiohttp
...@@ -16,7 +17,8 @@ pillow # Required for image processing ...@@ -16,7 +17,8 @@ pillow # Required for image processing
prometheus_client >= 0.18.0 prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0 prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.1 lm-format-enforcer == 0.10.3
outlines >= 0.0.43 # Requires torch >= 2.1.0 outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
...@@ -2,5 +2,6 @@ ...@@ -2,5 +2,6 @@
-r requirements-common.txt -r requirements-common.txt
# Dependencies for x86_64 CPUs # Dependencies for x86_64 CPUs
torch == 2.3.0+cpu torch == 2.3.1+cpu; platform_machine != "ppc64le"
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. torchvision == 0.18.1+cpu; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
\ No newline at end of file triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment