Unverified Commit d9701151 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] vLLM cache directory for images (#6444)

parent 37d77660
#!/bin/bash
set -ex
set -o pipefail
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
mkdir -p images
cd images
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
cd -
......@@ -12,7 +12,6 @@ steps:
fast_check_only: true
commands:
- pytest -v -s async_engine # Async Engine
- bash ../.buildkite/download-images.sh # Inputs
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils
......@@ -82,7 +81,6 @@ steps:
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands:
- bash ../.buildkite/download-images.sh
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
......@@ -155,7 +153,6 @@ steps:
- label: Inputs Test
#mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
......@@ -175,7 +172,6 @@ steps:
- label: Vision Language Models Test
mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models -m vlm
- label: Prefix Caching Test
......
import os
import subprocess
from PIL import Image
from vllm import LLM
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
from vllm.assets.image import ImageAsset
def run_llava():
......@@ -14,7 +7,7 @@ def run_llava():
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
image = Image.open("images/stop_sign.jpg")
image = ImageAsset("stop_sign").pil_image
outputs = llm.generate({
"prompt": prompt,
......@@ -28,25 +21,5 @@ def run_llava():
print(generated_text)
def main():
run_llava()
if __name__ == "__main__":
# Download from s3
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
main()
run_llava()
import os
import subprocess
from PIL import Image
from vllm import LLM
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
from vllm.assets.image import ImageAsset
def run_paligemma():
......@@ -14,7 +7,7 @@ def run_paligemma():
prompt = "caption es"
image = Image.open("images/stop_sign.jpg")
image = ImageAsset("stop_sign").pil_image
outputs = llm.generate({
"prompt": prompt,
......@@ -28,25 +21,5 @@ def run_paligemma():
print(generated_text)
def main():
run_paligemma()
if __name__ == "__main__":
# Download from s3
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
main()
run_paligemma()
import os
import subprocess
from PIL import Image
from vllm import LLM, SamplingParams
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
from vllm.assets.image import ImageAsset
def run_phi3v():
......@@ -24,7 +17,7 @@ def run_phi3v():
max_num_seqs=5,
)
image = Image.open("images/cherry_blossom.jpg")
image = ImageAsset("cherry_blossom").pil_image
# single-image prompt
prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501
......@@ -44,19 +37,4 @@ def run_phi3v():
if __name__ == "__main__":
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
run_phi3v()
......@@ -3,11 +3,7 @@ import gc
import os
import sys
from collections import UserList
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
TypeVar)
from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar
import pytest
import torch
......@@ -18,12 +14,12 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
AutoTokenizer, BatchEncoding)
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.config import TokenizerPoolConfig
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel)
from vllm.inputs import TextPrompt
from vllm.logger import init_logger
from vllm.multimodal.utils import fetch_image
from vllm.sequence import SampleLogprobs
from vllm.utils import cuda_device_count_stateless, is_cpu
......@@ -33,9 +29,6 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
_IMAGE_DIR = Path(_TEST_DIR) / "images"
"""You can use `.buildkite/download-images.sh` to download the assets."""
def _read_prompts(filename: str) -> List[str]:
with open(filename, "r") as f:
......@@ -43,20 +36,6 @@ def _read_prompts(filename: str) -> List[str]:
return prompts
@dataclass(frozen=True)
class ImageAsset:
name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
@cached_property
def pil_image(self) -> Image.Image:
if self.name == "boardwalk":
return fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
)
return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
class _ImageAssetPrompts(TypedDict):
stop_sign: str
cherry_blossom: str
......
from pathlib import Path
import vllm.envs as envs
def get_cache_dir():
"""Get the path to the cache for storing downloaded assets."""
path = Path(envs.VLLM_ASSETS_CACHE)
path.mkdir(parents=True, exist_ok=True)
return path
import shutil
from dataclasses import dataclass
from functools import cached_property, lru_cache
from typing import Literal
import requests
from PIL import Image
from vllm.multimodal.utils import fetch_image
from .base import get_cache_dir
@lru_cache
def get_air_example_data_2_asset(filename: str) -> Image.Image:
"""
Download and open an image from
``s3://air-example-data-2/vllm_opensource_llava/``.
"""
image_directory = get_cache_dir() / "air-example-data-2"
image_directory.mkdir(parents=True, exist_ok=True)
image_path = image_directory / filename
if not image_path.exists():
base_url = "https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava"
with requests.get(f"{base_url}/{filename}", stream=True) as response:
response.raise_for_status()
with image_path.open("wb") as f:
shutil.copyfileobj(response.raw, f)
return Image.open(image_path)
@dataclass(frozen=True)
class ImageAsset:
name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
@cached_property
def pil_image(self) -> Image.Image:
if self.name == "boardwalk":
return fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
)
return get_air_example_data_2_asset(f"{self.name}.jpg")
......@@ -189,10 +189,10 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
if cuda_visible_devices is None:
cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
path = os.path.expanduser(
f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
)
path = os.path.join(
envs.VLLM_CACHE_ROOT,
f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
os.makedirs(os.path.dirname(path), exist_ok=True)
from vllm.distributed.parallel_state import get_world_group
if ((not is_distributed or get_world_group().local_rank == 0)
......
......@@ -17,7 +17,8 @@ if TYPE_CHECKING:
S3_ACCESS_KEY_ID: Optional[str] = None
S3_SECRET_ACCESS_KEY: Optional[str] = None
S3_ENDPOINT_URL: Optional[str] = None
VLLM_CONFIG_ROOT: str = ""
VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
VLLM_NO_USAGE_STATS: bool = False
VLLM_DO_NOT_TRACK: bool = False
......@@ -31,10 +32,11 @@ if TYPE_CHECKING:
VLLM_OPENVINO_KVCACHE_SPACE: int = 0
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_TARGET_DEVICE: str = "cuda"
MAX_JOBS: Optional[str] = None
......@@ -45,6 +47,21 @@ if TYPE_CHECKING:
CMAKE_BUILD_TYPE: Optional[str] = None
VERBOSE: bool = False
def get_default_cache_root():
return os.getenv(
"XDG_CACHE_HOME",
os.path.join(os.path.expanduser("~"), ".cache"),
)
def get_default_config_root():
return os.getenv(
"XDG_CONFIG_HOME",
os.path.join(os.path.expanduser("~"), ".config"),
)
# The begin-* and end* here are used by the documentation generator
# to extract the used env vars.
......@@ -89,15 +106,28 @@ environment_variables: Dict[str, Callable[[], Any]] = {
lambda: bool(int(os.getenv('VERBOSE', '0'))),
# Root directory for VLLM configuration files
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
# Note that this not only affects how vllm finds its configuration files
# during runtime, but also affects how vllm installs its configuration
# files during **installation**.
"VLLM_CONFIG_ROOT":
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
lambda: os.path.expanduser(
os.getenv(
"VLLM_CONFIG_ROOT",
os.path.join(get_default_config_root(), "vllm"),
)),
# ================== Runtime Env Vars ==================
# Root directory for VLLM cache files
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
"VLLM_CACHE_ROOT":
lambda: os.path.expanduser(
os.getenv(
"VLLM_CACHE_ROOT",
os.path.join(get_default_cache_root(), "vllm"),
)),
# used in distributed environment to determine the master address
'VLLM_HOST_IP':
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
......@@ -242,6 +272,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_WORKER_MULTIPROC_METHOD":
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
# Path to the cache for storing downloaded assets
"VLLM_ASSETS_CACHE":
lambda: os.path.expanduser(
os.getenv(
"VLLM_ASSETS_CACHE",
os.path.join(get_default_cache_root(), "vllm", "assets"),
)),
# Timeout for fetching images when serving multimodal models
# Default is 5 seconds
"VLLM_IMAGE_FETCH_TIMEOUT":
......@@ -250,7 +288,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs.
"VLLM_XLA_CACHE_PATH":
lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
lambda: os.path.expanduser(
os.getenv(
"VLLM_ASSETS_CACHE",
os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
)),
"VLLM_FUSED_MOE_CHUNK_SIZE":
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
......@@ -262,7 +304,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# end-env-vars-definition
def __getattr__(name):
def __getattr__(name: str):
# lazy evaluation of environment variables
if name in environment_variables:
return environment_variables[name]()
......
......@@ -19,9 +19,8 @@ import vllm.envs as envs
from vllm.version import __version__ as VLLM_VERSION
_config_home = envs.VLLM_CONFIG_ROOT
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
"vllm/do_not_track")
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
_USAGE_STATS_ENABLED = None
_USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
......
......@@ -98,8 +98,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
# Use persistent cache to avoid XLA recompilation.
# NOTE(woosuk): This does not completely eliminate the recompilation
# overhead because dynamo does not cache the compiled results.
xr.initialize_cache(os.path.expanduser(envs.VLLM_XLA_CACHE_PATH),
readonly=False)
xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, readonly=False)
def load_model(self):
self.model_runner.load_model()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment