Unverified Commit fd389df9 authored by StonyPort's avatar StonyPort Committed by GitHub
Browse files

Reduce the image processing latency in VLM (#11541)


Co-authored-by: default avatarqiuxuan.lzw <qiuxuan.lzw@alibaba-inc.com>
parent b0d1d717
...@@ -221,6 +221,10 @@ class Envs: ...@@ -221,6 +221,10 @@ class Envs:
SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096) SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256) SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
# VLM
SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
SGLANG_RESIZE_RESAMPLE = EnvStr("")
# fmt: on # fmt: on
......
...@@ -313,7 +313,9 @@ class BaseMultimodalProcessor(ABC): ...@@ -313,7 +313,9 @@ class BaseMultimodalProcessor(ABC):
try: try:
if modality == Modality.IMAGE: if modality == Modality.IMAGE:
img, _ = load_image(data) img, _ = load_image(data)
return img.convert("RGB") if discard_alpha_channel else img if discard_alpha_channel and img.mode != "RGB":
img = img.convert("RGB")
return img
elif modality == Modality.VIDEO: elif modality == Modality.VIDEO:
return load_video(data, frame_count_limit) return load_video(data, frame_count_limit)
elif modality == Modality.AUDIO: elif modality == Modality.AUDIO:
......
...@@ -9,6 +9,7 @@ import torchvision ...@@ -9,6 +9,7 @@ import torchvision
from PIL import Image from PIL import Image
from torchvision.transforms import InterpolationMode from torchvision.transforms import InterpolationMode
from sglang.srt.environ import envs
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
...@@ -23,8 +24,14 @@ from sglang.utils import logger ...@@ -23,8 +24,14 @@ from sglang.utils import logger
IMAGE_FACTOR = 28 IMAGE_FACTOR = 28
MIN_PIXELS = 4 * 28 * 28 MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28 MAX_PIXELS = envs.SGLANG_IMAGE_MAX_PIXELS.get()
MAX_RATIO = 200 MAX_RATIO = 200
RESIZE_RESAMPLE = getattr(Image, envs.SGLANG_RESIZE_RESAMPLE.get(), None)
if envs.SGLANG_RESIZE_RESAMPLE.is_set() and RESIZE_RESAMPLE is None:
logger.warning(
f"Invalid RESIZE_RESAMPLE value: '{envs.SGLANG_RESIZE_RESAMPLE.get()}'. "
f"Ignoring and using default."
)
VIDEO_TOTAL_PIXELS = int( VIDEO_TOTAL_PIXELS = int(
float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9)) float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
) )
...@@ -86,7 +93,7 @@ def resize_image( ...@@ -86,7 +93,7 @@ def resize_image(
min_pixels=min_pixels, min_pixels=min_pixels,
max_pixels=max_pixels, max_pixels=max_pixels,
) )
image = image.resize((resized_width, resized_height)) image = image.resize((resized_width, resized_height), resample=RESIZE_RESAMPLE)
return image return image
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment