"encoding/functions/encoding.py" did not exist on "30be3799bb080903f1b8bac71c836f09147907a6"
Unverified Commit a5b14ad0 authored by Kaichen Zhang - NTU's avatar Kaichen Zhang - NTU Committed by GitHub
Browse files

[Feat/WIP] add llava-onevision, with support for (1) siglip encoder, (2) qwen2...


[Feat/WIP] add llava-onevision, with support for (1) siglip encoder, (2) qwen2 decoder (3) openai api compatible server. (#1123)
Co-authored-by: default avatarBo Li <drluodian@gmail.com>
parent 5fafcac0
...@@ -231,8 +231,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ...@@ -231,8 +231,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
- `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
- `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
- `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000` - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --host=127.0.0.1 --tp-size=1 --chat-template=llava_llama_3`
- `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --host="127.0.0.1" --tp-size=8 --chat-template=chatml-llava`
- LLaVA-NeXT-Video - LLaVA-NeXT-Video
- see [examples/usage/llava_video](examples/usage/llava_video) - see [examples/usage/llava_video](examples/usage/llava_video)
- [LLaVA-OneVision](https://arxiv.org/abs/2408.03326)
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
- see [test/srt/test_llava_onevision_openai_server.py](test/srt/test_llava_onevision_openai_server.py)
- Yi-VL - Yi-VL
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py). - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
- StableLM - StableLM
......
import base64
import io
import os
import sys
import time
import numpy as np
import openai
import requests
from decord import VideoReader, cpu
from PIL import Image
# pip install httpx==0.23.3
# pip install decord
# pip install protobuf==3.20.0
def download_video(url, cache_dir):
file_path = os.path.join(cache_dir, "jobs.mp4")
os.makedirs(cache_dir, exist_ok=True)
response = requests.get(url)
response.raise_for_status()
with open(file_path, "wb") as f:
f.write(response.content)
print(f"File downloaded and saved to: {file_path}")
return file_path
def create_openai_client(base_url):
return openai.Client(api_key="EMPTY", base_url=base_url)
def image_stream_request_test(client):
print("----------------------Image Stream Request Test----------------------")
stream_request = client.chat.completions.create(
model="default",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
},
},
{
"type": "text",
"text": "Please describe this image. Please list the benchmarks and the models.",
},
],
},
],
temperature=0.7,
max_tokens=1024,
stream=True,
)
stream_response = ""
for chunk in stream_request:
if chunk.choices[0].delta.content is not None:
content = chunk.choices[0].delta.content
stream_response += content
sys.stdout.write(content)
sys.stdout.flush()
print("-" * 30)
def video_stream_request_test(client, video_path):
print("------------------------Video Stream Request Test----------------------")
messages = prepare_video_messages(video_path)
start_time = time.time()
video_request = client.chat.completions.create(
model="default",
messages=messages,
temperature=0,
max_tokens=1024,
stream=True,
)
print("-" * 30)
video_response = ""
for chunk in video_request:
if chunk.choices[0].delta.content is not None:
content = chunk.choices[0].delta.content
video_response += content
sys.stdout.write(content)
sys.stdout.flush()
print("-" * 30)
def image_speed_test(client):
print("----------------------Image Speed Test----------------------")
start_time = time.time()
request = client.chat.completions.create(
model="default",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
},
},
{
"type": "text",
"text": "Please describe this image. Please list the benchmarks and the models.",
},
],
},
],
temperature=0,
max_tokens=1024,
)
end_time = time.time()
response = request.choices[0].message.content
print(response)
print("-" * 30)
print_speed_test_results(request, start_time, end_time)
def video_speed_test(client, video_path):
print("------------------------Video Speed Test------------------------")
messages = prepare_video_messages(video_path)
start_time = time.time()
video_request = client.chat.completions.create(
model="default",
messages=messages,
temperature=0,
max_tokens=1024,
)
end_time = time.time()
video_response = video_request.choices[0].message.content
print(video_response)
print("-" * 30)
print_speed_test_results(video_request, start_time, end_time)
def prepare_video_messages(video_path):
max_frames_num = 32
vr = VideoReader(video_path, ctx=cpu(0))
total_frame_num = len(vr)
uniform_sampled_frames = np.linspace(
0, total_frame_num - 1, max_frames_num, dtype=int
)
frame_idx = uniform_sampled_frames.tolist()
frames = vr.get_batch(frame_idx).asnumpy()
base64_frames = []
for frame in frames:
pil_img = Image.fromarray(frame)
buff = io.BytesIO()
pil_img.save(buff, format="JPEG")
base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
base64_frames.append(base64_str)
messages = [{"role": "user", "content": []}]
frame_format = {
"type": "image_url",
"image_url": {"url": "data:image/jpeg;base64,{}"},
}
for base64_frame in base64_frames:
frame_format["image_url"]["url"] = "data:image/jpeg;base64,{}".format(
base64_frame
)
messages[0]["content"].append(frame_format.copy())
prompt = {"type": "text", "text": "Please describe the video in detail."}
messages[0]["content"].append(prompt)
return messages
def print_speed_test_results(request, start_time, end_time):
total_tokens = request.usage.total_tokens
completion_tokens = request.usage.completion_tokens
prompt_tokens = request.usage.prompt_tokens
print(f"Total tokens: {total_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Time taken: {end_time - start_time} seconds")
print(f"Token per second: {total_tokens / (end_time - start_time)}")
print(f"Completion token per second: {completion_tokens / (end_time - start_time)}")
print(f"Prompt token per second: {prompt_tokens / (end_time - start_time)}")
def main():
url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
cache_dir = os.path.expanduser("~/.cache")
video_path = download_video(url, cache_dir)
client = create_openai_client("http://127.0.0.1:30000/v1")
image_stream_request_test(client)
video_stream_request_test(client, video_path)
image_speed_test(client)
video_speed_test(client, video_path)
if __name__ == "__main__":
main()
...@@ -121,6 +121,20 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size= ...@@ -121,6 +121,20 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=
if __name__ == "__main__": if __name__ == "__main__":
url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
cache_dir = os.path.expanduser("~/.cache")
file_path = os.path.join(cache_dir, "jobs.mp4")
os.makedirs(cache_dir, exist_ok=True)
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad responses
with open(file_path, "wb") as f:
f.write(response.content)
print(f"File downloaded and saved to: {file_path}")
# Create the parser # Create the parser
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Run video processing with specified port." description="Run video processing with specified port."
...@@ -148,7 +162,7 @@ if __name__ == "__main__": ...@@ -148,7 +162,7 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--video-dir", "--video-dir",
type=str, type=str,
default="./videos/Q98Z4OTh8RwmDonc.mp4", default=os.path.expanduser("~/.cache/jobs.mp4"),
help="The directory or path for the processed video files.", help="The directory or path for the processed video files.",
) )
parser.add_argument( parser.add_argument(
......
...@@ -20,7 +20,7 @@ dependencies = [ ...@@ -20,7 +20,7 @@ dependencies = [
] ]
[project.optional-dependencies] [project.optional-dependencies]
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
"packaging", "pillow", "psutil", "pydantic", "python-multipart", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
"torch", "uvicorn", "uvloop", "zmq", "torch", "uvicorn", "uvloop", "zmq",
"vllm==0.5.4", "outlines>=0.0.44"] "vllm==0.5.4", "outlines>=0.0.44"]
......
...@@ -137,7 +137,7 @@ register_chat_template( ...@@ -137,7 +137,7 @@ register_chat_template(
register_chat_template( register_chat_template(
ChatTemplate( ChatTemplate(
name="chatml-llava", name="chatml-llava",
default_system_prompt="Answer the questions.", default_system_prompt="You are a helpful assistant.",
role_prefix_and_suffix={ role_prefix_and_suffix={
"system": ("<|im_start|>system\n", "<|im_end|>\n"), "system": ("<|im_start|>system\n", "<|im_end|>\n"),
"user": ("<|im_start|>user\n", "<|im_end|>\n"), "user": ("<|im_start|>user\n", "<|im_end|>\n"),
...@@ -145,7 +145,7 @@ register_chat_template( ...@@ -145,7 +145,7 @@ register_chat_template(
}, },
style=ChatTemplateStyle.PLAIN, style=ChatTemplateStyle.PLAIN,
stop_str=("<|im_end|>",), stop_str=("<|im_end|>",),
image_token=" <image>\n", image_token="<image>\n",
) )
) )
...@@ -322,12 +322,17 @@ def match_chat_ml(model_path: str): ...@@ -322,12 +322,17 @@ def match_chat_ml(model_path: str):
if "tinyllama" in model_path: if "tinyllama" in model_path:
return get_chat_template("chatml") return get_chat_template("chatml")
# Now the suffix for qwen2 chat model is "instruct" # Now the suffix for qwen2 chat model is "instruct"
if "qwen" in model_path and ("chat" in model_path or "instruct" in model_path): if (
"qwen" in model_path
and ("chat" in model_path or "instruct" in model_path)
and ("llava" not in model_path)
):
return get_chat_template("qwen") return get_chat_template("qwen")
if ( if (
"llava-v1.6-34b" in model_path "llava-v1.6-34b" in model_path
or "llava-v1.6-yi-34b" in model_path or "llava-v1.6-yi-34b" in model_path
or "llava-next-video-34b" in model_path or "llava-next-video-34b" in model_path
or "llava-onevision-qwen2" in model_path
): ):
return get_chat_template("chatml-llava") return get_chat_template("chatml-llava")
......
...@@ -34,6 +34,7 @@ class SeparatorStyle(IntEnum): ...@@ -34,6 +34,7 @@ class SeparatorStyle(IntEnum):
NO_COLON_TWO = auto() NO_COLON_TWO = auto()
ADD_NEW_LINE_SINGLE = auto() ADD_NEW_LINE_SINGLE = auto()
LLAMA2 = auto() LLAMA2 = auto()
LLAMA3 = auto()
CHATGLM = auto() CHATGLM = auto()
CHATML = auto() CHATML = auto()
CHATINTERN = auto() CHATINTERN = auto()
...@@ -137,6 +138,20 @@ class Conversation: ...@@ -137,6 +138,20 @@ class Conversation:
else: else:
ret += role + ":" ret += role + ":"
return ret return ret
elif self.sep_style == SeparatorStyle.LLAMA3:
ret = "<|begin_of_text|>"
if self.system_message:
ret += system_prompt
else:
ret += ""
for i, (role, message) in enumerate(self.messages):
if message:
ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
ret += f"{message.strip()}<|eot_id|>"
else:
ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
# print(ret)
return ret
elif self.sep_style == SeparatorStyle.LLAMA2: elif self.sep_style == SeparatorStyle.LLAMA2:
seps = [self.sep, self.sep2] seps = [self.sep, self.sep2]
if self.system_message: if self.system_message:
...@@ -379,12 +394,23 @@ def generate_chat_conv( ...@@ -379,12 +394,23 @@ def generate_chat_conv(
conv.append_message(conv.roles[0], message.content) conv.append_message(conv.roles[0], message.content)
else: else:
real_content = "" real_content = ""
# calculate number of image_url
num_image_url = 0
for content in message.content:
if content.type == "image_url":
num_image_url += 1
if num_image_url > 1:
image_token = "<image>"
else:
image_token = "<image>\n"
for content in message.content: for content in message.content:
if content.type == "text": if content.type == "text":
if num_image_url > 16:
real_content += "\n" # for video
real_content += content.text real_content += content.text
elif content.type == "image_url": elif content.type == "image_url":
# NOTE: Only works for llava # NOTE: Only works for llava
real_content += "<image>\n" real_content += image_token
conv.append_image(content.image_url.url) conv.append_image(content.image_url.url)
conv.append_message(conv.roles[0], real_content) conv.append_message(conv.roles[0], real_content)
elif msg_role == "assistant": elif msg_role == "assistant":
...@@ -425,6 +451,18 @@ register_conv_template( ...@@ -425,6 +451,18 @@ register_conv_template(
) )
) )
register_conv_template(
Conversation(
name="chatml-llava",
system_template="<|im_start|>system\n{system_message}",
system_message="You are a helpful assistant.",
roles=("<|im_start|>user", "<|im_start|>assistant"),
sep_style=SeparatorStyle.CHATML,
sep="<|im_end|>",
stop_str=["<|endoftext|>", "<|im_end|>"],
)
)
register_conv_template( register_conv_template(
Conversation( Conversation(
name="vicuna_v1.1", name="vicuna_v1.1",
...@@ -437,6 +475,17 @@ register_conv_template( ...@@ -437,6 +475,17 @@ register_conv_template(
) )
) )
register_conv_template(
Conversation(
name="llava_llama_3",
system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
roles=("user", "assistant"),
sep_style=SeparatorStyle.LLAMA3,
sep="",
stop_str=["<|end_of_text|>", "<|eot_id|>"],
)
)
# Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442 # Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442
register_conv_template( register_conv_template(
Conversation( Conversation(
......
...@@ -131,11 +131,49 @@ class TokenizerManager: ...@@ -131,11 +131,49 @@ class TokenizerManager:
self.model_update_lock = asyncio.Lock() self.model_update_lock = asyncio.Lock()
self.model_update_result = None self.model_update_result = None
async def get_pixel_values(self, image_data): async def get_pixel_values(self, image_data, aspect_ratio=None):
aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None) aspect_ratio = (
getattr(self.hf_config, "image_aspect_ratio", None)
if aspect_ratio is None
else aspect_ratio
)
grid_pinpoints = ( grid_pinpoints = (
self.hf_config.image_grid_pinpoints if aspect_ratio == "anyres" else None self.hf_config.image_grid_pinpoints
if hasattr(self.hf_config, "image_grid_pinpoints")
and "anyres" in aspect_ratio
else None
) )
if isinstance(image_data, list) and len(image_data) > 0:
pixel_values, image_hash, image_size = [], [], []
if len(image_data) > 1:
aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
for img_data in image_data:
pixel_v, image_h, image_s = await self._process_single_image(
img_data, aspect_ratio, grid_pinpoints
)
pixel_values.append(pixel_v)
image_hash.append(image_h)
image_size.append(image_s)
pixel_values = np.stack(pixel_values, axis=0)
else:
pixel_values, image_hash, image_size = await self._process_single_image(
image_data[0], aspect_ratio, grid_pinpoints
)
image_hash = [image_hash]
image_size = [image_size]
elif isinstance(image_data, str):
pixel_values, image_hash, image_size = await self._process_single_image(
image_data, aspect_ratio, grid_pinpoints
)
image_hash = [image_hash]
image_size = [image_size]
else:
pixel_values, image_hash, image_size = None, None, None
return pixel_values, image_hash, image_size
async def _process_single_image(self, image_data, aspect_ratio, grid_pinpoints):
if self.executor is not None: if self.executor is not None:
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
return await loop.run_in_executor( return await loop.run_in_executor(
...@@ -194,8 +232,8 @@ class TokenizerManager: ...@@ -194,8 +232,8 @@ class TokenizerManager:
) )
if self.is_generation: if self.is_generation:
pixel_values, image_hash, image_size = await self._get_pixel_values( pixel_values, image_hash, image_size = await self.get_pixel_values(
obj.image_data if not_use_index else obj.image_data[index] obj.image_data
) )
return_logprob = ( return_logprob = (
obj.return_logprob if not_use_index else obj.return_logprob[index] obj.return_logprob if not_use_index else obj.return_logprob[index]
...@@ -704,7 +742,7 @@ def get_pixel_values( ...@@ -704,7 +742,7 @@ def get_pixel_values(
tuple(int(x * 255) for x in processor.image_processor.image_mean), tuple(int(x * 255) for x in processor.image_processor.image_mean),
) )
pixel_values = processor.image_processor(image)["pixel_values"][0] pixel_values = processor.image_processor(image)["pixel_values"][0]
elif image_aspect_ratio == "anyres": elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
pixel_values = process_anyres_image( pixel_values = process_anyres_image(
image, processor.image_processor, image_grid_pinpoints image, processor.image_processor, image_grid_pinpoints
) )
......
...@@ -322,11 +322,16 @@ class ModelTpServer: ...@@ -322,11 +322,16 @@ class ModelTpServer:
if self.model_runner.is_generation: if self.model_runner.is_generation:
req.pixel_values = recv_req.pixel_values req.pixel_values = recv_req.pixel_values
if req.pixel_values is not None: if req.pixel_values is not None:
image_hash = (
hash(tuple(recv_req.image_hash))
if isinstance(recv_req.image_hash, list)
else recv_req.image_hash
)
req.pad_value = [ req.pad_value = [
(recv_req.image_hash) % self.model_config.vocab_size, (image_hash) % self.model_config.vocab_size,
(recv_req.image_hash >> 16) % self.model_config.vocab_size, (image_hash >> 16) % self.model_config.vocab_size,
(recv_req.image_hash >> 32) % self.model_config.vocab_size, (image_hash >> 32) % self.model_config.vocab_size,
(recv_req.image_hash >> 64) % self.model_config.vocab_size, (image_hash >> 64) % self.model_config.vocab_size,
] ]
req.image_size = recv_req.image_size req.image_size = recv_req.image_size
( (
......
...@@ -13,10 +13,25 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
""" """
# Source: https://github.com/haotian-liu/LLaVA/blob/main/llava/mm_utils.py # Source: https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/llava/mm_utils.py
"""
Utilities for multi-modal models.
This python file mainly contains utilities that were used in the
image processing logic of llava-next including operations such as
anyres and anyres_max
Currently supports the anyres and anyres_max operation for CLIP and
SigLip. For more information, you may refer to the paper or the blog
LLaVA-NeXT : https://llava-vl.github.io/blog/2024-01-30-llava-next/
LLaVA-Onevision : https://arxiv.org/pdf/2408.03326
"""
import ast import ast
import base64 import base64
import math import math
import re
from io import BytesIO from io import BytesIO
import numpy as np import numpy as np
...@@ -40,10 +55,13 @@ def select_best_resolution(original_size, possible_resolutions): ...@@ -40,10 +55,13 @@ def select_best_resolution(original_size, possible_resolutions):
min_wasted_resolution = float("inf") min_wasted_resolution = float("inf")
for width, height in possible_resolutions: for width, height in possible_resolutions:
# Calculate the downscaled size to keep the aspect ratio
scale = min(width / original_width, height / original_height) scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = int(original_width * scale), int( downscaled_width, downscaled_height = int(original_width * scale), int(
original_height * scale original_height * scale
) )
# Calculate effective and wasted resolutions
effective_resolution = min( effective_resolution = min(
downscaled_width * downscaled_height, original_width * original_height downscaled_width * downscaled_height, original_width * original_height
) )
...@@ -129,6 +147,26 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): ...@@ -129,6 +147,26 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
Returns: Returns:
tuple: The shape of the image patch grid in the format (width, height). tuple: The shape of the image patch grid in the format (width, height).
""" """
if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
assert patch_size in [
224,
336,
384,
448,
512,
], "patch_size should be in [224, 336, 384, 448, 512]"
# Use regex to extract the range from the input string
matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
range_start = tuple(map(int, matches[0]))
range_end = tuple(map(int, matches[-1]))
# Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
grid_pinpoints = [
(i, j)
for i in range(range_start[0], range_end[0] + 1)
for j in range(range_start[1], range_end[1] + 1)
]
# Multiply all elements by patch_size
grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
if type(grid_pinpoints) is list: if type(grid_pinpoints) is list:
possible_resolutions = grid_pinpoints possible_resolutions = grid_pinpoints
else: else:
...@@ -149,6 +187,31 @@ def process_anyres_image(image, processor, grid_pinpoints): ...@@ -149,6 +187,31 @@ def process_anyres_image(image, processor, grid_pinpoints):
Returns: Returns:
np.array: An np array containing the processed image patches. np.array: An np array containing the processed image patches.
""" """
if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
try:
patch_size = processor.size[0]
except Exception as e:
patch_size = processor.size["shortest_edge"]
assert patch_size in [
224,
336,
384,
448,
512,
], "patch_size should be in [224, 336, 384, 448, 512]"
# Use regex to extract the range from the input string
matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
range_start = tuple(map(int, matches[0]))
range_end = tuple(map(int, matches[-1]))
# Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
grid_pinpoints = [
(i, j)
for i in range(range_start[0], range_end[0] + 1)
for j in range(range_start[1], range_end[1] + 1)
]
# Multiply all elements by patch_size
grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
if type(grid_pinpoints) is list: if type(grid_pinpoints) is list:
possible_resolutions = grid_pinpoints possible_resolutions = grid_pinpoints
else: else:
...@@ -156,15 +219,24 @@ def process_anyres_image(image, processor, grid_pinpoints): ...@@ -156,15 +219,24 @@ def process_anyres_image(image, processor, grid_pinpoints):
best_resolution = select_best_resolution(image.size, possible_resolutions) best_resolution = select_best_resolution(image.size, possible_resolutions)
image_padded = resize_and_pad_image(image, best_resolution) image_padded = resize_and_pad_image(image, best_resolution)
patches = divide_to_patches(image_padded, processor.crop_size["height"]) # For Siglip processor, only have size but no crop size
crop_size = (
image_original_resize = image.resize( processor.crop_size["height"]
(processor.size["shortest_edge"], processor.size["shortest_edge"]) if "crop_size" in processor.__dict__
else processor.size["height"]
) )
shortest_edge = (
processor.size["shortest_edge"]
if "shortest_edge" in processor.size
else processor.size["height"]
)
patches = divide_to_patches(image_padded, crop_size)
image_original_resize = image.resize((shortest_edge, shortest_edge))
image_patches = [image_original_resize] + patches image_patches = [image_original_resize] + patches
image_patches = [ image_patches = [
processor.preprocess(image_patch)["pixel_values"][0] processor.preprocess(image_patch.convert("RGB"))["pixel_values"][0]
for image_patch in image_patches for image_patch in image_patches
] ]
return np.stack(image_patches, axis=0) return np.stack(image_patches, axis=0)
...@@ -255,7 +327,7 @@ def process_images(images, image_processor, model_cfg): ...@@ -255,7 +327,7 @@ def process_images(images, image_processor, model_cfg):
) )
image = image_processor.preprocess(image)["pixel_values"][0] image = image_processor.preprocess(image)["pixel_values"][0]
new_images.append(image) new_images.append(image)
elif image_aspect_ratio == "anyres": elif "anyres" in image_aspect_ratio:
for image in images: for image in images:
image = process_anyres_image( image = process_anyres_image(
image, image_processor, model_cfg.image_grid_pinpoints image, image_processor, model_cfg.image_grid_pinpoints
......
...@@ -88,14 +88,19 @@ class InputMetadata: ...@@ -88,14 +88,19 @@ class InputMetadata:
reqs = batch.reqs reqs = batch.reqs
self.pixel_values = [r.pixel_values for r in reqs] self.pixel_values = [r.pixel_values for r in reqs]
self.image_sizes = [r.image_size for r in reqs] self.image_sizes = [r.image_size for r in reqs]
self.image_offsets = [ self.image_offsets = []
( for r in reqs:
(r.image_offset - batch.prefix_lens_cpu[i]) if isinstance(r.image_offset, list):
if r.image_offset is not None self.image_offsets.append(
else 0 [
) (image_offset - len(r.prefix_indices))
for i, r in enumerate(reqs) for image_offset in r.image_offset
] ]
)
elif isinstance(r.image_offset, int):
self.image_offsets.append(r.image_offset - len(r.prefix_indices))
elif r.image_offset is None:
self.image_offsets.append(0)
def compute_positions(self, batch: ScheduleBatch): def compute_positions(self, batch: ScheduleBatch):
position_ids_offsets = batch.position_ids_offsets position_ids_offsets = batch.position_ids_offsets
......
...@@ -15,6 +15,8 @@ limitations under the License. ...@@ -15,6 +15,8 @@ limitations under the License.
"""Inference-only LLaVa model compatible with HuggingFace weights.""" """Inference-only LLaVa model compatible with HuggingFace weights."""
import math
import re
from typing import Iterable, List, Optional, Tuple from typing import Iterable, List, Optional, Tuple
import numpy as np import numpy as np
...@@ -26,6 +28,8 @@ from transformers import ( ...@@ -26,6 +28,8 @@ from transformers import (
LlavaConfig, LlavaConfig,
MistralConfig, MistralConfig,
Qwen2Config, Qwen2Config,
SiglipVisionConfig,
SiglipVisionModel,
) )
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
from vllm.config import CacheConfig from vllm.config import CacheConfig
...@@ -63,34 +67,61 @@ class LlavaLlamaForCausalLM(nn.Module): ...@@ -63,34 +67,61 @@ class LlavaLlamaForCausalLM(nn.Module):
) )
def pad_input_ids(self, input_ids, pad_value, pt_shape=None, image_size=None): def pad_input_ids(self, input_ids, pad_value, pt_shape=None, image_size=None):
new_image_feature_len = self.image_feature_len
# now only support spatial_unpad + anyres # hardcode for spatial_unpad + anyres
if self.mm_patch_merge_type.startswith("spatial"): image_aspect_ratio = "anyres" if len(image_size) == 1 else "pad"
offset_list = []
for image_s in image_size:
if len(image_size) > 16:
# 2x2 pooling with stride 2
new_image_feature_len = (
math.ceil(self.image_size / self.patch_size / 2) ** 2
)
else:
new_image_feature_len = self.image_feature_len # multiimage
height = width = self.num_patches_per_side height = width = self.num_patches_per_side
if pt_shape[0] > 1: if "anyres" in image_aspect_ratio:
if self.image_aspect_ratio == "anyres": num_patch_width, num_patch_height = get_anyres_image_grid_shape(
num_patch_width, num_patch_height = get_anyres_image_grid_shape( image_s,
image_size, self.image_grid_pinpoints,
self.image_grid_pinpoints, self.vision_tower.config.image_size,
self.vision_tower.config.image_size, )
h = num_patch_height * height
w = num_patch_width * width
new_h, new_w = unpad_image_shape(h, w, image_s)
if "anyres_max" in self.config.image_aspect_ratio:
matched_anyres_max_num_patches = re.match(
r"anyres_max_(\d+)", self.config.image_aspect_ratio
)
if matched_anyres_max_num_patches:
max_num_patches = int(matched_anyres_max_num_patches.group(1))
# times = math.sqrt(h * w / (max_num_patches * unit**2))
times = math.sqrt(
new_h * new_w / (max_num_patches * self.image_feature_len)
) )
if "unpad" in self.mm_patch_merge_type: if times > 1.1:
h = num_patch_height * height new_h = int(new_h // times)
w = num_patch_width * width new_w = int(new_w // times)
new_h, new_w = unpad_image_shape(h, w, image_size) new_image_feature_len += new_h * (new_w + 1)
new_image_feature_len += new_h * (new_w + 1)
pad_ids = pad_value * (
pad_ids = pad_value * ( (new_image_feature_len + len(pad_value)) // len(pad_value)
(new_image_feature_len + len(pad_value)) // len(pad_value) )
) # print("calculated new_image_feature_len: ", new_image_feature_len)
offset = input_ids.index(self.config.image_token_index) try:
# old_len + pad_len - 1, because we need to remove image_token_id offset = input_ids.index(self.config.image_token_index)
new_input_ids = ( except ValueError:
input_ids[:offset] offset = 0
+ pad_ids[:new_image_feature_len] # old_len + pad_len - 1, because we need to remove image_token_id
+ input_ids[offset + 1 :] input_ids = (
) input_ids[:offset]
return new_input_ids, offset + pad_ids[:new_image_feature_len]
+ input_ids[offset + 1 :]
)
offset_list.append(offset)
return input_ids, offset_list
def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor: def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor:
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
...@@ -124,7 +155,6 @@ class LlavaLlamaForCausalLM(nn.Module): ...@@ -124,7 +155,6 @@ class LlavaLlamaForCausalLM(nn.Module):
# Embed text input # Embed text input
input_embeds = self.language_model.model.embed_tokens(input_ids) input_embeds = self.language_model.model.embed_tokens(input_ids)
# Embed vision input # Embed vision input
need_vision = ( need_vision = (
(positions[input_metadata.extend_start_loc] < self.image_feature_len) (positions[input_metadata.extend_start_loc] < self.image_feature_len)
...@@ -163,27 +193,73 @@ class LlavaLlamaForCausalLM(nn.Module): ...@@ -163,27 +193,73 @@ class LlavaLlamaForCausalLM(nn.Module):
if self.mm_patch_merge_type.startswith("spatial"): if self.mm_patch_merge_type.startswith("spatial"):
new_image_features = [] new_image_features = []
height = width = self.num_patches_per_side
for image_idx, image_feature in enumerate(image_features): for image_idx, image_feature in enumerate(image_features):
if image_feature.shape[0] > 1: if len(image_sizes[image_idx]) == 1:
image_aspect_ratio = (
self.config.image_aspect_ratio
) # single image
else:
image_aspect_ratio = "pad" # multi image
# image_aspect_ratio = (
# "anyres" if len(image_sizes[image_idx]) == 1 else "pad"
# )
if (
image_feature.shape[0] > 1
and "anyres" in image_aspect_ratio
):
base_image_feature = image_feature[0] base_image_feature = image_feature[0]
image_feature = image_feature[1:] image_feature = image_feature[1:]
height = width = self.num_patches_per_side
assert height * width == base_image_feature.shape[0] assert height * width == base_image_feature.shape[0]
if self.image_aspect_ratio == "anyres":
( if "anyres_max" in image_aspect_ratio:
num_patch_width, matched_anyres_max_num_patches = re.match(
num_patch_height, r"anyres_max_(\d+)", image_aspect_ratio
) = get_anyres_image_grid_shape(
image_sizes[image_idx],
self.image_grid_pinpoints,
self.vision_tower.config.image_size,
) )
if matched_anyres_max_num_patches:
max_num_patches = int(
matched_anyres_max_num_patches.group(1)
)
if (
image_aspect_ratio == "anyres"
or "anyres_max" in image_aspect_ratio
):
vision_tower_image_size = self.image_size
try:
num_patch_width, num_patch_height = (
get_anyres_image_grid_shape(
image_sizes[image_idx][0],
self.config.image_grid_pinpoints,
vision_tower_image_size,
)
)
except Exception as e:
print(f"Error: {e}")
num_patch_width, num_patch_height = 2, 2
image_feature = image_feature.view( image_feature = image_feature.view(
num_patch_height, num_patch_width, height, width, -1 num_patch_height, num_patch_width, height, width, -1
) )
else: else:
raise NotImplementedError() image_feature = image_feature.view(
2, 2, height, width, -1
)
# (
# num_patch_width,
# num_patch_height,
# ) = get_anyres_image_grid_shape(
# image_sizes[image_idx][0],
# self.image_grid_pinpoints,
# self.vision_tower.config.image_size,
# )
# image_feature = image_feature.view(
# num_patch_height, num_patch_width, height, width, -1
# )
if "unpad" in self.mm_patch_merge_type: if "unpad" in self.mm_patch_merge_type:
unit = image_feature.shape[2]
image_feature = image_feature.permute( image_feature = image_feature.permute(
4, 0, 2, 1, 3 4, 0, 2, 1, 3
).contiguous() ).contiguous()
...@@ -191,8 +267,23 @@ class LlavaLlamaForCausalLM(nn.Module): ...@@ -191,8 +267,23 @@ class LlavaLlamaForCausalLM(nn.Module):
2, 3 2, 3
) )
image_feature = unpad_image( image_feature = unpad_image(
image_feature, image_sizes[image_idx] image_feature, image_sizes[image_idx][0]
) )
if (
"anyres_max" in image_aspect_ratio
and matched_anyres_max_num_patches
):
c, h, w = image_feature.shape
times = math.sqrt(
h * w / (max_num_patches * unit**2)
)
if times > 1.1:
image_feature = image_feature[None]
image_feature = nn.functional.interpolate(
image_feature,
[int(h // times), int(w // times)],
mode="bilinear",
)[0]
image_feature = torch.cat( image_feature = torch.cat(
( (
image_feature, image_feature,
...@@ -213,16 +304,31 @@ class LlavaLlamaForCausalLM(nn.Module): ...@@ -213,16 +304,31 @@ class LlavaLlamaForCausalLM(nn.Module):
image_feature = torch.cat( image_feature = torch.cat(
(base_image_feature, image_feature), dim=0 (base_image_feature, image_feature), dim=0
) )
image_feature = image_feature.unsqueeze(0)
else: else:
image_feature = image_feature[0] if image_feature.shape[0] > 16: # video
if "unpad" in self.mm_patch_merge_type: # 2x2 pooling
image_feature = torch.cat( num_of_frames = image_feature.shape[0]
( image_feature = image_feature.view(
image_feature, num_of_frames, height, width, -1
self.language_model.model.image_newline[None],
),
dim=0,
) )
image_feature = image_feature.permute(
0, 3, 1, 2
).contiguous() # N, C, H, W
height, weight = image_feature.shape[2:]
scaled_shape = [
math.ceil(height / 2),
math.ceil(weight / 2),
]
image_feature = nn.functional.interpolate(
image_feature, size=scaled_shape, mode="bilinear"
)
image_feature = (
image_feature.flatten(2)
.transpose(1, 2)
.contiguous()
) # N, C, H*W
new_image_features.append(image_feature) new_image_features.append(image_feature)
image_features = new_image_features image_features = new_image_features
...@@ -233,21 +339,22 @@ class LlavaLlamaForCausalLM(nn.Module): ...@@ -233,21 +339,22 @@ class LlavaLlamaForCausalLM(nn.Module):
continue continue
start_idx = extend_start_loc_cpu[i] start_idx = extend_start_loc_cpu[i]
pad_len, pad_dim = image_features[pt].shape # 576, 4096 pad_dim = image_features[pt].shape[-1] # 576, 4096
dim = input_embeds.shape[1] dim = input_embeds.shape[1]
assert ( assert (
pad_dim == dim pad_dim == dim
), "invalid pad_dim={}, input_embed_dim={}!".format(pad_dim, dim) ), "invalid pad_dim={}, input_embed_dim={}!".format(pad_dim, dim)
# Fill in the placeholder for the image # Fill in the placeholder for the image
try: try:
input_embeds[ for j, image_off in enumerate(image_offsets[i]):
start_idx # print("actual image_features length: ", image_features[pt][j].shape[0])
+ image_offsets[i] : start_idx pad_len = image_features[pt][j].shape[0]
+ image_offsets[i] input_embeds[
+ pad_len start_idx + image_off : start_idx + image_off + pad_len
] = image_features[pt] ] = image_features[pt][j]
except RuntimeError as e: except RuntimeError as e:
print(f"RuntimeError in llava image encoding: {e}") print(f"RuntimeError in llava image encoding: {e}")
print(image_features[pt].shape)
print(input_embeds.shape) print(input_embeds.shape)
print(start_idx, image_offsets[i]) print(start_idx, image_offsets[i])
pt += 1 pt += 1
...@@ -262,9 +369,16 @@ class LlavaLlamaForCausalLM(nn.Module): ...@@ -262,9 +369,16 @@ class LlavaLlamaForCausalLM(nn.Module):
# load clip vision model by cfg['mm_vision_tower']: # load clip vision model by cfg['mm_vision_tower']:
# huggingface_name or path_of_clip_relative_to_llava_model_dir # huggingface_name or path_of_clip_relative_to_llava_model_dir
vision_path = self.config.mm_vision_tower vision_path = self.config.mm_vision_tower
self.vision_tower = CLIPVisionModel.from_pretrained( if "clip" in vision_path:
vision_path, torch_dtype=torch.float16 self.vision_tower = CLIPVisionModel.from_pretrained(
).cuda() vision_path, torch_dtype=torch.float16
).cuda()
elif "siglip" in vision_path:
self.vision_tower = SiglipVisionModel.from_pretrained(
vision_path, torch_dtype=torch.float16
).cuda()
# Siglip needs all feature tokens
self.config.mm_vision_select_feature = "full"
self.vision_tower.eval() self.vision_tower.eval()
self.vision_feature_layer = self.config.mm_vision_select_layer self.vision_feature_layer = self.config.mm_vision_select_layer
...@@ -276,8 +390,11 @@ class LlavaLlamaForCausalLM(nn.Module): ...@@ -276,8 +390,11 @@ class LlavaLlamaForCausalLM(nn.Module):
self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square") self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None) self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None)
self.image_feature_len = int((self.image_size / self.patch_size) ** 2) self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
if self.vision_feature_select_strategy == "patch": if (
self.vision_feature_select_strategy == "patch"
or self.vision_feature_select_strategy == "full"
):
pass pass
elif self.vision_feature_select_strategy == "cls_patch": elif self.vision_feature_select_strategy == "cls_patch":
self.image_feature_len += 1 self.image_feature_len += 1
......
import base64
import io
import json import json
import os
import sys
import time
import unittest import unittest
import numpy as np
import openai import openai
import requests
from decord import VideoReader, cpu
from PIL import Image
from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server
# python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --tokenizer-path lmms-lab/llavanext-qwen-siglip-tokenizer --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384
class TestOpenAIVisionServer(unittest.TestCase): class TestOpenAIVisionServer(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = "liuhaotian/llava-v1.6-vicuna-7b" cls.model = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
cls.api_key = "sk-123456" cls.api_key = "sk-123456"
cls.process = popen_launch_server( cls.process = popen_launch_server(
...@@ -21,9 +31,11 @@ class TestOpenAIVisionServer(unittest.TestCase): ...@@ -21,9 +31,11 @@ class TestOpenAIVisionServer(unittest.TestCase):
api_key=cls.api_key, api_key=cls.api_key,
other_args=[ other_args=[
"--chat-template", "--chat-template",
"vicuna_v1.1", "chatml-llava",
"--tokenizer-path", "--tokenizer-path",
"llava-hf/llava-1.5-7b-hf", "lmms-lab/llavanext-qwen-siglip-tokenizer",
"--chunked-prefill-size",
"16384",
"--log-requests", "--log-requests",
], ],
) )
...@@ -68,6 +80,81 @@ class TestOpenAIVisionServer(unittest.TestCase): ...@@ -68,6 +80,81 @@ class TestOpenAIVisionServer(unittest.TestCase):
assert response.usage.completion_tokens > 0 assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0 assert response.usage.total_tokens > 0
def prepare_video_messages(self, video_path):
max_frames_num = 32
vr = VideoReader(video_path, ctx=cpu(0))
total_frame_num = len(vr)
uniform_sampled_frames = np.linspace(
0, total_frame_num - 1, max_frames_num, dtype=int
)
frame_idx = uniform_sampled_frames.tolist()
frames = vr.get_batch(frame_idx).asnumpy()
base64_frames = []
for frame in frames:
pil_img = Image.fromarray(frame)
buff = io.BytesIO()
pil_img.save(buff, format="JPEG")
base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
base64_frames.append(base64_str)
messages = [{"role": "user", "content": []}]
frame_format = {
"type": "image_url",
"image_url": {"url": "data:image/jpeg;base64,{}"},
}
for base64_frame in base64_frames:
frame_format["image_url"]["url"] = "data:image/jpeg;base64,{}".format(
base64_frame
)
messages[0]["content"].append(frame_format.copy())
prompt = {"type": "text", "text": "Please describe the video in detail."}
messages[0]["content"].append(prompt)
return messages
def test_video_chat_completion(self):
url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
cache_dir = os.path.expanduser("~/.cache")
file_path = os.path.join(cache_dir, "jobs.mp4")
os.makedirs(cache_dir, exist_ok=True)
if not os.path.exists(file_path):
response = requests.get(url)
response.raise_for_status()
with open(file_path, "wb") as f:
f.write(response.content)
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
messages = self.prepare_video_messages(file_path)
start_time = time.time()
video_request = client.chat.completions.create(
model="default",
messages=messages,
temperature=0,
max_tokens=1024,
stream=True,
)
print("-" * 30)
video_response = ""
for chunk in video_request:
if chunk.choices[0].delta.content is not None:
content = chunk.choices[0].delta.content
video_response += content
sys.stdout.write(content)
sys.stdout.flush()
print("-" * 30)
# Add assertions to validate the video response
self.assertIsNotNone(video_response)
self.assertGreater(len(video_response), 0)
def test_regex(self): def test_regex(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url) client = openai.Client(api_key=self.api_key, base_url=self.base_url)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment