Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
......@@ -6,16 +6,15 @@ typically specific to a small subset of models.
import re
import types
from pathlib import PosixPath
from typing import Callable, List, Optional, Tuple, Union
from typing import Optional, Union
import torch
from PIL.Image import Image
from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
GenerationConfig)
from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import patch_padding_side
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from .....conftest import HfRunner, ImageAsset, _ImageAssets
from .types import RunnerOutput
......@@ -49,7 +48,7 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
def qwen_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
......@@ -60,7 +59,7 @@ def qwen_vllm_to_hf_output(
def qwen2_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
......@@ -78,7 +77,7 @@ def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
def llava_video_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
config = AutoConfig.from_pretrained(model)
mm_token_id = config.video_token_index
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
......@@ -211,43 +210,9 @@ def get_llava_embeddings(image_assets: _ImageAssets):
return [asset.image_embeds for asset in image_assets]
####### postprocessors to run on HF BatchEncoding
def cast_dtype_post_processor(
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
"""Gets a handle to a post processor which converts a given key into a
target data type."""
def process(hf_inputs: BatchEncoding, dtype: str):
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
return hf_inputs
return process
def ignore_inputs_post_processor(
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
"""Gets a handle to a post processor which ignores a given key."""
def process(hf_inputs: BatchEncoding, dtype: str):
del hf_inputs[hf_inp_key]
return hf_inputs
return process
def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
return {"model_inputs": hf_inputs}
def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
####### Prompt path encoders for models that need models on disk
def qwen_prompt_path_encoder(
tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
_ImageAssets]) -> str:
"""Given a temporary dir path, export one or more image assets into the
tempdir & replace its contents with the local path to the string so that
......@@ -257,7 +222,7 @@ def qwen_prompt_path_encoder(
Args:
tmp_path: Tempdir for test under consideration.
prompt: Prompt with image placeholders.
assets: List of image assets whose len equals the num placeholders.
assets: list of image assets whose len equals the num placeholders.
"""
# Ensure that the number of placeholders matches the number of assets;
# If this is not true, the test is probably written incorrectly.
......@@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
for k in inputs.keys() # noqa
if k not in ("seq_lens", "sft_format")
}
inputs = BatchEncoding(data=inputs, tensor_type="pt")
return inputs
return BatchFeature(data=inputs, tensor_type="pt")
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
......@@ -304,8 +268,20 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4."""
def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Gemma 3."""
hf_processor = hf_model.processor
def processor(*args, **kwargs):
return hf_processor(*args, do_pan_and_scan=True, **kwargs)
hf_model.processor = processor
return hf_model
def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
hf_processor = hf_model.processor
patch_padding_side(hf_processor)
......@@ -313,12 +289,20 @@ def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
if images is None:
return hf_processor(*args, **kwargs)
images = [images] if isinstance(images, Image) else images
contents = re.findall(
r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
text,
)
assert len(contents) == len(images)
return hf_processor.apply_chat_template(
[{
"role": "user",
"image": images,
"content": text
}],
"image": image,
"content": content
} for image, content in zip(images, contents)],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
......@@ -350,7 +334,7 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, List[Image]],
def __call__(self, text: str, images: Union[Image, list[Image]],
**kwargs):
# yapf: disable
from vllm.model_executor.models.h2ovl import (
......@@ -410,7 +394,7 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, List[Image]],
def __call__(self, text: str, images: Union[Image, list[Image]],
**kwargs):
from vllm.model_executor.models.internvl import (
IMG_CONTEXT, IMG_END, IMG_START,
......@@ -509,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model
def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
orig_generate = hf_model.model.generate
def _generate(self, *args, **kwargs):
def _generate(
self,
*args,
input_ids=None,
pixel_values=None,
image_sizes=None,
image_bound=None,
tgt_sizes=None,
**kwargs,
):
model_inputs = {
"input_ids": input_ids,
"pixel_values": pixel_values,
"image_sizes": image_sizes,
"image_bound": image_bound,
"tgt_sizes": tgt_sizes,
}
for k in list(model_inputs.keys()):
if model_inputs[k] is None:
model_inputs.pop(k)
return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model
def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
orig_generate = hf_model.model.generate
def _generate(self, *args, image_sizes=None, **kwargs):
return orig_generate(*args, decode_text=False, **kwargs)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model
def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
orig_generate = hf_model.model.generate
def _generate(self, *args, image_sizes=None, **kwargs):
return orig_generate(*args, decode_text=False, **kwargs)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
......@@ -531,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
batch = {
k: kwargs.pop(k)
k: kwargs.pop(k).unsqueeze(0)
for k in ("input_ids", "images", "image_input_idx", "image_masks")
if k in kwargs
}
batch = BatchFeature(batch).to(dtype=self.dtype)
return self.generate_from_batch(
batch,
......
......@@ -3,7 +3,6 @@
types / modalities.
"""
from pathlib import PosixPath
from typing import Type
from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
from . import builders, core
......@@ -13,8 +12,8 @@ from .types import ExpandableVLMTestArgs, VLMTestInfo
####### Entrypoints for running different test types
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: _ImageAssets):
assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info(
......@@ -36,8 +35,8 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: _ImageAssets):
assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info(
......@@ -59,8 +58,8 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
def run_embedding_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: _ImageAssets):
assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
......@@ -85,8 +84,8 @@ def run_video_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
video_assets: _VideoAssets,
):
assert test_case.size_wrapper is not None
......@@ -111,8 +110,8 @@ def run_video_test(
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner]):
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner]):
# Custom test cases can provide inputs directly, but they need to
# explicitly provided a CustomTestConfig, which wraps the inputs and
# the limit_mm_per_prompt
......
# SPDX-License-Identifier: Apache-2.0
"""Types for writing multimodal model tests."""
from collections.abc import Iterable
from enum import Enum
from pathlib import PosixPath
from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
Tuple, Type, Union)
from typing import Any, Callable, NamedTuple, Optional, Union
import torch
from PIL.Image import Image
from pytest import MarkDecorator
from transformers import AutoModelForCausalLM, BatchEncoding
from transformers import AutoModelForCausalLM
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption
from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import identity
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
from ....utils import check_logprobs_close
......@@ -35,7 +34,7 @@ VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]]
RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
# yapf: enable
......@@ -53,8 +52,8 @@ class SizeType(Enum):
class CustomTestOptions(NamedTuple):
inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]]
limit_mm_per_prompt: Dict[str, int]
inputs: list[tuple[list[str], list[Union[list[Image], Image]]]]
limit_mm_per_prompt: dict[str, int]
# kwarg to pass multimodal data in as to vllm/hf runner instances.
runner_mm_key: str = "images"
......@@ -63,13 +62,13 @@ class ImageSizeWrapper(NamedTuple):
type: SizeType
# A size factor is a wrapper of 0+ floats,
# while a fixed size contains an iterable of integer pairs
data: Union[Iterable[float], Iterable[Tuple[int, int]]]
data: Union[Iterable[float], Iterable[tuple[int, int]]]
class VLMTestInfo(NamedTuple):
"""Holds the configuration for 1+ tests for one model architecture."""
models: List[str]
models: list[str]
test_type: Union[VLMTestType, Iterable[VLMTestType]]
# Should be None only if this is a CUSTOM_INPUTS test
......@@ -97,24 +96,19 @@ class VLMTestInfo(NamedTuple):
max_num_seqs: int = 256
task: TaskOption = "auto"
tensor_parallel_size: int = 1
vllm_runner_kwargs: Optional[Dict[str, Any]] = None
vllm_runner_kwargs: Optional[dict[str, Any]] = None
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
stop_str: Optional[List[str]] = None
stop_str: Optional[list[str]] = None
# Exposed options for HF runner
hf_model_kwargs: Optional[Dict[str, Any]] = None
hf_model_kwargs: Optional[dict[str, Any]] = None
# Indicates we should explicitly pass the EOS from the tokenizer
use_tokenizer_eos: bool = False
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
# Callable to pass to the HF runner to run on inputs; for now, we also pass
# the data type to input post processing, because almost all of the uses of
# postprocess_inputs are to fix the data types of BatchEncoding values.
postprocess_inputs: Callable[[BatchEncoding, str],
BatchEncoding] = identity
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
# Post processors that if defined, will run oun the outputs of the
......@@ -128,12 +122,12 @@ class VLMTestInfo(NamedTuple):
# Default expandable params per test; these defaults can be overridden in
# instances of this object; the complete set of test cases for the model
# is all combinations of .models + all fields below
max_tokens: Union[int, Tuple[int]] = 128
num_logprobs: Union[int, Tuple[int]] = 5
dtype: Union[str, Iterable[str]] = "half"
max_tokens: Union[int, tuple[int]] = 128
num_logprobs: Union[int, tuple[int]] = 5
dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
# Only expanded in video tests
num_video_frames: Union[int, Tuple[int]] = 16
num_video_frames: Union[int, tuple[int]] = 16
# Fixed image sizes / image size factors; most tests use image_size_factors
# The values provided for these two fields will be stacked and expanded
......@@ -141,19 +135,19 @@ class VLMTestInfo(NamedTuple):
# once per tests (much like concatenating and wrapping in one parametrize
# call)
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None
image_sizes: Optional[Iterable[Iterable[tuple[int, int]]]] = None
# Hack for updating a prompt to take into a local path; currently only used
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
prompt_path_encoder: Optional[
Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]],
Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]],
str]] = None # noqa: E501
# Allows configuring a test to run with custom inputs
custom_test_opts: Optional[List[CustomTestOptions]] = None
custom_test_opts: Optional[list[CustomTestOptions]] = None
marks: Optional[List[MarkDecorator]] = None
marks: Optional[list[MarkDecorator]] = None
def get_non_parametrized_runner_kwargs(self):
"""Returns a dictionary of expandable kwargs for items that are used
......@@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
"vllm_output_post_proc": self.vllm_output_post_proc,
"auto_cls": self.auto_cls,
"use_tokenizer_eos": self.use_tokenizer_eos,
"postprocess_inputs": self.postprocess_inputs,
"comparator": self.comparator,
"get_stop_token_ids": self.get_stop_token_ids,
"hf_model_kwargs": self.hf_model_kwargs,
......
......@@ -9,6 +9,8 @@ import torch
from transformers import AutoModelForSequenceClassification
from ....utils import models_path_prefix
from vllm.platforms import current_platform
@pytest.mark.parametrize(
"model",
......@@ -17,24 +19,24 @@ from ....utils import models_path_prefix
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
],
)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype",
["half"] if current_platform.is_rocm() else ["float"])
def test_classification_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
monkeypatch,
) -> None:
if current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
......@@ -45,4 +47,8 @@ def test_classification_models(
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)
assert torch.allclose(hf_output, vllm_output, 1e-3)
# the tolerance value of 1e-2 is selected based on the
# half datatype tests in
# tests/models/embedding/language/test_embedding.py
assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)
......@@ -7,10 +7,11 @@ import os
import pytest
from vllm.config import PoolerConfig
from ....utils import models_path_prefix
from vllm.platforms import current_platform
from ..utils import check_embeddings_close
from vllm.platforms import current_platform
@pytest.mark.parametrize(
......@@ -21,15 +22,15 @@ from vllm.platforms import current_platform
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param(os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")),
pytest.param(os.path.join(models_path_prefix, "intfloat/multilingual-e5-large")),
pytest.param(os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-7B-instruct")),
# [Decoder-only]
pytest.param(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
marks=[pytest.mark.core_model]),
pytest.param(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param(os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-1.5B-instruct")),
pytest.param(os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-7B-instruct")),
pytest.param(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base")),
# [Encoder-decoder]
# [Cross-Encoder]
pytest.param(os.path.join(models_path_prefix, "sentence-transformers/stsb-roberta-base-v2")),
],
)
......@@ -44,13 +45,21 @@ def test_models(
example_prompts,
model,
dtype: str,
monkeypatch,
) -> None:
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
vllm_extra_kwargs = {}
if model == os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base"):
vllm_extra_kwargs["override_pooler_config"] = \
PoolerConfig(pooling_type="MEAN")
if model == os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-7B-instruct"):
vllm_extra_kwargs["hf_overrides"] = {"is_causal": False}
vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
......@@ -71,13 +80,6 @@ def test_models(
**vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
......
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import importlib.util
import math
from array import array
from typing import List
import os
import openai
......@@ -13,13 +13,14 @@ from scipy.spatial.distance import cosine
import vllm
import vllm.config
from vllm.utils import STR_BACKEND_ENV_VAR
from ....utils import RemoteOpenAIServer
from ....utils import models_path_prefix
# GritLM embedding implementation is only supported by XFormers backend.
pytest.mark.skipif(not importlib.util.find_spec("xformers"),
reason="GritLM requires XFormers")
pytestmark = pytest.mark.skipif(not importlib.util.find_spec("xformers"),
reason="GritLM requires XFormers")
MODEL_NAME = os.path.join(models_path_prefix, "parasail-ai/GritLM-7B-vllm")
MAX_MODEL_LEN = 4000
......@@ -32,36 +33,34 @@ def _arr(arr):
return array("i", arr)
def test_find_array(monkeypatch):
def test_find_array(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend.
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
from vllm.model_executor.models.gritlm import GritLMPooler
from vllm.model_executor.models.gritlm import GritLMPooler
# Create an LLM object to get the model config.
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
# Create an LLM object to get the model config.
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
@pytest.fixture(scope="module")
def server_embedding():
# GritLM embedding implementation is only supported by XFormers backend.
with pytest.MonkeyPatch.context() as mp:
mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
......@@ -72,9 +71,12 @@ def server_generate():
@pytest_asyncio.fixture
async def client_embedding(server_embedding: RemoteOpenAIServer):
async with server_embedding.get_async_client() as async_client:
yield async_client
async def client_embedding(monkeypatch: pytest.MonkeyPatch,
server_embedding: RemoteOpenAIServer):
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
async with server_embedding.get_async_client() as async_client:
yield async_client
@pytest_asyncio.fixture
......@@ -83,14 +85,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
yield async_client
def run_llm_encode(llm: vllm.LLM, queries: List[str],
instruction: str) -> List[float]:
def run_llm_encode(
llm: vllm.LLM,
queries: list[str],
instruction: str,
) -> list[float]:
outputs = llm.encode([instruction + q for q in queries], )
return [output.outputs.embedding for output in outputs]
async def run_client_embeddings(client: vllm.LLM, queries: List[str],
instruction: str) -> List[float]:
async def run_client_embeddings(
client: vllm.LLM,
queries: list[str],
instruction: str,
) -> list[float]:
outputs = await client.embeddings.create(
model=MODEL_NAME,
input=[instruction + q for q in queries],
......@@ -109,7 +117,7 @@ def get_test_data():
README.md in https://github.com/ContextualAI/gritlm
"""
q_instruction = gritlm_instruction(
"Given a scientific paper title, retrieve the paper's abstract")
"Given a scientific paper title, retrieve the paper's abstract", )
queries = [
"Bitcoin: A Peer-to-Peer Electronic Cash System",
"Generative Representational Instruction Tuning",
......@@ -125,7 +133,7 @@ def get_test_data():
return queries, q_instruction, documents, d_instruction
def validate_embed_output(q_rep: List[float], d_rep: List[float]):
def validate_embed_output(q_rep: list[float], d_rep: list[float]):
cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
......@@ -139,31 +147,32 @@ def validate_embed_output(q_rep: List[float], d_rep: List[float]):
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
def test_gritlm_offline_embedding(monkeypatch):
def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend.
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
queries, q_instruction, documents, d_instruction = get_test_data()
queries, q_instruction, documents, d_instruction = get_test_data()
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
d_rep = run_llm_encode(
llm,
documents,
d_instruction,
)
q_rep = run_llm_encode(
llm,
queries,
q_instruction,
)
d_rep = run_llm_encode(
llm,
documents,
d_instruction,
)
q_rep = run_llm_encode(
llm,
queries,
q_instruction,
)
validate_embed_output(q_rep, d_rep)
validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio
async def test_gritlm_api_server_embedding(
client_embedding: openai.AsyncOpenAI):
client_embedding: openai.AsyncOpenAI, ):
queries, q_instruction, documents, d_instruction = get_test_data()
d_rep = await run_client_embeddings(
......
# SPDX-License-Identifier: Apache-2.0
from typing import List, Sequence
from collections.abc import Sequence
import torch
import torch.nn.functional as F
......@@ -8,8 +8,8 @@ import torch.nn.functional as F
def check_embeddings_close(
*,
embeddings_0_lst: Sequence[List[float]],
embeddings_1_lst: Sequence[List[float]],
embeddings_0_lst: Sequence[list[float]],
embeddings_1_lst: Sequence[list[float]],
name_0: str,
name_1: str,
tol: float = 1e-3,
......
# SPDX-License-Identifier: Apache-2.0
from functools import partial
from typing import Callable, Dict, List, Type
from typing import Callable
import os
import pytest
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
from transformers import Qwen2VLForConditionalGeneration
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test, models_path_prefix
......@@ -68,7 +68,7 @@ def get_messages(image: Image.Image, text: str, embed_text: bool):
def apply_chat_template_and_add_eos(
messages: List[Dict],
messages: list[dict],
apply_chat_template_fn: Callable,
):
prompt = apply_chat_template_fn(
......@@ -76,16 +76,12 @@ def apply_chat_template_and_add_eos(
return prompt
def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
def _run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
input_texts: List[str],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
input_texts: list[str],
input_images: PromptImageInput,
embed_texts: List[bool],
embed_texts: list[bool],
model: str,
*,
dtype: str,
......@@ -119,14 +115,8 @@ def _run_test(
with hf_runner(model,
dtype=dtype,
auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
hf_model.postprocess_inputs = partial(
postprocess_inputs,
hf_model,
cache_position=torch.arange(
0,
1, # 1 for batch size
requires_grad=False),
use_cache=False)
prompts = []
for text, image, embed_text in zip(input_texts, input_images,
embed_texts):
# dse requires non-standard input processing
......@@ -134,20 +124,34 @@ def _run_test(
messages = get_messages(image, text, embed_text)
prompt = apply_chat_template_and_add_eos(
messages, hf_model.processor.apply_chat_template)
inputs = hf_model.get_inputs(
prompts=[[prompt]],
images=[[image]],
)
with torch.no_grad():
prompts.append(prompt)
all_inputs = hf_model.get_inputs(
prompts=prompts,
images=input_images,
)
with torch.no_grad():
all_outputs = []
for inputs in all_inputs:
inputs = hf_model.model.prepare_inputs_for_generation(
**inputs,
cache_position=torch.arange(1), # 1 for batch size
use_cache=False,
)
outputs = hf_model.model(
**hf_model.wrap_device(inputs[0],
device=hf_model.model.device.type),
**hf_model.wrap_device(inputs),
return_dict=True,
output_hidden_states=True,
)
pooled_output = torch.nn.functional.normalize(
outputs.hidden_states[-1][0, -1], p=2, dim=-1)
hf_outputs.append(pooled_output.tolist())
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
p=2,
dim=-1)
all_outputs.append(pooled_output.tolist())
hf_outputs = all_outputs
check_embeddings_close(
embeddings_0_lst=hf_outputs,
......
# SPDX-License-Identifier: Apache-2.0
from typing import List, Type
import os
import pytest
import torch.nn.functional as F
from transformers import AutoModelForVision2Seq
from transformers import AutoModelForImageTextToText
from vllm.platforms import current_platform
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test, models_path_prefix
from ..utils import check_embeddings_close
# Llava Next embedding implementation is only supported by CUDA.
# If run on ROCm, hf_model.model.resize_token_embeddings will
# cause the following error:
# RuntimeError: Calling torch.linalg.cholesky on a CUDA tensor
# requires compiling PyTorch with MAGMA. Please use PyTorch
# built with MAGMA support.
# If run on CPU, hf_model.model.resize_token_embeddings will
# cause the following error:
# RuntimeError: Calling torch.linalg.cholesky on a CPU tensor
# requires compiling PyTorch with LAPACK. Please use PyTorch
# built with LAPACK support.
pytestmark = pytest.mark.skipif(
not current_platform.is_cuda(),
reason="Llava Next model uses op that is only supported in CUDA")
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
HF_TEXT_PROMPTS = [
......@@ -36,9 +51,9 @@ MODELS = [os.path.join(models_path_prefix, "royokong/e5-v")]
def _run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
input_texts: List[str],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
input_texts: list[str],
input_images: PromptImageInput,
model: str,
*,
......@@ -56,7 +71,7 @@ def _run_test(
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForVision2Seq) as hf_model:
auto_cls=AutoModelForImageTextToText) as hf_model:
# Patch the issue where generation_config.json is missing
hf_model.processor.patch_size = \
hf_model.model.config.vision_config.patch_size
......@@ -72,8 +87,7 @@ def _run_test(
for inputs in all_inputs:
# Based on: https://huggingface.co/royokong/e5-v
outputs = hf_model.model(
**hf_model.wrap_device(inputs,
device=hf_model.model.device.type),
**hf_model.wrap_device(inputs),
return_dict=True,
output_hidden_states=True,
)
......
# SPDX-License-Identifier: Apache-2.0
from typing import List, Type
import os
import pytest
import torch.nn.functional as F
......@@ -30,9 +28,9 @@ MODELS = [os.path.join(models_path_prefix, "TIGER-Lab/VLM2Vec-Full")]
def _run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
input_texts: List[str],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
input_texts: list[str],
input_images: PromptImageInput,
model: str,
*,
......@@ -56,8 +54,7 @@ def _run_test(
for inputs in all_inputs:
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
outputs = hf_model.model(
**hf_model.wrap_device(inputs,
device=hf_model.model.device.type),
**hf_model.wrap_device(inputs),
return_dict=True,
output_hidden_states=True,
)
......
......@@ -10,7 +10,7 @@ import pytest
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from ....utils import fork_new_process_for_each_test, multi_gpu_test
from ....utils import create_new_process_for_each_test, multi_gpu_test
PROMPTS = [
{
......@@ -119,7 +119,7 @@ def run_test(
assert output.outputs[0].text == expected
@fork_new_process_for_each_test
@create_new_process_for_each_test()
@pytest.mark.core_model
@pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
......
......@@ -4,7 +4,7 @@
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
"""
import os
from typing import List, Optional, Tuple, Type
from typing import Optional
import pytest
from transformers import AutoModelForSeq2SeqLM
......@@ -19,7 +19,7 @@ from ....utils import models_path_prefix
def vllm_to_hf_output(
vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
decoder_prompt_type: DecoderPromptType,
):
"""Sanitize vllm output to be comparable with hf output."""
......@@ -33,9 +33,9 @@ def vllm_to_hf_output(
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
decoder_prompt_type: DecoderPromptType,
model: str,
*,
......
# SPDX-License-Identifier: Apache-2.0
from functools import partial
from typing import List, Optional, Tuple, Type
from typing import Optional
import os
import pytest
from PIL import Image
from vllm.inputs.data import ExplicitEncoderDecoderPrompt
from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
from vllm.multimodal.image import rescale_image_size
from vllm.sequence import SampleLogprobs
from ....conftest import HfRunner, VllmRunner
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
Florence2Prompt = partial(ExplicitEncoderDecoderPrompt,
decoder_prompt=None,
mm_processor_kwargs=None)
MODELS = [os.path.join(models_path_prefix, "microsoft/Florence-2-base")]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
TOKENIZER = os.path.join(models_path_prefix, "facebook/bart-base")
PROMPTS = [
Florence2Prompt(encoder_prompt="<CAPTION>"),
Florence2Prompt(encoder_prompt="<DETAILED_CAPTION>"),
Florence2Prompt(encoder_prompt="<MORE_DETAILED_CAPTION>"),
Florence2Prompt(encoder_prompt="<CAPTION_TO_PHRASE_GROUNDING>"),
Florence2Prompt(encoder_prompt="<DENSE_REGION_CAPTION>"),
Florence2Prompt(encoder_prompt="<REGION_PROPOSAL>"),
Florence2Prompt(encoder_prompt="<OCR_WITH_REGION>"),
Florence2Prompt(encoder_prompt="<OCR>"),
Florence2Prompt(encoder_prompt="<OD>"),
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<CAPTION>", # special task token
"cherry_blossom":
"Describe in detail what is shown in the image.",
})
def get_hf_images_prompts(
prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]],
) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]:
prompts, images = [], []
for prompt in prompts_:
encoder_prompt = prompt["encoder_prompt"]
prompts.append(
ExplicitEncoderDecoderPrompt(
encoder_prompt=encoder_prompt["prompt"],
decoder_prompt=None,
))
images.append(encoder_prompt["multi_modal_data"]["image"])
return prompts, images
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
Optional[SampleLogprobs]], ):
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = "</s><s>" + output_str + "</s>"
def hf_to_vllm_output(hf_output: tuple[list[int], str,
Optional[SampleLogprobs]]):
"""Sanitize hf output to be comparable with vllm output."""
output_ids, output_str, out_logprobs = hf_output
return output_ids, hf_output_str, out_logprobs
output_str = output_str.replace("</s>", "").replace("<s>", "")
output_ids = [ids for ids in output_ids if ids not in [0, 2]]
return output_ids, output_str, out_logprobs
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
prompts: List[ExplicitEncoderDecoderPrompt],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: list[list[ExplicitEncoderDecoderPrompt]],
model: str,
*,
dtype: str,
......@@ -58,46 +65,76 @@ def run_test(
distributed_executor_backend: Optional[str] = None,
) -> None:
with vllm_runner(model,
max_num_seqs=8,
tokenizer_name=TOKENIZER,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
prompts, max_tokens, num_logprobs)
vllm_outputs_per_case = [
vllm_model.generate_encoder_decoder_greedy_logprobs(
prompts, max_tokens, num_logprobs=num_logprobs)
for prompts in inputs
]
hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
# Florence-2 processors require image inputs
dummy_image = Image.new(mode="RGB", size=(2, 2))
with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.lm_head
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs,
images=[dummy_image] * len(prompts),
))
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
)
hf_outputs_per_case = [
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
prompts, max_tokens, num_logprobs=num_logprobs, images=images)
for prompts, images in hf_inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, model, dtype, max_tokens,
num_logprobs) -> None:
def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, model: str,
size_factors: list[int], dtype: str, max_tokens: int,
num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [[
ExplicitEncoderDecoderPrompt(
encoder_prompt=TextPrompt(
prompt=prompt,
multi_modal_data={"image": rescale_image_size(image, factor)}),
decoder_prompt=None,
) for factor in size_factors
] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
run_test(
hf_runner,
vllm_runner,
PROMPTS,
inputs_per_image,
model,
dtype=dtype,
max_tokens=max_tokens,
......
# SPDX-License-Identifier: Apache-2.0
from typing import List, Optional, Tuple, Type, overload
from typing import Optional, overload
import os
import pytest
import torch
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
BatchEncoding)
from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
......@@ -18,6 +17,7 @@ from vllm.sequence import SampleLogprobs
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets)
from ....quantization.utils import is_quant_method_supported
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
......@@ -66,7 +66,7 @@ prompt_data = {
}
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
Optional[SampleLogprobs]],
model: str):
"""Sanitize vllm output to be comparable with hf output."""
......@@ -93,9 +93,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
def _get_inputs(
image_assets: _ImageAssets,
*,
size_factors: Optional[List[float]] = None,
sizes: Optional[List[Tuple[int, int]]] = None,
) -> List[Tuple[List[str], PromptImageInput]]:
size_factors: Optional[list[float]] = None,
sizes: Optional[list[tuple[int, int]]] = None,
) -> list[tuple[list[str], PromptImageInput]]:
images = [asset.pil_image for asset in image_assets]
if size_factors is not None:
......@@ -125,12 +125,12 @@ def _get_inputs(
@overload
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: _ImageAssets,
model: str,
*,
size_factors: List[float],
size_factors: list[float],
dtype: str,
max_tokens: int,
num_logprobs: int,
......@@ -142,12 +142,12 @@ def run_test(
@overload
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: _ImageAssets,
model: str,
*,
sizes: List[Tuple[int, int]],
sizes: list[tuple[int, int]],
dtype: str,
max_tokens: int,
num_logprobs: int,
......@@ -158,13 +158,13 @@ def run_test(
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: _ImageAssets,
model: str,
*,
size_factors: Optional[List[float]] = None,
sizes: Optional[List[Tuple[int, int]]] = None,
size_factors: Optional[list[float]] = None,
sizes: Optional[list[tuple[int, int]]] = None,
dtype: str,
max_tokens: int,
num_logprobs: int,
......@@ -185,9 +185,9 @@ def run_test(
def _run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
inputs: List[Tuple[List[str], PromptImageInput]],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: list[tuple[list[str], PromptImageInput]],
model: str,
*,
dtype: str,
......@@ -217,7 +217,6 @@ def _run_test(
max_num_seqs=2,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
}) as vllm_model:
vllm_outputs_per_image = [
......@@ -228,14 +227,10 @@ def _run_test(
for prompts, images in inputs
]
def process(hf_inputs: BatchEncoding, **kwargs):
return hf_inputs
with hf_runner(model,
dtype=dtype,
model_kwargs={"device_map": "auto"},
postprocess_inputs=process,
auto_cls=AutoModelForVision2Seq) as hf_model:
auto_cls=AutoModelForImageTextToText) as hf_model:
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
......@@ -399,6 +394,49 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
)
@large_gpu_test(min_gb=48)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
def test_bnb_regression(
image_assets: _ImageAssets,
model: str,
dtype: str,
max_tokens: int,
):
stop_sign = image_assets[0].pil_image
prompts = [
{
"prompt": "<|begin_of_text|>The content of the image <|image|> is",
"multi_modal_data": {
"image": stop_sign
},
},
{
"prompt":
"The color of the sky is blue but sometimes it can also be",
},
]
# Test regression about QKVCrossParallelLinear
llm = LLM(
model=model,
dtype=dtype,
max_model_len=4096,
max_num_seqs=2,
quantization="bitsandbytes",
load_format="bitsandbytes",
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=max_tokens,
)
outputs = llm.generate(prompts, sampling_params)
assert outputs
@large_gpu_test(min_gb=48)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
......@@ -443,7 +481,6 @@ def test_explicit_implicit_prompt(
max_model_len=4096,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True,
)
sampling_params = SamplingParams(
temperature=0,
......@@ -475,14 +512,14 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
max_model_len=4096,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True,
limit_mm_per_prompt={"image":
_LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
# Regression tests for https://github.com/vllm-project/vllm/issues/10648
# Number of image tags is greater than the number of images provided
prompt = "<|begin_of_text|><|image|><|image|> Compare the two images" # noqa: E501
# Number of groups of image tokens is greater than the number of images
# provided (the whitespace between the tags is necessary)
prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images" # noqa: E501
image = stop_sign
with pytest.raises(ValueError):
vllm_model.generate_greedy_logprobs([prompt],
......
[[[1784, 3937, 6122, 1261, 7244, 10575, 28528, 1408, 1261, 32656, 11237, 1044, 7283, 2015, 1454, 1261, 38462, 4818, 1046, 2], "The image shows a black dog lying on a wooden floor, looking up with a curious expression.", [{"1784": {"logprob": -0.4740446209907532, "rank": 1, "decoded_token": "The"}, "1065": {"logprob": -1.0990445613861084, "rank": 2, "decoded_token": "A"}, "4380": {"logprob": -3.3490445613861084, "rank": 3, "decoded_token": "This"}, "1785": {"logprob": -5.0990447998046875, "rank": 4, "decoded_token": "In"}, "11745": {"logprob": -6.4740447998046875, "rank": 5, "decoded_token": "Here"}}, {"3937": {"logprob": -0.06349722295999527, "rank": 1, "decoded_token": " image"}, "7244": {"logprob": -2.813497304916382, "rank": 2, "decoded_token": " black"}, "16649": {"logprob": -7.563497066497803, "rank": 3, "decoded_token": " photo"}, "18390": {"logprob": -7.688497066497803, "rank": 4, "decoded_token": " photograph"}, "10575": {"logprob": -8.438497543334961, "rank": 5, "decoded_token": " dog"}}, {"6122": {"logprob": -0.25453490018844604, "rank": 1, "decoded_token": " shows"}, "6971": {"logprob": -1.8795349597930908, "rank": 2, "decoded_token": " features"}, "51948": {"logprob": -2.754534959793091, "rank": 3, "decoded_token": " depicts"}, "25981": {"logprob": -5.629534721374512, "rank": 4, "decoded_token": " displays"}, "1395": {"logprob": -6.129534721374512, "rank": 5, "decoded_token": " is"}}, {"1261": {"logprob": -0.0001245659514097497, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -9.00012493133545, "rank": 2, "decoded_token": " an"}, "1278": {"logprob": -14.25012493133545, "rank": 3, "decoded_token": " the"}, "7244": {"logprob": -14.87512493133545, "rank": 4, "decoded_token": " black"}, "1925": {"logprob": -16.125123977661133, "rank": 5, "decoded_token": " one"}}, {"7244": {"logprob": -0.009403933770954609, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -5.259403705596924, "rank": 2, "decoded_token": " close"}, "16450": {"logprob": -6.759403705596924, "rank": 3, "decoded_token": " sle"}, "8500": {"logprob": -7.009403705596924, "rank": 4, "decoded_token": " dark"}, "4329": {"logprob": -7.696903705596924, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.7522680163383484, "rank": 1, "decoded_token": " dog"}, "119075": {"logprob": -1.0022680759429932, "rank": 2, "decoded_token": " Labrador"}, "116572": {"logprob": -1.8772680759429932, "rank": 3, "decoded_token": " puppy"}, "8636": {"logprob": -5.627267837524414, "rank": 4, "decoded_token": " lab"}, "15812": {"logprob": -5.814767837524414, "rank": 5, "decoded_token": " Lab"}}, {"28528": {"logprob": -0.2941223084926605, "rank": 1, "decoded_token": " lying"}, "7283": {"logprob": -2.1691222190856934, "rank": 2, "decoded_token": " looking"}, "1454": {"logprob": -2.5441222190856934, "rank": 3, "decoded_token": " with"}, "60700": {"logprob": -3.2941222190856934, "rank": 4, "decoded_token": " laying"}, "18970": {"logprob": -4.794122219085693, "rank": 5, "decoded_token": " sitting"}}, {"1408": {"logprob": -0.3170951306819916, "rank": 1, "decoded_token": " on"}, "3151": {"logprob": -1.317095160484314, "rank": 2, "decoded_token": " down"}, "14038": {"logprob": -7.3170952796936035, "rank": 3, "decoded_token": " flat"}, "104248": {"logprob": -7.4420952796936035, "rank": 4, "decoded_token": " comfortably"}, "1321": {"logprob": -7.6920952796936035, "rank": 5, "decoded_token": " and"}}, {"1261": {"logprob": -0.08228635042905807, "rank": 1, "decoded_token": " a"}, "2246": {"logprob": -3.2072863578796387, "rank": 2, "decoded_token": " its"}, "32656": {"logprob": -3.3322863578796387, "rank": 3, "decoded_token": " wooden"}, "3977": {"logprob": -6.957286357879639, "rank": 4, "decoded_token": " top"}, "1278": {"logprob": -7.207286357879639, "rank": 5, "decoded_token": " the"}}, {"32656": {"logprob": -0.03605202957987785, "rank": 1, "decoded_token": " wooden"}, "3403": {"logprob": -3.9110519886016846, "rank": 2, "decoded_token": " text"}, "44130": {"logprob": -4.911052227020264, "rank": 3, "decoded_token": " rust"}, "12603": {"logprob": -6.036052227020264, "rank": 4, "decoded_token": " wood"}, "8500": {"logprob": -6.473552227020264, "rank": 5, "decoded_token": " dark"}}, {"11237": {"logprob": -0.6433407068252563, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -0.7683407068252563, "rank": 2, "decoded_token": " surface"}, "1615": {"logprob": -5.268340587615967, "rank": 3, "decoded_token": " pl"}, "3403": {"logprob": -6.018340587615967, "rank": 4, "decoded_token": " text"}, "18645": {"logprob": -7.143340587615967, "rank": 5, "decoded_token": " flo"}}, {"1044": {"logprob": -0.6826052665710449, "rank": 1, "decoded_token": ","}, "1321": {"logprob": -1.682605266571045, "rank": 2, "decoded_token": " and"}, "7283": {"logprob": -1.807605266571045, "rank": 3, "decoded_token": " looking"}, "1046": {"logprob": -2.682605266571045, "rank": 4, "decoded_token": "."}, "1454": {"logprob": -3.182605266571045, "rank": 5, "decoded_token": " with"}}, {"7283": {"logprob": -0.07239976525306702, "rank": 1, "decoded_token": " looking"}, "11589": {"logprob": -3.197399854660034, "rank": 2, "decoded_token": " gaz"}, "35542": {"logprob": -3.822399854660034, "rank": 3, "decoded_token": " staring"}, "1454": {"logprob": -6.384899616241455, "rank": 4, "decoded_token": " with"}, "22116": {"logprob": -6.572399616241455, "rank": 5, "decoded_token": " facing"}}, {"2015": {"logprob": -0.9646494388580322, "rank": 2, "decoded_token": " up"}, "7655": {"logprob": -0.9646494388580322, "rank": 1, "decoded_token": " directly"}, "74606": {"logprob": -2.0896494388580322, "rank": 3, "decoded_token": " upwards"}, "40022": {"logprob": -3.0896494388580322, "rank": 4, "decoded_token": " upward"}, "1935": {"logprob": -4.152149200439453, "rank": 5, "decoded_token": " int"}}, {"1454": {"logprob": -0.8447978496551514, "rank": 1, "decoded_token": " with"}, "1513": {"logprob": -1.2197978496551514, "rank": 2, "decoded_token": " at"}, "41132": {"logprob": -2.2197978496551514, "rank": 3, "decoded_token": " attent"}, "1935": {"logprob": -2.9697978496551514, "rank": 4, "decoded_token": " int"}, "7655": {"logprob": -3.0947978496551514, "rank": 5, "decoded_token": " directly"}}, {"1261": {"logprob": -0.7162021994590759, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -1.3412022590637207, "rank": 2, "decoded_token": " an"}, "41132": {"logprob": -2.2162022590637207, "rank": 3, "decoded_token": " attent"}, "2246": {"logprob": -3.2162022590637207, "rank": 4, "decoded_token": " its"}, "38462": {"logprob": -3.9662022590637207, "rank": 5, "decoded_token": " curious"}}, {"38462": {"logprob": -0.7836517095565796, "rank": 1, "decoded_token": " curious"}, "26517": {"logprob": -1.8461517095565796, "rank": 2, "decoded_token": " calm"}, "26905": {"logprob": -2.533651828765869, "rank": 3, "decoded_token": " gentle"}, "11304": {"logprob": -3.408651828765869, "rank": 4, "decoded_token": " serious"}, "97680": {"logprob": -3.596151828765869, "rank": 5, "decoded_token": " thoughtful"}}, {"4818": {"logprob": -0.047154705971479416, "rank": 1, "decoded_token": " expression"}, "1321": {"logprob": -3.922154664993286, "rank": 2, "decoded_token": " and"}, "1505": {"logprob": -4.047154903411865, "rank": 3, "decoded_token": " or"}, "22131": {"logprob": -4.797154903411865, "rank": 4, "decoded_token": " gaze"}, "1044": {"logprob": -9.047154426574707, "rank": 5, "decoded_token": ","}}, {"1046": {"logprob": -0.0008031480247154832, "rank": 1, "decoded_token": "."}, "1408": {"logprob": -7.250802993774414, "rank": 2, "decoded_token": " on"}, "1321": {"logprob": -10.500802993774414, "rank": 3, "decoded_token": " and"}, "1338": {"logprob": -11.000802993774414, "rank": 4, "decoded_token": ".\n\n"}, "3016": {"logprob": -11.500802993774414, "rank": 5, "decoded_token": " while"}}, {"2": {"logprob": -0.0008517451351508498, "rank": 1, "decoded_token": " "}, "1032": {"logprob": -7.125851631164551, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -10.00085163116455, "rank": 3, "decoded_token": " The"}}]], [[1049, 1046, 1349, 7244, 10575, 1395, 28528, 1408, 1261, 32656, 11237, 1044, 7283, 2015, 1513, 1278, 13424, 1626, 1050, 1046, 1349, 10726, 1290, 3719, 1307, 122203, 35463, 1454, 11223, 1321, 95746, 24765, 2425, 1261, 6133, 21283, 1046, 2], "1. A black dog is lying on a wooden floor, looking up at the camera.\n2. A scenic view of rugged mountains with green and rocky terrain under a clear sky.", [{"1049": {"logprob": -0.05050129443407059, "rank": 1, "decoded_token": "1"}, "11745": {"logprob": -3.5505013465881348, "rank": 2, "decoded_token": "Here"}, "69957": {"logprob": -4.175501346588135, "rank": 3, "decoded_token": "Sure"}, "117991": {"logprob": -6.175501346588135, "rank": 4, "decoded_token": "Certain"}, "1045": {"logprob": -6.550501346588135, "rank": 5, "decoded_token": "-"}}, {"1046": {"logprob": -5.364403477869928e-06, "rank": 1, "decoded_token": "."}, "1041": {"logprob": -12.500005722045898, "rank": 2, "decoded_token": ")"}, "1058": {"logprob": -13.875005722045898, "rank": 3, "decoded_token": ":"}, "1044": {"logprob": -15.687505722045898, "rank": 4, "decoded_token": ","}, "1045": {"logprob": -15.875005722045898, "rank": 5, "decoded_token": "-"}}, {"1349": {"logprob": -0.4890742003917694, "rank": 1, "decoded_token": " A"}, "1531": {"logprob": -1.1140742301940918, "rank": 2, "decoded_token": " The"}, "1603": {"logprob": -3.364074230194092, "rank": 3, "decoded_token": " **"}, "1656": {"logprob": -4.364074230194092, "rank": 4, "decoded_token": " In"}, "2409": {"logprob": -4.989074230194092, "rank": 5, "decoded_token": " This"}}, {"7244": {"logprob": -0.08685152232646942, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -3.4618515968322754, "rank": 2, "decoded_token": " close"}, "16450": {"logprob": -3.5868515968322754, "rank": 3, "decoded_token": " sle"}, "4329": {"logprob": -4.899351596832275, "rank": 4, "decoded_token": " large"}, "8500": {"logprob": -5.399351596832275, "rank": 5, "decoded_token": " dark"}}, {"10575": {"logprob": -0.20338763296604156, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -1.8283876180648804, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.95338773727417, "rank": 3, "decoded_token": " Labrador"}, "28404": {"logprob": -6.95338773727417, "rank": 4, "decoded_token": " pup"}, "8636": {"logprob": -7.07838773727417, "rank": 5, "decoded_token": " lab"}}, {"1395": {"logprob": -0.532414972782135, "rank": 1, "decoded_token": " is"}, "22524": {"logprob": -1.7824149131774902, "rank": 2, "decoded_token": " lies"}, "1454": {"logprob": -2.1574149131774902, "rank": 3, "decoded_token": " with"}, "10637": {"logprob": -3.2824149131774902, "rank": 4, "decoded_token": " looks"}, "28528": {"logprob": -3.4074149131774902, "rank": 5, "decoded_token": " lying"}}, {"28528": {"logprob": -0.4258010685443878, "rank": 1, "decoded_token": " lying"}, "7283": {"logprob": -1.6758010387420654, "rank": 2, "decoded_token": " looking"}, "60700": {"logprob": -2.9258010387420654, "rank": 3, "decoded_token": " laying"}, "38235": {"logprob": -3.6758010387420654, "rank": 4, "decoded_token": " resting"}, "18970": {"logprob": -3.6758010387420654, "rank": 5, "decoded_token": " sitting"}}, {"1408": {"logprob": -0.3588743805885315, "rank": 1, "decoded_token": " on"}, "3151": {"logprob": -1.2338743209838867, "rank": 2, "decoded_token": " down"}, "41132": {"logprob": -6.358874320983887, "rank": 3, "decoded_token": " attent"}, "14038": {"logprob": -6.546374320983887, "rank": 4, "decoded_token": " flat"}, "1321": {"logprob": -6.733874320983887, "rank": 5, "decoded_token": " and"}}, {"1261": {"logprob": -0.07801607996225357, "rank": 1, "decoded_token": " a"}, "2246": {"logprob": -2.9530160427093506, "rank": 2, "decoded_token": " its"}, "32656": {"logprob": -4.20301628112793, "rank": 3, "decoded_token": " wooden"}, "1278": {"logprob": -5.20301628112793, "rank": 4, "decoded_token": " the"}, "3977": {"logprob": -6.57801628112793, "rank": 5, "decoded_token": " top"}}, {"32656": {"logprob": -0.06541638821363449, "rank": 1, "decoded_token": " wooden"}, "3403": {"logprob": -3.4404163360595703, "rank": 2, "decoded_token": " text"}, "44130": {"logprob": -3.9404163360595703, "rank": 3, "decoded_token": " rust"}, "17253": {"logprob": -5.81541633605957, "rank": 4, "decoded_token": " weather"}, "12603": {"logprob": -5.94041633605957, "rank": 5, "decoded_token": " wood"}}, {"11237": {"logprob": -0.4574064016342163, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0824064016342163, "rank": 2, "decoded_token": " surface"}, "1615": {"logprob": -4.082406520843506, "rank": 3, "decoded_token": " pl"}, "3403": {"logprob": -5.207406520843506, "rank": 4, "decoded_token": " text"}, "28984": {"logprob": -6.582406520843506, "rank": 5, "decoded_token": " deck"}}, {"1044": {"logprob": -0.9594833850860596, "rank": 1, "decoded_token": ","}, "7283": {"logprob": -1.2094833850860596, "rank": 2, "decoded_token": " looking"}, "1321": {"logprob": -2.2094833850860596, "rank": 3, "decoded_token": " and"}, "1454": {"logprob": -2.4594833850860596, "rank": 4, "decoded_token": " with"}, "1626": {"logprob": -2.5844833850860596, "rank": 5, "decoded_token": ".\n"}}, {"7283": {"logprob": -0.15972694754600525, "rank": 1, "decoded_token": " looking"}, "11589": {"logprob": -2.534726858139038, "rank": 2, "decoded_token": " gaz"}, "35542": {"logprob": -2.909726858139038, "rank": 3, "decoded_token": " staring"}, "22116": {"logprob": -6.034727096557617, "rank": 4, "decoded_token": " facing"}, "1454": {"logprob": -6.409727096557617, "rank": 5, "decoded_token": " with"}}, {"2015": {"logprob": -0.894250750541687, "rank": 1, "decoded_token": " up"}, "7655": {"logprob": -1.269250750541687, "rank": 2, "decoded_token": " directly"}, "74606": {"logprob": -1.769250750541687, "rank": 3, "decoded_token": " upwards"}, "40022": {"logprob": -2.6442508697509766, "rank": 4, "decoded_token": " upward"}, "1935": {"logprob": -4.081750869750977, "rank": 5, "decoded_token": " int"}}, {"1513": {"logprob": -0.5085363388061523, "rank": 1, "decoded_token": " at"}, "1454": {"logprob": -1.5085363388061523, "rank": 2, "decoded_token": " with"}, "1626": {"logprob": -2.6335363388061523, "rank": 3, "decoded_token": ".\n"}, "1935": {"logprob": -3.3835363388061523, "rank": 4, "decoded_token": " int"}, "41132": {"logprob": -3.6335363388061523, "rank": 5, "decoded_token": " attent"}}, {"1278": {"logprob": -0.0010482537327334285, "rank": 1, "decoded_token": " the"}, "4433": {"logprob": -7.0010480880737305, "rank": 2, "decoded_token": " something"}, "2246": {"logprob": -10.25104808807373, "rank": 3, "decoded_token": " its"}, "1261": {"logprob": -10.25104808807373, "rank": 4, "decoded_token": " a"}, "1636": {"logprob": -10.50104808807373, "rank": 5, "decoded_token": " you"}}, {"13424": {"logprob": -0.0003800861886702478, "rank": 1, "decoded_token": " camera"}, "56268": {"logprob": -8.250380516052246, "rank": 2, "decoded_token": " viewer"}, "68439": {"logprob": -9.250380516052246, "rank": 3, "decoded_token": " photographer"}, "2965": {"logprob": -12.375380516052246, "rank": 4, "decoded_token": " person"}, "37967": {"logprob": -12.500380516052246, "rank": 5, "decoded_token": " ceiling"}}, {"1626": {"logprob": -0.34197133779525757, "rank": 1, "decoded_token": ".\n"}, "1454": {"logprob": -1.4669713973999023, "rank": 2, "decoded_token": " with"}, "1046": {"logprob": -3.3419713973999023, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -3.9669713973999023, "rank": 4, "decoded_token": ".\n\n"}, "1935": {"logprob": -5.966971397399902, "rank": 5, "decoded_token": " int"}}, {"1050": {"logprob": -0.002148107625544071, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -6.877148151397705, "rank": 2, "decoded_token": " "}, "1293": {"logprob": -7.127148151397705, "rank": 3, "decoded_token": " "}, "1032": {"logprob": -8.252147674560547, "rank": 4, "decoded_token": " "}, "1049": {"logprob": -10.752147674560547, "rank": 5, "decoded_token": "1"}}, {"1046": {"logprob": -7.510157047363464e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.437507629394531, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.437507629394531, "rank": 3, "decoded_token": ".\n"}, "48426": {"logprob": -13.687507629394531, "rank": 4, "decoded_token": ".The"}, "1044": {"logprob": -14.062507629394531, "rank": 5, "decoded_token": ","}}, {"1349": {"logprob": -0.2843300700187683, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -2.034330129623413, "rank": 2, "decoded_token": " Maj"}, "113465": {"logprob": -3.534330129623413, "rank": 3, "decoded_token": " Rug"}, "22468": {"logprob": -4.409329891204834, "rank": 4, "decoded_token": " Several"}, "1531": {"logprob": -4.534329891204834, "rank": 5, "decoded_token": " The"}}, {"10726": {"logprob": -1.3984904289245605, "rank": 1, "decoded_token": " scen"}, "122203": {"logprob": -1.7734904289245605, "rank": 2, "decoded_token": " rugged"}, "61082": {"logprob": -1.7734904289245605, "rank": 3, "decoded_token": " panor"}, "15375": {"logprob": -2.5234904289245605, "rank": 4, "decoded_token": " vast"}, "13770": {"logprob": -2.6484904289245605, "rank": 5, "decoded_token": " maj"}}, {"1290": {"logprob": -3.099436753473128e-06, "rank": 1, "decoded_token": "ic"}, "2981": {"logprob": -13.56250286102295, "rank": 2, "decoded_token": "ically"}, "1702": {"logprob": -14.31250286102295, "rank": 3, "decoded_token": "ice"}, "4965": {"logprob": -16.625003814697266, "rank": 4, "decoded_token": "etic"}, "4336": {"logprob": -16.687503814697266, "rank": 5, "decoded_token": "icro"}}, {"3719": {"logprob": -0.1252945065498352, "rank": 1, "decoded_token": " view"}, "28035": {"logprob": -2.8752944469451904, "rank": 2, "decoded_token": " landscape"}, "24361": {"logprob": -3.2502944469451904, "rank": 3, "decoded_token": " mountain"}, "127945": {"logprob": -5.1252946853637695, "rank": 4, "decoded_token": " mountainous"}, "1044": {"logprob": -5.3752946853637695, "rank": 5, "decoded_token": ","}}, {"1307": {"logprob": -0.09058280289173126, "rank": 1, "decoded_token": " of"}, "89995": {"logprob": -3.465582847595215, "rank": 2, "decoded_token": " showc"}, "6122": {"logprob": -3.715582847595215, "rank": 3, "decoded_token": " shows"}, "6971": {"logprob": -4.590582847595215, "rank": 4, "decoded_token": " features"}, "66583": {"logprob": -5.090582847595215, "rank": 5, "decoded_token": " captures"}}, {"122203": {"logprob": -0.5323622226715088, "rank": 1, "decoded_token": " rugged"}, "1261": {"logprob": -2.032362222671509, "rank": 2, "decoded_token": " a"}, "6245": {"logprob": -2.532362222671509, "rank": 3, "decoded_token": " multiple"}, "127945": {"logprob": -3.157362222671509, "rank": 4, "decoded_token": " mountainous"}, "35463": {"logprob": -3.532362222671509, "rank": 5, "decoded_token": " mountains"}}, {"35463": {"logprob": -0.6520033478736877, "rank": 1, "decoded_token": " mountains"}, "1044": {"logprob": -1.027003288269043, "rank": 2, "decoded_token": ","}, "24361": {"logprob": -2.527003288269043, "rank": 3, "decoded_token": " mountain"}, "127945": {"logprob": -3.902003288269043, "rank": 4, "decoded_token": " mountainous"}, "11223": {"logprob": -4.652003288269043, "rank": 5, "decoded_token": " green"}}, {"1454": {"logprob": -0.39697548747062683, "rank": 1, "decoded_token": " with"}, "13875": {"logprob": -2.146975517272949, "rank": 2, "decoded_token": " covered"}, "1321": {"logprob": -2.271975517272949, "rank": 3, "decoded_token": " and"}, "2425": {"logprob": -3.459475517272949, "rank": 4, "decoded_token": " under"}, "47948": {"logprob": -4.459475517272949, "rank": 5, "decoded_token": " stretching"}}, {"11223": {"logprob": -1.3947651386260986, "rank": 1, "decoded_token": " green"}, "24880": {"logprob": -1.8947651386260986, "rank": 2, "decoded_token": " varying"}, "95746": {"logprob": -2.0822651386260986, "rank": 3, "decoded_token": " rocky"}, "1295": {"logprob": -3.0197651386260986, "rank": 4, "decoded_token": " l"}, "19546": {"logprob": -3.0822651386260986, "rank": 5, "decoded_token": " varied"}}, {"1321": {"logprob": -0.8649212121963501, "rank": 1, "decoded_token": " and"}, "61263": {"logprob": -1.73992121219635, "rank": 2, "decoded_token": " slopes"}, "47260": {"logprob": -1.86492121219635, "rank": 3, "decoded_token": " vegetation"}, "50373": {"logprob": -1.98992121219635, "rank": 4, "decoded_token": " patches"}, "23170": {"logprob": -3.4899210929870605, "rank": 5, "decoded_token": " grass"}}, {"95746": {"logprob": -0.21662631630897522, "rank": 1, "decoded_token": " rocky"}, "22980": {"logprob": -1.9666262865066528, "rank": 2, "decoded_token": " brown"}, "26549": {"logprob": -3.8416264057159424, "rank": 3, "decoded_token": " gray"}, "4266": {"logprob": -4.216626167297363, "rank": 4, "decoded_token": " bar"}, "34052": {"logprob": -4.966626167297363, "rank": 5, "decoded_token": " grey"}}, {"24765": {"logprob": -0.32041722536087036, "rank": 1, "decoded_token": " terrain"}, "57912": {"logprob": -1.8204171657562256, "rank": 2, "decoded_token": " terrains"}, "61263": {"logprob": -2.6954171657562256, "rank": 3, "decoded_token": " slopes"}, "84497": {"logprob": -3.9454171657562256, "rank": 4, "decoded_token": " landscapes"}, "17764": {"logprob": -4.695417404174805, "rank": 5, "decoded_token": " surfaces"}}, {"2425": {"logprob": -0.4664109945297241, "rank": 1, "decoded_token": " under"}, "1046": {"logprob": -1.4664109945297241, "rank": 2, "decoded_token": "."}, "1044": {"logprob": -3.4664111137390137, "rank": 3, "decoded_token": ","}, "22923": {"logprob": -3.9664111137390137, "rank": 4, "decoded_token": " extending"}, "47948": {"logprob": -4.091411113739014, "rank": 5, "decoded_token": " stretching"}}, {"1261": {"logprob": -0.015043734572827816, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -4.76504373550415, "rank": 2, "decoded_token": " an"}, "6133": {"logprob": -6.01504373550415, "rank": 3, "decoded_token": " clear"}, "1278": {"logprob": -6.26504373550415, "rank": 4, "decoded_token": " the"}, "16152": {"logprob": -7.26504373550415, "rank": 5, "decoded_token": " cloud"}}, {"6133": {"logprob": -0.7420746684074402, "rank": 1, "decoded_token": " clear"}, "18416": {"logprob": -1.492074728012085, "rank": 2, "decoded_token": " haz"}, "16152": {"logprob": -1.992074728012085, "rank": 3, "decoded_token": " cloud"}, "27254": {"logprob": -3.367074728012085, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.617074728012085, "rank": 5, "decoded_token": " light"}}, {"21283": {"logprob": -0.007355513051152229, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -5.257355690002441, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -6.382355690002441, "rank": 3, "decoded_token": ","}, "1505": {"logprob": -8.257355690002441, "rank": 4, "decoded_token": " or"}, "3950": {"logprob": -10.132355690002441, "rank": 5, "decoded_token": " day"}}, {"1046": {"logprob": -0.01126158982515335, "rank": 1, "decoded_token": "."}, "1626": {"logprob": -4.636261463165283, "rank": 2, "decoded_token": ".\n"}, "1338": {"logprob": -7.761261463165283, "rank": 3, "decoded_token": ".\n\n"}, "1044": {"logprob": -7.761261463165283, "rank": 4, "decoded_token": ","}, "1395": {"logprob": -8.011261940002441, "rank": 5, "decoded_token": " is"}}, {"2": {"logprob": -0.00709608756005764, "rank": 1, "decoded_token": " "}, "1032": {"logprob": -5.007096290588379, "rank": 2, "decoded_token": " The"}, "1256": {"logprob": -8.132096290588379, "rank": 3, "decoded_token": " "}}]], [[1049, 1046, 1349, 7244, 10575, 1395, 28528, 1408, 1261, 32656, 11237, 1044, 7283, 2015, 1513, 1278, 13424, 1626, 1050, 1046, 1349, 122203, 24361, 28035, 1454, 11223, 1321, 95746, 24765, 2425, 1261, 6133, 21283, 1626, 1051, 1046, 1349, 2965, 1294, 1261, 4804, 4250, 12006, 4302, 48049, 4837, 1261, 29397, 1435, 22140, 21457, 22196, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 1294, 1261, 12097, 1044, 121040, 1536, 11223, 23170, 1321, 17744, 34941, 16429, 2425, 1261, 10991, 21283, 1046, 2], "1. A black dog is lying on a wooden floor, looking up at the camera.\n2. A rugged mountain landscape with green and rocky terrain under a clear sky.\n3. A person in a red swimsuit walks along a beach as waves crash nearby.\n4. A winding gravel path in a park, bordered by green grass and blooming trees under a blue sky.", [{"1049": {"logprob": -0.17000193893909454, "rank": 1, "decoded_token": "1"}, "11745": {"logprob": -1.9200019836425781, "rank": 2, "decoded_token": "Here"}, "69957": {"logprob": -4.920001983642578, "rank": 3, "decoded_token": "Sure"}, "117991": {"logprob": -7.295001983642578, "rank": 4, "decoded_token": "Certain"}, "1784": {"logprob": -7.295001983642578, "rank": 5, "decoded_token": "The"}}, {"1046": {"logprob": -1.597391747054644e-05, "rank": 1, "decoded_token": "."}, "1041": {"logprob": -11.500016212463379, "rank": 2, "decoded_token": ")"}, "1058": {"logprob": -13.062516212463379, "rank": 3, "decoded_token": ":"}, "3590": {"logprob": -13.750016212463379, "rank": 4, "decoded_token": ".A"}, "48426": {"logprob": -14.312516212463379, "rank": 5, "decoded_token": ".The"}}, {"1349": {"logprob": -0.07567699253559113, "rank": 1, "decoded_token": " A"}, "1531": {"logprob": -3.075676918029785, "rank": 2, "decoded_token": " The"}, "1603": {"logprob": -3.950676918029785, "rank": 3, "decoded_token": " **"}, "2409": {"logprob": -6.075676918029785, "rank": 4, "decoded_token": " This"}, "8479": {"logprob": -6.575676918029785, "rank": 5, "decoded_token": " Black"}}, {"7244": {"logprob": -0.06906593590974808, "rank": 1, "decoded_token": " black"}, "16450": {"logprob": -3.694066047668457, "rank": 2, "decoded_token": " sle"}, "6231": {"logprob": -4.506566047668457, "rank": 3, "decoded_token": " close"}, "4329": {"logprob": -4.944066047668457, "rank": 4, "decoded_token": " large"}, "8500": {"logprob": -5.256566047668457, "rank": 5, "decoded_token": " dark"}}, {"10575": {"logprob": -0.11913803219795227, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.24413800239563, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -5.494138240814209, "rank": 3, "decoded_token": " Labrador"}, "28404": {"logprob": -7.181638240814209, "rank": 4, "decoded_token": " pup"}, "8636": {"logprob": -7.869138240814209, "rank": 5, "decoded_token": " lab"}}, {"1395": {"logprob": -0.782707154750824, "rank": 1, "decoded_token": " is"}, "22524": {"logprob": -1.1577072143554688, "rank": 2, "decoded_token": " lies"}, "1454": {"logprob": -2.9077072143554688, "rank": 3, "decoded_token": " with"}, "10637": {"logprob": -3.0327072143554688, "rank": 4, "decoded_token": " looks"}, "28528": {"logprob": -3.5327072143554688, "rank": 5, "decoded_token": " lying"}}, {"28528": {"logprob": -0.3443163335323334, "rank": 1, "decoded_token": " lying"}, "7283": {"logprob": -2.094316244125366, "rank": 2, "decoded_token": " looking"}, "60700": {"logprob": -2.844316244125366, "rank": 3, "decoded_token": " laying"}, "38235": {"logprob": -3.344316244125366, "rank": 4, "decoded_token": " resting"}, "18970": {"logprob": -3.469316244125366, "rank": 5, "decoded_token": " sitting"}}, {"1408": {"logprob": -0.29093095660209656, "rank": 1, "decoded_token": " on"}, "3151": {"logprob": -1.415930986404419, "rank": 2, "decoded_token": " down"}, "41132": {"logprob": -6.16593074798584, "rank": 3, "decoded_token": " attent"}, "1321": {"logprob": -6.85343074798584, "rank": 4, "decoded_token": " and"}, "14038": {"logprob": -6.97843074798584, "rank": 5, "decoded_token": " flat"}}, {"1261": {"logprob": -0.05553353577852249, "rank": 1, "decoded_token": " a"}, "2246": {"logprob": -3.6805336475372314, "rank": 2, "decoded_token": " its"}, "32656": {"logprob": -3.8055336475372314, "rank": 3, "decoded_token": " wooden"}, "1278": {"logprob": -5.305533409118652, "rank": 4, "decoded_token": " the"}, "3977": {"logprob": -7.430533409118652, "rank": 5, "decoded_token": " top"}}, {"32656": {"logprob": -0.039505477994680405, "rank": 1, "decoded_token": " wooden"}, "3403": {"logprob": -3.9145054817199707, "rank": 2, "decoded_token": " text"}, "44130": {"logprob": -4.414505481719971, "rank": 3, "decoded_token": " rust"}, "12603": {"logprob": -5.914505481719971, "rank": 4, "decoded_token": " wood"}, "17253": {"logprob": -6.539505481719971, "rank": 5, "decoded_token": " weather"}}, {"11237": {"logprob": -0.373188853263855, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.248188853263855, "rank": 2, "decoded_token": " surface"}, "1615": {"logprob": -4.2481889724731445, "rank": 3, "decoded_token": " pl"}, "3403": {"logprob": -5.6231889724731445, "rank": 4, "decoded_token": " text"}, "28984": {"logprob": -5.9981889724731445, "rank": 5, "decoded_token": " deck"}}, {"1044": {"logprob": -1.378434181213379, "rank": 3, "decoded_token": ","}, "7283": {"logprob": -1.378434181213379, "rank": 1, "decoded_token": " looking"}, "1626": {"logprob": -1.378434181213379, "rank": 2, "decoded_token": ".\n"}, "1321": {"logprob": -2.378434181213379, "rank": 4, "decoded_token": " and"}, "1454": {"logprob": -2.628434181213379, "rank": 5, "decoded_token": " with"}}, {"7283": {"logprob": -0.17630912363529205, "rank": 1, "decoded_token": " looking"}, "11589": {"logprob": -2.551309108734131, "rank": 2, "decoded_token": " gaz"}, "35542": {"logprob": -2.676309108734131, "rank": 3, "decoded_token": " staring"}, "22116": {"logprob": -6.238809108734131, "rank": 4, "decoded_token": " facing"}, "11735": {"logprob": -6.488809108734131, "rank": 5, "decoded_token": " giving"}}, {"2015": {"logprob": -0.8436563014984131, "rank": 1, "decoded_token": " up"}, "7655": {"logprob": -1.343656301498413, "rank": 2, "decoded_token": " directly"}, "74606": {"logprob": -1.718656301498413, "rank": 3, "decoded_token": " upwards"}, "40022": {"logprob": -2.593656301498413, "rank": 4, "decoded_token": " upward"}, "11521": {"logprob": -4.406156539916992, "rank": 5, "decoded_token": " straight"}}, {"1513": {"logprob": -0.45780688524246216, "rank": 1, "decoded_token": " at"}, "1626": {"logprob": -1.7078068256378174, "rank": 2, "decoded_token": ".\n"}, "1454": {"logprob": -2.3328068256378174, "rank": 3, "decoded_token": " with"}, "1935": {"logprob": -3.5828068256378174, "rank": 4, "decoded_token": " int"}, "41132": {"logprob": -3.9578068256378174, "rank": 5, "decoded_token": " attent"}}, {"1278": {"logprob": -0.0004164305282756686, "rank": 1, "decoded_token": " the"}, "4433": {"logprob": -8.00041675567627, "rank": 2, "decoded_token": " something"}, "1261": {"logprob": -10.50041675567627, "rank": 3, "decoded_token": " a"}, "2246": {"logprob": -10.87541675567627, "rank": 4, "decoded_token": " its"}, "1636": {"logprob": -11.37541675567627, "rank": 5, "decoded_token": " you"}}, {"13424": {"logprob": -0.000399033073335886, "rank": 1, "decoded_token": " camera"}, "56268": {"logprob": -8.125398635864258, "rank": 2, "decoded_token": " viewer"}, "68439": {"logprob": -9.500398635864258, "rank": 3, "decoded_token": " photographer"}, "37967": {"logprob": -12.000398635864258, "rank": 4, "decoded_token": " ceiling"}, "2965": {"logprob": -12.312898635864258, "rank": 5, "decoded_token": " person"}}, {"1626": {"logprob": -0.10298559814691544, "rank": 1, "decoded_token": ".\n"}, "1046": {"logprob": -2.9779856204986572, "rank": 2, "decoded_token": "."}, "1454": {"logprob": -3.2279856204986572, "rank": 3, "decoded_token": " with"}, "1338": {"logprob": -5.227985382080078, "rank": 4, "decoded_token": ".\n\n"}, "1935": {"logprob": -6.852985382080078, "rank": 5, "decoded_token": " int"}}, {"1050": {"logprob": -0.002897590398788452, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -6.5028977394104, "rank": 2, "decoded_token": " "}, "1293": {"logprob": -6.6278977394104, "rank": 3, "decoded_token": " "}, "1032": {"logprob": -9.877897262573242, "rank": 4, "decoded_token": " "}, "1009": {"logprob": -11.627897262573242, "rank": 5, "decoded_token": "\t"}}, {"1046": {"logprob": -1.5497195136049413e-06, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -14.875001907348633, "rank": 2, "decoded_token": ","}, "3590": {"logprob": -15.000001907348633, "rank": 3, "decoded_token": ".A"}, "2247": {"logprob": -15.125001907348633, "rank": 4, "decoded_token": " ."}, "1058": {"logprob": -15.375001907348633, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.6107801198959351, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.360780119895935, "rank": 2, "decoded_token": " Maj"}, "113465": {"logprob": -2.3607802391052246, "rank": 3, "decoded_token": " Rug"}, "27260": {"logprob": -3.7357802391052246, "rank": 4, "decoded_token": " Mountain"}, "1531": {"logprob": -4.485780239105225, "rank": 5, "decoded_token": " The"}}, {"122203": {"logprob": -0.8547073602676392, "rank": 1, "decoded_token": " rugged"}, "15375": {"logprob": -2.1047072410583496, "rank": 2, "decoded_token": " vast"}, "10726": {"logprob": -2.1047072410583496, "rank": 3, "decoded_token": " scen"}, "61082": {"logprob": -2.6047072410583496, "rank": 4, "decoded_token": " panor"}, "2965": {"logprob": -3.2922072410583496, "rank": 5, "decoded_token": " person"}}, {"24361": {"logprob": -0.41217130422592163, "rank": 1, "decoded_token": " mountain"}, "1044": {"logprob": -1.6621713638305664, "rank": 2, "decoded_token": ","}, "127945": {"logprob": -2.6621713638305664, "rank": 3, "decoded_token": " mountainous"}, "28035": {"logprob": -3.5371713638305664, "rank": 4, "decoded_token": " landscape"}, "1321": {"logprob": -3.6621713638305664, "rank": 5, "decoded_token": " and"}}, {"28035": {"logprob": -0.6676621437072754, "rank": 1, "decoded_token": " landscape"}, "4521": {"logprob": -0.7926621437072754, "rank": 2, "decoded_token": " range"}, "24765": {"logprob": -4.542662143707275, "rank": 3, "decoded_token": " terrain"}, "13327": {"logprob": -5.167662143707275, "rank": 4, "decoded_token": " scene"}, "12248": {"logprob": -5.167662143707275, "rank": 5, "decoded_token": " peak"}}, {"1454": {"logprob": -0.31015345454216003, "rank": 1, "decoded_token": " with"}, "6971": {"logprob": -2.4351534843444824, "rank": 2, "decoded_token": " features"}, "94973": {"logprob": -3.3101534843444824, "rank": 3, "decoded_token": " stretches"}, "89995": {"logprob": -3.4351534843444824, "rank": 4, "decoded_token": " showc"}, "1395": {"logprob": -3.5601534843444824, "rank": 5, "decoded_token": " is"}}, {"11223": {"logprob": -1.547694206237793, "rank": 1, "decoded_token": " green"}, "95746": {"logprob": -1.922694206237793, "rank": 2, "decoded_token": " rocky"}, "27469": {"logprob": -2.172694206237793, "rank": 3, "decoded_token": " peaks"}, "6245": {"logprob": -2.297694206237793, "rank": 4, "decoded_token": " multiple"}, "47147": {"logprob": -2.360194206237793, "rank": 5, "decoded_token": " steep"}}, {"1321": {"logprob": -0.9617817401885986, "rank": 1, "decoded_token": " and"}, "61263": {"logprob": -1.3367817401885986, "rank": 2, "decoded_token": " slopes"}, "51187": {"logprob": -2.3367817401885986, "rank": 3, "decoded_token": " hills"}, "47260": {"logprob": -2.3367817401885986, "rank": 4, "decoded_token": " vegetation"}, "50373": {"logprob": -2.7117817401885986, "rank": 5, "decoded_token": " patches"}}, {"95746": {"logprob": -0.11686273664236069, "rank": 1, "decoded_token": " rocky"}, "22980": {"logprob": -2.7418627738952637, "rank": 2, "decoded_token": " brown"}, "4266": {"logprob": -3.8668627738952637, "rank": 3, "decoded_token": " bar"}, "26549": {"logprob": -4.491862773895264, "rank": 4, "decoded_token": " gray"}, "9091": {"logprob": -5.366862773895264, "rank": 5, "decoded_token": " rock"}}, {"24765": {"logprob": -0.22640009224414825, "rank": 1, "decoded_token": " terrain"}, "57912": {"logprob": -2.476400136947632, "rank": 2, "decoded_token": " terrains"}, "61263": {"logprob": -2.726400136947632, "rank": 3, "decoded_token": " slopes"}, "51187": {"logprob": -3.851400136947632, "rank": 4, "decoded_token": " hills"}, "27469": {"logprob": -3.976400136947632, "rank": 5, "decoded_token": " peaks"}}, {"2425": {"logprob": -0.7823817133903503, "rank": 1, "decoded_token": " under"}, "1626": {"logprob": -1.1573817729949951, "rank": 2, "decoded_token": ".\n"}, "94973": {"logprob": -2.657381772994995, "rank": 3, "decoded_token": " stretches"}, "1395": {"logprob": -2.782381772994995, "rank": 4, "decoded_token": " is"}, "7038": {"logprob": -3.532381772994995, "rank": 5, "decoded_token": " extends"}}, {"1261": {"logprob": -0.016132064163684845, "rank": 1, "decoded_token": " a"}, "6133": {"logprob": -5.39113187789917, "rank": 2, "decoded_token": " clear"}, "1420": {"logprob": -5.39113187789917, "rank": 3, "decoded_token": " an"}, "1278": {"logprob": -6.01613187789917, "rank": 4, "decoded_token": " the"}, "16152": {"logprob": -6.26613187789917, "rank": 5, "decoded_token": " cloud"}}, {"6133": {"logprob": -0.44541382789611816, "rank": 1, "decoded_token": " clear"}, "16152": {"logprob": -2.070413827896118, "rank": 2, "decoded_token": " cloud"}, "18416": {"logprob": -2.320413827896118, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -3.195413827896118, "rank": 4, "decoded_token": " partly"}, "10991": {"logprob": -3.320413827896118, "rank": 5, "decoded_token": " blue"}}, {"21283": {"logprob": -0.003768961876630783, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -5.7537689208984375, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -7.6287689208984375, "rank": 3, "decoded_token": ","}, "1505": {"logprob": -10.753768920898438, "rank": 4, "decoded_token": " or"}, "3044": {"logprob": -11.128768920898438, "rank": 5, "decoded_token": " sk"}}, {"1626": {"logprob": -0.0008177988929674029, "rank": 1, "decoded_token": ".\n"}, "1046": {"logprob": -7.375817775726318, "rank": 2, "decoded_token": "."}, "1395": {"logprob": -9.750818252563477, "rank": 3, "decoded_token": " is"}, "1010": {"logprob": -10.125818252563477, "rank": 4, "decoded_token": "\n"}, "1044": {"logprob": -10.750818252563477, "rank": 5, "decoded_token": ","}}, {"1051": {"logprob": -0.00013457823661156, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -9.125134468078613, "rank": 2, "decoded_token": "4"}, "1256": {"logprob": -11.375134468078613, "rank": 3, "decoded_token": " "}, "1050": {"logprob": -11.875134468078613, "rank": 4, "decoded_token": "2"}, "1049": {"logprob": -13.000134468078613, "rank": 5, "decoded_token": "1"}}, {"1046": {"logprob": -7.152555099310121e-07, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.875000953674316, "rank": 2, "decoded_token": ".A"}, "48426": {"logprob": -15.937500953674316, "rank": 3, "decoded_token": ".The"}, "1349": {"logprob": -17.0, "rank": 4, "decoded_token": " A"}, "1338": {"logprob": -17.3125, "rank": 5, "decoded_token": ".\n\n"}}, {"1349": {"logprob": -0.03193942829966545, "rank": 1, "decoded_token": " A"}, "10638": {"logprob": -4.406939506530762, "rank": 2, "decoded_token": " Two"}, "2048": {"logprob": -5.031939506530762, "rank": 3, "decoded_token": " An"}, "1488": {"logprob": -5.156939506530762, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -5.906939506530762, "rank": 5, "decoded_token": " People"}}, {"2965": {"logprob": -0.41655251383781433, "rank": 1, "decoded_token": " person"}, "92731": {"logprob": -1.5415525436401367, "rank": 2, "decoded_token": " lone"}, "79013": {"logprob": -2.7915525436401367, "rank": 3, "decoded_token": " solitary"}, "29397": {"logprob": -3.5415525436401367, "rank": 4, "decoded_token": " beach"}, "2169": {"logprob": -4.729052543640137, "rank": 5, "decoded_token": " ser"}}, {"1294": {"logprob": -0.9845026135444641, "rank": 1, "decoded_token": " in"}, "1395": {"logprob": -1.2345025539398193, "rank": 2, "decoded_token": " is"}, "48049": {"logprob": -1.8595025539398193, "rank": 3, "decoded_token": " walks"}, "23737": {"logprob": -2.2345025539398193, "rank": 4, "decoded_token": " stands"}, "1285": {"logprob": -2.8595025539398193, "rank": 5, "decoded_token": " w"}}, {"1261": {"logprob": -0.32012784481048584, "rank": 1, "decoded_token": " a"}, "4804": {"logprob": -1.3201278448104858, "rank": 2, "decoded_token": " red"}, "1420": {"logprob": -5.820127964019775, "rank": 3, "decoded_token": " an"}, "64031": {"logprob": -6.570127964019775, "rank": 4, "decoded_token": " swim"}, "18168": {"logprob": -6.695127964019775, "rank": 5, "decoded_token": " bright"}}, {"4804": {"logprob": -0.10999592393636703, "rank": 1, "decoded_token": " red"}, "1285": {"logprob": -2.3599958419799805, "rank": 2, "decoded_token": " w"}, "4250": {"logprob": -5.6099958419799805, "rank": 3, "decoded_token": " sw"}, "18168": {"logprob": -6.0474958419799805, "rank": 4, "decoded_token": " bright"}, "18258": {"logprob": -6.4224958419799805, "rank": 5, "decoded_token": " wet"}}, {"4250": {"logprob": -0.2469252496957779, "rank": 1, "decoded_token": " sw"}, "1285": {"logprob": -2.3719253540039062, "rank": 2, "decoded_token": " w"}, "64031": {"logprob": -2.7469253540039062, "rank": 3, "decoded_token": " swim"}, "17513": {"logprob": -3.2469253540039062, "rank": 4, "decoded_token": " suit"}, "75948": {"logprob": -4.371925354003906, "rank": 5, "decoded_token": " outfit"}}, {"12006": {"logprob": -5.722029527532868e-06, "rank": 1, "decoded_token": "ims"}, "25763": {"logprob": -12.750005722045898, "rank": 2, "decoded_token": "immer"}, "7552": {"logprob": -13.687505722045898, "rank": 3, "decoded_token": "imm"}, "2097": {"logprob": -16.6875057220459, "rank": 4, "decoded_token": "ins"}, "19523": {"logprob": -16.7500057220459, "rank": 5, "decoded_token": "imb"}}, {"4302": {"logprob": -1.8000440832111053e-05, "rank": 1, "decoded_token": "uit"}, "17513": {"logprob": -11.875018119812012, "rank": 2, "decoded_token": " suit"}, "8036": {"logprob": -13.250018119812012, "rank": 3, "decoded_token": "irt"}, "36953": {"logprob": -13.500018119812012, "rank": 4, "decoded_token": "uiten"}, "1276": {"logprob": -14.437518119812012, "rank": 5, "decoded_token": "it"}}, {"48049": {"logprob": -0.41766560077667236, "rank": 1, "decoded_token": " walks"}, "1395": {"logprob": -1.4176656007766724, "rank": 2, "decoded_token": " is"}, "19710": {"logprob": -2.792665481567383, "rank": 3, "decoded_token": " walking"}, "23737": {"logprob": -3.917665481567383, "rank": 4, "decoded_token": " stands"}, "1285": {"logprob": -4.292665481567383, "rank": 5, "decoded_token": " w"}}, {"4837": {"logprob": -0.002689199522137642, "rank": 1, "decoded_token": " along"}, "9412": {"logprob": -6.627689361572266, "rank": 2, "decoded_token": " alone"}, "6117": {"logprob": -7.377689361572266, "rank": 3, "decoded_token": " near"}, "1408": {"logprob": -8.002689361572266, "rank": 4, "decoded_token": " on"}, "2203": {"logprob": -8.377689361572266, "rank": 5, "decoded_token": " into"}}, {"1261": {"logprob": -0.38749611377716064, "rank": 1, "decoded_token": " a"}, "1278": {"logprob": -1.1374961137771606, "rank": 2, "decoded_token": " the"}, "1420": {"logprob": -7.387495994567871, "rank": 3, "decoded_token": " an"}, "100991": {"logprob": -13.949995994567871, "rank": 4, "decoded_token": " sandy"}, "18258": {"logprob": -14.512495994567871, "rank": 5, "decoded_token": " wet"}}, {"29397": {"logprob": -0.5292408466339111, "rank": 1, "decoded_token": " beach"}, "100991": {"logprob": -0.9042408466339111, "rank": 2, "decoded_token": " sandy"}, "1627": {"logprob": -6.029240608215332, "rank": 3, "decoded_token": " sh"}, "46422": {"logprob": -6.529240608215332, "rank": 4, "decoded_token": " shore"}, "2169": {"logprob": -7.779240608215332, "rank": 5, "decoded_token": " ser"}}, {"1435": {"logprob": -0.29965779185295105, "rank": 1, "decoded_token": " as"}, "1454": {"logprob": -1.6746578216552734, "rank": 2, "decoded_token": " with"}, "1513": {"logprob": -3.7996578216552734, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.7996578216552734, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -4.799657821655273, "rank": 5, "decoded_token": " near"}}, {"22140": {"logprob": -0.015346773900091648, "rank": 1, "decoded_token": " waves"}, "1261": {"logprob": -4.515347003936768, "rank": 2, "decoded_token": " a"}, "1278": {"logprob": -6.140347003936768, "rank": 3, "decoded_token": " the"}, "27208": {"logprob": -6.890347003936768, "rank": 4, "decoded_token": " ocean"}, "4329": {"logprob": -7.265347003936768, "rank": 5, "decoded_token": " large"}}, {"21457": {"logprob": -0.013234862126410007, "rank": 1, "decoded_token": " crash"}, "33168": {"logprob": -5.138235092163086, "rank": 2, "decoded_token": " gently"}, "10401": {"logprob": -5.950735092163086, "rank": 3, "decoded_token": " roll"}, "4323": {"logprob": -6.700735092163086, "rank": 4, "decoded_token": " break"}, "5125": {"logprob": -7.138235092163086, "rank": 5, "decoded_token": " approach"}}, {"22196": {"logprob": -0.060372594743967056, "rank": 1, "decoded_token": " nearby"}, "6117": {"logprob": -3.3103725910186768, "rank": 2, "decoded_token": " near"}, "1294": {"logprob": -4.435372829437256, "rank": 3, "decoded_token": " in"}, "25644": {"logprob": -6.310372829437256, "rank": 4, "decoded_token": " beside"}, "1321": {"logprob": -6.560372829437256, "rank": 5, "decoded_token": " and"}}, {"1626": {"logprob": -0.005290080793201923, "rank": 1, "decoded_token": ".\n"}, "1294": {"logprob": -6.5052900314331055, "rank": 2, "decoded_token": " in"}, "1044": {"logprob": -7.0052900314331055, "rank": 3, "decoded_token": ","}, "1321": {"logprob": -7.1302900314331055, "rank": 4, "decoded_token": " and"}, "1513": {"logprob": -7.2552900314331055, "rank": 5, "decoded_token": " at"}}, {"1052": {"logprob": -7.748573807475623e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -12.562507629394531, "rank": 2, "decoded_token": "3"}, "1053": {"logprob": -13.125007629394531, "rank": 3, "decoded_token": "5"}, "1256": {"logprob": -14.125007629394531, "rank": 4, "decoded_token": " "}, "1049": {"logprob": -14.312507629394531, "rank": 5, "decoded_token": "1"}}, {"1046": {"logprob": -1.2993727978027891e-05, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -12.62501335144043, "rank": 2, "decoded_token": ","}, "3590": {"logprob": -12.75001335144043, "rank": 3, "decoded_token": ".A"}, "1058": {"logprob": -13.00001335144043, "rank": 4, "decoded_token": ":"}, "2247": {"logprob": -13.37501335144043, "rank": 5, "decoded_token": " ."}}, {"1349": {"logprob": -0.00046957432641647756, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -8.250469207763672, "rank": 2, "decoded_token": " An"}, "1488": {"logprob": -10.125469207763672, "rank": 3, "decoded_token": " W"}, "2409": {"logprob": -10.375469207763672, "rank": 4, "decoded_token": " This"}, "12232": {"logprob": -10.500469207763672, "rank": 5, "decoded_token": " Gra"}}, {"53301": {"logprob": -0.35120296478271484, "rank": 1, "decoded_token": " winding"}, "59396": {"logprob": -1.8512029647827148, "rank": 2, "decoded_token": " gravel"}, "2169": {"logprob": -2.476202964782715, "rank": 3, "decoded_token": " ser"}, "54742": {"logprob": -3.851202964782715, "rank": 4, "decoded_token": " peaceful"}, "43536": {"logprob": -5.101202964782715, "rank": 5, "decoded_token": " curved"}}, {"59396": {"logprob": -0.2955280840396881, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.6705280542373657, "rank": 2, "decoded_token": " path"}, "14801": {"logprob": -2.7955281734466553, "rank": 3, "decoded_token": " pathway"}, "1044": {"logprob": -6.420527935028076, "rank": 4, "decoded_token": ","}, "18341": {"logprob": -6.670527935028076, "rank": 5, "decoded_token": " pathways"}}, {"3549": {"logprob": -0.03408379852771759, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.409083843231201, "rank": 2, "decoded_token": " pathway"}, "18341": {"logprob": -8.284083366394043, "rank": 3, "decoded_token": " pathways"}, "1505": {"logprob": -9.534083366394043, "rank": 4, "decoded_token": " or"}, "7368": {"logprob": -10.659083366394043, "rank": 5, "decoded_token": "path"}}, {"1294": {"logprob": -1.0857839584350586, "rank": 1, "decoded_token": " in"}, "13335": {"logprob": -1.4607839584350586, "rank": 2, "decoded_token": " leads"}, "2645": {"logprob": -1.9607839584350586, "rank": 3, "decoded_token": " through"}, "29817": {"logprob": -2.4607839584350586, "rank": 4, "decoded_token": " surrounded"}, "22416": {"logprob": -3.2107839584350586, "rank": 5, "decoded_token": " curves"}}, {"1261": {"logprob": -0.00011705666838679463, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -9.500117301940918, "rank": 2, "decoded_token": " an"}, "1278": {"logprob": -10.250117301940918, "rank": 3, "decoded_token": " the"}, "2549": {"logprob": -12.750117301940918, "rank": 4, "decoded_token": " what"}, "11223": {"logprob": -13.750117301940918, "rank": 5, "decoded_token": " green"}}, {"12097": {"logprob": -0.02791696786880493, "rank": 1, "decoded_token": " park"}, "2169": {"logprob": -4.65291690826416, "rank": 2, "decoded_token": " ser"}, "1295": {"logprob": -4.65291690826416, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -5.27791690826416, "rank": 4, "decoded_token": " grass"}, "26428": {"logprob": -6.52791690826416, "rank": 5, "decoded_token": " garden"}}, {"1044": {"logprob": -1.350893259048462, "rank": 1, "decoded_token": ","}, "1395": {"logprob": -1.600893259048462, "rank": 2, "decoded_token": " is"}, "29817": {"logprob": -2.350893259048462, "rank": 3, "decoded_token": " surrounded"}, "121313": {"logprob": -2.475893259048462, "rank": 4, "decoded_token": " flanked"}, "1454": {"logprob": -2.475893259048462, "rank": 5, "decoded_token": " with"}}, {"121040": {"logprob": -0.710591197013855, "rank": 1, "decoded_token": " bordered"}, "121313": {"logprob": -1.085591197013855, "rank": 2, "decoded_token": " flanked"}, "54410": {"logprob": -1.960591197013855, "rank": 3, "decoded_token": " lined"}, "29817": {"logprob": -3.8355913162231445, "rank": 4, "decoded_token": " surrounded"}, "1454": {"logprob": -5.8355913162231445, "rank": 5, "decoded_token": " with"}}, {"1536": {"logprob": -4.6491513785440475e-06, "rank": 1, "decoded_token": " by"}, "1454": {"logprob": -12.375004768371582, "rank": 2, "decoded_token": " with"}, "1408": {"logprob": -15.812504768371582, "rank": 3, "decoded_token": " on"}, "3326": {"logprob": -16.875003814697266, "rank": 4, "decoded_token": "by"}, "1295": {"logprob": -16.875003814697266, "rank": 5, "decoded_token": " l"}}, {"11223": {"logprob": -0.4314780533313751, "rank": 1, "decoded_token": " green"}, "1295": {"logprob": -1.4314780235290527, "rank": 2, "decoded_token": " l"}, "23170": {"logprob": -2.4314780235290527, "rank": 3, "decoded_token": " grass"}, "17744": {"logprob": -4.806478023529053, "rank": 4, "decoded_token": " blo"}, "95612": {"logprob": -5.181478023529053, "rank": 5, "decoded_token": " vibrant"}}, {"23170": {"logprob": -0.00035041390219703317, "rank": 1, "decoded_token": " grass"}, "69230": {"logprob": -8.125349998474121, "rank": 2, "decoded_token": " lawn"}, "128633": {"logprob": -10.750349998474121, "rank": 3, "decoded_token": " grasses"}, "87781": {"logprob": -11.437849998474121, "rank": 4, "decoded_token": "\u8349"}, "16429": {"logprob": -11.437849998474121, "rank": 5, "decoded_token": " trees"}}, {"1321": {"logprob": -0.0009494088008068502, "rank": 1, "decoded_token": " and"}, "1044": {"logprob": -7.125949382781982, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -9.25094985961914, "rank": 3, "decoded_token": " with"}, "2425": {"logprob": -11.75094985961914, "rank": 4, "decoded_token": " under"}, "1046": {"logprob": -11.75094985961914, "rank": 5, "decoded_token": "."}}, {"17744": {"logprob": -0.21488544344902039, "rank": 1, "decoded_token": " blo"}, "105368": {"logprob": -1.8398854732513428, "rank": 2, "decoded_token": " bloss"}, "87833": {"logprob": -3.8398854732513428, "rank": 3, "decoded_token": " flowering"}, "16429": {"logprob": -4.464885234832764, "rank": 4, "decoded_token": " trees"}, "117207": {"logprob": -7.589885234832764, "rank": 5, "decoded_token": " bloom"}}, {"34941": {"logprob": -7.152555099310121e-07, "rank": 1, "decoded_token": "oming"}, "35974": {"logprob": -14.375000953674316, "rank": 2, "decoded_token": "omed"}, "6325": {"logprob": -16.5625, "rank": 3, "decoded_token": "oms"}, "11009": {"logprob": -17.625, "rank": 4, "decoded_token": "omy"}, "9457": {"logprob": -18.875, "rank": 5, "decoded_token": "ming"}}, {"16429": {"logprob": -0.002424398437142372, "rank": 1, "decoded_token": " trees"}, "103796": {"logprob": -6.627424240112305, "rank": 2, "decoded_token": " cherry"}, "32152": {"logprob": -7.377424240112305, "rank": 3, "decoded_token": " flowers"}, "29151": {"logprob": -9.314924240112305, "rank": 4, "decoded_token": " shr"}, "20370": {"logprob": -9.564924240112305, "rank": 5, "decoded_token": " fruit"}}, {"2425": {"logprob": -0.3792523741722107, "rank": 1, "decoded_token": " under"}, "1046": {"logprob": -1.3792524337768555, "rank": 2, "decoded_token": "."}, "3675": {"logprob": -2.8792524337768555, "rank": 3, "decoded_token": " against"}, "1044": {"logprob": -5.1292524337768555, "rank": 4, "decoded_token": ","}, "1454": {"logprob": -7.2542524337768555, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.0002315968304174021, "rank": 1, "decoded_token": " a"}, "1278": {"logprob": -8.875231742858887, "rank": 2, "decoded_token": " the"}, "10991": {"logprob": -9.875231742858887, "rank": 3, "decoded_token": " blue"}, "6133": {"logprob": -10.375231742858887, "rank": 4, "decoded_token": " clear"}, "1420": {"logprob": -12.250231742858887, "rank": 5, "decoded_token": " an"}}, {"10991": {"logprob": -0.6372600197792053, "rank": 1, "decoded_token": " blue"}, "6133": {"logprob": -0.7622600197792053, "rank": 2, "decoded_token": " clear"}, "18168": {"logprob": -5.3872599601745605, "rank": 3, "decoded_token": " bright"}, "105573": {"logprob": -10.012260437011719, "rank": 4, "decoded_token": " sunny"}, "15330": {"logprob": -11.512260437011719, "rank": 5, "decoded_token": " Blue"}}, {"21283": {"logprob": -6.12716976320371e-05, "rank": 1, "decoded_token": " sky"}, "1044": {"logprob": -9.87506103515625, "rank": 2, "decoded_token": ","}, "19673": {"logprob": -12.00006103515625, "rank": 3, "decoded_token": " Sky"}, "1321": {"logprob": -13.31256103515625, "rank": 4, "decoded_token": " and"}, "124968": {"logprob": -14.81256103515625, "rank": 5, "decoded_token": " skies"}}, {"1046": {"logprob": -0.00013982271775603294, "rank": 1, "decoded_token": "."}, "2": {"logprob": -9.500140190124512, "rank": 2, "decoded_token": ".\n"}, "1626": {"logprob": -10.000140190124512, "rank": 3, "decoded_token": ".\n\n"}, "1338": {"logprob": -11.750140190124512, "rank": 4, "decoded_token": " with"}}, {"2": {"logprob": -0.0004533693427219987, "rank": 1, "decoded_token": " "}, "1032": {"logprob": -7.750453472137451, "rank": 2, "decoded_token": " Each"}, "1256": {"logprob": -11.125452995300293, "rank": 3, "decoded_token": " This"}}]]]
\ No newline at end of file
[[[1784, 3937, 6122, 1261, 7244, 10575, 18970, 1408, 1261, 32656, 4691, 1046, 2], "The image shows a black dog sitting on a wooden surface.", [{"1784": {"logprob": -0.11685245484113693, "rank": 1, "decoded_token": "The"}, "4380": {"logprob": -2.3668525218963623, "rank": 2, "decoded_token": "This"}, "1049": {"logprob": -4.741852283477783, "rank": 3, "decoded_token": "1"}, "117991": {"logprob": -5.991852283477783, "rank": 4, "decoded_token": "Certain"}, "1785": {"logprob": -5.991852283477783, "rank": 5, "decoded_token": "In"}}, {"3937": {"logprob": -0.2591013014316559, "rank": 1, "decoded_token": " image"}, "2158": {"logprob": -1.5091012716293335, "rank": 2, "decoded_token": " first"}, "3977": {"logprob": -5.884101390838623, "rank": 3, "decoded_token": " top"}, "7244": {"logprob": -6.259101390838623, "rank": 4, "decoded_token": " black"}, "8061": {"logprob": -6.759101390838623, "rank": 5, "decoded_token": " images"}}, {"6122": {"logprob": -0.9660423994064331, "rank": 1, "decoded_token": " shows"}, "51948": {"logprob": -1.466042399406433, "rank": 2, "decoded_token": " depicts"}, "6971": {"logprob": -1.466042399406433, "rank": 3, "decoded_token": " features"}, "25981": {"logprob": -2.8410425186157227, "rank": 4, "decoded_token": " displays"}, "8688": {"logprob": -2.8410425186157227, "rank": 5, "decoded_token": " contains"}}, {"1261": {"logprob": -0.0030613720882683992, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -6.253061294555664, "rank": 2, "decoded_token": " an"}, "2295": {"logprob": -7.878061294555664, "rank": 3, "decoded_token": " two"}, "2342": {"logprob": -7.878061294555664, "rank": 4, "decoded_token": " only"}, "1278": {"logprob": -8.628061294555664, "rank": 5, "decoded_token": " the"}}, {"7244": {"logprob": -0.17649099230766296, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -2.3014910221099854, "rank": 2, "decoded_token": " close"}, "4249": {"logprob": -3.4264910221099854, "rank": 3, "decoded_token": " single"}, "4329": {"logprob": -5.113990783691406, "rank": 4, "decoded_token": " large"}, "10575": {"logprob": -5.176490783691406, "rank": 5, "decoded_token": " dog"}}, {"10575": {"logprob": -0.10929587483406067, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.4842958450317383, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -4.109295845031738, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.296795845031738, "rank": 4, "decoded_token": " Lab"}, "7990": {"logprob": -7.484295845031738, "rank": 5, "decoded_token": " cat"}}, {"18970": {"logprob": -0.830376148223877, "rank": 1, "decoded_token": " sitting"}, "1454": {"logprob": -1.580376148223877, "rank": 2, "decoded_token": " with"}, "28528": {"logprob": -1.955376148223877, "rank": 3, "decoded_token": " lying"}, "7283": {"logprob": -2.205376148223877, "rank": 4, "decoded_token": " looking"}, "15866": {"logprob": -3.017876148223877, "rank": 5, "decoded_token": " standing"}}, {"1408": {"logprob": -0.08554735779762268, "rank": 1, "decoded_token": " on"}, "1321": {"logprob": -3.71054744720459, "rank": 2, "decoded_token": " and"}, "3675": {"logprob": -3.96054744720459, "rank": 3, "decoded_token": " against"}, "41132": {"logprob": -4.71054744720459, "rank": 4, "decoded_token": " attent"}, "1454": {"logprob": -5.08554744720459, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.540847897529602, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -0.915847897529602, "rank": 2, "decoded_token": " wooden"}, "12603": {"logprob": -5.4158477783203125, "rank": 3, "decoded_token": " wood"}, "3977": {"logprob": -5.4158477783203125, "rank": 4, "decoded_token": " top"}, "17253": {"logprob": -6.2908477783203125, "rank": 5, "decoded_token": " weather"}}, {"32656": {"logprob": -0.025753861293196678, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -4.400753974914551, "rank": 2, "decoded_token": " rust"}, "12603": {"logprob": -5.275753974914551, "rank": 3, "decoded_token": " wood"}, "3403": {"logprob": -5.400753974914551, "rank": 4, "decoded_token": " text"}, "17253": {"logprob": -6.963253974914551, "rank": 5, "decoded_token": " weather"}}, {"4691": {"logprob": -0.7265751957893372, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8515751957893372, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.6015751361846924, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -5.2265753746032715, "rank": 4, "decoded_token": " deck"}, "1615": {"logprob": -5.7265753746032715, "rank": 5, "decoded_token": " pl"}}, {"1046": {"logprob": -0.4868825674057007, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -1.9868825674057007, "rank": 2, "decoded_token": ","}, "1321": {"logprob": -2.3618826866149902, "rank": 3, "decoded_token": " and"}, "1454": {"logprob": -2.6118826866149902, "rank": 4, "decoded_token": " with"}, "7283": {"logprob": -2.7368826866149902, "rank": 5, "decoded_token": " looking"}}, {"2": {"logprob": -0.0026643513701856136, "rank": 1, "decoded_token": "</s>"}, "1531": {"logprob": -6.502664566040039, "rank": 2, "decoded_token": " The"}, "1032": {"logprob": -6.877664566040039, "rank": 3, "decoded_token": " "}, "3730": {"logprob": -9.752664566040039, "rank": 4, "decoded_token": " There"}, "1256": {"logprob": -11.002664566040039, "rank": 5, "decoded_token": " "}}]], [[1049, 1046, 1349, 7244, 10575, 1454, 2327, 94766, 32961, 53048, 41132, 3923, 1408, 1261, 32656, 4691, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1046, 2], "1. A black dog with floppy ears sits attentively on a wooden surface.\n2. A vast mountain range stretches across the horizon under a cloudy sky.", [{"1049": {"logprob": -0.42824622988700867, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -1.553246259689331, "rank": 2, "decoded_token": "-"}, "1065": {"logprob": -2.428246259689331, "rank": 3, "decoded_token": "A"}, "1784": {"logprob": -4.053246021270752, "rank": 4, "decoded_token": "The"}, "69957": {"logprob": -4.428246021270752, "rank": 5, "decoded_token": "Sure"}}, {"1046": {"logprob": -1.811964830267243e-05, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.875018119812012, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -12.250018119812012, "rank": 3, "decoded_token": ".A"}, "1065": {"logprob": -13.062518119812012, "rank": 4, "decoded_token": "A"}, "1041": {"logprob": -13.750018119812012, "rank": 5, "decoded_token": ")"}}, {"1349": {"logprob": -0.13647246360778809, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.386472463607788, "rank": 2, "decoded_token": " \""}, "1603": {"logprob": -3.886472463607788, "rank": 3, "decoded_token": " **"}, "11967": {"logprob": -5.011472702026367, "rank": 4, "decoded_token": " Image"}, "1531": {"logprob": -5.011472702026367, "rank": 5, "decoded_token": " The"}}, {"7244": {"logprob": -0.18561004102230072, "rank": 1, "decoded_token": " black"}, "38462": {"logprob": -3.185610055923462, "rank": 2, "decoded_token": " curious"}, "68076": {"logprob": -3.623110055923462, "rank": 3, "decoded_token": " cute"}, "4329": {"logprob": -3.935610055923462, "rank": 4, "decoded_token": " large"}, "74168": {"logprob": -4.373109817504883, "rank": 5, "decoded_token": " gloss"}}, {"10575": {"logprob": -0.17297746241092682, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.1729774475097656, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.1729774475097656, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -6.985477447509766, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.360477447509766, "rank": 5, "decoded_token": " lab"}}, {"1454": {"logprob": -0.5785807967185974, "rank": 1, "decoded_token": " with"}, "53048": {"logprob": -1.2660808563232422, "rank": 2, "decoded_token": " sits"}, "1395": {"logprob": -3.016080856323242, "rank": 3, "decoded_token": " is"}, "22524": {"logprob": -3.578580856323242, "rank": 4, "decoded_token": " lies"}, "18970": {"logprob": -3.703580856323242, "rank": 5, "decoded_token": " sitting"}}, {"2327": {"logprob": -1.2709298133850098, "rank": 1, "decoded_token": " fl"}, "1261": {"logprob": -1.3959298133850098, "rank": 2, "decoded_token": " a"}, "17300": {"logprob": -1.8959298133850098, "rank": 3, "decoded_token": " soul"}, "100089": {"logprob": -2.6459298133850098, "rank": 4, "decoded_token": " expressive"}, "6444": {"logprob": -3.1459298133850098, "rank": 5, "decoded_token": " soft"}}, {"94766": {"logprob": -0.002432247158139944, "rank": 1, "decoded_token": "oppy"}, "124603": {"logprob": -6.377432346343994, "rank": 2, "decoded_token": "uffy"}, "1484": {"logprob": -7.877432346343994, "rank": 3, "decoded_token": "op"}, "24897": {"logprob": -8.877431869506836, "rank": 4, "decoded_token": "appy"}, "102477": {"logprob": -9.752431869506836, "rank": 5, "decoded_token": "opping"}}, {"32961": {"logprob": -5.113947918289341e-05, "rank": 1, "decoded_token": " ears"}, "16962": {"logprob": -11.312551498413086, "rank": 2, "decoded_token": " ear"}, "5731": {"logprob": -11.750051498413086, "rank": 3, "decoded_token": " eyes"}, "3351": {"logprob": -12.000051498413086, "rank": 4, "decoded_token": " years"}, "42071": {"logprob": -13.000051498413086, "rank": 5, "decoded_token": " cheeks"}}, {"53048": {"logprob": -0.6131591200828552, "rank": 1, "decoded_token": " sits"}, "10637": {"logprob": -1.9881591796875, "rank": 2, "decoded_token": " looks"}, "1321": {"logprob": -2.4256591796875, "rank": 3, "decoded_token": " and"}, "1395": {"logprob": -2.6756591796875, "rank": 4, "decoded_token": " is"}, "18970": {"logprob": -3.0506591796875, "rank": 5, "decoded_token": " sitting"}}, {"41132": {"logprob": -0.36187249422073364, "rank": 1, "decoded_token": " attent"}, "1408": {"logprob": -2.361872434616089, "rank": 2, "decoded_token": " on"}, "106534": {"logprob": -2.424372434616089, "rank": 3, "decoded_token": " calmly"}, "12276": {"logprob": -2.611872434616089, "rank": 4, "decoded_token": " alert"}, "6482": {"logprob": -5.174372673034668, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -8.451581379631534e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.50008487701416, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -11.87508487701416, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -14.00008487701416, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -14.75008487701416, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.058125678449869156, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.1831257343292236, "rank": 2, "decoded_token": " against"}, "1294": {"logprob": -4.9331254959106445, "rank": 3, "decoded_token": " in"}, "7283": {"logprob": -5.8081254959106445, "rank": 4, "decoded_token": " looking"}, "1044": {"logprob": -5.9331254959106445, "rank": 5, "decoded_token": ","}}, {"1261": {"logprob": -0.21029606461524963, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.7102960348129272, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -5.710296154022217, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -6.085296154022217, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.960296154022217, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.08548421412706375, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.710484266281128, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.710484027862549, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.960484027862549, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.960484027862549, "rank": 5, "decoded_token": " text"}}, {"4691": {"logprob": -0.7172377109527588, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8422377109527588, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.842237710952759, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -4.21723747253418, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.21723747253418, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.12971943616867065, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.3797194957733154, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -4.129719257354736, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -5.129719257354736, "rank": 4, "decoded_token": ".\n\n"}, "7283": {"logprob": -5.504719257354736, "rank": 5, "decoded_token": " looking"}}, {"1050": {"logprob": -0.00015698630886618048, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -9.125157356262207, "rank": 2, "decoded_token": " "}, "1032": {"logprob": -10.875157356262207, "rank": 3, "decoded_token": " "}, "1293": {"logprob": -11.750157356262207, "rank": 4, "decoded_token": " "}, "1051": {"logprob": -12.125157356262207, "rank": 5, "decoded_token": "3"}}, {"1046": {"logprob": -6.6756979322235566e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.062506675720215, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.187506675720215, "rank": 3, "decoded_token": ".\n"}, "1338": {"logprob": -14.750006675720215, "rank": 4, "decoded_token": ".\n\n"}, "1058": {"logprob": -14.937506675720215, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.5863217115402222, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.4613217115402222, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.2113218307495117, "rank": 3, "decoded_token": " Snow"}, "113465": {"logprob": -3.8988218307495117, "rank": 4, "decoded_token": " Rug"}, "1531": {"logprob": -3.9613218307495117, "rank": 5, "decoded_token": " The"}}, {"15375": {"logprob": -0.639299213886261, "rank": 1, "decoded_token": " vast"}, "37849": {"logprob": -2.014299154281616, "rank": 2, "decoded_token": " breat"}, "61082": {"logprob": -2.389299154281616, "rank": 3, "decoded_token": " panor"}, "10726": {"logprob": -3.139299154281616, "rank": 4, "decoded_token": " scen"}, "2169": {"logprob": -3.201799154281616, "rank": 5, "decoded_token": " ser"}}, {"24361": {"logprob": -0.702845573425293, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.952845573425293, "rank": 2, "decoded_token": " mountainous"}, "1044": {"logprob": -2.077845573425293, "rank": 3, "decoded_token": ","}, "4521": {"logprob": -2.327845573425293, "rank": 4, "decoded_token": " range"}, "28035": {"logprob": -2.452845573425293, "rank": 5, "decoded_token": " landscape"}}, {"4521": {"logprob": -0.07058162242174149, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.6955816745758057, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.320581436157227, "rank": 3, "decoded_token": " valley"}, "12248": {"logprob": -9.445581436157227, "rank": 4, "decoded_token": " peak"}, "13327": {"logprob": -9.695581436157227, "rank": 5, "decoded_token": " scene"}}, {"94973": {"logprob": -1.1164050102233887, "rank": 1, "decoded_token": " stretches"}, "1454": {"logprob": -1.1789050102233887, "rank": 2, "decoded_token": " with"}, "2425": {"logprob": -1.8664050102233887, "rank": 3, "decoded_token": " under"}, "1395": {"logprob": -2.5539050102233887, "rank": 4, "decoded_token": " is"}, "13875": {"logprob": -2.9914050102233887, "rank": 5, "decoded_token": " covered"}}, {"5669": {"logprob": -0.3286789357662201, "rank": 1, "decoded_token": " across"}, "1848": {"logprob": -2.078678846359253, "rank": 2, "decoded_token": " out"}, "2425": {"logprob": -2.328678846359253, "rank": 3, "decoded_token": " under"}, "2203": {"logprob": -3.328678846359253, "rank": 4, "decoded_token": " into"}, "8994": {"logprob": -4.766179084777832, "rank": 5, "decoded_token": " towards"}}, {"1278": {"logprob": -0.039004355669021606, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -3.289004325866699, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -7.414004325866699, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -9.0390043258667, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -9.2265043258667, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.2659883201122284, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -2.140988349914551, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.015988349914551, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.515988349914551, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -4.265988349914551, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.5356141328811646, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -1.5356141328811646, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -1.7856141328811646, "rank": 3, "decoded_token": " with"}, "25136": {"logprob": -3.785614013671875, "rank": 4, "decoded_token": " beneath"}, "1408": {"logprob": -5.785614013671875, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.006081883795559406, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -5.506082057952881, "rank": 2, "decoded_token": " an"}, "16152": {"logprob": -7.631082057952881, "rank": 3, "decoded_token": " cloud"}, "6133": {"logprob": -7.881082057952881, "rank": 4, "decoded_token": " clear"}, "2136": {"logprob": -8.006081581115723, "rank": 5, "decoded_token": " over"}}, {"16152": {"logprob": -0.6749536991119385, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -1.4249536991119385, "rank": 2, "decoded_token": " clear"}, "18416": {"logprob": -2.8624536991119385, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -2.9874536991119385, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.2374536991119385, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.10860869288444519, "rank": 1, "decoded_token": "y"}, "4527": {"logprob": -2.9836087226867676, "rank": 2, "decoded_token": "less"}, "1286": {"logprob": -3.4836087226867676, "rank": 3, "decoded_token": "ed"}, "77187": {"logprob": -4.608608722686768, "rank": 4, "decoded_token": "-filled"}, "114525": {"logprob": -4.858608722686768, "rank": 5, "decoded_token": "-covered"}}, {"21283": {"logprob": -0.002785732736811042, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -6.252785682678223, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -7.627785682678223, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -8.627785682678223, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -9.377785682678223, "rank": 5, "decoded_token": " grey"}}, {"1046": {"logprob": -0.047878943383693695, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -3.1728789806365967, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -5.547878742218018, "rank": 3, "decoded_token": " with"}, "1338": {"logprob": -7.172878742218018, "rank": 4, "decoded_token": ".\n\n"}, "1294": {"logprob": -9.172879219055176, "rank": 5, "decoded_token": " in"}}, {"2": {"logprob": -1.3351351299206726e-05, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -11.25001335144043, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.00001335144043, "rank": 3, "decoded_token": " "}, "1319": {"logprob": -17.25001335144043, "rank": 4, "decoded_token": " ("}, "1766": {"logprob": -18.50001335144043, "rank": 5, "decoded_token": " ["}}]], [[1049, 1046, 1349, 7244, 10575, 53048, 41132, 3923, 1408, 1261, 32656, 11237, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1626, 1051, 1046, 8342, 71284, 7377, 1394, 22140, 1294, 1278, 27208, 1513, 97558, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 13335, 2645, 1261, 1295, 3506, 11223, 12097, 1046, 2], "1. A black dog sits attentively on a wooden floor.\n2. A vast mountain range stretches across the horizon under a cloudy sky.\n3. Surfers wait for waves in the ocean at sunset.\n4. A winding gravel path leads through a lush green park.", [{"1049": {"logprob": -0.05001257359981537, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -3.1750125885009766, "rank": 2, "decoded_token": "-"}, "69957": {"logprob": -5.925012588500977, "rank": 3, "decoded_token": "Sure"}, "11745": {"logprob": -6.425012588500977, "rank": 4, "decoded_token": "Here"}, "1065": {"logprob": -6.425012588500977, "rank": 5, "decoded_token": "A"}}, {"1046": {"logprob": -8.702239938429557e-06, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -12.000008583068848, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -13.375008583068848, "rank": 3, "decoded_token": ".A"}, "1041": {"logprob": -14.750008583068848, "rank": 4, "decoded_token": ")"}, "1065": {"logprob": -15.687508583068848, "rank": 5, "decoded_token": "A"}}, {"1349": {"logprob": -0.14196155965328217, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.2669615745544434, "rank": 2, "decoded_token": " \""}, "1531": {"logprob": -4.516961574554443, "rank": 3, "decoded_token": " The"}, "11967": {"logprob": -4.516961574554443, "rank": 4, "decoded_token": " Image"}, "1603": {"logprob": -5.391961574554443, "rank": 5, "decoded_token": " **"}}, {"7244": {"logprob": -0.14889711141586304, "rank": 1, "decoded_token": " black"}, "68076": {"logprob": -3.398897171020508, "rank": 2, "decoded_token": " cute"}, "6231": {"logprob": -3.961397171020508, "rank": 3, "decoded_token": " close"}, "38462": {"logprob": -4.273897171020508, "rank": 4, "decoded_token": " curious"}, "4329": {"logprob": -4.398897171020508, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.12091328203678131, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.37091326713562, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.99591326713562, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.683413505554199, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.808413505554199, "rank": 5, "decoded_token": " lab"}}, {"53048": {"logprob": -0.8691943287849426, "rank": 1, "decoded_token": " sits"}, "1454": {"logprob": -1.1191942691802979, "rank": 2, "decoded_token": " with"}, "1395": {"logprob": -2.431694269180298, "rank": 3, "decoded_token": " is"}, "18970": {"logprob": -2.744194269180298, "rank": 4, "decoded_token": " sitting"}, "22524": {"logprob": -3.681694269180298, "rank": 5, "decoded_token": " lies"}}, {"41132": {"logprob": -0.5939557552337646, "rank": 1, "decoded_token": " attent"}, "106534": {"logprob": -1.2814557552337646, "rank": 2, "decoded_token": " calmly"}, "12276": {"logprob": -2.8439557552337646, "rank": 3, "decoded_token": " alert"}, "1408": {"logprob": -2.8439557552337646, "rank": 4, "decoded_token": " on"}, "6482": {"logprob": -4.968955993652344, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -0.00010084597306558862, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.500101089477539, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -10.875101089477539, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -13.000101089477539, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -13.750101089477539, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.056158196181058884, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.6811583042144775, "rank": 2, "decoded_token": " against"}, "1454": {"logprob": -4.306158065795898, "rank": 3, "decoded_token": " with"}, "1294": {"logprob": -5.181158065795898, "rank": 4, "decoded_token": " in"}, "7283": {"logprob": -5.431158065795898, "rank": 5, "decoded_token": " looking"}}, {"1261": {"logprob": -0.33056098222732544, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.3305609226226807, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -4.70556116104126, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -5.83056116104126, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.58056116104126, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.07081110030412674, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.9458110332489014, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.6958112716674805, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.8208112716674805, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -6.0708112716674805, "rank": 5, "decoded_token": " text"}}, {"11237": {"logprob": -0.6428436636924744, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0178437232971191, "rank": 2, "decoded_token": " surface"}, "7042": {"logprob": -2.642843723297119, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -3.517843723297119, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.017843723297119, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.7337945103645325, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -0.8587945103645325, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -3.3587944507598877, "rank": 3, "decoded_token": " with"}, "7283": {"logprob": -3.6087944507598877, "rank": 4, "decoded_token": " looking"}, "1321": {"logprob": -4.108794689178467, "rank": 5, "decoded_token": " and"}}, {"1050": {"logprob": -1.0132738680113107e-05, "rank": 1, "decoded_token": "2"}, "1051": {"logprob": -11.75001049041748, "rank": 2, "decoded_token": "3"}, "1256": {"logprob": -14.00001049041748, "rank": 3, "decoded_token": " "}, "1049": {"logprob": -14.62501049041748, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -14.62501049041748, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -2.861018856492592e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.43750286102295, "rank": 2, "decoded_token": ".A"}, "4700": {"logprob": -15.37500286102295, "rank": 3, "decoded_token": ".M"}, "1626": {"logprob": -15.37500286102295, "rank": 4, "decoded_token": ".\n"}, "3051": {"logprob": -15.87500286102295, "rank": 5, "decoded_token": ".S"}}, {"1349": {"logprob": -0.6794427633285522, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.9294427633285522, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.116942882537842, "rank": 3, "decoded_token": " Snow"}, "27260": {"logprob": -2.616942882537842, "rank": 4, "decoded_token": " Mountain"}, "113465": {"logprob": -2.866942882537842, "rank": 5, "decoded_token": " Rug"}}, {"15375": {"logprob": -0.9194075465202332, "rank": 1, "decoded_token": " vast"}, "10726": {"logprob": -2.294407606124878, "rank": 2, "decoded_token": " scen"}, "4521": {"logprob": -2.356907606124878, "rank": 3, "decoded_token": " range"}, "122203": {"logprob": -2.419407606124878, "rank": 4, "decoded_token": " rugged"}, "61082": {"logprob": -2.856907606124878, "rank": 5, "decoded_token": " panor"}}, {"24361": {"logprob": -0.5804797410964966, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.8304797410964966, "rank": 2, "decoded_token": " mountainous"}, "28035": {"logprob": -2.455479621887207, "rank": 3, "decoded_token": " landscape"}, "4521": {"logprob": -2.455479621887207, "rank": 4, "decoded_token": " range"}, "1044": {"logprob": -2.705479621887207, "rank": 5, "decoded_token": ","}}, {"4521": {"logprob": -0.0493546724319458, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -3.0493545532226562, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.424354553222656, "rank": 3, "decoded_token": " valley"}, "13327": {"logprob": -9.049354553222656, "rank": 4, "decoded_token": " scene"}, "3719": {"logprob": -9.799354553222656, "rank": 5, "decoded_token": " view"}}, {"94973": {"logprob": -0.6676871180534363, "rank": 1, "decoded_token": " stretches"}, "2425": {"logprob": -1.792687177658081, "rank": 2, "decoded_token": " under"}, "1395": {"logprob": -2.292687177658081, "rank": 3, "decoded_token": " is"}, "1454": {"logprob": -2.730187177658081, "rank": 4, "decoded_token": " with"}, "7038": {"logprob": -3.292687177658081, "rank": 5, "decoded_token": " extends"}}, {"5669": {"logprob": -0.4542117118835449, "rank": 1, "decoded_token": " across"}, "2425": {"logprob": -1.454211711883545, "rank": 2, "decoded_token": " under"}, "1848": {"logprob": -2.454211711883545, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -4.204211711883545, "rank": 4, "decoded_token": " into"}, "25136": {"logprob": -4.641711711883545, "rank": 5, "decoded_token": " beneath"}}, {"1278": {"logprob": -0.23009441792964935, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -1.6050944328308105, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -5.6050944328308105, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -7.2300944328308105, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -10.167593955993652, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.3072167932987213, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -1.932216763496399, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.1822168827056885, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.6822168827056885, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -3.6822168827056885, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.2914469838142395, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -2.4164469242095947, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -2.5414469242095947, "rank": 3, "decoded_token": " with"}, "1626": {"logprob": -3.7914469242095947, "rank": 4, "decoded_token": ".\n"}, "1408": {"logprob": -3.7914469242095947, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.0460360012948513, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -3.9210360050201416, "rank": 2, "decoded_token": " an"}, "16152": {"logprob": -4.1085357666015625, "rank": 3, "decoded_token": " cloud"}, "2136": {"logprob": -6.1710357666015625, "rank": 4, "decoded_token": " over"}, "6133": {"logprob": -6.4210357666015625, "rank": 5, "decoded_token": " clear"}}, {"16152": {"logprob": -0.20367540419101715, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -2.8286755084991455, "rank": 2, "decoded_token": " clear"}, "27254": {"logprob": -3.5161755084991455, "rank": 3, "decoded_token": " partly"}, "18416": {"logprob": -3.8286755084991455, "rank": 4, "decoded_token": " haz"}, "4391": {"logprob": -4.328675270080566, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.05241352692246437, "rank": 1, "decoded_token": "y"}, "1286": {"logprob": -3.8024134635925293, "rank": 2, "decoded_token": "ed"}, "77187": {"logprob": -4.552413463592529, "rank": 3, "decoded_token": "-filled"}, "4527": {"logprob": -4.802413463592529, "rank": 4, "decoded_token": "less"}, "114525": {"logprob": -4.927413463592529, "rank": 5, "decoded_token": "-covered"}}, {"21283": {"logprob": -0.0003716255014296621, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -8.750371932983398, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -9.375371932983398, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -10.375371932983398, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -11.250371932983398, "rank": 5, "decoded_token": " grey"}}, {"1626": {"logprob": -0.00012730741582345217, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -9.500126838684082, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -10.500126838684082, "rank": 3, "decoded_token": "."}, "1454": {"logprob": -10.875126838684082, "rank": 4, "decoded_token": " with"}, "1294": {"logprob": -13.250126838684082, "rank": 5, "decoded_token": " in"}}, {"1051": {"logprob": -3.2186455882765586e-06, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -12.75000286102295, "rank": 2, "decoded_token": "4"}, "1050": {"logprob": -15.00000286102295, "rank": 3, "decoded_token": "2"}, "1049": {"logprob": -16.937503814697266, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -17.875003814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.687501907348633, "rank": 2, "decoded_token": ".A"}, "5226": {"logprob": -15.687501907348633, "rank": 3, "decoded_token": ".D"}, "6847": {"logprob": -15.812501907348633, "rank": 4, "decoded_token": ".T"}, "48426": {"logprob": -16.812501907348633, "rank": 5, "decoded_token": ".The"}}, {"8342": {"logprob": -0.5730464458465576, "rank": 1, "decoded_token": " Sur"}, "1349": {"logprob": -1.6980464458465576, "rank": 2, "decoded_token": " A"}, "22468": {"logprob": -2.5730464458465576, "rank": 3, "decoded_token": " Several"}, "1488": {"logprob": -2.6980464458465576, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -3.1980464458465576, "rank": 5, "decoded_token": " People"}}, {"71284": {"logprob": -0.0033258858602494, "rank": 1, "decoded_token": "fers"}, "1102": {"logprob": -5.878325939178467, "rank": 2, "decoded_token": "f"}, "1726": {"logprob": -7.628325939178467, "rank": 3, "decoded_token": "fer"}, "61888": {"logprob": -12.253325462341309, "rank": 4, "decoded_token": "fline"}, "2119": {"logprob": -13.003325462341309, "rank": 5, "decoded_token": "fter"}}, {"7377": {"logprob": -1.4996429681777954, "rank": 1, "decoded_token": " wait"}, "1584": {"logprob": -1.7496429681777954, "rank": 2, "decoded_token": " are"}, "88014": {"logprob": -1.9371429681777954, "rank": 3, "decoded_token": " paddle"}, "1294": {"logprob": -1.9371429681777954, "rank": 4, "decoded_token": " in"}, "24434": {"logprob": -2.187142848968506, "rank": 5, "decoded_token": " ride"}}, {"1394": {"logprob": -0.6126739382743835, "rank": 1, "decoded_token": " for"}, "1294": {"logprob": -0.9876739382743835, "rank": 2, "decoded_token": " in"}, "1408": {"logprob": -2.7376739978790283, "rank": 3, "decoded_token": " on"}, "6482": {"logprob": -4.425173759460449, "rank": 4, "decoded_token": " patient"}, "1321": {"logprob": -5.612673759460449, "rank": 5, "decoded_token": " and"}}, {"22140": {"logprob": -0.00729279313236475, "rank": 1, "decoded_token": " waves"}, "1278": {"logprob": -5.632292747497559, "rank": 2, "decoded_token": " the"}, "1261": {"logprob": -5.757292747497559, "rank": 3, "decoded_token": " a"}, "39460": {"logprob": -8.257292747497559, "rank": 4, "decoded_token": " incoming"}, "1321": {"logprob": -9.757292747497559, "rank": 5, "decoded_token": " and"}}, {"1294": {"logprob": -0.3071398138999939, "rank": 1, "decoded_token": " in"}, "1408": {"logprob": -2.1821398735046387, "rank": 2, "decoded_token": " on"}, "1513": {"logprob": -2.4321398735046387, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.6821398735046387, "rank": 4, "decoded_token": " while"}, "1435": {"logprob": -3.8071398735046387, "rank": 5, "decoded_token": " as"}}, {"1278": {"logprob": -0.004646694287657738, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -6.1921467781066895, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -6.9421467781066895, "rank": 3, "decoded_token": " an"}, "40466": {"logprob": -7.2546467781066895, "rank": 4, "decoded_token": " shallow"}, "26517": {"logprob": -7.8796467781066895, "rank": 5, "decoded_token": " calm"}}, {"27208": {"logprob": -0.0658877044916153, "rank": 1, "decoded_token": " ocean"}, "7786": {"logprob": -3.440887689590454, "rank": 2, "decoded_token": " distance"}, "5124": {"logprob": -5.253387928009033, "rank": 3, "decoded_token": " early"}, "26517": {"logprob": -5.315887928009033, "rank": 4, "decoded_token": " calm"}, "11196": {"logprob": -5.378387928009033, "rank": 5, "decoded_token": " sea"}}, {"1513": {"logprob": -1.1504861116409302, "rank": 1, "decoded_token": " at"}, "1435": {"logprob": -1.2754861116409302, "rank": 2, "decoded_token": " as"}, "3184": {"logprob": -1.4004861116409302, "rank": 3, "decoded_token": " during"}, "3016": {"logprob": -2.9004859924316406, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -3.1504859924316406, "rank": 5, "decoded_token": " near"}}, {"97558": {"logprob": -0.12151996046304703, "rank": 1, "decoded_token": " sunset"}, "11729": {"logprob": -2.8715200424194336, "rank": 2, "decoded_token": " sun"}, "1266": {"logprob": -3.4965200424194336, "rank": 3, "decoded_token": " d"}, "54507": {"logprob": -3.9965200424194336, "rank": 4, "decoded_token": " dawn"}, "1261": {"logprob": -5.121520042419434, "rank": 5, "decoded_token": " a"}}, {"1626": {"logprob": -0.3073118329048157, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.182311773300171, "rank": 2, "decoded_token": ","}, "3016": {"logprob": -2.557311773300171, "rank": 3, "decoded_token": " while"}, "1454": {"logprob": -3.432311773300171, "rank": 4, "decoded_token": " with"}, "6117": {"logprob": -4.05731201171875, "rank": 5, "decoded_token": " near"}}, {"1052": {"logprob": -3.3378546504536644e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -13.25000286102295, "rank": 2, "decoded_token": "3"}, "1049": {"logprob": -13.93750286102295, "rank": 3, "decoded_token": "1"}, "1053": {"logprob": -14.43750286102295, "rank": 4, "decoded_token": "5"}, "1032": {"logprob": -16.687503814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.500001907348633, "rank": 2, "decoded_token": ".A"}, "6847": {"logprob": -16.437501907348633, "rank": 3, "decoded_token": ".T"}, "1044": {"logprob": -17.312501907348633, "rank": 4, "decoded_token": ","}, "1349": {"logprob": -17.375001907348633, "rank": 5, "decoded_token": " A"}}, {"1349": {"logprob": -0.004292916506528854, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -5.629292964935303, "rank": 2, "decoded_token": " An"}, "10638": {"logprob": -7.879292964935303, "rank": 3, "decoded_token": " Two"}, "111463": {"logprob": -10.004292488098145, "rank": 4, "decoded_token": " Trees"}, "1531": {"logprob": -10.879292488098145, "rank": 5, "decoded_token": " The"}}, {"53301": {"logprob": -1.5473321676254272, "rank": 1, "decoded_token": " winding"}, "15192": {"logprob": -1.7348321676254272, "rank": 2, "decoded_token": " narrow"}, "47945": {"logprob": -2.109832286834717, "rank": 3, "decoded_token": " dirt"}, "2169": {"logprob": -2.609832286834717, "rank": 4, "decoded_token": " ser"}, "59396": {"logprob": -2.672332286834717, "rank": 5, "decoded_token": " gravel"}}, {"59396": {"logprob": -0.8954829573631287, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.1454830169677734, "rank": 2, "decoded_token": " path"}, "47945": {"logprob": -1.6454830169677734, "rank": 3, "decoded_token": " dirt"}, "14801": {"logprob": -3.2704830169677734, "rank": 4, "decoded_token": " pathway"}, "15551": {"logprob": -4.270483016967773, "rank": 5, "decoded_token": " stone"}}, {"3549": {"logprob": -0.02117946185171604, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.896179437637329, "rank": 2, "decoded_token": " pathway"}, "33659": {"logprob": -8.14617919921875, "rank": 3, "decoded_token": " trail"}, "9480": {"logprob": -9.64617919921875, "rank": 4, "decoded_token": " road"}, "7368": {"logprob": -9.64617919921875, "rank": 5, "decoded_token": "path"}}, {"13335": {"logprob": -0.18962937593460083, "rank": 1, "decoded_token": " leads"}, "39985": {"logprob": -2.752129316329956, "rank": 2, "decoded_token": " cuts"}, "1639": {"logprob": -3.877129316329956, "rank": 3, "decoded_token": " me"}, "11500": {"logprob": -3.939629316329956, "rank": 4, "decoded_token": " runs"}, "2645": {"logprob": -4.189629554748535, "rank": 5, "decoded_token": " through"}}, {"2645": {"logprob": -0.05349981039762497, "rank": 1, "decoded_token": " through"}, "8994": {"logprob": -4.053499698638916, "rank": 2, "decoded_token": " towards"}, "2396": {"logprob": -4.303499698638916, "rank": 3, "decoded_token": " between"}, "2203": {"logprob": -4.678499698638916, "rank": 4, "decoded_token": " into"}, "1317": {"logprob": -5.678499698638916, "rank": 5, "decoded_token": " to"}}, {"1261": {"logprob": -0.017386287450790405, "rank": 1, "decoded_token": " a"}, "11223": {"logprob": -4.892386436462402, "rank": 2, "decoded_token": " green"}, "1295": {"logprob": -5.017386436462402, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -6.642386436462402, "rank": 4, "decoded_token": " grass"}, "1420": {"logprob": -7.267386436462402, "rank": 5, "decoded_token": " an"}}, {"1295": {"logprob": -0.9453322887420654, "rank": 1, "decoded_token": " l"}, "11223": {"logprob": -1.3203322887420654, "rank": 2, "decoded_token": " green"}, "23170": {"logprob": -1.9453322887420654, "rank": 3, "decoded_token": " grass"}, "12097": {"logprob": -2.4453322887420654, "rank": 4, "decoded_token": " park"}, "26428": {"logprob": -3.3203322887420654, "rank": 5, "decoded_token": " garden"}}, {"3506": {"logprob": -6.556489552167477e-06, "rank": 1, "decoded_token": "ush"}, "1374": {"logprob": -12.000006675720215, "rank": 2, "decoded_token": "us"}, "90716": {"logprob": -15.625006675720215, "rank": 3, "decoded_token": "USH"}, "16938": {"logprob": -15.875006675720215, "rank": 4, "decoded_token": "usher"}, "13326": {"logprob": -17.1875057220459, "rank": 5, "decoded_token": "inden"}}, {"11223": {"logprob": -0.3668670654296875, "rank": 1, "decoded_token": " green"}, "1044": {"logprob": -1.3668670654296875, "rank": 2, "decoded_token": ","}, "26428": {"logprob": -3.4918670654296875, "rank": 3, "decoded_token": " garden"}, "12097": {"logprob": -4.1168670654296875, "rank": 4, "decoded_token": " park"}, "23170": {"logprob": -5.8668670654296875, "rank": 5, "decoded_token": " grass"}}, {"12097": {"logprob": -0.5530153512954712, "rank": 1, "decoded_token": " park"}, "3727": {"logprob": -2.0530152320861816, "rank": 2, "decoded_token": " field"}, "28035": {"logprob": -2.1780152320861816, "rank": 3, "decoded_token": " landscape"}, "26428": {"logprob": -2.3030152320861816, "rank": 4, "decoded_token": " garden"}, "4457": {"logprob": -2.8030152320861816, "rank": 5, "decoded_token": " area"}}, {"1046": {"logprob": -0.7924000024795532, "rank": 1, "decoded_token": "."}, "1454": {"logprob": -1.2924000024795532, "rank": 2, "decoded_token": " with"}, "8994": {"logprob": -2.7923998832702637, "rank": 3, "decoded_token": " towards"}, "54410": {"logprob": -3.5423998832702637, "rank": 4, "decoded_token": " lined"}, "2425": {"logprob": -3.5423998832702637, "rank": 5, "decoded_token": " under"}}, {"2": {"logprob": -1.9073468138230965e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -13.250001907348633, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.250001907348633, "rank": 3, "decoded_token": " "}, "1293": {"logprob": -19.000001907348633, "rank": 4, "decoded_token": " "}, "1319": {"logprob": -20.000001907348633, "rank": 5, "decoded_token": " ("}}]]]
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import copy
from functools import partial
from typing import Optional, Union
import numpy as np
import pytest
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
UserMessage)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import ProcessingCache
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.inputs import MultiModalInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
from vllm.transformers_utils.tokenizer import (MistralTokenizer,
cached_tokenizer_from_config)
from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS
......@@ -21,6 +29,7 @@ def _test_processing_correctness(
hit_rate: float,
num_batches: int,
simplify_rate: float,
ignore_mm_keys: Optional[list[str]] = None,
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip")
......@@ -29,8 +38,8 @@ def _test_processing_correctness(
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="float16",
......@@ -45,7 +54,7 @@ def _test_processing_correctness(
tokenizer=cached_tokenizer_from_config(model_config),
)
# Ensure that it can fit all of the data
cache = ProcessingCache(capacity=1 << 30)
cache = ProcessingCache(capacity_gb=2048)
processing_info = factories.info(ctx)
supported_mm_limits = processing_info.get_supported_mm_limits()
......@@ -82,14 +91,6 @@ def _test_processing_correctness(
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
}
tokenizer_encode_kwargs = {}
if model_config.hf_config.model_type == "mllama":
# For Mllama, tokenizer will always add bos_token at the beginning of
# prompt by default, causing hf_processor outputs incorrect token ids.
# So we need use `add_special_tokens=False` here to leave bos_token
# to be added by the processor.
tokenizer_encode_kwargs = {"add_special_tokens": False}
for batch_idx in range(num_batches):
mm_data = {
k:
......@@ -112,37 +113,131 @@ def _test_processing_correctness(
elif len(mm_data[k]) == 1:
mm_data[k] = mm_data[k][0]
baseline_result = baseline_processor.apply(
prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
cached_result = cached_processor.apply(
prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
if isinstance(tokenizer, MistralTokenizer):
_test_processing_correctness_mistral(
model_config,
tokenizer,
prompt,
mm_data,
baseline_processor,
cached_processor,
batch_idx,
ignore_mm_keys=ignore_mm_keys,
)
else:
_test_processing_correctness_hf(
model_config,
tokenizer,
prompt,
mm_data,
baseline_processor,
cached_processor,
batch_idx,
ignore_mm_keys=ignore_mm_keys,
)
def _test_processing_correctness_hf(
model_config: ModelConfig,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
prompt: str,
mm_data: MultiModalDataDict,
baseline_processor: BaseMultiModalProcessor,
cached_processor: BaseMultiModalProcessor,
batch_idx: int,
ignore_mm_keys: Optional[list[str]] = None,
):
if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
token_prompt = tokenizer.encode(prompt, add_special_tokens=False)
else:
token_prompt = tokenizer.encode(prompt)
baseline_result = baseline_processor.apply(
prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
cached_result = cached_processor.apply(
prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
assert _inputs_equal(
baseline_result,
cached_result,
ignore_mm_keys,
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
assert baseline_result == cached_result, (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
baseline_tokenized_result = baseline_processor.apply(
token_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
baseline_tokenized_result = baseline_processor.apply(
tokenizer.encode(prompt, **tokenizer_encode_kwargs),
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
assert _inputs_equal(
baseline_result,
baseline_tokenized_result,
ignore_mm_keys,
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
assert baseline_result == baseline_tokenized_result, (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
cached_tokenized_result = cached_processor.apply(
token_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
cached_tokenized_result = cached_processor.apply(
tokenizer.encode(prompt, **tokenizer_encode_kwargs),
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
assert _inputs_equal(
cached_result,
cached_tokenized_result,
ignore_mm_keys,
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
assert cached_result == cached_tokenized_result, (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
def _test_processing_correctness_mistral(
model_config: ModelConfig,
tokenizer: MistralTokenizer,
prompt: str,
mm_data: MultiModalDataDict,
baseline_processor: BaseMultiModalProcessor,
cached_processor: BaseMultiModalProcessor,
batch_idx: int,
ignore_mm_keys: Optional[list[str]] = None,
):
images = mm_data.get("image", [])
if not isinstance(images, list):
images = [images]
request = ChatCompletionRequest(messages=[
UserMessage(content=[
TextChunk(text=prompt),
*(ImageChunk(image=image) for image in images),
]),
])
res = tokenizer.mistral.encode_chat_completion(request)
token_prompt = res.tokens
# Mistral chat outputs tokens directly, rather than text prompts
baseline_tokenized_result = baseline_processor.apply(
token_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
cached_tokenized_result = cached_processor.apply(
token_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
assert _inputs_equal(
baseline_tokenized_result,
cached_tokenized_result,
ignore_mm_keys,
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
# yapf: disable
......@@ -151,7 +246,9 @@ def _test_processing_correctness(
"Salesforce/blip2-opt-2.7b",
"facebook/chameleon-7b",
"deepseek-ai/deepseek-vl2-tiny",
"microsoft/Florence-2-base",
"adept/fuyu-8b",
"google/gemma-3-4b-it",
"THUDM/glm-4v-9b",
"h2oai/h2ovl-mississippi-800m",
"OpenGVLab/InternVL2-1B",
......@@ -162,6 +259,7 @@ def _test_processing_correctness(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"mistralai/Pixtral-12B-2409",
"mistral-community/pixtral-12b",
"openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6",
......@@ -173,6 +271,9 @@ def _test_processing_correctness(
"Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
"openai/whisper-large-v3",
"google/paligemma-3b-mix-224",
"google/paligemma2-3b-ft-docci-448",
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
......@@ -184,16 +285,24 @@ def test_processing_correctness(
num_batches: int,
simplify_rate: float,
):
ignore_mm_keys = None
if 'ultravox' in model_id:
# In Ultravox, the audio_features can be different depending on padding
# The slight difference should not be a problem though, since
# attention_mask lets us ignore the difference.
ignore_mm_keys = ['audio_features']
_test_processing_correctness(
model_id,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
ignore_mm_keys=ignore_mm_keys,
)
# yapf: disable
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
......@@ -217,3 +326,40 @@ def test_processing_correctness_phi3v(
num_batches=num_batches,
simplify_rate=simplify_rate,
)
def _inputs_equal(
a: MultiModalInputs,
b: MultiModalInputs,
ignore_mm_keys: Optional[list[str]] = None,
):
return _drop_mm_kwargs_keys(a, ignore_mm_keys) == _drop_mm_kwargs_keys(
b, ignore_mm_keys)
def _drop_mm_kwargs_keys(
result: MultiModalInputs,
ignore_mm_keys: Optional[list[str]] = None,
) -> MultiModalInputs:
"""Drop specified keys from result['mm_kwargs'].
This is mainly to avoid doing exact match of audio_features in ultravox.
Args:
result: Result to drop keys from
ignore_mm_keys: List of keys to ignore, e.g. ['audio_features']
"""
if not ignore_mm_keys:
return result
if 'mm_kwargs' in result:
result = copy.deepcopy(result)
mm_kwargs = result['mm_kwargs']
for key in ignore_mm_keys:
mm_kwargs.pop(key, None)
for items in mm_kwargs._items_by_modality.values():
for item in items:
for key in ignore_mm_keys:
item.pop(key, None)
return result
# SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from typing import Mapping, Optional
from collections.abc import Mapping
from typing import Optional
import pytest
from PIL import Image
......@@ -95,14 +96,14 @@ def _run_check(
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
prompt = "<image>" * len(images)
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
......@@ -151,9 +152,7 @@ def test_processor_override(
}
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
......
......@@ -11,10 +11,8 @@ from ....conftest import _ImageAssets
from ...utils import build_model_context
from ....utils import models_path_prefix
models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
......@@ -27,7 +25,7 @@ models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
model: str,
model_id: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
num_imgs: int,
......@@ -38,9 +36,7 @@ def test_processor_override(
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx = build_model_context(
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
......
# SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs."""
from typing import Mapping, Optional
from collections.abc import Mapping
from typing import Optional
import os
import pytest
......@@ -57,14 +58,14 @@ def _run_check(
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
prompt = "<image>" * len(images)
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
......@@ -110,9 +111,7 @@ def test_processor_override(
}
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment