Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
88c83041
Unverified
Commit
88c83041
authored
May 02, 2025
by
Isotr0py
Committed by
GitHub
May 01, 2025
Browse files
[Model] Refactor Ovis2 to support original tokenizer (#17537)
Signed-off-by:
Isotr0py
<
2037008807@qq.com
>
parent
6768ff4a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
102 additions
and
48 deletions
+102
-48
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+0
-2
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+0
-2
tests/models/registry.py
tests/models/registry.py
+0
-1
vllm/model_executor/models/ovis2.py
vllm/model_executor/models/ovis2.py
+66
-9
vllm/transformers_utils/processors/ovis2.py
vllm/transformers_utils/processors/ovis2.py
+36
-34
No files found.
examples/offline_inference/vision_language.py
View file @
88c83041
...
@@ -730,11 +730,9 @@ def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
...
@@ -730,11 +730,9 @@ def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"AIDC-AI/Ovis2-1B"
model_name
=
"AIDC-AI/Ovis2-1B"
tokenizer
=
"Isotr0py/Ovis2-tokenizer"
engine_args
=
EngineArgs
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
tokenizer
=
tokenizer
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
88c83041
...
@@ -439,11 +439,9 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
...
@@ -439,11 +439,9 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
# Ovis2
# Ovis2
def
load_ovis2
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
def
load_ovis2
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"AIDC-AI/Ovis2-1B"
model_name
=
"AIDC-AI/Ovis2-1B"
tokenizer
=
"Isotr0py/Ovis2-tokenizer"
engine_args
=
EngineArgs
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
tokenizer
=
tokenizer
,
max_model_len
=
8192
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
...
tests/models/registry.py
View file @
88c83041
...
@@ -349,7 +349,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -349,7 +349,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
}),
# noqa: E501
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
}),
# noqa: E501
"Ovis2ForConditionalGeneration"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2-1B"
,
"Ovis2ForConditionalGeneration"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2-1B"
,
tokenizer
=
"Isotr0py/Ovis2-tokenizer"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"Ovis2ForConditionalGeneration"
]}),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"Ovis2ForConditionalGeneration"
]}),
# noqa: E501
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
...
...
vllm/model_executor/models/ovis2.py
View file @
88c83041
...
@@ -46,8 +46,7 @@ from .utils import merge_multimodal_embeddings
...
@@ -46,8 +46,7 @@ from .utils import merge_multimodal_embeddings
# Cannot find the following number from hf config.
# Cannot find the following number from hf config.
IMAGE_TOKEN
=
"<image>"
IMAGE_TOKEN
=
"<image>"
IMAGE_ATOM_TOKEN_ID
=
151666
IMAGE_PAD_TOKEN_ID
=
151655
IMAGE_PAD_TOKEN_ID
=
151672
NUMBER_OF_TOKEN_TO_RESERVE_FOR_SEGMENT
=
256
NUMBER_OF_TOKEN_TO_RESERVE_FOR_SEGMENT
=
256
...
@@ -59,6 +58,12 @@ class Ovis2ImagePatchInputs(TypedDict):
...
@@ -59,6 +58,12 @@ class Ovis2ImagePatchInputs(TypedDict):
`(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
`(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
"""
"""
inducator_tokens
:
torch
.
Tensor
"""
Shape:
`(batch_size * (num_patches + 1))`
"""
patches_per_image
:
List
[
int
]
patches_per_image
:
List
[
int
]
"""
"""
List of number of total patches for each image in the batch.
List of number of total patches for each image in the batch.
...
@@ -138,6 +143,21 @@ class Ovis2DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2ProcessingInfo]):
...
@@ -138,6 +143,21 @@ class Ovis2DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2ProcessingInfo]):
class
Ovis2MultiModalProcessor
(
BaseMultiModalProcessor
[
Ovis2ProcessingInfo
]):
class
Ovis2MultiModalProcessor
(
BaseMultiModalProcessor
[
Ovis2ProcessingInfo
]):
def
image_indicators_to_visual_tokens
(
self
,
image_indicators
:
list
[
int
],
)
->
list
[
int
]:
"""
Filter image indicators placeholders and convert them to corresponding
tokens in visual tokenizer.
For example, [-301, -300, -302, -300, -303, -300, -304, -300, -305]
should return [vocab_size-1, vocab_size-2, ..., vocab_size-5]
"""
hf_config
=
self
.
info
.
get_hf_config
()
vte_vocab_size
=
hf_config
.
visual_tokenizer_config
.
vocab_size
# -300 is image_atom token, filter them out
return
[
vte_vocab_size
+
x
+
300
for
x
in
image_indicators
if
x
<
-
300
]
def
_call_hf_processor
(
def
_call_hf_processor
(
self
,
self
,
prompt
:
str
,
prompt
:
str
,
...
@@ -156,6 +176,16 @@ class Ovis2MultiModalProcessor(BaseMultiModalProcessor[Ovis2ProcessingInfo]):
...
@@ -156,6 +176,16 @@ class Ovis2MultiModalProcessor(BaseMultiModalProcessor[Ovis2ProcessingInfo]):
mm_kwargs
=
mm_kwargs
,
mm_kwargs
=
mm_kwargs
,
)
)
hf_processor
=
self
.
info
.
get_hf_processor
()
image_indicators
=
[
hf_processor
.
construct_image_indicators
(
grid
)
for
grid
in
processed_outputs
[
"grids"
]
]
indicator_tokens
=
[
self
.
image_indicators_to_visual_tokens
(
indicator
)
for
indicator
in
image_indicators
]
processed_outputs
[
"indicator_tokens"
]
=
indicator_tokens
return
processed_outputs
return
processed_outputs
def
_apply_hf_processor_tokens_only
(
def
_apply_hf_processor_tokens_only
(
...
@@ -171,7 +201,8 @@ class Ovis2MultiModalProcessor(BaseMultiModalProcessor[Ovis2ProcessingInfo]):
...
@@ -171,7 +201,8 @@ class Ovis2MultiModalProcessor(BaseMultiModalProcessor[Ovis2ProcessingInfo]):
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
return
dict
(
pixel_values
=
MultiModalFieldConfig
.
batched
(
"image"
),
return
dict
(
pixel_values
=
MultiModalFieldConfig
.
batched
(
"image"
),
grids
=
MultiModalFieldConfig
.
batched
(
"image"
))
grids
=
MultiModalFieldConfig
.
batched
(
"image"
),
indicator_tokens
=
MultiModalFieldConfig
.
batched
(
"image"
))
def
_get_prompt_updates
(
def
_get_prompt_updates
(
self
,
self
,
...
@@ -230,20 +261,28 @@ class Ovis2ForConditionalGeneration(nn.Module, SupportsMultiModal):
...
@@ -230,20 +261,28 @@ class Ovis2ForConditionalGeneration(nn.Module, SupportsMultiModal):
def
_parse_and_validate_image_input
(
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
)
->
Optional
[
Ovis2ImagePatchInputs
]:
self
,
**
kwargs
:
object
)
->
Optional
[
Ovis2ImagePatchInputs
]:
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
if
pixel_values
is
None
:
indicator_tokens
=
kwargs
.
pop
(
"indicator_tokens"
,
None
)
if
pixel_values
is
None
and
indicator_tokens
is
None
:
return
None
return
None
if
pixel_values
is
not
None
:
if
pixel_values
is
not
None
and
indicator_tokens
is
not
None
:
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)):
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of pixel values. "
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
f
"Got type:
{
type
(
pixel_values
)
}
"
)
if
not
isinstance
(
indicator_tokens
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of indicator_tokens. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
return
Ovis2ImagePatchInputs
(
return
Ovis2ImagePatchInputs
(
type
=
"image_patches"
,
type
=
"image_patches"
,
flat_data
=
flatten_bn
(
flatten_bn
(
pixel_values
),
concat
=
True
),
flat_data
=
flatten_bn
(
flatten_bn
(
pixel_values
),
concat
=
True
),
patches_per_image
=
[
patches_per_image
=
[
x
.
shape
[
0
]
for
x
in
flatten_bn
(
pixel_values
)
x
.
shape
[
0
]
for
x
in
flatten_bn
(
pixel_values
)
],
],
indicator_tokens
=
flatten_bn
(
flatten_bn
(
indicator_tokens
),
concat
=
True
),
)
)
raise
AssertionError
(
"This line should be unreachable."
)
raise
AssertionError
(
"This line should be unreachable."
)
...
@@ -252,15 +291,33 @@ class Ovis2ForConditionalGeneration(nn.Module, SupportsMultiModal):
...
@@ -252,15 +291,33 @@ class Ovis2ForConditionalGeneration(nn.Module, SupportsMultiModal):
self
,
image_input
:
Ovis2ImagePatchInputs
)
->
MultiModalEmbeddings
:
self
,
image_input
:
Ovis2ImagePatchInputs
)
->
MultiModalEmbeddings
:
image_patches_flat
=
image_input
[
"flat_data"
]
image_patches_flat
=
image_input
[
"flat_data"
]
patches_per_image
=
image_input
[
"patches_per_image"
]
patches_per_image
=
image_input
[
"patches_per_image"
]
indicator_tokens
=
image_input
[
"indicator_tokens"
]
indicator_per_image
=
list
(
map
(
lambda
x
:
x
+
1
if
x
>
1
else
x
+
2
,
patches_per_image
))
target_dtype
=
self
.
visual_tokenizer
.
dtype
target_dtype
=
self
.
visual_tokenizer
.
dtype
visual_tokens
=
self
.
visual_tokenizer
(
visual_tokens
=
self
.
visual_tokenizer
(
image_patches_flat
.
to
(
target_dtype
))
image_patches_flat
.
to
(
target_dtype
))
visual_embeds
=
self
.
vte
(
visual_tokens
)
# 1:1 numeric eq.
visual_embeds
=
self
.
vte
(
visual_tokens
)
# 1:1 numeric eq.
return
tuple
(
indicator_embeds
=
self
.
vte
(
indicator_tokens
)
x
.
flatten
(
0
,
1
)
indicator_embeds_per_image
=
indicator_embeds
.
split
(
for
x
in
visual_embeds
.
split
(
patches_per_image
,
dim
=
0
))
indicator_per_image
)
visual_embeds_per_image
=
visual_embeds
.
split
(
patches_per_image
,
dim
=
0
)
vision_embeddings
=
[]
for
indicator
,
visual
in
zip
(
indicator_embeds_per_image
,
visual_embeds_per_image
):
vision_embeddings_per_image
=
[]
for
i
in
range
(
visual
.
shape
[
0
]):
vision_embeddings_per_image
.
append
(
torch
.
cat
([
indicator
[
i
:
i
+
1
],
visual
[
i
]],
dim
=
0
))
vision_embeddings_per_image
.
append
(
indicator
[
i
+
1
:])
vision_embeddings
.
append
(
torch
.
cat
(
vision_embeddings_per_image
,
dim
=
0
))
return
tuple
(
vision_embeddings
)
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
...
@@ -281,7 +338,7 @@ class Ovis2ForConditionalGeneration(nn.Module, SupportsMultiModal):
...
@@ -281,7 +338,7 @@ class Ovis2ForConditionalGeneration(nn.Module, SupportsMultiModal):
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
[
IMAGE_ATOM_TOKEN_ID
,
IMAGE_PAD_TOKEN_ID
])
[
IMAGE_PAD_TOKEN_ID
])
return
inputs_embeds
return
inputs_embeds
def
forward
(
def
forward
(
...
...
vllm/transformers_utils/processors/ovis2.py
View file @
88c83041
...
@@ -69,20 +69,21 @@ class OvisProcessor(ProcessorMixin):
...
@@ -69,20 +69,21 @@ class OvisProcessor(ProcessorMixin):
image_processor_class
=
"AutoImageProcessor"
image_processor_class
=
"AutoImageProcessor"
tokenizer_class
=
"Qwen2Tokenizer"
tokenizer_class
=
"Qwen2Tokenizer"
def
__init__
(
self
,
image_processor
=
None
,
tokenizer
=
None
,
chat_template
=
None
,
**
kwargs
):
def
__init__
(
self
,
image_processor
=
None
,
tokenizer
=
None
,
chat_template
=
None
,
image_pad_token
=
None
,
**
kwargs
):
self
.
image_token
=
"<
|
image
_pad|>"
if
not
hasattr
(
tokenizer
,
"image_token"
)
else
tokenizer
.
image_token
self
.
image_token
=
"<image
>"
self
.
video
_token
=
"<|
video
_pad|>"
if
not
hasattr
(
tokenizer
,
"video_token"
)
else
tokenizer
.
video
_token
self
.
image_pad
_token
=
"<|
image
_pad|>"
if
image_pad_token
is
None
else
image_pad
_token
super
().
__init__
(
image_processor
,
tokenizer
,
chat_template
=
chat_template
)
super
().
__init__
(
image_processor
,
tokenizer
,
chat_template
=
chat_template
)
self
.
image_pad_token_id
=
self
.
tokenizer
.
get_vocab
()[
self
.
image_pad_token
]
self
.
extra_special_tokens
=
{
self
.
extra_special_tokens
=
{
"image_token"
:
"<image>"
,
"image_token"
:
-
200
,
"image_atom"
:
"<image_atom>"
,
"image_atom"
:
-
300
,
"image_start"
:
"<img>"
,
"image_start"
:
-
301
,
"image_prefix"
:
"<pre>"
,
"image_prefix"
:
-
302
,
"image_col_sep"
:
"<col>"
,
"image_col_sep"
:
-
303
,
"image_row_sep"
:
"<row>"
,
"image_row_sep"
:
-
304
,
"image_end"
:
"</img>"
,
"image_end"
:
-
305
,
'image_pad'
:
'<
image_pad
>'
,
'image_pad'
:
self
.
image_pad
_token_id
,
}
}
def
__call__
(
def
__call__
(
...
@@ -157,58 +158,44 @@ class OvisProcessor(ProcessorMixin):
...
@@ -157,58 +158,44 @@ class OvisProcessor(ProcessorMixin):
if
not
isinstance
(
text
,
list
):
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
text
=
[
text
]
tokenized_batched_text
=
self
.
tokenizer
.
batch_encode_plus
(
tokenized_batched_text
=
self
.
_tokenize_with_image_symbol
(
text
)
text
,
**
output_kwargs
[
"text_kwargs"
]
)
image_token_id
=
self
.
get_token_value
(
"image_token"
)
image_token_id
=
self
.
get_token_value
(
"image_token"
)
replaced_ids_list
=
[]
replaced_ids_list
=
[]
replaced_attn_mask_list
=
[]
idx
=
0
idx
=
0
for
ids_tensor
,
attn_mask
in
zip
(
tokenized_batched_text
[
'input_ids'
],
for
ids_tensor
in
tokenized_batched_text
:
tokenized_batched_text
[
'attention_mask'
]):
if
image_token_id
in
ids_tensor
and
"image_placeholders"
in
image_features
:
if
image_token_id
in
ids_tensor
and
"image_placeholders"
in
image_features
:
if
idx
<
len
(
image_features
[
"image_placeholders"
]):
if
idx
<
len
(
image_features
[
"image_placeholders"
]):
# Converts in list for ease of use
# Converts in list for ease of use
ids_list
=
ids_tensor
.
tolist
()
ids_list
=
ids_tensor
.
tolist
()
attn_list
=
attn_mask
.
tolist
()
new_ids
=
[]
new_ids
=
[]
new_attn
=
[]
# replace placeholders
# replace placeholders
for
i
,
token_id
in
enumerate
(
ids_list
):
for
i
,
token_id
in
enumerate
(
ids_list
):
if
token_id
==
image_token_id
:
if
token_id
==
image_token_id
:
placeholder_ids
=
image_features
[
"image_placeholders"
][
idx
]
placeholder_ids
=
image_features
[
"image_placeholders"
][
idx
]
new_ids
.
extend
(
placeholder_ids
)
new_ids
.
extend
(
placeholder_ids
)
new_attn
.
extend
([
1
]
*
len
(
placeholder_ids
))
idx
+=
1
idx
+=
1
else
:
else
:
new_ids
.
append
(
token_id
)
new_ids
.
append
(
token_id
)
new_attn
.
append
(
attn_list
[
i
])
# Converts back to tensors
# Converts back to tensors
ids_tensor
=
torch
.
tensor
(
new_ids
,
dtype
=
torch
.
long
)
ids_tensor
=
torch
.
tensor
(
new_ids
,
dtype
=
torch
.
long
)
attn_mask
=
torch
.
tensor
(
new_attn
,
dtype
=
torch
.
long
)
else
:
else
:
raise
RuntimeError
(
raise
RuntimeError
(
'Mismatch between the images you provided and the number of placeholder present in the text'
)
'Mismatch between the images you provided and the number of placeholder present in the text'
)
replaced_ids_list
.
append
(
ids_tensor
)
replaced_ids_list
.
append
(
ids_tensor
)
replaced_attn_mask_list
.
append
(
attn_mask
)
if
replaced_ids_list
:
if
replaced_ids_list
:
replaced_and_tokenized_ids
=
torch
.
stack
(
replaced_ids_list
)
replaced_and_tokenized_ids
=
torch
.
stack
(
replaced_ids_list
)
replaced_and_tokenized_attn_mask
=
torch
.
stack
(
replaced_attn_mask_list
)
else
:
else
:
replaced_and_tokenized_ids
=
torch
.
tensor
([],
dtype
=
torch
.
long
)
replaced_and_tokenized_ids
=
torch
.
tensor
([],
dtype
=
torch
.
long
)
replaced_and_tokenized_attn_mask
=
torch
.
tensor
([],
dtype
=
torch
.
long
)
# Create the output with text features
# Create the output with text features
output
=
BatchFeature
(
output
=
BatchFeature
(
data
=
{
data
=
{
"input_ids"
:
replaced_and_tokenized_ids
,
"input_ids"
:
replaced_and_tokenized_ids
,
"attention_mask"
:
replaced_and_tokenized_attn_mask
,
}
}
)
)
...
@@ -219,10 +206,22 @@ class OvisProcessor(ProcessorMixin):
...
@@ -219,10 +206,22 @@ class OvisProcessor(ProcessorMixin):
return
output
return
output
# If only images were provided
# If only images were provided
return
BatchFeature
(
data
=
image_features
)
return
BatchFeature
(
data
=
image_features
)
def
_tokenize_with_image_symbol
(
self
,
text_list
:
list
[
str
])
->
torch
.
LongTensor
:
batch_token_ids
=
[]
for
text
in
text_list
:
text_chunks
=
[
self
.
tokenizer
(
chunk
,
add_special_tokens
=
False
).
input_ids
for
chunk
in
text
.
split
(
self
.
image_token
)]
token_ids
=
[]
num_chuck
=
len
(
text_chunks
)
for
i
,
chunk
in
enumerate
(
text_chunks
):
token_ids
.
extend
(
chunk
)
if
i
<
num_chuck
-
1
:
token_ids
.
append
(
self
.
get_token_value
(
"image_token"
))
batch_token_ids
.
append
(
token_ids
)
return
torch
.
tensor
(
batch_token_ids
,
dtype
=
torch
.
long
)
def
get_image_size
(
self
):
def
get_image_size
(
self
):
height
=
self
.
image_processor
.
crop_size
[
"height"
]
height
=
self
.
image_processor
.
crop_size
[
"height"
]
...
@@ -230,10 +229,9 @@ class OvisProcessor(ProcessorMixin):
...
@@ -230,10 +229,9 @@ class OvisProcessor(ProcessorMixin):
return
height
,
width
return
height
,
width
def
get_token_value
(
self
,
tok
):
def
get_token_value
(
self
,
tok
):
return
self
.
tokenizer
.
get_vocab
()[
self
.
extra_special_tokens
[
tok
]]
return
self
.
extra_special_tokens
[
tok
]
def
construct_image_placeholders
(
self
,
grid
):
def
construct_image_indicators
(
self
,
grid
):
image_placeholders
=
[
self
.
get_token_value
(
'image_start'
),
image_placeholders
=
[
self
.
get_token_value
(
'image_start'
),
self
.
get_token_value
(
'image_atom'
),
self
.
get_token_value
(
'image_atom'
),
self
.
get_token_value
(
'image_prefix'
)]
self
.
get_token_value
(
'image_prefix'
)]
...
@@ -246,7 +244,11 @@ class OvisProcessor(ProcessorMixin):
...
@@ -246,7 +244,11 @@ class OvisProcessor(ProcessorMixin):
if
r
<
grid
[
0
]
-
1
:
if
r
<
grid
[
0
]
-
1
:
image_placeholders
.
append
(
self
.
get_token_value
(
'image_row_sep'
))
image_placeholders
.
append
(
self
.
get_token_value
(
'image_row_sep'
))
image_placeholders
.
append
(
self
.
get_token_value
(
'image_end'
))
image_placeholders
.
append
(
self
.
get_token_value
(
'image_end'
))
# return image_placeholders
return
image_placeholders
def
construct_image_placeholders
(
self
,
grid
):
image_placeholders
=
self
.
construct_image_indicators
(
grid
)
image_atom_token_id
=
self
.
get_token_value
(
'image_atom'
)
image_atom_token_id
=
self
.
get_token_value
(
'image_atom'
)
# Extract the padding token ID from tokenizer
# Extract the padding token ID from tokenizer
...
@@ -255,7 +257,7 @@ class OvisProcessor(ProcessorMixin):
...
@@ -255,7 +257,7 @@ class OvisProcessor(ProcessorMixin):
# Create a new list with padding tokens inserted
# Create a new list with padding tokens inserted
padded_placeholder_tokens
=
[]
padded_placeholder_tokens
=
[]
for
token
in
image_placeholders
:
for
token
in
image_placeholders
:
padded_placeholder_tokens
.
append
(
token
)
padded_placeholder_tokens
.
append
(
image_padding_
token
_id
)
if
token
==
image_atom_token_id
:
if
token
==
image_atom_token_id
:
# Add 255 padding tokens after each image atom token
# Add 255 padding tokens after each image atom token
padded_placeholder_tokens
.
extend
([
image_padding_token_id
]
*
255
)
padded_placeholder_tokens
.
extend
([
image_padding_token_id
]
*
255
)
...
@@ -394,4 +396,4 @@ class OvisProcessor(ProcessorMixin):
...
@@ -394,4 +396,4 @@ class OvisProcessor(ProcessorMixin):
return
names_from_processor
+
[
"second_per_grid_ts"
]
return
names_from_processor
+
[
"second_per_grid_ts"
]
AutoProcessor
.
register
(
"OvisProcessor"
,
OvisProcessor
)
AutoProcessor
.
register
(
"OvisProcessor"
,
OvisProcessor
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment