Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
00c3d68e
Unverified
Commit
00c3d68e
authored
Aug 13, 2024
by
Peter Salas
Committed by
GitHub
Aug 13, 2024
Browse files
[Frontend][Core] Add plumbing to support audio language models (#7446)
parent
e20233d3
Changes
24
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
64 additions
and
9 deletions
+64
-9
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+2
-1
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+56
-2
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+4
-4
vllm/worker/xpu_model_runner.py
vllm/worker/xpu_model_runner.py
+2
-2
No files found.
vllm/multimodal/registry.py
View file @
00c3d68e
...
@@ -6,6 +6,7 @@ import torch
...
@@ -6,6 +6,7 @@ import torch
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
.audio
import
AudioPlugin
from
.base
import
(
MultiModalDataDict
,
MultiModalInputMapper
,
MultiModalInputs
,
from
.base
import
(
MultiModalDataDict
,
MultiModalInputMapper
,
MultiModalInputs
,
MultiModalPlugin
,
MultiModalTokensCalc
)
MultiModalPlugin
,
MultiModalTokensCalc
)
from
.image
import
ImagePlugin
from
.image
import
ImagePlugin
...
@@ -19,7 +20,7 @@ class MultiModalRegistry:
...
@@ -19,7 +20,7 @@ class MultiModalRegistry:
:class:`~vllm.multimodal.MultiModalPlugin` for each modality.
:class:`~vllm.multimodal.MultiModalPlugin` for each modality.
"""
"""
DEFAULT_PLUGINS
=
(
ImagePlugin
(),
)
DEFAULT_PLUGINS
=
(
ImagePlugin
(),
AudioPlugin
()
)
def
__init__
(
def
__init__
(
self
,
self
,
...
...
vllm/multimodal/utils.py
View file @
00c3d68e
import
base64
import
base64
from
io
import
BytesIO
from
io
import
BytesIO
from
typing
import
Union
from
typing
import
Tuple
,
Union
import
librosa
import
numpy
as
np
import
soundfile
from
PIL
import
Image
from
PIL
import
Image
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
from
vllm.envs
import
VLLM_IMAGE_FETCH_TIMEOUT
from
vllm.envs
import
VLLM_AUDIO_FETCH_TIMEOUT
,
VLLM_IMAGE_FETCH_TIMEOUT
from
vllm.multimodal.base
import
MultiModalDataDict
from
vllm.multimodal.base
import
MultiModalDataDict
...
@@ -63,11 +66,62 @@ async def async_fetch_image(image_url: str,
...
@@ -63,11 +66,62 @@ async def async_fetch_image(image_url: str,
return
image
.
convert
(
image_mode
)
return
image
.
convert
(
image_mode
)
def
fetch_audio
(
audio_url
:
str
)
->
Tuple
[
np
.
ndarray
,
Union
[
int
,
float
]]:
"""
Load audio from a URL.
"""
if
audio_url
.
startswith
(
"http"
):
audio_bytes
=
global_http_connection
.
get_bytes
(
audio_url
,
timeout
=
VLLM_AUDIO_FETCH_TIMEOUT
)
elif
audio_url
.
startswith
(
"data:audio"
):
_
,
audio_base64
=
audio_url
.
split
(
","
,
1
)
audio_bytes
=
base64
.
b64decode
(
audio_base64
)
else
:
raise
ValueError
(
"Invalid 'audio_url': A valid 'audio_url' must start "
"with either 'data:audio' or 'http'."
)
return
librosa
.
load
(
BytesIO
(
audio_bytes
),
sr
=
None
)
async
def
async_fetch_audio
(
audio_url
:
str
)
->
Tuple
[
np
.
ndarray
,
Union
[
int
,
float
]]:
"""
Asynchronously fetch audio from a URL.
"""
if
audio_url
.
startswith
(
"http"
):
audio_bytes
=
await
global_http_connection
.
async_get_bytes
(
audio_url
,
timeout
=
VLLM_AUDIO_FETCH_TIMEOUT
)
elif
audio_url
.
startswith
(
"data:audio"
):
_
,
audio_base64
=
audio_url
.
split
(
","
,
1
)
audio_bytes
=
base64
.
b64decode
(
audio_base64
)
else
:
raise
ValueError
(
"Invalid 'audio_url': A valid 'audio_url' must start "
"with either 'data:audio' or 'http'."
)
return
librosa
.
load
(
BytesIO
(
audio_bytes
),
sr
=
None
)
async
def
async_get_and_parse_audio
(
audio_url
:
str
)
->
MultiModalDataDict
:
audio
,
sr
=
await
async_fetch_audio
(
audio_url
)
return
{
"audio"
:
(
audio
,
sr
)}
async
def
async_get_and_parse_image
(
image_url
:
str
)
->
MultiModalDataDict
:
async
def
async_get_and_parse_image
(
image_url
:
str
)
->
MultiModalDataDict
:
image
=
await
async_fetch_image
(
image_url
)
image
=
await
async_fetch_image
(
image_url
)
return
{
"image"
:
image
}
return
{
"image"
:
image
}
def
encode_audio_base64
(
audio
:
np
.
ndarray
,
sampling_rate
:
int
,
)
->
str
:
"""Encode audio as base64."""
buffered
=
BytesIO
()
soundfile
.
write
(
buffered
,
audio
,
sampling_rate
,
format
=
"WAV"
)
return
base64
.
b64encode
(
buffered
.
getvalue
()).
decode
(
'utf-8'
)
def
encode_image_base64
(
def
encode_image_base64
(
image
:
Image
.
Image
,
image
:
Image
.
Image
,
*
,
*
,
...
...
vllm/worker/model_runner.py
View file @
00c3d68e
...
@@ -40,7 +40,7 @@ from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
...
@@ -40,7 +40,7 @@ from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
from
vllm.model_executor.models.interfaces
import
(
supports_lora
,
from
vllm.model_executor.models.interfaces
import
(
supports_lora
,
supports_
vision
)
supports_
multimodal
)
from
vllm.model_executor.models.utils
import
set_cpu_offload_max_bytes
from
vllm.model_executor.models.utils
import
set_cpu_offload_max_bytes
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
MultiModalInputs
)
MultiModalInputs
)
...
@@ -900,9 +900,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -900,9 +900,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
if
self
.
lora_config
:
if
self
.
lora_config
:
assert
supports_lora
(
self
.
model
),
"Model does not support LoRA"
assert
supports_lora
(
self
.
model
),
"Model does not support LoRA"
assert
not
supports_
vision
(
assert
not
supports_
multimodal
(
self
.
model
self
.
model
),
"To be tested:
vision
language model with LoRA settings."
),
"To be tested:
multimodal
language model with LoRA settings."
self
.
lora_manager
=
LRUCacheWorkerLoRAManager
(
self
.
lora_manager
=
LRUCacheWorkerLoRAManager
(
self
.
scheduler_config
.
max_num_seqs
,
self
.
scheduler_config
.
max_num_seqs
,
...
@@ -1054,7 +1054,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -1054,7 +1054,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# of images processed.
# of images processed.
model_config
=
self
.
model_config
model_config
=
self
.
model_config
if
supports_
vision
(
self
.
model
):
if
supports_
multimodal
(
self
.
model
):
max_mm_tokens
=
MULTIMODAL_REGISTRY
\
max_mm_tokens
=
MULTIMODAL_REGISTRY
\
.
get_max_multimodal_tokens
(
model_config
)
.
get_max_multimodal_tokens
(
model_config
)
max_num_seqs_orig
=
max_num_seqs
max_num_seqs_orig
=
max_num_seqs
...
...
vllm/worker/xpu_model_runner.py
View file @
00c3d68e
...
@@ -12,7 +12,7 @@ from vllm.distributed import broadcast_tensor_dict
...
@@ -12,7 +12,7 @@ from vllm.distributed import broadcast_tensor_dict
from
vllm.inputs
import
INPUT_REGISTRY
from
vllm.inputs
import
INPUT_REGISTRY
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.models.interfaces
import
supports_
vision
from
vllm.model_executor.models.interfaces
import
supports_
multimodal
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
MultiModalInputs
)
MultiModalInputs
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
...
@@ -165,7 +165,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
...
@@ -165,7 +165,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
# of images processed.
# of images processed.
model_config
=
self
.
model_config
model_config
=
self
.
model_config
if
supports_
vision
(
self
.
model
):
if
supports_
multimodal
(
self
.
model
):
max_mm_tokens
=
MULTIMODAL_REGISTRY
\
max_mm_tokens
=
MULTIMODAL_REGISTRY
\
.
get_max_multimodal_tokens
(
model_config
)
.
get_max_multimodal_tokens
(
model_config
)
max_num_seqs_orig
=
max_num_seqs
max_num_seqs_orig
=
max_num_seqs
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment