Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
154 additions
and
1 deletion
+154
-1
examples/offline_inference/embedding.py
examples/offline_inference/embedding.py
+2
-0
examples/offline_inference/encoder_decoder.py
examples/offline_inference/encoder_decoder.py
+1
-0
examples/offline_inference/florence2_inference.py
examples/offline_inference/florence2_inference.py
+1
-0
examples/offline_inference/gguf_inference.py
examples/offline_inference/gguf_inference.py
+2
-0
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+2
-0
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+1
-0
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+3
-1
examples/offline_inference/multilora_inference.py
examples/offline_inference/multilora_inference.py
+1
-0
examples/offline_inference/neuron.py
examples/offline_inference/neuron.py
+2
-0
examples/offline_inference/neuron_int8_quantization.py
examples/offline_inference/neuron_int8_quantization.py
+2
-0
examples/offline_inference/pixtral.py
examples/offline_inference/pixtral.py
+2
-0
examples/offline_inference/prefix_caching.py
examples/offline_inference/prefix_caching.py
+2
-0
examples/offline_inference/profiling.py
examples/offline_inference/profiling.py
+2
-0
examples/offline_inference/profiling_tpu/profiling.py
examples/offline_inference/profiling_tpu/profiling.py
+2
-0
examples/offline_inference/ray_placement.py
examples/offline_inference/ray_placement.py
+121
-0
examples/offline_inference/rlhf.py
examples/offline_inference/rlhf.py
+1
-0
examples/offline_inference/save_sharded_state.py
examples/offline_inference/save_sharded_state.py
+1
-0
examples/offline_inference/scoring.py
examples/offline_inference/scoring.py
+2
-0
examples/offline_inference/simple_profiling.py
examples/offline_inference/simple_profiling.py
+2
-0
examples/offline_inference/structured_outputs.py
examples/offline_inference/structured_outputs.py
+2
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
examples/offline_inference/embedding.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
# Sample prompts.
...
...
examples/offline_inference/encoder_decoder.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
'''
Demonstrate prompting of text-to-text
encoder/decoder models, specifically BART
...
...
examples/offline_inference/florence2_inference.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
'''
Demonstrate prompting of text-to-text
encoder/decoder models, specifically Florence-2
...
...
examples/offline_inference/gguf_inference.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
huggingface_hub
import
hf_hub_download
from
vllm
import
LLM
,
SamplingParams
...
...
examples/offline_inference/llm_engine_example.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
argparse
from
typing
import
List
,
Tuple
...
...
examples/offline_inference/lora_with_quantization_inference.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use LoRA with different quantization techniques
for offline inference.
...
...
examples/offline_inference/mlpspeculator.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
gc
import
time
from
typing
import
List
...
...
@@ -49,7 +51,7 @@ if __name__ == "__main__":
# Create an LLM with spec decoding
llm
=
LLM
(
model
=
"meta-llama/Llama-2-13b-chat-hf"
,
speculative_model
=
"ibm-
fms
/llama-13b-accelerator"
,
speculative_model
=
"ibm-
ai-platform
/llama-13b-accelerator"
,
)
print
(
"With speculation"
)
...
...
examples/offline_inference/multilora_inference.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use the multi-LoRA functionality
for offline inference.
...
...
examples/offline_inference/neuron.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
,
SamplingParams
# Sample prompts.
...
...
examples/offline_inference/neuron_int8_quantization.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
from
vllm
import
LLM
,
SamplingParams
...
...
examples/offline_inference/pixtral.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa
import
argparse
...
...
examples/offline_inference/prefix_caching.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
,
SamplingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
...
...
examples/offline_inference/profiling.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
inspect
import
json
import
os
...
...
examples/offline_inference/profiling_tpu/profiling.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
dataclasses
import
os
...
...
examples/offline_inference/ray_placement.py
0 → 100644
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
a simple demonstration to show how to control
the placement of the vLLM workers with Ray.
The key is to set VLLM_RAY_PER_WORKER_GPUS and
VLLM_RAY_BUNDLE_INDICES properly.
"""
import
os
import
ray
from
ray.util.placement_group
import
placement_group
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
vllm
import
LLM
from
vllm.worker.worker
import
Worker
class
MyWorker
(
Worker
):
def
report_device_id
(
self
)
->
str
:
from
vllm.platforms
import
current_platform
return
current_platform
.
get_device_uuid
(
self
.
device
.
index
)
class
MyLLM
(
LLM
):
def
__init__
(
self
,
*
args
,
bundle_indices
:
list
,
**
kwargs
):
# a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
# every worker will use 0.4 GPU, so that we can schedule
# 2 instances on the same GPUs.
os
.
environ
[
"VLLM_RAY_PER_WORKER_GPUS"
]
=
"0.4"
os
.
environ
[
"VLLM_RAY_BUNDLE_INDICES"
]
=
","
.
join
(
map
(
str
,
bundle_indices
))
print
(
f
"creating LLM with bundle_indices=
{
bundle_indices
}
"
)
super
().
__init__
(
*
args
,
**
kwargs
)
class
RayTrainingActor
:
def
report_device_id
(
self
)
->
str
:
# the argument for get_device_uuid is the index
# of the GPU in the visible devices.
# ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
from
vllm.platforms
import
current_platform
return
current_platform
.
get_device_uuid
(
0
)
# ray manages 4 GPUs
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0,1,2,3"
ray
.
init
()
# we want to co-locate vLLM instance and the training actor
# on the same set of GPUs.
# the placement plan is as follows:
# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2)
# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
pg
=
placement_group
([{
"GPU"
:
1
,
"CPU"
:
0
}]
*
4
)
ray
.
get
(
pg
.
ready
())
print
(
f
"placement group has bundles
{
pg
.
bundle_specs
=
}
"
)
training_actors
=
[]
training_actor_device_ids
=
[]
inference_engines
=
[]
inference_engine_device_ids
=
[]
for
bundle_index
in
[
0
,
1
,
2
,
3
]:
training_actor
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
0.4
,
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
placement_group
=
pg
,
placement_group_capture_child_tasks
=
True
,
placement_group_bundle_index
=
bundle_index
,
),
)(
RayTrainingActor
).
remote
()
training_actors
.
append
(
training_actor
)
device_id
=
ray
.
get
(
training_actor
.
report_device_id
.
remote
())
print
(
f
"training actor
{
bundle_index
}
is on
{
device_id
}
"
)
training_actor_device_ids
.
append
(
device_id
)
for
(
i
,
bundle_indices
)
in
enumerate
([[
0
,
1
],
[
2
,
3
]]):
# IMPORTANT: when creating vLLM instances, we need to
# make sure there are no GPU activities on the target GPUs,
# otherwise, they will interfere with the vLLM memory profiling,
# and cause unexpected behaviors.
llm
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
0
,
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
placement_group
=
pg
,
placement_group_capture_child_tasks
=
True
,
),
)(
MyLLM
).
remote
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
worker_cls
=
MyWorker
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
"ray"
,
gpu_memory_utilization
=
0.4
,
bundle_indices
=
bundle_indices
,
)
inference_engines
.
append
(
llm
)
# don't call any method on the inference engine here,
# otherwise it will block until the vLLM instance is created.
for
i
,
llm
in
enumerate
(
inference_engines
):
inference_engine_device_ids
.
append
(
ray
.
get
(
llm
.
collective_rpc
.
remote
(
"report_device_id"
,
args
=
tuple
())))
print
(
f
"inference engine
{
i
}
is on
{
inference_engine_device_ids
[
-
1
]
}
"
)
# check the placement
# the first two training actors should be
# on the same GPUs as the first inference engine
assert
training_actor_device_ids
[:
2
]
==
inference_engine_device_ids
[
0
]
# the last two training actors should be
# on the same GPUs as the second inference engine
assert
training_actor_device_ids
[
2
:]
==
inference_engine_device_ids
[
1
]
examples/offline_inference/rlhf.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
a simple demonstration of RLHF with vLLM, inspired by
the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
...
...
examples/offline_inference/save_sharded_state.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
Saves each worker's model state dict directly to a checkpoint, which enables a
fast load path for large tensor-parallel models where each worker only needs to
...
...
examples/offline_inference/scoring.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
# Sample prompts.
...
...
examples/offline_inference/simple_profiling.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
time
...
...
examples/offline_inference/structured_outputs.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
enum
import
Enum
from
pydantic
import
BaseModel
...
...
Prev
1
2
3
4
5
6
7
8
9
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment