Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
154 additions
and
1 deletion
+154
-1
examples/offline_inference/embedding.py
examples/offline_inference/embedding.py
+2
-0
examples/offline_inference/encoder_decoder.py
examples/offline_inference/encoder_decoder.py
+1
-0
examples/offline_inference/florence2_inference.py
examples/offline_inference/florence2_inference.py
+1
-0
examples/offline_inference/gguf_inference.py
examples/offline_inference/gguf_inference.py
+2
-0
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+2
-0
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+1
-0
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+3
-1
examples/offline_inference/multilora_inference.py
examples/offline_inference/multilora_inference.py
+1
-0
examples/offline_inference/neuron.py
examples/offline_inference/neuron.py
+2
-0
examples/offline_inference/neuron_int8_quantization.py
examples/offline_inference/neuron_int8_quantization.py
+2
-0
examples/offline_inference/pixtral.py
examples/offline_inference/pixtral.py
+2
-0
examples/offline_inference/prefix_caching.py
examples/offline_inference/prefix_caching.py
+2
-0
examples/offline_inference/profiling.py
examples/offline_inference/profiling.py
+2
-0
examples/offline_inference/profiling_tpu/profiling.py
examples/offline_inference/profiling_tpu/profiling.py
+2
-0
examples/offline_inference/ray_placement.py
examples/offline_inference/ray_placement.py
+121
-0
examples/offline_inference/rlhf.py
examples/offline_inference/rlhf.py
+1
-0
examples/offline_inference/save_sharded_state.py
examples/offline_inference/save_sharded_state.py
+1
-0
examples/offline_inference/scoring.py
examples/offline_inference/scoring.py
+2
-0
examples/offline_inference/simple_profiling.py
examples/offline_inference/simple_profiling.py
+2
-0
examples/offline_inference/structured_outputs.py
examples/offline_inference/structured_outputs.py
+2
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
examples/offline_inference/embedding.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
from
vllm
import
LLM
# Sample prompts.
# Sample prompts.
...
...
examples/offline_inference/encoder_decoder.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
'''
'''
Demonstrate prompting of text-to-text
Demonstrate prompting of text-to-text
encoder/decoder models, specifically BART
encoder/decoder models, specifically BART
...
...
examples/offline_inference/florence2_inference.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
'''
'''
Demonstrate prompting of text-to-text
Demonstrate prompting of text-to-text
encoder/decoder models, specifically Florence-2
encoder/decoder models, specifically Florence-2
...
...
examples/offline_inference/gguf_inference.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
huggingface_hub
import
hf_hub_download
from
huggingface_hub
import
hf_hub_download
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
...
...
examples/offline_inference/llm_engine_example.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
argparse
from
typing
import
List
,
Tuple
from
typing
import
List
,
Tuple
...
...
examples/offline_inference/lora_with_quantization_inference.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
"""
This example shows how to use LoRA with different quantization techniques
This example shows how to use LoRA with different quantization techniques
for offline inference.
for offline inference.
...
...
examples/offline_inference/mlpspeculator.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
gc
import
gc
import
time
import
time
from
typing
import
List
from
typing
import
List
...
@@ -49,7 +51,7 @@ if __name__ == "__main__":
...
@@ -49,7 +51,7 @@ if __name__ == "__main__":
# Create an LLM with spec decoding
# Create an LLM with spec decoding
llm
=
LLM
(
llm
=
LLM
(
model
=
"meta-llama/Llama-2-13b-chat-hf"
,
model
=
"meta-llama/Llama-2-13b-chat-hf"
,
speculative_model
=
"ibm-
fms
/llama-13b-accelerator"
,
speculative_model
=
"ibm-
ai-platform
/llama-13b-accelerator"
,
)
)
print
(
"With speculation"
)
print
(
"With speculation"
)
...
...
examples/offline_inference/multilora_inference.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
"""
This example shows how to use the multi-LoRA functionality
This example shows how to use the multi-LoRA functionality
for offline inference.
for offline inference.
...
...
examples/offline_inference/neuron.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
# Sample prompts.
# Sample prompts.
...
...
examples/offline_inference/neuron_int8_quantization.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
...
...
examples/offline_inference/pixtral.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa
# ruff: noqa
import
argparse
import
argparse
...
...
examples/offline_inference/prefix_caching.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
...
...
examples/offline_inference/profiling.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
inspect
import
inspect
import
json
import
json
import
os
import
os
...
...
examples/offline_inference/profiling_tpu/profiling.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
argparse
import
dataclasses
import
dataclasses
import
os
import
os
...
...
examples/offline_inference/ray_placement.py
0 → 100644
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
a simple demonstration to show how to control
the placement of the vLLM workers with Ray.
The key is to set VLLM_RAY_PER_WORKER_GPUS and
VLLM_RAY_BUNDLE_INDICES properly.
"""
import
os
import
ray
from
ray.util.placement_group
import
placement_group
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
vllm
import
LLM
from
vllm.worker.worker
import
Worker
class
MyWorker
(
Worker
):
def
report_device_id
(
self
)
->
str
:
from
vllm.platforms
import
current_platform
return
current_platform
.
get_device_uuid
(
self
.
device
.
index
)
class
MyLLM
(
LLM
):
def
__init__
(
self
,
*
args
,
bundle_indices
:
list
,
**
kwargs
):
# a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
# every worker will use 0.4 GPU, so that we can schedule
# 2 instances on the same GPUs.
os
.
environ
[
"VLLM_RAY_PER_WORKER_GPUS"
]
=
"0.4"
os
.
environ
[
"VLLM_RAY_BUNDLE_INDICES"
]
=
","
.
join
(
map
(
str
,
bundle_indices
))
print
(
f
"creating LLM with bundle_indices=
{
bundle_indices
}
"
)
super
().
__init__
(
*
args
,
**
kwargs
)
class
RayTrainingActor
:
def
report_device_id
(
self
)
->
str
:
# the argument for get_device_uuid is the index
# of the GPU in the visible devices.
# ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
from
vllm.platforms
import
current_platform
return
current_platform
.
get_device_uuid
(
0
)
# ray manages 4 GPUs
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0,1,2,3"
ray
.
init
()
# we want to co-locate vLLM instance and the training actor
# on the same set of GPUs.
# the placement plan is as follows:
# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2)
# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
pg
=
placement_group
([{
"GPU"
:
1
,
"CPU"
:
0
}]
*
4
)
ray
.
get
(
pg
.
ready
())
print
(
f
"placement group has bundles
{
pg
.
bundle_specs
=
}
"
)
training_actors
=
[]
training_actor_device_ids
=
[]
inference_engines
=
[]
inference_engine_device_ids
=
[]
for
bundle_index
in
[
0
,
1
,
2
,
3
]:
training_actor
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
0.4
,
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
placement_group
=
pg
,
placement_group_capture_child_tasks
=
True
,
placement_group_bundle_index
=
bundle_index
,
),
)(
RayTrainingActor
).
remote
()
training_actors
.
append
(
training_actor
)
device_id
=
ray
.
get
(
training_actor
.
report_device_id
.
remote
())
print
(
f
"training actor
{
bundle_index
}
is on
{
device_id
}
"
)
training_actor_device_ids
.
append
(
device_id
)
for
(
i
,
bundle_indices
)
in
enumerate
([[
0
,
1
],
[
2
,
3
]]):
# IMPORTANT: when creating vLLM instances, we need to
# make sure there are no GPU activities on the target GPUs,
# otherwise, they will interfere with the vLLM memory profiling,
# and cause unexpected behaviors.
llm
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
0
,
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
placement_group
=
pg
,
placement_group_capture_child_tasks
=
True
,
),
)(
MyLLM
).
remote
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
worker_cls
=
MyWorker
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
"ray"
,
gpu_memory_utilization
=
0.4
,
bundle_indices
=
bundle_indices
,
)
inference_engines
.
append
(
llm
)
# don't call any method on the inference engine here,
# otherwise it will block until the vLLM instance is created.
for
i
,
llm
in
enumerate
(
inference_engines
):
inference_engine_device_ids
.
append
(
ray
.
get
(
llm
.
collective_rpc
.
remote
(
"report_device_id"
,
args
=
tuple
())))
print
(
f
"inference engine
{
i
}
is on
{
inference_engine_device_ids
[
-
1
]
}
"
)
# check the placement
# the first two training actors should be
# on the same GPUs as the first inference engine
assert
training_actor_device_ids
[:
2
]
==
inference_engine_device_ids
[
0
]
# the last two training actors should be
# on the same GPUs as the second inference engine
assert
training_actor_device_ids
[
2
:]
==
inference_engine_device_ids
[
1
]
examples/offline_inference/rlhf.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
"""
a simple demonstration of RLHF with vLLM, inspired by
a simple demonstration of RLHF with vLLM, inspired by
the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
...
...
examples/offline_inference/save_sharded_state.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
"""
Saves each worker's model state dict directly to a checkpoint, which enables a
Saves each worker's model state dict directly to a checkpoint, which enables a
fast load path for large tensor-parallel models where each worker only needs to
fast load path for large tensor-parallel models where each worker only needs to
...
...
examples/offline_inference/scoring.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
from
vllm
import
LLM
# Sample prompts.
# Sample prompts.
...
...
examples/offline_inference/simple_profiling.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
import
time
import
time
...
...
examples/offline_inference/structured_outputs.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
enum
import
Enum
from
enum
import
Enum
from
pydantic
import
BaseModel
from
pydantic
import
BaseModel
...
...
Prev
1
2
3
4
5
6
7
8
9
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment