Unverified Commit 82e2339b authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Doc] Move examples and further reorganize user guide (#18666)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 9553fdb4
......@@ -28,7 +28,7 @@ https://github.com/coreweave/tensorizer
To serialize a model, install vLLM from source, then run something
like this from the root level of this repository:
python examples/other/tensorize_vllm_model.py \
python examples/others/tensorize_vllm_model.py \
--model facebook/opt-125m \
serialize \
--serialized-directory s3://my-bucket \
......@@ -48,7 +48,7 @@ providing a `--keyfile` argument.
To deserialize a model, you can run something like this from the root
level of this repository:
python examples/other/tensorize_vllm_model.py \
python examples/others/tensorize_vllm_model.py \
--model EleutherAI/gpt-j-6B \
--dtype float16 \
deserialize \
......@@ -66,11 +66,11 @@ shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors
For more information on the available arguments for serializing, run
`python -m examples.other.tensorize_vllm_model serialize --help`.
`python -m examples.others.tensorize_vllm_model serialize --help`.
Or for deserializing:
`python examples/other/tensorize_vllm_model.py deserialize --help`.
`python examples/others/tensorize_vllm_model.py deserialize --help`.
Once a model is serialized, tensorizer can be invoked with the `LLM` class
directly to load models:
......@@ -91,7 +91,7 @@ TensorizerConfig arguments desired.
In order to see all of the available arguments usable to configure
loading with tensorizer that are given to `TensorizerConfig`, run:
`python examples/other/tensorize_vllm_model.py deserialize --help`
`python examples/others/tensorize_vllm_model.py deserialize --help`
under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and
......
......@@ -62,11 +62,6 @@ ignore_patterns = [
[tool.ruff]
# Allow lines to be as long as 80.
line-length = 80
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
......
......@@ -41,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/other/logging_configuration.md
python-json-logger # Used by logging as per examples/others/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu
opentelemetry-sdk>=1.26.0 # vllm.tracing
......
......@@ -207,7 +207,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
try:
result = subprocess.run([
sys.executable,
f"{VLLM_PATH}/examples/other/tensorize_vllm_model.py", "--model",
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
str(tp_size), "serialize", "--serialized-directory",
str(tmp_path), "--suffix", suffix
......
......@@ -251,7 +251,7 @@ class TensorizerArgs:
encryption_keyfile: File path to a binary file containing a
binary key to use for decryption. `None` (the default) means
no decryption. See the example script in
examples/other/tensorize_vllm_model.py.
examples/others/tensorize_vllm_model.py.
s3_access_key_id: The access key for the S3 bucket. Can also be set via
the S3_ACCESS_KEY_ID environment variable.
s3_secret_access_key: The secret access key for the S3 bucket. Can also
......@@ -469,7 +469,7 @@ def tensorizer_weights_iterator(
"loading on vLLM, as tensorizer is forced to load to CPU. "
"Consider deserializing a vLLM model instead for faster "
"load times. See the "
"examples/other/tensorize_vllm_model.py example script "
"examples/others/tensorize_vllm_model.py example script "
"for serializing vLLM models.")
deserializer_args = tensorizer_args.deserializer_params
......
......@@ -48,7 +48,7 @@ class TensorizerLoader(BaseModelLoader):
"""Load a serialized model with tensorizer to the CPU.
This is only necessary when the model isn't vLLM-tensorized (see
examples/other/tensorize_vllm_model.py) This should still
examples/others/tensorize_vllm_model.py) This should still
be faster than default HuggingFace loading, but will be slower than
loading a vLLM-tensorized model.
"""
......@@ -68,7 +68,7 @@ class TensorizerLoader(BaseModelLoader):
"""Load a serialized model with tensorizer.
Expects a vLLM-tensorized model. See the
examples/other/tensorize_vllm_model.py example script
examples/others/tensorize_vllm_model.py example script
for serializing vLLM models."""
device_config = vllm_config.device_config
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment