[Doc] Move examples and further reorganize user guide (#18666)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Doc] Move examples and further reorganize user guide (#18666)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
82e2339b · Cyrus Leung · GitHub · 9553fdb4 · 82e2339b · 82e2339b
Unverified Commit 82e2339b authored May 26, 2025 by Cyrus Leung Committed by GitHub May 26, 2025
7 changed files
--- a/examples/other/logging_configuration.md
+++ b/examples/other/logging_configuration.md
--- a/examples/other/tensorize_vllm_model.py
+++ b/examples/other/tensorize_vllm_model.py
@@ -28,7 +28,7 @@ https://github.com/coreweave/tensorizer
 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:

-python examples/other/tensorize_vllm_model.py \
+python examples/others/tensorize_vllm_model.py \
   --model facebook/opt-125m \
   serialize \
   --serialized-directory s3://my-bucket \
@@ -48,7 +48,7 @@ providing a `--keyfile` argument.
 To deserialize a model, you can run something like this from the root 
 level of this repository:

-python examples/other/tensorize_vllm_model.py \
+python examples/others/tensorize_vllm_model.py \
   --model EleutherAI/gpt-j-6B \
   --dtype float16 \
   deserialize \
@@ -66,11 +66,11 @@ shard's rank. Sharded models serialized with this script will be named as
 model-rank-%03d.tensors

 For more information on the available arguments for serializing, run 
-`python -m examples.other.tensorize_vllm_model serialize --help`.
+`python -m examples.others.tensorize_vllm_model serialize --help`.

 Or for deserializing:

-`python examples/other/tensorize_vllm_model.py deserialize --help`.
+`python examples/others/tensorize_vllm_model.py deserialize --help`.

 Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 directly to load models:
@@ -91,7 +91,7 @@ TensorizerConfig arguments desired.
 In order to see all of the available arguments usable to configure 
 loading with tensorizer that are given to `TensorizerConfig`, run:

-`python examples/other/tensorize_vllm_model.py deserialize --help`
+`python examples/others/tensorize_vllm_model.py deserialize --help`

 under the `tensorizer options` section. These can also be used for
 deserialization in this example script, although `--tensorizer-uri` and

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,11 +62,6 @@ ignore_patterns = [
 [tool.ruff]
 # Allow lines to be as long as 80.
 line-length = 80
-exclude = [
-    # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py",
-    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
-]

 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]

--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -41,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
-python-json-logger # Used by logging as per examples/other/logging_configuration.md
+python-json-logger # Used by logging as per examples/others/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 opentelemetry-sdk>=1.26.0  # vllm.tracing

--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -207,7 +207,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
    try:
        result = subprocess.run([
            sys.executable,
-            f"{VLLM_PATH}/examples/other/tensorize_vllm_model.py", "--model",
+            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
            str(tp_size), "serialize", "--serialized-directory",
            str(tmp_path), "--suffix", suffix

--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -251,7 +251,7 @@ class TensorizerArgs:
      encryption_keyfile: File path to a binary file containing a  
          binary key to use for decryption. `None` (the default) means 
          no decryption. See the example script in 
-          examples/other/tensorize_vllm_model.py. 
+          examples/others/tensorize_vllm_model.py. 
      s3_access_key_id: The access key for the S3 bucket. Can also be set via
          the S3_ACCESS_KEY_ID environment variable.
      s3_secret_access_key: The secret access key for the S3 bucket. Can also
@@ -469,7 +469,7 @@ def tensorizer_weights_iterator(
                   "loading on vLLM, as tensorizer is forced to load to CPU. "
                   "Consider deserializing a vLLM model instead for faster "
                   "load times. See the "
-                   "examples/other/tensorize_vllm_model.py example script "
+                   "examples/others/tensorize_vllm_model.py example script "
                   "for serializing vLLM models.")

    deserializer_args = tensorizer_args.deserializer_params

--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -48,7 +48,7 @@ class TensorizerLoader(BaseModelLoader):
        """Load a serialized model with tensorizer to the CPU.

        This is only necessary when the model isn't vLLM-tensorized (see
-        examples/other/tensorize_vllm_model.py) This should still
+        examples/others/tensorize_vllm_model.py) This should still
        be faster than default HuggingFace loading, but will be slower than
        loading a vLLM-tensorized model.
        """
@@ -68,7 +68,7 @@ class TensorizerLoader(BaseModelLoader):
        """Load a serialized model with tensorizer.

        Expects a vLLM-tensorized model. See the
-        examples/other/tensorize_vllm_model.py example script
+        examples/others/tensorize_vllm_model.py example script
        for serializing vLLM models."""

        device_config = vllm_config.device_config