[Doc] Move examples into categories (#11840)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

[Doc] Move examples into categories (#11840)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
aba8d6ee · Harry Mellor · GitHub · 2a0596bc · aba8d6ee · aba8d6ee
Unverified Commit aba8d6ee authored Jan 08, 2025 by Harry Mellor Committed by GitHub Jan 08, 2025
20 changed files
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,7 +30,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
    set -e
-    python3 examples/offline_inference.py"
+    python3 examples/offline_inference/offline_inference.py"
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "

--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -24,5 +24,5 @@ remove_docker_container
 # Run the image and test offline inference
 docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference.py
+    python3 examples/offline_inference/offline_inference.py
 '
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
\ No newline at end of file
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -14,4 +14,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -14,6 +14,6 @@ remove_docker_container
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference.py
+    python3 examples/offline_inference/offline_inference.py
-    python3 examples/offline_inference_cli.py -tp 2
+    python3 examples/offline_inference/offline_inference_cli.py -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -187,19 +187,19 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference.py
+    - python3 offline_inference/offline_inference.py
-    - python3 cpu_offload.py
+    - python3 offline_inference/cpu_offload.py
-    - python3 offline_inference_chat.py
+    - python3 offline_inference/offline_inference_chat.py
-    - python3 offline_inference_with_prefix.py
+    - python3 offline_inference/offline_inference_with_prefix.py
-    - python3 llm_engine_example.py
+    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference_vision_language.py
+    - python3 offline_inference/offline_inference_vision_language.py
-    - python3 offline_inference_vision_language_multi_image.py
+    - python3 offline_inference/offline_inference_vision_language_multi_image.py
-    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference_encoder_decoder.py
+    - python3 offline_inference/offline_inference_encoder_decoder.py
-    - python3 offline_inference_classification.py
+    - python3 offline_inference/offline_inference_classification.py
-    - python3 offline_inference_embedding.py
+    - python3 offline_inference/offline_inference_embedding.py
-    - python3 offline_inference_scoring.py
+    - python3 offline_inference/offline_inference_scoring.py
-    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
 - label: Prefix Caching Test # 9min
  mirror_hardwares: [amd]

--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -27,7 +27,7 @@ jobs:
          version: v3.10.1
      - name: Run chart-testing (lint)
-        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
      - name: Setup minio
        run: |
@@ -64,7 +64,7 @@ jobs:
        run: |
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
      - name: curl test
        run: |

--- a/Dockerfile
+++ b/Dockerfile
@@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image
 # define sagemaker first, so it is not default from `docker build`
 FROM vllm-openai-base AS vllm-sagemaker
-COPY examples/sagemaker-entrypoint.sh .
+COPY examples/online_serving/sagemaker-entrypoint.sh .
 RUN chmod +x sagemaker-entrypoint.sh
 ENTRYPOINT ["./sagemaker-entrypoint.sh"]

--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 ### Offline Inference
-Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example.
+Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
 ### OpenAI Server

--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -61,7 +61,7 @@ run: |
  echo 'Starting gradio server...'
  git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
    -m $MODEL_NAME \
    --port 8811 \
    --model-url http://localhost:8081/v1 \
@@ -321,7 +321,7 @@ run: |
  echo 'Starting gradio server...'
  git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
    -m $MODEL_NAME \
    --port 8811 \
    --model-url http://$ENDPOINT/v1 \

--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput.
 ## Usage example
-Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
 ## Benchmarks

--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -47,7 +47,7 @@ outputs = llm.generate(
 )
 ```
-Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 ## Serving LoRA Adapters

--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 ```console
-$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
 ```
 AWQ models are also supported directly through the LLM entrypoint:

--- a/docs/source/features/quantization/fp8_e4m3_kvcache.md
+++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md
@@ -28,7 +28,7 @@ Here is an example of how to enable this feature:
 ```python
 # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
-# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
+# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
 from vllm import LLM, SamplingParams
 sampling_params = SamplingParams(temperature=1.3, top_p=0.8)

--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -131,7 +131,7 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
-Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
 ## Experimental Automatic Parsing (OpenAI API)
@@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
-Full example: <gh-file:examples/offline_inference_structured_outputs.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -12,6 +12,7 @@ EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
 def fix_case(text: str) -> str:
    subs = {
        "api": "API",
+        "Cli": "CLI",
        "cpu": "CPU",
        "llm": "LLM",
        "tpu": "TPU",
@@ -58,7 +59,7 @@ class Index:
        content = f"# {self.title}\n\n{self.description}\n\n"
        content += "```{toctree}\n"
        content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(sorted(self.documents)) + "\n```\n"
+        content += "\n".join(self.documents) + "\n```\n"
        return content
@@ -131,11 +132,14 @@ class Example:
            ROOT_DIR)
        content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
-        if self.main_file.suffix == ".py":
-            content += f"# {self.title}\n\n"
        include = "include" if self.main_file.suffix == ".md" else \
            "literalinclude"
-        content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
+        if include == "literalinclude":
+            content += f"# {self.title}\n\n"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n"
+        if include == "literalinclude":
+            content += f":language: {self.main_file.suffix[1:]}\n"
+        content += ":::\n\n"
        if not self.other_files:
            return content
@@ -163,14 +167,16 @@ def generate_examples():
        description=
        "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
        caption="Examples",
-        maxdepth=1)  # TODO change to 2 when examples start being categorised
+        maxdepth=2)
+    # Category indices stored in reverse order because they are inserted into
+    # examples_index.documents at index 0 in order
    category_indices = {
-        "offline_inference":
+        "other":
        Index(
-            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
+            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
-            title="Offline Inference",
+            title="Other",
            description=
-            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
            caption="Examples",
        ),
        "online_serving":
@@ -181,31 +187,30 @@ def generate_examples():
            "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
            caption="Examples",
        ),
-        "other":
+        "offline_inference":
        Index(
-            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
+            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
-            title="Other",
+            title="Offline Inference",
            description=
-            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
            caption="Examples",
        ),
    }
    examples = []
+    glob_patterns = ["*.py", "*.md", "*.sh"]
    # Find categorised examples
    for category in category_indices:
        category_dir = EXAMPLE_DIR / category
-        py = category_dir.glob("*.py")
+        globs = [category_dir.glob(pattern) for pattern in glob_patterns]
-        md = category_dir.glob("*.md")
+        for path in itertools.chain(*globs):
-        for path in itertools.chain(py, md):
            examples.append(Example(path, category))
        # Find examples in subdirectories
        for path in category_dir.glob("*/*.md"):
            examples.append(Example(path.parent, category))
    # Find uncategorised examples
-    py = EXAMPLE_DIR.glob("*.py")
+    globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
-    md = EXAMPLE_DIR.glob("*.md")
+    for path in itertools.chain(*globs):
-    for path in itertools.chain(py, md):
        examples.append(Example(path))
    # Find examples in subdirectories
    for path in EXAMPLE_DIR.glob("*/*.md"):
@@ -215,7 +220,7 @@ def generate_examples():
        examples.append(Example(path.parent))
    # Generate the example documentation
-    for example in examples:
+    for example in sorted(examples, key=lambda e: e.path.stem):
        doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
        with open(doc_path, "w+") as f:
            f.write(example.generate())

--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 $ find / -name *libtcmalloc* # find the dynamic link library path
 $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference.py # run vLLM
+$ python examples/offline_inference/offline_inference.py # run vLLM
 ```
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference.py
+$ python examples/offline_inference/offline_inference.py
 ```
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.

--- a/docs/source/getting_started/installation/xpu.md
+++ b/docs/source/getting_started/installation/xpu.md
@@ -71,4 +71,4 @@ $      --pipeline-parallel-size=2 \
 $      -tp=8
 ```
-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.