Unverified Commit aba8d6ee authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

[Doc] Move examples into categories (#11840)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 2a0596bc
...@@ -30,7 +30,7 @@ function cpu_tests() { ...@@ -30,7 +30,7 @@ function cpu_tests() {
# offline inference # offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e set -e
python3 examples/offline_inference.py" python3 examples/offline_inference/offline_inference.py"
# Run basic model test # Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
......
...@@ -24,5 +24,5 @@ remove_docker_container ...@@ -24,5 +24,5 @@ remove_docker_container
# Run the image and test offline inference # Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference.py python3 examples/offline_inference/offline_inference.py
' '
...@@ -13,4 +13,4 @@ trap remove_docker_container EXIT ...@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
\ No newline at end of file \ No newline at end of file
...@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ ...@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \ --name "${container_name}" \
${image_name} \ ${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py" /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
...@@ -13,4 +13,4 @@ trap remove_docker_container EXIT ...@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
...@@ -14,4 +14,4 @@ remove_docker_container ...@@ -14,4 +14,4 @@ remove_docker_container
# For HF_TOKEN. # For HF_TOKEN.
source /etc/environment source /etc/environment
# Run a simple end-to-end example. # Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
...@@ -14,6 +14,6 @@ remove_docker_container ...@@ -14,6 +14,6 @@ remove_docker_container
# Run the image and test offline inference/tensor parallel # Run the image and test offline inference/tensor parallel
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference.py python3 examples/offline_inference/offline_inference.py
python3 examples/offline_inference_cli.py -tp 2 python3 examples/offline_inference/offline_inference_cli.py -tp 2
' '
...@@ -187,19 +187,19 @@ steps: ...@@ -187,19 +187,19 @@ steps:
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
- python3 offline_inference.py - python3 offline_inference/offline_inference.py
- python3 cpu_offload.py - python3 offline_inference/cpu_offload.py
- python3 offline_inference_chat.py - python3 offline_inference/offline_inference_chat.py
- python3 offline_inference_with_prefix.py - python3 offline_inference/offline_inference_with_prefix.py
- python3 llm_engine_example.py - python3 offline_inference/llm_engine_example.py
- python3 offline_inference_vision_language.py - python3 offline_inference/offline_inference_vision_language.py
- python3 offline_inference_vision_language_multi_image.py - python3 offline_inference/offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py - python3 offline_inference/offline_inference_encoder_decoder.py
- python3 offline_inference_classification.py - python3 offline_inference/offline_inference_classification.py
- python3 offline_inference_embedding.py - python3 offline_inference/offline_inference_embedding.py
- python3 offline_inference_scoring.py - python3 offline_inference/offline_inference_scoring.py
- python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min - label: Prefix Caching Test # 9min
mirror_hardwares: [amd] mirror_hardwares: [amd]
......
...@@ -27,7 +27,7 @@ jobs: ...@@ -27,7 +27,7 @@ jobs:
version: v3.10.1 version: v3.10.1
- name: Run chart-testing (lint) - name: Run chart-testing (lint)
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
- name: Setup minio - name: Setup minio
run: | run: |
...@@ -64,7 +64,7 @@ jobs: ...@@ -64,7 +64,7 @@ jobs:
run: | run: |
export AWS_ACCESS_KEY_ID=minioadmin export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test - name: curl test
run: | run: |
......
...@@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image ...@@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image
# define sagemaker first, so it is not default from `docker build` # define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker FROM vllm-openai-base AS vllm-sagemaker
COPY examples/sagemaker-entrypoint.sh . COPY examples/online_serving/sagemaker-entrypoint.sh .
RUN chmod +x sagemaker-entrypoint.sh RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT ["./sagemaker-entrypoint.sh"] ENTRYPOINT ["./sagemaker-entrypoint.sh"]
......
...@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve ...@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
### Offline Inference ### Offline Inference
Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example. Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
### OpenAI Server ### OpenAI Server
......
...@@ -61,7 +61,7 @@ run: | ...@@ -61,7 +61,7 @@ run: |
echo 'Starting gradio server...' echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \ python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \ -m $MODEL_NAME \
--port 8811 \ --port 8811 \
--model-url http://localhost:8081/v1 \ --model-url http://localhost:8081/v1 \
...@@ -321,7 +321,7 @@ run: | ...@@ -321,7 +321,7 @@ run: |
echo 'Starting gradio server...' echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \ python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \ -m $MODEL_NAME \
--port 8811 \ --port 8811 \
--model-url http://$ENDPOINT/v1 \ --model-url http://$ENDPOINT/v1 \
......
...@@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput. ...@@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput.
## Usage example ## Usage example
Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
## Benchmarks ## Benchmarks
......
...@@ -47,7 +47,7 @@ outputs = llm.generate( ...@@ -47,7 +47,7 @@ outputs = llm.generate(
) )
``` ```
Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
## Serving LoRA Adapters ## Serving LoRA Adapters
......
...@@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"') ...@@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
```console ```console
$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq $ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
``` ```
AWQ models are also supported directly through the LLM entrypoint: AWQ models are also supported directly through the LLM entrypoint:
......
...@@ -28,7 +28,7 @@ Here is an example of how to enable this feature: ...@@ -28,7 +28,7 @@ Here is an example of how to enable this feature:
```python ```python
# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. # https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=1.3, top_p=0.8) sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
......
...@@ -131,7 +131,7 @@ completion = client.chat.completions.create( ...@@ -131,7 +131,7 @@ completion = client.chat.completions.create(
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
``` ```
Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
## Experimental Automatic Parsing (OpenAI API) ## Experimental Automatic Parsing (OpenAI API)
...@@ -257,4 +257,4 @@ outputs = llm.generate( ...@@ -257,4 +257,4 @@ outputs = llm.generate(
print(outputs[0].outputs[0].text) print(outputs[0].outputs[0].text)
``` ```
Full example: <gh-file:examples/offline_inference_structured_outputs.py> Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
...@@ -12,6 +12,7 @@ EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples" ...@@ -12,6 +12,7 @@ EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
def fix_case(text: str) -> str: def fix_case(text: str) -> str:
subs = { subs = {
"api": "API", "api": "API",
"Cli": "CLI",
"cpu": "CPU", "cpu": "CPU",
"llm": "LLM", "llm": "LLM",
"tpu": "TPU", "tpu": "TPU",
...@@ -58,7 +59,7 @@ class Index: ...@@ -58,7 +59,7 @@ class Index:
content = f"# {self.title}\n\n{self.description}\n\n" content = f"# {self.title}\n\n{self.description}\n\n"
content += "```{toctree}\n" content += "```{toctree}\n"
content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n" content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
content += "\n".join(sorted(self.documents)) + "\n```\n" content += "\n".join(self.documents) + "\n```\n"
return content return content
...@@ -131,11 +132,14 @@ class Example: ...@@ -131,11 +132,14 @@ class Example:
ROOT_DIR) ROOT_DIR)
content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n" content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
if self.main_file.suffix == ".py":
content += f"# {self.title}\n\n"
include = "include" if self.main_file.suffix == ".md" else \ include = "include" if self.main_file.suffix == ".md" else \
"literalinclude" "literalinclude"
content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n" if include == "literalinclude":
content += f"# {self.title}\n\n"
content += f":::{{{include}}} {make_relative(self.main_file)}\n"
if include == "literalinclude":
content += f":language: {self.main_file.suffix[1:]}\n"
content += ":::\n\n"
if not self.other_files: if not self.other_files:
return content return content
...@@ -163,14 +167,16 @@ def generate_examples(): ...@@ -163,14 +167,16 @@ def generate_examples():
description= description=
"A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.", # noqa: E501 "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.", # noqa: E501
caption="Examples", caption="Examples",
maxdepth=1) # TODO change to 2 when examples start being categorised maxdepth=2)
# Category indices stored in reverse order because they are inserted into
# examples_index.documents at index 0 in order
category_indices = { category_indices = {
"offline_inference": "other":
Index( Index(
path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md", path=EXAMPLE_DOC_DIR / "examples_other_index.md",
title="Offline Inference", title="Other",
description= description=
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501 "Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501
caption="Examples", caption="Examples",
), ),
"online_serving": "online_serving":
...@@ -181,31 +187,30 @@ def generate_examples(): ...@@ -181,31 +187,30 @@ def generate_examples():
"Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501 "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501
caption="Examples", caption="Examples",
), ),
"other": "offline_inference":
Index( Index(
path=EXAMPLE_DOC_DIR / "examples_other_index.md", path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
title="Other", title="Offline Inference",
description= description=
"Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501 "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501
caption="Examples", caption="Examples",
), ),
} }
examples = [] examples = []
glob_patterns = ["*.py", "*.md", "*.sh"]
# Find categorised examples # Find categorised examples
for category in category_indices: for category in category_indices:
category_dir = EXAMPLE_DIR / category category_dir = EXAMPLE_DIR / category
py = category_dir.glob("*.py") globs = [category_dir.glob(pattern) for pattern in glob_patterns]
md = category_dir.glob("*.md") for path in itertools.chain(*globs):
for path in itertools.chain(py, md):
examples.append(Example(path, category)) examples.append(Example(path, category))
# Find examples in subdirectories # Find examples in subdirectories
for path in category_dir.glob("*/*.md"): for path in category_dir.glob("*/*.md"):
examples.append(Example(path.parent, category)) examples.append(Example(path.parent, category))
# Find uncategorised examples # Find uncategorised examples
py = EXAMPLE_DIR.glob("*.py") globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
md = EXAMPLE_DIR.glob("*.md") for path in itertools.chain(*globs):
for path in itertools.chain(py, md):
examples.append(Example(path)) examples.append(Example(path))
# Find examples in subdirectories # Find examples in subdirectories
for path in EXAMPLE_DIR.glob("*/*.md"): for path in EXAMPLE_DIR.glob("*/*.md"):
...@@ -215,7 +220,7 @@ def generate_examples(): ...@@ -215,7 +220,7 @@ def generate_examples():
examples.append(Example(path.parent)) examples.append(Example(path.parent))
# Generate the example documentation # Generate the example documentation
for example in examples: for example in sorted(examples, key=lambda e: e.path.stem):
doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md" doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
with open(doc_path, "w+") as f: with open(doc_path, "w+") as f:
f.write(example.generate()) f.write(example.generate())
......
...@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install ...@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
$ find / -name *libtcmalloc* # find the dynamic link library path $ find / -name *libtcmalloc* # find the dynamic link library path
$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
$ python examples/offline_inference.py # run vLLM $ python examples/offline_inference/offline_inference.py # run vLLM
``` ```
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
...@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ ...@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference.py $ python examples/offline_inference/offline_inference.py
``` ```
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
......
...@@ -71,4 +71,4 @@ $ --pipeline-parallel-size=2 \ ...@@ -71,4 +71,4 @@ $ --pipeline-parallel-size=2 \
$ -tp=8 $ -tp=8
``` ```
By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script. By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment