删除多个示例文件，并在README中添加新的示例和配置文件以支持最新功能。

0ac94a70 · laibao · 6f676d33 · 0ac94a70 · 0ac94a70 · 0ac94a70
Commit 0ac94a70 authored Aug 16, 2025 by laibao
20 changed files
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
--- a/examples/offline_inference_chat.py
+++ b/examples/offline_inference_chat.py
--- a/examples/offline_inference_encoder_decoder.py
+++ b/examples/offline_inference_encoder_decoder.py
--- a/examples/offline_inference_neuron_int8_quantization.py
+++ b/examples/offline_inference_neuron_int8_quantization.py
--- a/examples/offline_inference_tpu.py
+++ b/examples/offline_inference_tpu.py
-from vllm import LLM, SamplingParams
-prompts = [
-    "A robot may not injure a human being",
-    "It is only with the heart that one can see rightly;",
-    "The greatest glory in living lies not in never falling,",
-]
-answers = [
-    " or, through inaction, allow a human being to come to harm.",
-    " what is essential is invisible to the eye.",
-    " but in rising every time we fall.",
-]
-N = 1
-# Currently, top-p sampling is disabled. `top_p` should be 1.0.
-sampling_params = SamplingParams(temperature=0.7,
-                                 top_p=1.0,
-                                 n=N,
-                                 max_tokens=16)
-# Set `enforce_eager=True` to avoid ahead-of-time compilation.
-# In real workloads, `enforace_eager` should be `False`.
-llm = LLM(model="google/gemma-2b", enforce_eager=True)
-outputs = llm.generate(prompts, sampling_params)
-for output, answer in zip(outputs, answers):
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    assert generated_text.startswith(answer)
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
--- a/examples/offline_streaming_inference_chat_demo.py
+++ b/examples/offline_streaming_inference_chat_demo.py
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
--- a/examples/online_serving/chart-helm/.helmignore
+++ b/examples/online_serving/chart-helm/.helmignore
+*.png
+.git/
+ct.yaml
+lintconf.yaml
+values.schema.json
+/workflows
\ No newline at end of file
--- a/examples/online_serving/chart-helm/Chart.yaml
+++ b/examples/online_serving/chart-helm/Chart.yaml
--- a/examples/online_serving/chart-helm/README.md
+++ b/examples/online_serving/chart-helm/README.md