update vllm0.6.2

8228a79e · laibao · d77d1901 · 8228a79e · d77d1901 · d77d1901
Commit 8228a79e authored Dec 17, 2024 by laibao
20 changed files
--- a/examples/offline_inference_chat.py
+++ b/examples/offline_inference_chat.py
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
-from vllm import LLM
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-# Create an LLM.
-model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
-# Generate embedding. The output is a list of EmbeddingRequestOutputs.
-outputs = model.encode(prompts)
-# Print the outputs.
-for output in outputs:
-    print(output.outputs.embedding)  # list of 4096 floats
--- a/examples/offline_inference_encoder_decoder.py
+++ b/examples/offline_inference_encoder_decoder.py
--- a/examples/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference_mlpspeculator.py
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
+import os
+
 from vllm import LLM, SamplingParams

+# creates XLA hlo graphs for all the context length buckets.
+os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+# creates XLA hlo graphs for all the token gen buckets.
+os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+# Quantizes neuron model weight to int8 ,
+# The default config for quantization is int8 dtype.
+os.environ['NEURON_QUANT_DTYPE'] = "s8"
+
 # Sample prompts.
 prompts = [
    "Hello, my name is",
@@ -19,12 +29,16 @@ llm = LLM(
    # Currently, this is a known limitation in continuous batching support
    # in transformers-neuronx.
    # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=128,
-    block_size=128,
+    max_model_len=2048,
+    block_size=2048,
    # The device can be automatically detected when AWS Neuron SDK is installed.
    # The device argument can be either unspecified for automated detection,
    # or explicitly assigned.
    device="neuron",
+    quantization="neuron_quant",
+    override_neuron_config={
+        "cast_logits_dtype": "bfloat16",
+    },
    tensor_parallel_size=2)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.

--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference_openai.md
--- a/examples/offline_inference_pixtral.py
+++ b/examples/offline_inference_pixtral.py
--- a/examples/offline_inference_tpu.py
+++ b/examples/offline_inference_tpu.py
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
--- a/examples/offline_streaming_inference_chat_demo.py
+++ b/examples/offline_streaming_inference_chat_demo.py
--- a/examples/openai_audio_api_client.py
+++ b/examples/openai_audio_api_client.py
--- a/examples/openai_chat_completion_client.py
+++ b/examples/openai_chat_completion_client.py
--- a/examples/openai_chat_completion_client_with_tools.py
+++ b/examples/openai_chat_completion_client_with_tools.py
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
--- a/examples/openai_embedding_client.py
+++ b/examples/openai_embedding_client.py
--- a/examples/openai_example_batch.jsonl
+++ b/examples/openai_example_batch.jsonl
-{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}