init qwen3

007858cc · zhuwenwen · 007858cc · 007858cc · 007858cc · 007858cc
Commit 007858cc authored May 06, 2025 by zhuwenwen
20 changed files
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
+WEIGHT_SHAPES = {
+    "ideal": [[4 * 256 * 32, 256 * 32]],
+    "mistralai/Mistral-7B-v0.1/TP1": [
+        [4096, 6144],
+        [4096, 4096],
+        [4096, 28672],
+        [14336, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP2": [
+        [4096, 3072],
+        [2048, 4096],
+        [4096, 14336],
+        [7168, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP4": [
+        [4096, 1536],
+        [1024, 4096],
+        [4096, 7168],
+        [3584, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP1": [
+        [4096, 12288],
+        [4096, 4096],
+        [4096, 22016],
+        [11008, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP2": [
+        [4096, 6144],
+        [2048, 4096],
+        [4096, 11008],
+        [5504, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP4": [
+        [4096, 3072],
+        [1024, 4096],
+        [4096, 5504],
+        [2752, 4096],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP1": [
+        [5120, 15360],
+        [5120, 5120],
+        [5120, 27648],
+        [13824, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP2": [
+        [5120, 7680],
+        [2560, 5120],
+        [5120, 13824],
+        [6912, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP4": [
+        [5120, 3840],
+        [1280, 5120],
+        [5120, 6912],
+        [3456, 5120],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP1": [
+        [8192, 10240],
+        [8192, 8192],
+        [8192, 57344],
+        [28672, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP2": [
+        [8192, 5120],
+        [4096, 8192],
+        [8192, 28672],
+        [14336, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP4": [
+        [8192, 2560],
+        [2048, 8192],
+        [8192, 14336],
+        [7168, 8192],
+    ],
+}
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
--- a/benchmarks/kernels/requirements.txt
+++ b/benchmarks/kernels/requirements.txt
+pandas
\ No newline at end of file
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
+#!/bin/bash
+
+PORT=8000
+MODEL=$1
+TOKENS=$2
+
+docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
+           -v $PWD/data:/data \
+           ghcr.io/huggingface/text-generation-inference:2.2.0 \
+           --model-id $MODEL \
+           --sharded false  \
+           --max-input-length 1024 \
+           --max-total-tokens 2048 \
+           --max-best-of 5 \
+           --max-concurrent-requests 5000 \
+           --max-batch-total-tokens $TOKENS
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
--- a/benchmarks/sonnet.txt
+++ b/benchmarks/sonnet.txt
--- a/doc/qwen3.jpg
+++ b/doc/qwen3.jpg
--- a/doc/qwen3.png
+++ b/doc/qwen3.png
--- a/examples/cpu_offload.py
+++ b/examples/cpu_offload.py
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/gguf_inference.py
+++ b/examples/gguf_inference.py
--- a/examples/gradio_openai_chatbot_webserver.py
+++ b/examples/gradio_openai_chatbot_webserver.py
--- a/examples/gradio_webserver.py
+++ b/examples/gradio_webserver.py
--- a/examples/medusa/README.md
+++ b/examples/medusa/README.md
--- a/examples/medusa/medusa_benchmark_throughput.py
+++ b/examples/medusa/medusa_benchmark_throughput.py
--- a/examples/medusa/medusa_weight_converter.py
+++ b/examples/medusa/medusa_weight_converter.py
--- a/examples/offline_chat_with_tools.py
+++ b/examples/offline_chat_with_tools.py
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py