Unverified Commit f17aec0d authored by Reid's avatar Reid Committed by GitHub
Browse files

[doc] Fold long code blocks to improve readability (#19926)


Signed-off-by: default avatarreidliu41 <reid201711@gmail.com>
Co-authored-by: default avatarreidliu41 <reid201711@gmail.com>
parent 493c2753
...@@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall, ...@@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall,
all results for output have been calculated but are just stored in all results for output have been calculated but are just stored in
different thread register memory. different thread register memory.
```cpp ??? Code
float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = NUM_WARPS; i > 1; i /= 2) {
// Upper warps write to shared memory.
...
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
dst[row_idx] = accs[i];
}
// Lower warps update the output. ```cpp
const float* src = &out_smem[warp_idx * HEAD_SIZE]; float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { for (int i = NUM_WARPS; i > 1; i /= 2) {
// Upper warps write to shared memory.
... ...
accs[i] += src[row_idx]; float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
dst[row_idx] = accs[i];
}
// Lower warps update the output.
const float* src = &out_smem[warp_idx * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
accs[i] += src[row_idx];
}
// Write out the accs.
} }
```
// Write out the accs.
}
```
## Output ## Output
......
...@@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture ( ...@@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
```python ??? Code
# inside `setup.py` file
from setuptools import setup ```python
# inside `setup.py` file
setup(name='vllm_add_dummy_model', from setuptools import setup
version='0.1',
packages=['vllm_add_dummy_model'], setup(name='vllm_add_dummy_model',
entry_points={ version='0.1',
'vllm.general_plugins': packages=['vllm_add_dummy_model'],
["register_dummy_model = vllm_add_dummy_model:register"] entry_points={
}) 'vllm.general_plugins':
["register_dummy_model = vllm_add_dummy_model:register"]
# inside `vllm_add_dummy_model.py` file })
def register():
from vllm import ModelRegistry # inside `vllm_add_dummy_model.py` file
def register():
if "MyLlava" not in ModelRegistry.get_supported_archs(): from vllm import ModelRegistry
ModelRegistry.register_model(
"MyLlava", if "MyLlava" not in ModelRegistry.get_supported_archs():
"vllm_add_dummy_model.my_llava:MyLlava", ModelRegistry.register_model(
) "MyLlava",
``` "vllm_add_dummy_model.my_llava:MyLlava",
)
```
For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
......
...@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa ...@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
the third parameter is the path to the LoRA adapter. the third parameter is the path to the LoRA adapter.
```python ??? Code
sampling_params = SamplingParams(
temperature=0, ```python
max_tokens=256, sampling_params = SamplingParams(
stop=["[/assistant]"] temperature=0,
) max_tokens=256,
stop=["[/assistant]"]
prompts = [ )
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", prompts = [
] "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
outputs = llm.generate( ]
prompts,
sampling_params, outputs = llm.generate(
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) prompts,
) sampling_params,
``` lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
)
```
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
...@@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora ...@@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.): with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
```bash ??? Command
curl localhost:8000/v1/models | jq .
{ ```bash
"object": "list", curl localhost:8000/v1/models | jq .
"data": [ {
{ "object": "list",
"id": "meta-llama/Llama-2-7b-hf", "data": [
"object": "model", {
... "id": "meta-llama/Llama-2-7b-hf",
}, "object": "model",
{ ...
"id": "sql-lora", },
"object": "model", {
... "id": "sql-lora",
} "object": "model",
] ...
} }
``` ]
}
```
Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
...@@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin: ...@@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin:
1. Implement the LoRAResolver interface. 1. Implement the LoRAResolver interface.
Example of a simple S3 LoRAResolver implementation: ??? Example of a simple S3 LoRAResolver implementation
```python ```python
import os import os
import s3fs import s3fs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver from vllm.lora.resolver import LoRAResolver
class S3LoRAResolver(LoRAResolver): class S3LoRAResolver(LoRAResolver):
def __init__(self): def __init__(self):
self.s3 = s3fs.S3FileSystem() self.s3 = s3fs.S3FileSystem()
self.s3_path_format = os.getenv("S3_PATH_TEMPLATE") self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE") self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
async def resolve_lora(self, base_model_name, lora_name): async def resolve_lora(self, base_model_name, lora_name):
s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name) s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name) local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
# Download the LoRA from S3 to the local path # Download the LoRA from S3 to the local path
await self.s3._get( await self.s3._get(
s3_path, local_path, recursive=True, maxdepth=1 s3_path, local_path, recursive=True, maxdepth=1
) )
lora_request = LoRARequest( lora_request = LoRARequest(
lora_name=lora_name, lora_name=lora_name,
lora_path=local_path, lora_path=local_path,
lora_int_id=abs(hash(lora_name)) lora_int_id=abs(hash(lora_name))
) )
return lora_request return lora_request
``` ```
2. Register `LoRAResolver` plugin. 2. Register `LoRAResolver` plugin.
...@@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo ...@@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
- The `root` field points to the artifact location of the lora adapter. - The `root` field points to the artifact location of the lora adapter.
```bash ??? Command output
$ curl http://localhost:8000/v1/models
```bash
{ $ curl http://localhost:8000/v1/models
"object": "list",
"data": [ {
{ "object": "list",
"id": "meta-llama/Llama-2-7b-hf", "data": [
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"parent": null,
"permission": [
{ {
..... "id": "meta-llama/Llama-2-7b-hf",
} "object": "model",
] "created": 1715644056,
}, "owned_by": "vllm",
{ "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"id": "sql-lora", "parent": null,
"object": "model", "permission": [
"created": 1715644056, {
"owned_by": "vllm", .....
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", }
"parent": meta-llama/Llama-2-7b-hf, ]
"permission": [ },
{ {
.... "id": "sql-lora",
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
"parent": meta-llama/Llama-2-7b-hf,
"permission": [
{
....
}
]
} }
] ]
} }
] ```
}
```
This diff is collapsed.
...@@ -15,29 +15,31 @@ pip install autoawq ...@@ -15,29 +15,31 @@ pip install autoawq
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
```python ??? Code
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = 'mistralai/Mistral-7B-Instruct-v0.2' ```python
quant_path = 'mistral-instruct-v0.2-awq' from awq import AutoAWQForCausalLM
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } from transformers import AutoTokenizer
# Load model model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
model = AutoAWQForCausalLM.from_pretrained( quant_path = 'mistral-instruct-v0.2-awq'
model_path, **{"low_cpu_mem_usage": True, "use_cache": False} quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Quantize # Load model
model.quantize(tokenizer, quant_config=quant_config) model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Save quantized model # Quantize
model.save_quantized(quant_path) model.quantize(tokenizer, quant_config=quant_config)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"') # Save quantized model
``` model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
```
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
...@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \ ...@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \
AWQ models are also supported directly through the LLM entrypoint: AWQ models are also supported directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
```python
# Sample prompts. from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is", # Sample prompts.
"The president of the United States is", prompts = [
"The capital of France is", "Hello, my name is",
"The future of AI is", "The president of the United States is",
] "The capital of France is",
# Create a sampling params object. "The future of AI is",
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ]
# Create a sampling params object.
# Create an LLM. sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
# Generate texts from the prompts. The output is a list of RequestOutput objects # Create an LLM.
# that contain the prompt, generated text, and other information. llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
outputs = llm.generate(prompts, sampling_params) # Generate texts from the prompts. The output is a list of RequestOutput objects
# Print the outputs. # that contain the prompt, generated text, and other information.
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt # Print the outputs.
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
...@@ -43,17 +43,19 @@ llm = LLM( ...@@ -43,17 +43,19 @@ llm = LLM(
## Read gptq format checkpoint ## Read gptq format checkpoint
```python ??? Code
from vllm import LLM
import torch ```python
from vllm import LLM
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. import torch
model_id = "hxbgsyxh/llama-13b-4bit-g-1"
llm = LLM( # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
model=model_id, model_id = "hxbgsyxh/llama-13b-4bit-g-1"
dtype=torch.float16, llm = LLM(
trust_remote_code=True, model=model_id,
quantization="bitblas", dtype=torch.float16,
max_model_len=1024 trust_remote_code=True,
) quantization="bitblas",
``` max_model_len=1024
)
```
...@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r ...@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
# Configure the simple PTQ quantization ```python
recipe = QuantizationModifier( from llmcompressor.transformers import oneshot
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) from llmcompressor.modifiers.quantization import QuantizationModifier
# Apply the quantization algorithm. # Configure the simple PTQ quantization
oneshot(model=model, recipe=recipe) recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic # Apply the quantization algorithm.
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" oneshot(model=model, recipe=recipe)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR) # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
``` SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
```
### 3. Evaluating Accuracy ### 3. Evaluating Accuracy
......
...@@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ ...@@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
You can also use the GGUF model directly through the LLM entrypoint: You can also use the GGUF model directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
```python
# In this script, we demonstrate how to pass input to the chat method: from vllm import LLM, SamplingParams
conversation = [
{ # In this script, we demonstrate how to pass input to the chat method:
"role": "system", conversation = [
"content": "You are a helpful assistant" {
}, "role": "system",
{ "content": "You are a helpful assistant"
"role": "user", },
"content": "Hello" {
}, "role": "user",
{ "content": "Hello"
"role": "assistant", },
"content": "Hello! How can I assist you today?" {
}, "role": "assistant",
{ "content": "Hello! How can I assist you today?"
"role": "user", },
"content": "Write an essay about the importance of higher education.", {
}, "role": "user",
] "content": "Write an essay about the importance of higher education.",
},
# Create a sampling params object. ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create a sampling params object.
# Create an LLM. sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") # Create an LLM.
# Generate texts from the prompts. The output is a list of RequestOutput objects llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
# that contain the prompt, generated text, and other information. tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
outputs = llm.chat(conversation, sampling_params) # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
# Print the outputs. outputs = llm.chat(conversation, sampling_params)
for output in outputs:
prompt = output.prompt # Print the outputs.
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
...@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t ...@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
```python ??? Code
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
model_id = "meta-llama/Llama-3.2-1B-Instruct" ```python
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit" from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
calibration_dataset = load_dataset( model_id = "meta-llama/Llama-3.2-1B-Instruct"
"allenai/c4", quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128) calibration_dataset = load_dataset(
"allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
).select(range(1024))["text"]
model = GPTQModel.load(model_id, quant_config) quant_config = QuantizeConfig(bits=4, group_size=128)
# increase `batch_size` to match gpu/vram specs to speed up quantization model = GPTQModel.load(model_id, quant_config)
model.quantize(calibration_dataset, batch_size=2)
model.save(quant_path) # increase `batch_size` to match gpu/vram specs to speed up quantization
``` model.quantize(calibration_dataset, batch_size=2)
model.save(quant_path)
```
## Running a quantized model with vLLM ## Running a quantized model with vLLM
...@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \ ...@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \
GPTQModel quantized models are also supported directly through the LLM entrypoint: GPTQModel quantized models are also supported directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
```python
# Sample prompts. from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is", # Sample prompts.
"The president of the United States is", prompts = [
"The capital of France is", "Hello, my name is",
"The future of AI is", "The president of the United States is",
] "The capital of France is",
"The future of AI is",
# Create a sampling params object. ]
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
# Create a sampling params object.
# Create an LLM. sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
# Create an LLM.
# Generate texts from the prompts. The output is a list of RequestOutput objects llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
# Print the outputs. outputs = llm.generate(prompts, sampling_params)
print("-"*50)
for output in outputs: # Print the outputs.
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50) print("-"*50)
``` for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50)
```
...@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd ...@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
```python ??? Code
from datasets import load_dataset
NUM_CALIBRATION_SAMPLES = 512 ```python
MAX_SEQUENCE_LENGTH = 2048 from datasets import load_dataset
# Load and preprocess the dataset NUM_CALIBRATION_SAMPLES = 512
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") MAX_SEQUENCE_LENGTH = 2048
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example): # Load and preprocess the dataset
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.map(preprocess) ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def tokenize(sample): def preprocess(example):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(tokenize, remove_columns=ds.column_names) ds = ds.map(preprocess)
```
def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)
```
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
# Configure the quantization algorithms
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
# Apply quantization
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128 ```python
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" from llmcompressor.transformers import oneshot
model.save_pretrained(SAVE_DIR, save_compressed=True) from llmcompressor.modifiers.quantization import GPTQModifier
tokenizer.save_pretrained(SAVE_DIR) from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
```
# Configure the quantization algorithms
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
# Apply quantization
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
This process creates a W4A16 model with weights quantized to 4-bit integers. This process creates a W4A16 model with weights quantized to 4-bit integers.
...@@ -137,34 +141,36 @@ $ lm_eval --model vllm \ ...@@ -137,34 +141,36 @@ $ lm_eval --model vllm \
The following is an example of an expanded quantization recipe you can tune to your own use case: The following is an example of an expanded quantization recipe you can tune to your own use case:
```python ??? Code
from compressed_tensors.quantization import (
QuantizationArgs, ```python
QuantizationScheme, from compressed_tensors.quantization import (
QuantizationStrategy, QuantizationArgs,
QuantizationType, QuantizationScheme,
) QuantizationStrategy,
recipe = GPTQModifier( QuantizationType,
targets="Linear", )
config_groups={ recipe = GPTQModifier(
"config_group": QuantizationScheme( targets="Linear",
targets=["Linear"], config_groups={
weights=QuantizationArgs( "config_group": QuantizationScheme(
num_bits=4, targets=["Linear"],
type=QuantizationType.INT, weights=QuantizationArgs(
strategy=QuantizationStrategy.GROUP, num_bits=4,
group_size=128, type=QuantizationType.INT,
symmetric=True, strategy=QuantizationStrategy.GROUP,
dynamic=False, group_size=128,
actorder="weight", symmetric=True,
dynamic=False,
actorder="weight",
),
), ),
), },
}, ignore=["lm_head"],
ignore=["lm_head"], update_size=NUM_CALIBRATION_SAMPLES,
update_size=NUM_CALIBRATION_SAMPLES, dampening_frac=0.01
dampening_frac=0.01 )
) ```
```
## Troubleshooting and Support ## Troubleshooting and Support
......
...@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa ...@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
```python ??? Code
from datasets import load_dataset
NUM_CALIBRATION_SAMPLES = 512 ```python
MAX_SEQUENCE_LENGTH = 2048 from datasets import load_dataset
# Load and preprocess the dataset NUM_CALIBRATION_SAMPLES = 512
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") MAX_SEQUENCE_LENGTH = 2048
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example): # Load and preprocess the dataset
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.map(preprocess) ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def tokenize(sample): def preprocess(example):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(tokenize, remove_columns=ds.column_names) ds = ds.map(preprocess)
```
def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)
```
</details>
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier ```python
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
# Configure the quantization algorithms from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
recipe = [
SmoothQuantModifier(smoothing_strength=0.8), # Configure the quantization algorithms
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), recipe = [
] SmoothQuantModifier(smoothing_strength=0.8),
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
# Apply quantization ]
oneshot(
model=model, # Apply quantization
dataset=ds, oneshot(
recipe=recipe, model=model,
max_seq_length=MAX_SEQUENCE_LENGTH, dataset=ds,
num_calibration_samples=NUM_CALIBRATION_SAMPLES, recipe=recipe,
) max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token )
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
model.save_pretrained(SAVE_DIR, save_compressed=True) # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
tokenizer.save_pretrained(SAVE_DIR) SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
``` model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
This process creates a W8A8 model with weights and activations quantized to 8-bit integers. This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
......
...@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te ...@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te
Below is an example showing how to quantize a model using modelopt's PTQ API: Below is an example showing how to quantize a model using modelopt's PTQ API:
```python ??? Code
import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM
# Load the model from HuggingFace ```python
model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>") import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM
# Select the quantization config, for example, FP8 # Load the model from HuggingFace
config = mtq.FP8_DEFAULT_CFG model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
# Define a forward loop function for calibration # Select the quantization config, for example, FP8
def forward_loop(model): config = mtq.FP8_DEFAULT_CFG
for data in calib_set:
model(data)
# PTQ with in-place replacement of quantized modules # Define a forward loop function for calibration
model = mtq.quantize(model, config, forward_loop) def forward_loop(model):
``` for data in calib_set:
model(data)
# PTQ with in-place replacement of quantized modules
model = mtq.quantize(model, config, forward_loop)
```
After the model is quantized, you can export it to a quantized checkpoint using the export API: After the model is quantized, you can export it to a quantized checkpoint using the export API:
...@@ -48,31 +50,33 @@ with torch.inference_mode(): ...@@ -48,31 +50,33 @@ with torch.inference_mode():
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM: The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
```python ??? Code
from vllm import LLM, SamplingParams
def main(): ```python
from vllm import LLM, SamplingParams
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" def main():
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9) model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts, sampling_params) prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__": for output in outputs:
main() prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
main()
```
This diff is collapsed.
...@@ -15,26 +15,28 @@ pip install \ ...@@ -15,26 +15,28 @@ pip install \
## Quantizing HuggingFace Models ## Quantizing HuggingFace Models
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code: You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
```Python ??? Code
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer ```Python
from torchao.quantization import Int8WeightOnlyConfig import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Meta-Llama-3-8B" from torchao.quantization import Int8WeightOnlyConfig
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained( model_name = "meta-llama/Meta-Llama-3-8B"
model_name, quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
torch_dtype="auto", quantized_model = AutoModelForCausalLM.from_pretrained(
device_map="auto", model_name,
quantization_config=quantization_config torch_dtype="auto",
) device_map="auto",
tokenizer = AutoTokenizer.from_pretrained(model_name) quantization_config=quantization_config
input_text = "What are we having for dinner?" )
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") tokenizer = AutoTokenizer.from_pretrained(model_name)
input_text = "What are we having for dinner?"
hub_repo = # YOUR HUB REPO ID input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
tokenizer.push_to_hub(hub_repo)
quantized_model.push_to_hub(hub_repo, safe_serialization=False) hub_repo = # YOUR HUB REPO ID
``` tokenizer.push_to_hub(hub_repo)
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
```
Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI. Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment