Unverified Commit 62566979 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Doc] ruff format remaining Python examples (#26795)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 71557a5f
...@@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the ...@@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
from awq import AutoAWQForCausalLM from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
model_path = 'mistralai/Mistral-7B-Instruct-v0.2' model_path = "mistralai/Mistral-7B-Instruct-v0.2"
quant_path = 'mistral-instruct-v0.2-awq' quant_path = "mistral-instruct-v0.2-awq"
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
# Load model # Load model
model = AutoAWQForCausalLM.from_pretrained( model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False} model_path,
low_cpu_mem_usage=True,
use_cache=False,
) )
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
......
...@@ -34,7 +34,7 @@ llm = LLM( ...@@ -34,7 +34,7 @@ llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitblas" quantization="bitblas",
) )
``` ```
...@@ -53,6 +53,6 @@ llm = LLM( ...@@ -53,6 +53,6 @@ llm = LLM(
dtype=torch.float16, dtype=torch.float16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitblas", quantization="bitblas",
max_model_len=1024 max_model_len=1024,
) )
``` ```
...@@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit" ...@@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM( llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True trust_remote_code=True,
) )
``` ```
...@@ -43,7 +43,7 @@ llm = LLM( ...@@ -43,7 +43,7 @@ llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitsandbytes" quantization="bitsandbytes",
) )
``` ```
......
...@@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM ...@@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
torch_dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
...@@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio ...@@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
# Configure the simple PTQ quantization # Configure the simple PTQ quantization
recipe = QuantizationModifier( recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) targets="Linear",
scheme="FP8_DYNAMIC",
ignore=["lm_head"],
)
# Apply the quantization algorithm. # Apply the quantization algorithm.
oneshot(model=model, recipe=recipe) oneshot(model=model, recipe=recipe)
......
...@@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint: ...@@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
conversation = [ conversation = [
{ {
"role": "system", "role": "system",
"content": "You are a helpful assistant" "content": "You are a helpful assistant",
}, },
{ {
"role": "user", "role": "user",
"content": "Hello" "content": "Hello",
}, },
{ {
"role": "assistant", "role": "assistant",
"content": "Hello! How can I assist you today?" "content": "Hello! How can I assist you today?",
}, },
{ {
"role": "user", "role": "user",
...@@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint: ...@@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", llm = LLM(
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.chat(conversation, sampling_params) outputs = llm.chat(conversation, sampling_params)
......
...@@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: ...@@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
calibration_dataset = load_dataset( calibration_dataset = load_dataset(
"allenai/c4", "allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz", data_files="en/c4-train.00001-of-01024.json.gz",
split="train" split="train",
).select(range(1024))["text"] ).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128) quant_config = QuantizeConfig(bits=4, group_size=128)
......
...@@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM ...@@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
torch_dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
...@@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y ...@@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
}, },
ignore=["lm_head"], ignore=["lm_head"],
update_size=NUM_CALIBRATION_SAMPLES, update_size=NUM_CALIBRATION_SAMPLES,
dampening_frac=0.01 dampening_frac=0.01,
) )
``` ```
......
...@@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM ...@@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
torch_dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
......
...@@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll ...@@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
def main(): def main():
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9) sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
......
...@@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization: ...@@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=0.7, top_p=0.8) sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", llm = LLM(
kv_cache_dtype="fp8", model="meta-llama/Llama-2-7b-chat-hf",
calculate_kv_scales=True) kv_cache_dtype="fp8",
calculate_kv_scales=True,
)
prompt = "London is the capital of" prompt = "London is the capital of"
out = llm.generate(prompt, sampling_params)[0].outputs[0].text out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out) print(out)
......
...@@ -48,7 +48,9 @@ to fetch model and tokenizer. ...@@ -48,7 +48,9 @@ to fetch model and tokenizer.
MAX_SEQ_LEN = 512 MAX_SEQ_LEN = 512
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
torch_dtype="auto",
) )
model.eval() model.eval()
...@@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib ...@@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA] text_data = dataset["text"][:NUM_CALIBRATION_DATA]
tokenized_outputs = tokenizer(text_data, return_tensors="pt", tokenized_outputs = tokenizer(
padding=True, truncation=True, max_length=MAX_SEQ_LEN) text_data,
calib_dataloader = DataLoader(tokenized_outputs['input_ids'], return_tensors="pt",
batch_size=BATCH_SIZE, drop_last=True) padding=True,
truncation=True,
max_length=MAX_SEQ_LEN,
)
calib_dataloader = DataLoader(
tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE,
drop_last=True,
)
``` ```
### 3. Set the Quantization Configuration ### 3. Set the Quantization Configuration
...@@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. ...@@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
load_quant_algo_config_from_file) load_quant_algo_config_from_file)
# Define fp8/per-tensor/static spec. # Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
is_dynamic=False).to_quantization_spec() observer_method="min_max",
is_dynamic=False,
).to_quantization_spec()
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, global_quant_config = QuantizationConfig(
weight=FP8_PER_TENSOR_SPEC) input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC,
)
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = {name : kv_cache_quant_config = {
QuantizationConfig(input_tensors=global_quant_config.input_tensors, name: QuantizationConfig(
weight=global_quant_config.weight, input_tensors=global_quant_config.input_tensors,
output_tensors=KV_CACHE_SPEC) weight=global_quant_config.weight,
for name in kv_cache_layer_names_for_llama} output_tensors=KV_CACHE_SPEC,
)
for name in kv_cache_layer_names_for_llama
}
layer_quant_config = kv_cache_quant_config.copy() layer_quant_config = kv_cache_quant_config.copy()
# Define algorithm config by config file. # Define algorithm config by config file.
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
EXCLUDE_LAYERS = ["lm_head"] EXCLUDE_LAYERS = ["lm_head"]
...@@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. ...@@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
layer_quant_config=layer_quant_config, layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config, kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS, exclude=EXCLUDE_LAYERS,
algo_config=algo_config) algo_config=algo_config,
)
``` ```
### 4. Quantize the Model and Export ### 4. Quantize the Model and Export
...@@ -165,8 +182,11 @@ for more exporting format details. ...@@ -165,8 +182,11 @@ for more exporting format details.
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad(): with torch.no_grad():
exporter.export_safetensors_model(freezed_model, exporter.export_safetensors_model(
quant_config=quant_config, tokenizer=tokenizer) freezed_model,
quant_config=quant_config,
tokenizer=tokenizer,
)
``` ```
### 5. Evaluation in vLLM ### 5. Evaluation in vLLM
...@@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent ...@@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", llm = LLM(
kv_cache_dtype='fp8',quantization='quark') model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype="fp8",
quantization="quark",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
......
...@@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep ...@@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", completion = client.completions.create(
prompt="San Francisco is a") model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a",
)
print("Completion result:", completion) print("Completion result:", completion)
``` ```
...@@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package: ...@@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."}, {"role": "user", "content": "Tell me a joke."},
] ],
) )
print("Chat response:", chat_response) print("Chat response:", chat_response)
``` ```
......
...@@ -60,7 +60,7 @@ from vllm import LLM ...@@ -60,7 +60,7 @@ from vllm import LLM
llm = LLM( llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer", load_format="tensorizer",
enable_lora=True enable_lora=True,
) )
``` ```
...@@ -97,6 +97,6 @@ llm = LLM( ...@@ -97,6 +97,6 @@ llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer", load_format="tensorizer",
enable_lora=True, enable_lora=True,
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}} model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
) )
``` ```
...@@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc ...@@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
conversation = [ conversation = [
{ {
"role": "system", "role": "system",
"content": "You are a helpful assistant" "content": "You are a helpful assistant",
}, },
{ {
"role": "user", "role": "user",
"content": "Hello" "content": "Hello",
}, },
{ {
"role": "assistant", "role": "assistant",
"content": "Hello! How can I assist you today?" "content": "Hello! How can I assist you today?",
}, },
{ {
"role": "user", "role": "user",
......
...@@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u ...@@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u
from vllm import LLM from vllm import LLM
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
(output,) = llm.score("What is the capital of France?", (output,) = llm.score(
"The capital of Brazil is Brasilia.") "What is the capital of France?",
"The capital of Brazil is Brasilia.",
)
score = output.outputs.score score = output.outputs.score
print(f"Score: {score}") print(f"Score: {score}")
...@@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please ...@@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please
Here is an example to serve a model with Matryoshka Embeddings enabled. Here is an example to serve a model with Matryoshka Embeddings enabled.
```text ```bash
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
``` ```
...@@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka ...@@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka
```python ```python
from vllm import LLM, PoolingParams from vllm import LLM, PoolingParams
llm = LLM(model="jinaai/jina-embeddings-v3", llm = LLM(
runner="pooling", model="jinaai/jina-embeddings-v3",
trust_remote_code=True) runner="pooling",
outputs = llm.embed(["Follow the white rabbit."], trust_remote_code=True,
pooling_params=PoolingParams(dimensions=32)) )
outputs = llm.embed(
["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32),
)
print(outputs[0].outputs) print(outputs[0].outputs)
``` ```
...@@ -234,13 +240,13 @@ A code example can be found here: <gh-file:examples/offline_inference/pooling/em ...@@ -234,13 +240,13 @@ A code example can be found here: <gh-file:examples/offline_inference/pooling/em
Use the following command to start vllm server. Use the following command to start vllm server.
```text ```bash
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
``` ```
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter. You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
```text ```bash
curl http://127.0.0.1:8000/v1/embeddings \ curl http://127.0.0.1:8000/v1/embeddings \
-H 'accept: application/json' \ -H 'accept: application/json' \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
......
...@@ -278,8 +278,8 @@ https_proxy=http://your.proxy.server:port vllm serve <model_name> ...@@ -278,8 +278,8 @@ https_proxy=http://your.proxy.server:port vllm serve <model_name>
```python ```python
import os import os
os.environ['http_proxy'] = 'http://your.proxy.server:port' os.environ["http_proxy"] = "http://your.proxy.server:port"
os.environ['https_proxy'] = 'http://your.proxy.server:port' os.environ["https_proxy"] = "http://your.proxy.server:port"
``` ```
### ModelScope ### ModelScope
......
...@@ -243,10 +243,10 @@ try: ...@@ -243,10 +243,10 @@ try:
"remote_engine_id": None, # Will be populated by vLLM "remote_engine_id": None, # Will be populated by vLLM
"remote_block_ids": None, # Will be populated by vLLM "remote_block_ids": None, # Will be populated by vLLM
"remote_host": None, # Will be populated by vLLM "remote_host": None, # Will be populated by vLLM
"remote_port": None # Will be populated by vLLM "remote_port": None, # Will be populated by vLLM
} }
}, },
extra_headers={"X-Request-Id": request_id} extra_headers={"X-Request-Id": request_id},
) )
print("-" * 50) print("-" * 50)
...@@ -262,7 +262,7 @@ try: ...@@ -262,7 +262,7 @@ try:
extra_body={ extra_body={
"kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info "kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info
}, },
extra_headers={"X-Request-Id": request_id} # Same request ID extra_headers={"X-Request-Id": request_id}, # Same request ID
) )
print("-" * 50) print("-" * 50)
......
...@@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain` ...@@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
```python ```python
from langchain_community.llms import VLLM from langchain_community.llms import VLLM
llm = VLLM(model="mosaicml/mpt-7b", llm = VLLM(
trust_remote_code=True, # mandatory for hf models model="mosaicml/mpt-7b",
max_new_tokens=128, trust_remote_code=True, # mandatory for hf models
top_k=10, max_new_tokens=128,
top_p=0.95, top_k=10,
temperature=0.8, top_p=0.95,
# tensor_parallel_size=... # for distributed inference temperature=0.8,
# for distributed inference
# tensor_parallel_size=...,
) )
print(llm("What is the capital of France ?")) print(llm("What is the capital of France ?"))
......
...@@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an ...@@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Hello!"} {"role": "user", "content": "Hello!"},
] ],
) )
print(completion.choices[0].message) print(completion.choices[0].message)
...@@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below: ...@@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below:
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} {
] "role": "user",
"content": [
{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"},
],
},
],
) )
``` ```
...@@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly ...@@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
], ],
extra_body={ extra_body={
"structured_outputs": {"choice": ["positive", "negative"]} "structured_outputs": {"choice": ["positive", "negative"]},
} },
) )
``` ```
...@@ -149,11 +154,11 @@ with `--enable-request-id-headers`. ...@@ -149,11 +154,11 @@ with `--enable-request-id-headers`.
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
], ],
extra_headers={ extra_headers={
"x-request-id": "sentiment-classification-00001", "x-request-id": "sentiment-classification-00001",
} },
) )
print(completion._request_id) print(completion._request_id)
...@@ -162,7 +167,7 @@ with `--enable-request-id-headers`. ...@@ -162,7 +167,7 @@ with `--enable-request-id-headers`.
prompt="A robot may not injure a human being", prompt="A robot may not injure a human being",
extra_headers={ extra_headers={
"x-request-id": "completion-test", "x-request-id": "completion-test",
} },
) )
print(completion._request_id) print(completion._request_id)
``` ```
...@@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi ...@@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi
model="openai/whisper-large-v3-turbo", model="openai/whisper-large-v3-turbo",
file=audio_file, file=audio_file,
language="en", language="en",
response_format="verbose_json" response_format="verbose_json",
) )
print(transcription.text) print(transcription.text)
...@@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including ...@@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including
"model": "jinaai/jina-reranker-m0", "model": "jinaai/jina-reranker-m0",
"text_1": "slm markdown", "text_1": "slm markdown",
"text_2": { "text_2": {
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
}, },
}, },
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}, },
}, },
] ],
}
}, },
},
) )
response.raise_for_status() response.raise_for_status()
response_json = response.json() response_json = response.json()
......
...@@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ ...@@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
""" """
try: try:
url = s3_client.generate_presigned_url( url = s3_client.generate_presigned_url(
ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in ClientMethod=client_method,
Params=method_parameters,
ExpiresIn=expires_in,
) )
except ClientError: except ClientError:
raise raise
...@@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ ...@@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
s3_client = boto3.client("s3") s3_client = boto3.client("s3")
input_url = generate_presigned_url( input_url = generate_presigned_url(
s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600 s3_client,
"get_object",
{"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"},
expires_in=3600,
) )
output_url = generate_presigned_url( output_url = generate_presigned_url(
s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600 s3_client,
"put_object",
{"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"},
expires_in=3600,
) )
print(f"{input_url=}") print(f"{input_url=}")
print(f"{output_url=}") print(f"{output_url=}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment