Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
62566979
Unverified
Commit
62566979
authored
Oct 15, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 15, 2025
Browse files
[Doc] ruff format remaining Python examples (#26795)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
71557a5f
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
165 additions
and
104 deletions
+165
-104
docs/features/quantization/auto_awq.md
docs/features/quantization/auto_awq.md
+6
-4
docs/features/quantization/bitblas.md
docs/features/quantization/bitblas.md
+2
-2
docs/features/quantization/bnb.md
docs/features/quantization/bnb.md
+2
-2
docs/features/quantization/fp8.md
docs/features/quantization/fp8.md
+7
-2
docs/features/quantization/gguf.md
docs/features/quantization/gguf.md
+7
-5
docs/features/quantization/gptqmodel.md
docs/features/quantization/gptqmodel.md
+1
-1
docs/features/quantization/int4.md
docs/features/quantization/int4.md
+4
-2
docs/features/quantization/int8.md
docs/features/quantization/int8.md
+3
-1
docs/features/quantization/modelopt.md
docs/features/quantization/modelopt.md
+2
-2
docs/features/quantization/quantized_kvcache.md
docs/features/quantization/quantized_kvcache.md
+5
-3
docs/features/quantization/quark.md
docs/features/quantization/quark.md
+44
-21
docs/getting_started/quickstart.md
docs/getting_started/quickstart.md
+5
-3
docs/models/extensions/tensorizer.md
docs/models/extensions/tensorizer.md
+2
-2
docs/models/generative_models.md
docs/models/generative_models.md
+3
-3
docs/models/pooling_models.md
docs/models/pooling_models.md
+16
-10
docs/models/supported_models.md
docs/models/supported_models.md
+2
-2
docs/serving/expert_parallel_deployment.md
docs/serving/expert_parallel_deployment.md
+3
-3
docs/serving/integrations/langchain.md
docs/serving/integrations/langchain.md
+9
-7
docs/serving/openai_compatible_server.md
docs/serving/openai_compatible_server.md
+31
-26
examples/offline_inference/openai_batch/README.md
examples/offline_inference/openai_batch/README.md
+11
-3
No files found.
docs/features/quantization/auto_awq.md
View file @
62566979
...
...
@@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path =
'
mistralai/Mistral-7B-Instruct-v0.2
'
quant_path =
'
mistral-instruct-v0.2-awq
'
quant_config = {
"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"
}
model_path =
"
mistralai/Mistral-7B-Instruct-v0.2
"
quant_path =
"
mistral-instruct-v0.2-awq
"
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
# Load model
model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
model_path,
low_cpu_mem_usage=True,
use_cache=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
...
...
docs/features/quantization/bitblas.md
View file @
62566979
...
...
@@ -34,7 +34,7 @@ llm = LLM(
model
=
model_id
,
dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
,
quantization
=
"bitblas"
quantization
=
"bitblas"
,
)
```
...
...
@@ -53,6 +53,6 @@ llm = LLM(
dtype=torch.float16,
trust_remote_code=True,
quantization="bitblas",
max_model_len=1024
max_model_len=1024
,
)
```
docs/features/quantization/bnb.md
View file @
62566979
...
...
@@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
llm
=
LLM
(
model
=
model_id
,
dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
trust_remote_code
=
True
,
)
```
...
...
@@ -43,7 +43,7 @@ llm = LLM(
model
=
model_id
,
dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
,
quantization
=
"bitsandbytes"
quantization
=
"bitsandbytes"
,
)
```
...
...
docs/features/quantization/fp8.md
View file @
62566979
...
...
@@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID
=
"meta-llama/Meta-Llama-3-8B-Instruct"
model
=
AutoModelForCausalLM
.
from_pretrained
(
MODEL_ID
,
device_map
=
"auto"
,
torch_dtype
=
"auto"
,
MODEL_ID
,
device_map
=
"auto"
,
torch_dtype
=
"auto"
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
```
...
...
@@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
# Configure the simple PTQ quantization
recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
targets="Linear",
scheme="FP8_DYNAMIC",
ignore=["lm_head"],
)
# Apply the quantization algorithm.
oneshot(model=model, recipe=recipe)
...
...
docs/features/quantization/gguf.md
View file @
62566979
...
...
@@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
"content": "You are a helpful assistant"
,
},
{
"role": "user",
"content": "Hello"
"content": "Hello"
,
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
"content": "Hello! How can I assist you today?"
,
},
{
"role": "user",
...
...
@@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
llm = LLM(
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.chat(conversation, sampling_params)
...
...
docs/features/quantization/gptqmodel.md
View file @
62566979
...
...
@@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
calibration_dataset = load_dataset(
"allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
split="train"
,
).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128)
...
...
docs/features/quantization/int4.md
View file @
62566979
...
...
@@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID
=
"meta-llama/Meta-Llama-3-8B-Instruct"
model
=
AutoModelForCausalLM
.
from_pretrained
(
MODEL_ID
,
device_map
=
"auto"
,
torch_dtype
=
"auto"
,
MODEL_ID
,
device_map
=
"auto"
,
torch_dtype
=
"auto"
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
```
...
...
@@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
},
ignore=["lm_head"],
update_size=NUM_CALIBRATION_SAMPLES,
dampening_frac=0.01
dampening_frac=0.01
,
)
```
...
...
docs/features/quantization/int8.md
View file @
62566979
...
...
@@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID
=
"meta-llama/Meta-Llama-3-8B-Instruct"
model
=
AutoModelForCausalLM
.
from_pretrained
(
MODEL_ID
,
device_map
=
"auto"
,
torch_dtype
=
"auto"
,
MODEL_ID
,
device_map
=
"auto"
,
torch_dtype
=
"auto"
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
```
...
...
docs/features/quantization/modelopt.md
View file @
62566979
...
...
@@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
from vllm import LLM, SamplingParams
def main():
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
...
...
docs/features/quantization/quantized_kvcache.md
View file @
62566979
...
...
@@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
kv_cache_dtype="fp8",
calculate_kv_scales=True)
llm = LLM(
model="meta-llama/Llama-2-7b-chat-hf",
kv_cache_dtype="fp8",
calculate_kv_scales=True,
)
prompt = "London is the capital of"
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out)
...
...
docs/features/quantization/quark.md
View file @
62566979
...
...
@@ -48,7 +48,9 @@ to fetch model and tokenizer.
MAX_SEQ_LEN = 512
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto",
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
model.eval()
...
...
@@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE, drop_last=True)
tokenized_outputs = tokenizer(
text_data,
return_tensors="pt",
padding=True,
truncation=True,
max_length=MAX_SEQ_LEN,
)
calib_dataloader = DataLoader(
tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE,
drop_last=True,
)
```
### 3. Set the Quantization Configuration
...
...
@@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
load_quant_algo_config_from_file)
# Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
is_dynamic=False).to_quantization_spec()
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
observer_method="min_max",
is_dynamic=False,
).to_quantization_spec()
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC)
global_quant_config = QuantizationConfig(
input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC,
)
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = {name :
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
weight=global_quant_config.weight,
output_tensors=KV_CACHE_SPEC)
for name in kv_cache_layer_names_for_llama}
kv_cache_quant_config = {
name: QuantizationConfig(
input_tensors=global_quant_config.input_tensors,
weight=global_quant_config.weight,
output_tensors=KV_CACHE_SPEC,
)
for name in kv_cache_layer_names_for_llama
}
layer_quant_config = kv_cache_quant_config.copy()
# Define algorithm config by config file.
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
EXCLUDE_LAYERS = ["lm_head"]
...
...
@@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS,
algo_config=algo_config)
algo_config=algo_config,
)
```
### 4. Quantize the Model and Export
...
...
@@ -165,8 +182,11 @@ for more exporting format details.
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad():
exporter.export_safetensors_model(freezed_model,
quant_config=quant_config, tokenizer=tokenizer)
exporter.export_safetensors_model(
freezed_model,
quant_config=quant_config,
tokenizer=tokenizer,
)
```
### 5. Evaluation in vLLM
...
...
@@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype='fp8',quantization='quark')
llm = LLM(
model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype="fp8",
quantization="quark",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
...
...
docs/getting_started/quickstart.md
View file @
62566979
...
...
@@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
api_key=openai_api_key,
base_url=openai_api_base,
)
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a")
completion = client.completions.create(
model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a",
)
print("Completion result:", completion)
```
...
...
@@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."},
]
]
,
)
print("Chat response:", chat_response)
```
...
...
docs/models/extensions/tensorizer.md
View file @
62566979
...
...
@@ -60,7 +60,7 @@ from vllm import LLM
llm
=
LLM
(
"s3://my-bucket/vllm/facebook/opt-125m/v1"
,
load_format
=
"tensorizer"
,
enable_lora
=
True
enable_lora
=
True
,
)
```
...
...
@@ -97,6 +97,6 @@ llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1"
,
load_format
=
"tensorizer"
,
enable_lora
=
True
,
model_loader_extra_config
=
{
"deserialization_kwargs"
:
{
"num_readers"
:
2
}}
model_loader_extra_config
=
{
"deserialization_kwargs"
:
{
"num_readers"
:
2
}}
,
)
```
docs/models/generative_models.md
View file @
62566979
...
...
@@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
"content": "You are a helpful assistant"
,
},
{
"role": "user",
"content": "Hello"
"content": "Hello"
,
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
"content": "Hello! How can I assist you today?"
,
},
{
"role": "user",
...
...
docs/models/pooling_models.md
View file @
62566979
...
...
@@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u
from
vllm
import
LLM
llm
=
LLM
(
model
=
"BAAI/bge-reranker-v2-m3"
,
runner
=
"pooling"
)
(
output
,)
=
llm
.
score
(
"What is the capital of France?"
,
"The capital of Brazil is Brasilia."
)
(
output
,)
=
llm
.
score
(
"What is the capital of France?"
,
"The capital of Brazil is Brasilia."
,
)
score
=
output
.
outputs
.
score
print
(
f
"Score:
{
score
}
"
)
...
...
@@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please
Here is an example to serve a model with Matryoshka Embeddings enabled.
```
text
```
bash
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5
--hf-overrides
'{"matryoshka_dimensions":[256]}'
```
...
...
@@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka
```
python
from
vllm
import
LLM
,
PoolingParams
llm
=
LLM
(
model
=
"jinaai/jina-embeddings-v3"
,
runner
=
"pooling"
,
trust_remote_code
=
True
)
outputs
=
llm
.
embed
([
"Follow the white rabbit."
],
pooling_params
=
PoolingParams
(
dimensions
=
32
))
llm
=
LLM
(
model
=
"jinaai/jina-embeddings-v3"
,
runner
=
"pooling"
,
trust_remote_code
=
True
,
)
outputs
=
llm
.
embed
(
[
"Follow the white rabbit."
],
pooling_params
=
PoolingParams
(
dimensions
=
32
),
)
print
(
outputs
[
0
].
outputs
)
```
...
...
@@ -234,13 +240,13 @@ A code example can be found here: <gh-file:examples/offline_inference/pooling/em
Use the following command to start vllm server.
```
text
```
bash
vllm serve jinaai/jina-embeddings-v3
--trust-remote-code
```
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
```
text
```
bash
curl http://127.0.0.1:8000/v1/embeddings
\
-H
'accept: application/json'
\
-H
'Content-Type: application/json'
\
...
...
docs/models/supported_models.md
View file @
62566979
...
...
@@ -278,8 +278,8 @@ https_proxy=http://your.proxy.server:port vllm serve <model_name>
```
python
import
os
os
.
environ
[
'
http_proxy
'
]
=
'
http://your.proxy.server:port
'
os
.
environ
[
'
https_proxy
'
]
=
'
http://your.proxy.server:port
'
os
.
environ
[
"
http_proxy
"
]
=
"
http://your.proxy.server:port
"
os
.
environ
[
"
https_proxy
"
]
=
"
http://your.proxy.server:port
"
```
### ModelScope
...
...
docs/serving/expert_parallel_deployment.md
View file @
62566979
...
...
@@ -243,10 +243,10 @@ try:
"remote_engine_id"
:
None
,
# Will be populated by vLLM
"remote_block_ids"
:
None
,
# Will be populated by vLLM
"remote_host"
:
None
,
# Will be populated by vLLM
"remote_port"
:
None
# Will be populated by vLLM
"remote_port"
:
None
,
# Will be populated by vLLM
}
},
extra_headers
=
{
"X-Request-Id"
:
request_id
}
extra_headers
=
{
"X-Request-Id"
:
request_id
}
,
)
print
(
"-"
*
50
)
...
...
@@ -262,7 +262,7 @@ try:
extra_body
=
{
"kv_transfer_params"
:
prefill_response
.
kv_transfer_params
# Pass KV cache info
},
extra_headers
=
{
"X-Request-Id"
:
request_id
}
# Same request ID
extra_headers
=
{
"X-Request-Id"
:
request_id
}
,
# Same request ID
)
print
(
"-"
*
50
)
...
...
docs/serving/integrations/langchain.md
View file @
62566979
...
...
@@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
```python
from langchain_community.llms import VLLM
llm = VLLM(model="mosaicml/mpt-7b",
trust_remote_code=True, # mandatory for hf models
max_new_tokens=128,
top_k=10,
top_p=0.95,
temperature=0.8,
# tensor_parallel_size=... # for distributed inference
llm = VLLM(
model="mosaicml/mpt-7b",
trust_remote_code=True, # mandatory for hf models
max_new_tokens=128,
top_k=10,
top_p=0.95,
temperature=0.8,
# for distributed inference
# tensor_parallel_size=...,
)
print(llm("What is the capital of France ?"))
...
...
docs/serving/openai_compatible_server.md
View file @
62566979
...
...
@@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an
completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[
{"role": "user", "content": "Hello!"}
]
{"role": "user", "content": "Hello!"}
,
]
,
)
print(completion.choices[0].message)
...
...
@@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below:
completion
=
client
.
chat
.
completions
.
create
(
model
=
"NousResearch/Meta-Llama-3-8B-Instruct"
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
"Classify this sentiment: vLLM is wonderful!"
}]}
]
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Classify this sentiment: vLLM is wonderful!"
},
],
},
],
)
```
...
...
@@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
completion
=
client
.
chat
.
completions
.
create
(
model
=
"NousResearch/Meta-Llama-3-8B-Instruct"
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Classify this sentiment: vLLM is wonderful!"
}
{
"role"
:
"user"
,
"content"
:
"Classify this sentiment: vLLM is wonderful!"
}
,
],
extra_body
=
{
"structured_outputs"
:
{
"choice"
:
[
"positive"
,
"negative"
]}
}
"structured_outputs"
:
{
"choice"
:
[
"positive"
,
"negative"
]}
,
}
,
)
```
...
...
@@ -149,11 +154,11 @@ with `--enable-request-id-headers`.
completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
,
],
extra_headers={
"x-request-id": "sentiment-classification-00001",
}
}
,
)
print(completion._request_id)
...
...
@@ -162,7 +167,7 @@ with `--enable-request-id-headers`.
prompt="A robot may not injure a human being",
extra_headers={
"x-request-id": "completion-test",
}
}
,
)
print(completion._request_id)
```
...
...
@@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi
model="openai/whisper-large-v3-turbo",
file=audio_file,
language="en",
response_format="verbose_json"
response_format="verbose_json"
,
)
print(transcription.text)
...
...
@@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including
"model": "jinaai/jina-reranker-m0",
"text_1": "slm markdown",
"text_2": {
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
},
]
}
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
},
],
},
},
)
response.raise_for_status()
response_json = response.json()
...
...
examples/offline_inference/openai_batch/README.md
View file @
62566979
...
...
@@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
"""
try
:
url
=
s3_client
.
generate_presigned_url
(
ClientMethod
=
client_method
,
Params
=
method_parameters
,
ExpiresIn
=
expires_in
ClientMethod
=
client_method
,
Params
=
method_parameters
,
ExpiresIn
=
expires_in
,
)
except
ClientError
:
raise
...
...
@@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
s3_client
=
boto3
.
client
(
"s3"
)
input_url
=
generate_presigned_url
(
s3_client
,
"get_object"
,
{
"Bucket"
:
"MY_BUCKET"
,
"Key"
:
"MY_INPUT_FILE.jsonl"
},
3600
s3_client
,
"get_object"
,
{
"Bucket"
:
"MY_BUCKET"
,
"Key"
:
"MY_INPUT_FILE.jsonl"
},
expires_in
=
3600
,
)
output_url
=
generate_presigned_url
(
s3_client
,
"put_object"
,
{
"Bucket"
:
"MY_BUCKET"
,
"Key"
:
"MY_OUTPUT_FILE.jsonl"
},
3600
s3_client
,
"put_object"
,
{
"Bucket"
:
"MY_BUCKET"
,
"Key"
:
"MY_OUTPUT_FILE.jsonl"
},
expires_in
=
3600
,
)
print
(
f
"
{
input_url
=
}
"
)
print
(
f
"
{
output_url
=
}
"
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment