infer_vllm.py

import argparse
import torch
import vllm

from vllm import LLM


parse = argparse.ArgumentParser()
parse.add_argument("--model_name_or_path", type=str, default="Qwen/Qwen3-Embedding-0.6B")
args = parse.parse_args()

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a web search query, retrieve relevant passages that answer the query'

queries = [
    get_detailed_instruct(task, 'What is the capital of China?'),
    get_detailed_instruct(task, 'Explain gravity')
]
# No need to add instruction for retrieval documents
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
]

input_texts = queries + documents
print("input_texts:", input_texts)

# Initialize the model for embedding
model = LLM(model=args.model_name_or_path, task="embed")

outputs = model.embed(input_texts)
embeddings = torch.tensor([o.outputs.embedding for o in outputs])
scores = (embeddings[:2] @ embeddings[2:].T)
print("scores:", scores.tolist())