# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
llm=vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=2,
enforce_eager=True,
enable_chunked_prefill=True)
expected_lora_output=[
"SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",# noqa: E501
"SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);",# noqa: E501
"SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';",# noqa: E501
PROMPT="Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."# noqa: E501
...
...
@@ -119,22 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):
prompt_formatter=lambdaimg_prompt:f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",# noqa: E501