Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
...@@ -8,12 +8,12 @@ import zipfile ...@@ -8,12 +8,12 @@ import zipfile
# Note that we have 400 MiB quota, please use it wisely. # Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 . # See https://github.com/pypi/support/issues/3792 .
# Please also sync the value with the one in Dockerfile. # Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400)) VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
def print_top_10_largest_files(zip_file): def print_top_10_largest_files(zip_file):
"""Print the top 10 largest files in the given zip file.""" """Print the top 10 largest files in the given zip file."""
with zipfile.ZipFile(zip_file, 'r') as z: with zipfile.ZipFile(zip_file, "r") as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True) file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]: for f, size in file_sizes[:10]:
...@@ -28,14 +28,18 @@ def check_wheel_size(directory): ...@@ -28,14 +28,18 @@ def check_wheel_size(directory):
wheel_path = os.path.join(root, file_name) wheel_path = os.path.join(root, file_name)
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024) wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
if wheel_size_mb > VLLM_MAX_SIZE_MB: if wheel_size_mb > VLLM_MAX_SIZE_MB:
print(f"Not allowed: Wheel {wheel_path} is larger " print(
f"({wheel_size_mb:.2f} MB) than the limit " f"Not allowed: Wheel {wheel_path} is larger "
f"({VLLM_MAX_SIZE_MB} MB).") f"({wheel_size_mb:.2f} MB) than the limit "
f"({VLLM_MAX_SIZE_MB} MB)."
)
print_top_10_largest_files(wheel_path) print_top_10_largest_files(wheel_path)
return 1 return 1
else: else:
print(f"Wheel {wheel_path} is within the allowed size " print(
f"({wheel_size_mb:.2f} MB).") f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb:.2f} MB)."
)
return 0 return 0
...@@ -45,4 +49,4 @@ if __name__ == "__main__": ...@@ -45,4 +49,4 @@ if __name__ == "__main__":
sys.exit(1) sys.exit(1)
directory = sys.argv[1] directory = sys.argv[1]
sys.exit(check_wheel_size(directory)) sys.exit(check_wheel_size(directory))
\ No newline at end of file
...@@ -22,5 +22,5 @@ with open("index.html", "w") as f: ...@@ -22,5 +22,5 @@ with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}") print(f"Generated index.html for {args.wheel}")
# cloudfront requires escaping the '+' character # cloudfront requires escaping the '+' character
f.write( f.write(
template.format(wheel=filename, template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
wheel_html_escaped=filename.replace("+", "%2B"))) )
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.335
- name: "exact_match,flexible-extract"
value: 0.323
limit: 1319
num_fewshot: 5
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.54
- name: "exact_match,flexible-extract"
value: 0.59
limit: 1319
num_fewshot: 5
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.47
- name: "exact_match,flexible-extract"
value: 0.64
limit: 1319
num_fewshot: 5
...@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml ...@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml DeepSeek-V2-Lite-Chat.yaml
Meta-Llama-3-8B-QQQ.yaml
Meta-Llama-3-8B-Instruct.yaml Qwen2.5-1.5B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml Qwen1.5-MoE-W4A16-compressed-tensors.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml
Meta-Llama-3-8B-QQQ.yaml
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
import pytest
def pytest_addoption(parser):
parser.addoption(
"--config-list-file",
action="store",
help="Path to the file listing model config YAMLs (one per line)",
)
parser.addoption(
"--tp-size",
action="store",
default="1",
help="Tensor parallel size to use for evaluation",
)
@pytest.fixture(scope="session")
def config_list_file(pytestconfig, config_dir):
rel_path = pytestconfig.getoption("--config-list-file")
return config_dir / rel_path
@pytest.fixture(scope="session")
def tp_size(pytestconfig):
return pytestconfig.getoption("--tp-size")
def pytest_generate_tests(metafunc):
if "config_filename" in metafunc.fixturenames:
rel_path = metafunc.config.getoption("--config-list-file")
config_list_file = Path(rel_path).resolve()
config_dir = config_list_file.parent
with open(config_list_file, encoding="utf-8") as f:
configs = [
config_dir / line.strip()
for line in f
if line.strip() and not line.startswith("#")
]
metafunc.parametrize("config_filename", configs)
#!/bin/bash
usage() {
echo``
echo "Runs lm eval harness on GSM8k using vllm and compares to "
echo "precomputed baseline (measured by HF transformers.)"
echo
echo "usage: ${0} <options>"
echo
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
echo " -t - tensor parallel size"
echo
}
SUCCESS=0
while getopts "c:t:" OPT; do
case ${OPT} in
c )
CONFIG="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
# Parse list of configs.
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
LOCAL_SUCCESS=0
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
export LM_EVAL_TP_SIZE=$TP_SIZE
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
if [[ $LOCAL_SUCCESS == 0 ]]; then
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
else
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
fi
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
done
if [ "${SUCCESS}" -eq "0" ]; then
exit 0
else
exit 1
fi
...@@ -3,67 +3,52 @@ ...@@ -3,67 +3,52 @@
LM eval harness on model to compare vs HF baseline computed offline. LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml Configs are found in configs/$MODEL.yaml
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml pytest -s -v test_lm_eval_correctness.py \
* export LM_EVAL_TP_SIZE=4 --config-list-file=configs/models-small.txt \
* pytest -s test_lm_eval_correctness.py --tp-size=1
""" """
import os
from pathlib import Path
import lm_eval import lm_eval
import numpy import numpy as np
import pytest
import yaml import yaml
RTOL = 0.08 RTOL = 0.08
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
def launch_lm_eval(eval_config):
trust_remote_code = eval_config.get('trust_remote_code', False)
model_args = f"pretrained={eval_config['model_name']}," \
f"tensor_parallel_size={TP_SIZE}," \
f"add_bos_token=true," \
f"trust_remote_code={trust_remote_code}"
def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
model_args = (
f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size},"
f"enforce_eager=true,"
f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code}"
)
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model="vllm", model="vllm",
model_args=model_args, model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]], tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"], num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"], limit=eval_config["limit"],
batch_size="auto") batch_size="auto",
)
return results return results
def test_lm_eval_correctness(): def test_lm_eval_correctness_param(config_filename, tp_size):
eval_config = yaml.safe_load( eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
if eval_config[
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
pytest.skip("FBGEMM is currently failing on main.")
# Launch eval requests. results = launch_lm_eval(eval_config, tp_size)
results = launch_lm_eval(eval_config)
# Confirm scores match ground truth.
success = True success = True
for task in eval_config["tasks"]: for task in eval_config["tasks"]:
for metric in task["metrics"]: for metric in task["metrics"]:
ground_truth = metric["value"] ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]] measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: ' print(
f'ground_truth={ground_truth} | measured={measured_value}') f"{task['name']} | {metric['name']}: "
success = success and numpy.isclose( f"ground_truth={ground_truth} | measured={measured_value}"
ground_truth, measured_value, rtol=RTOL) )
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
# Assert at the end, print all scores even on failure for debugging.
assert success assert success
...@@ -65,18 +65,18 @@ def read_markdown(file): ...@@ -65,18 +65,18 @@ def read_markdown(file):
def results_to_json(latency, throughput, serving): def results_to_json(latency, throughput, serving):
return json.dumps({ return json.dumps(
'latency': latency.to_dict(), {
'throughput': throughput.to_dict(), "latency": latency.to_dict(),
'serving': serving.to_dict() "throughput": throughput.to_dict(),
}) "serving": serving.to_dict(),
}
)
if __name__ == "__main__": if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file) as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
...@@ -120,7 +120,8 @@ if __name__ == "__main__": ...@@ -120,7 +120,8 @@ if __name__ == "__main__":
for perc in [10, 25, 50, 75, 90, 99]: for perc in [10, 25, 50, 75, 90, 99]:
# Multiply 1000 to convert the time unit from s to ms # Multiply 1000 to convert the time unit from s to ms
raw_result.update( raw_result.update(
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}) {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
)
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
# add the result to raw_result # add the result to raw_result
...@@ -153,26 +154,27 @@ if __name__ == "__main__": ...@@ -153,26 +154,27 @@ if __name__ == "__main__":
serving_results = pd.DataFrame.from_dict(serving_results) serving_results = pd.DataFrame.from_dict(serving_results)
throughput_results = pd.DataFrame.from_dict(throughput_results) throughput_results = pd.DataFrame.from_dict(throughput_results)
raw_results_json = results_to_json(latency_results, throughput_results, raw_results_json = results_to_json(
serving_results) latency_results, throughput_results, serving_results
)
# remapping the key, for visualization purpose # remapping the key, for visualization purpose
if not latency_results.empty: if not latency_results.empty:
latency_results = latency_results[list( latency_results = latency_results[list(latency_column_mapping.keys())].rename(
latency_column_mapping.keys())].rename( columns=latency_column_mapping
columns=latency_column_mapping) )
if not serving_results.empty: if not serving_results.empty:
serving_results = serving_results[list( serving_results = serving_results[list(serving_column_mapping.keys())].rename(
serving_column_mapping.keys())].rename( columns=serving_column_mapping
columns=serving_column_mapping) )
if not throughput_results.empty: if not throughput_results.empty:
throughput_results = throughput_results[list( throughput_results = throughput_results[
throughput_results_column_mapping.keys())].rename( list(throughput_results_column_mapping.keys())
columns=throughput_results_column_mapping) ].rename(columns=throughput_results_column_mapping)
processed_results_json = results_to_json(latency_results, processed_results_json = results_to_json(
throughput_results, latency_results, throughput_results, serving_results
serving_results) )
for df in [latency_results, serving_results, throughput_results]: for df in [latency_results, serving_results, throughput_results]:
if df.empty: if df.empty:
...@@ -184,38 +186,39 @@ if __name__ == "__main__": ...@@ -184,38 +186,39 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE" # we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply( df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}") lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
)
# get markdown tables # get markdown tables
latency_md_table = tabulate(latency_results, latency_md_table = tabulate(
headers='keys', latency_results, headers="keys", tablefmt="pipe", showindex=False
tablefmt='pipe', )
showindex=False) serving_md_table = tabulate(
serving_md_table = tabulate(serving_results, serving_results, headers="keys", tablefmt="pipe", showindex=False
headers='keys', )
tablefmt='pipe', throughput_md_table = tabulate(
showindex=False) throughput_results, headers="keys", tablefmt="pipe", showindex=False
throughput_md_table = tabulate(throughput_results, )
headers='keys',
tablefmt='pipe',
showindex=False)
# document the result # document the result
with open(results_folder / "benchmark_results.md", "w") as f: with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown(
results = read_markdown("../.buildkite/nightly-benchmarks/" + "../.buildkite/nightly-benchmarks/"
"performance-benchmarks-descriptions.md") + "performance-benchmarks-descriptions.md"
)
results = results.format( results = results.format(
latency_tests_markdown_table=latency_md_table, latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table, throughput_tests_markdown_table=throughput_md_table,
serving_tests_markdown_table=serving_md_table, serving_tests_markdown_table=serving_md_table,
benchmarking_results_in_json_string=processed_results_json) benchmarking_results_in_json_string=processed_results_json,
)
f.write(results) f.write(results)
# document benchmarking results in json # document benchmarking results in json
with open(results_folder / "benchmark_results.json", "w") as f: with open(results_folder / "benchmark_results.json", "w") as f:
results = (
results = latency_results.to_dict( latency_results.to_dict(orient="records")
orient='records') + throughput_results.to_dict( + throughput_results.to_dict(orient="records")
orient='records') + serving_results.to_dict(orient='records') + serving_results.to_dict(orient="records")
)
f.write(json.dumps(results)) f.write(json.dumps(results))
...@@ -14,15 +14,12 @@ def main(model, cachedir): ...@@ -14,15 +14,12 @@ def main(model, cachedir):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Download and save Hugging Face tokenizer") description="Download and save Hugging Face tokenizer"
parser.add_argument("--model", )
type=str, parser.add_argument("--model", type=str, required=True, help="Name of the model")
required=True, parser.add_argument(
help="Name of the model") "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
parser.add_argument("--cachedir", )
type=str,
required=True,
help="Directory to save the tokenizer")
args = parser.parse_args() args = parser.parse_args()
main(args.model, args.cachedir) main(args.model, args.cachedir)
...@@ -11,33 +11,33 @@ from tabulate import tabulate ...@@ -11,33 +11,33 @@ from tabulate import tabulate
def parse_arguments(): def parse_arguments():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description= description="Parse command line arguments for summary-nightly-results script."
'Parse command line arguments for summary-nightly-results script.') )
parser.add_argument('--results-folder', parser.add_argument(
type=str, "--results-folder",
required=True, type=str,
help='The folder where the results are stored.') required=True,
parser.add_argument('--description', help="The folder where the results are stored.",
type=str, )
required=True, parser.add_argument(
help='Description of the results.') "--description", type=str, required=True, help="Description of the results."
)
args = parser.parse_args() args = parser.parse_args()
return args return args
def get_perf(df, method, model, metric): def get_perf(df, method, model, metric):
means = [] means = []
for qps in [2, 4, 8, 16, "inf"]: for qps in [2, 4, 8, 16, "inf"]:
target = df['Test name'].str.contains(model) target = df["Test name"].str.contains(model)
target = target & df['Engine'].str.contains(method) target = target & df["Engine"].str.contains(method)
target = target & df['Test name'].str.contains("qps_" + str(qps)) target = target & df["Test name"].str.contains("qps_" + str(qps))
filtered_df = df[target] filtered_df = df[target]
if filtered_df.empty: if filtered_df.empty:
means.append(0.) means.append(0.0)
else: else:
means.append(filtered_df[metric].values[0]) means.append(filtered_df[metric].values[0])
...@@ -45,7 +45,6 @@ def get_perf(df, method, model, metric): ...@@ -45,7 +45,6 @@ def get_perf(df, method, model, metric):
def get_perf_w_std(df, method, model, metric): def get_perf_w_std(df, method, model, metric):
if metric in ["TTFT", "ITL"]: if metric in ["TTFT", "ITL"]:
mean = get_perf(df, method, model, "Mean " + metric + " (ms)") mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
mean = mean.tolist() mean = mean.tolist()
...@@ -60,7 +59,8 @@ def get_perf_w_std(df, method, model, metric): ...@@ -60,7 +59,8 @@ def get_perf_w_std(df, method, model, metric):
else: else:
assert metric == "Tput" assert metric == "Tput"
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
df, method, model, "Output Tput (tok/s)") df, method, model, "Output Tput (tok/s)"
)
mean = mean.tolist() mean = mean.tolist()
std = None std = None
...@@ -80,18 +80,17 @@ def main(args): ...@@ -80,18 +80,17 @@ def main(args):
# generate markdown table # generate markdown table
df = pd.DataFrame.from_dict(results) df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
with open(args.description) as f: with open(args.description) as f:
description = f.read() description = f.read()
description = description.format( description = description.format(nightly_results_benchmarking_table=md_table)
nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f: with open("nightly_results.md", "w") as f:
f.write(description) f.write(description)
if __name__ == '__main__': if __name__ == "__main__":
args = parse_arguments() args = parse_arguments()
main(args) main(args)
...@@ -34,10 +34,8 @@ serving_column_mapping = { ...@@ -34,10 +34,8 @@ serving_column_mapping = {
} }
if __name__ == "__main__": if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file) as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
...@@ -56,17 +54,16 @@ if __name__ == "__main__": ...@@ -56,17 +54,16 @@ if __name__ == "__main__":
serving_results = pd.DataFrame.from_dict(serving_results) serving_results = pd.DataFrame.from_dict(serving_results)
if not serving_results.empty: if not serving_results.empty:
serving_results = serving_results[list( serving_results = serving_results[list(serving_column_mapping.keys())].rename(
serving_column_mapping.keys())].rename( columns=serving_column_mapping
columns=serving_column_mapping) )
serving_md_table_with_headers = tabulate(serving_results, serving_md_table_with_headers = tabulate(
headers='keys', serving_results, headers="keys", tablefmt="pipe", showindex=False
tablefmt='pipe', )
showindex=False)
# remove the first line of header # remove the first line of header
serving_md_table_lines = serving_md_table_with_headers.split('\n') serving_md_table_lines = serving_md_table_with_headers.split("\n")
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:]) serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
...@@ -76,10 +73,9 @@ if __name__ == "__main__": ...@@ -76,10 +73,9 @@ if __name__ == "__main__":
# document results with header. # document results with header.
# for those who wants to reproduce our benchmark. # for those who wants to reproduce our benchmark.
f.write(serving_md_table_with_headers) f.write(serving_md_table_with_headers)
f.write('\n') f.write("\n")
# document benchmarking results in json # document benchmarking results in json
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
results = serving_results.to_dict(orient="records")
results = serving_results.to_dict(orient='records')
f.write(json.dumps(results)) f.write(json.dumps(results))
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.format]
docstring-code-format = true
steps: steps:
- label: "Build wheel - CUDA 12.4" - label: "Build wheel - CUDA 12.8"
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.1" - label: "Build wheel - CUDA 12.6"
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
...@@ -31,7 +31,7 @@ steps: ...@@ -31,7 +31,7 @@ steps:
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
...@@ -48,7 +48,7 @@ steps: ...@@ -48,7 +48,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build and publish TPU release image" - label: "Build and publish TPU release image"
...@@ -57,6 +57,8 @@ steps: ...@@ -57,6 +57,8 @@ steps:
agents: agents:
queue: tpu_queue_postmerge queue: tpu_queue_postmerge
commands: commands:
- "yes | docker system prune -a"
- "git fetch --all"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly" - "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
......
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
# This script runs test inside the corresponding ROCm docker container. # This script runs test inside the corresponding ROCm docker container.
set -o pipefail set -o pipefail
# Export Python path
export PYTHONPATH=".."
# Print ROCm version # Print ROCm version
echo "--- Confirming Clean Initial State" echo "--- Confirming Clean Initial State"
while true; do while true; do
...@@ -74,38 +77,69 @@ HF_MOUNT="/root/.cache/huggingface" ...@@ -74,38 +77,69 @@ HF_MOUNT="/root/.cache/huggingface"
commands=$@ commands=$@
echo "Commands:$commands" echo "Commands:$commands"
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
fi
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
fi
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
#ignore certain kernels tests #ignore certain kernels tests
if [[ $commands == *" kernels "* ]]; then if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \ commands="${commands} \
--ignore=kernels/test_attention_selector.py \ --ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/test_blocksparse_attention.py \ --ignore=kernels/core/test_permute_cols.py"
--ignore=kernels/test_causal_conv1d.py \ fi
--ignore=kernels/test_cutlass.py \
--ignore=kernels/test_encoder_decoder_attn.py \ if [[ $commands == *" kernels/attention"* ]]; then
--ignore=kernels/test_flash_attn.py \ commands="${commands} \
--ignore=kernels/test_flashinfer.py \ --ignore=kernels/attention/stest_attention_selector.py \
--ignore=kernels/test_int8_quant.py \ --ignore=kernels/attention/test_blocksparse_attention.py \
--ignore=kernels/test_machete_gemm.py \ --ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/test_mamba_ssm.py \ --ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/test_marlin_gemm.py \ --ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/test_moe.py \ --ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/test_prefix_prefill.py \ --ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/test_rand.py \ --ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/test_sampler.py \ --ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/test_cascade_flash_attn.py \ --ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/test_mamba_mixer2.py \ --ignore=kernels/attention/test_attention.py"
--ignore=kernels/test_aqlm.py \ fi
--ignore=kernels/test_machete_mm.py \
--ignore=kernels/test_mha_attn.py \ if [[ $commands == *" kernels/quantization"* ]]; then
--ignore=kernels/test_block_fp8.py \ commands="${commands} \
--ignore=kernels/test_cutlass_moe.py \ --ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/test_mamba_ssm_ssd.py \ --ignore=kernels/quantization/test_aqlm.py \
--ignore=kernels/test_attention.py \ --ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/test_block_int8.py \ --ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/test_fused_quant_layernorm.py \ --ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/test_int8_kernel.py \ --ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/test_triton_moe_ptpc_fp8.py \ --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/test_permute_cols.py" --ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $commands == *" kernels/mamba"* ]]; then
commands="${commands} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $commands == *" kernels/moe"* ]]; then
commands="${commands} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi fi
#ignore certain Entrypoints/openai tests #ignore certain Entrypoints/openai tests
...@@ -147,6 +181,8 @@ fi ...@@ -147,6 +181,8 @@ fi
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used # assign job count as the number of shards used
...@@ -167,6 +203,7 @@ if [[ $commands == *"--shard-id="* ]]; then ...@@ -167,6 +203,7 @@ if [[ $commands == *"--shard-id="* ]]; then
-e AWS_SECRET_ACCESS_KEY \ -e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}_${GPU}" \ --name "${container_name}_${GPU}" \
"${image_name}" \ "${image_name}" \
/bin/bash -c "${commands_gpu}" \ /bin/bash -c "${commands_gpu}" \
...@@ -197,6 +234,7 @@ else ...@@ -197,6 +234,7 @@ else
-e AWS_SECRET_ACCESS_KEY \ -e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \ --name "${container_name}" \
"${image_name}" \ "${image_name}" \
/bin/bash -c "${commands}" /bin/bash -c "${commands}"
......
...@@ -32,9 +32,12 @@ function cpu_tests() { ...@@ -32,9 +32,12 @@ function cpu_tests() {
set -e set -e
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach] pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/encoder_decoder/language -m cpu_model" pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
} }
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.
......
#!/bin/bash #!/bin/bash
set -xue set -xu
# Build the docker image. # Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu . docker build -f docker/Dockerfile.tpu -t vllm-tpu .
...@@ -24,31 +24,80 @@ docker run --privileged --net host --shm-size=16G -it \ ...@@ -24,31 +24,80 @@ docker run --privileged --net host --shm-size=16G -it \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \ && export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo HARDWARE \ && echo HARDWARE \
&& tpu-info \ && tpu-info \
&& echo TEST_0 \ && { \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \ echo TEST_0: Running test_perf.py; \
&& echo TEST_1 \ python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \ echo TEST_0_EXIT_CODE: \$?; \
&& echo TEST_2 \ } & \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \ { \
&& echo TEST_3 \ echo TEST_1: Running test_compilation.py; \
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \ python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
&& echo TEST_4 \ echo TEST_1_EXIT_CODE: \$?; \
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ } & \
&& echo TEST_5 \ { \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \ echo TEST_2: Running test_basic.py; \
&& echo TEST_6 \ python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \ echo TEST_2_EXIT_CODE: \$?; \
&& echo TEST_7 \ } & \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \ { \
&& echo TEST_8 \ echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \ python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
&& echo TEST_9 \ echo TEST_3_EXIT_CODE: \$?; \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \ } & \
&& echo TEST_10 \ { \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \ echo TEST_4: Running test_quantization_accuracy.py; \
&& echo TEST_11 \ python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \ echo TEST_4_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_5: Running examples/offline_inference/tpu.py; \
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
echo TEST_5_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_6: Running test_tpu_model_runner.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
echo TEST_6_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_7: Running test_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
echo TEST_7_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_8: Running test_topk_topp_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
echo TEST_8_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_9: Running test_multimodal.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
echo TEST_9_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_10: Running test_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
echo TEST_10_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_11: Running test_struct_output_generate.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
echo TEST_11_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_12: Running test_moe_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
echo TEST_12_EXIT_CODE: \$?; \
} & \
# Disable the TPU LoRA tests until the feature is activated
# & { \
# echo TEST_13: Running test_moe_pallas.py; \
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
# echo TEST_13_EXIT_CODE: \$?; \
# } & \
wait \
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
"
# TODO: This test fails because it uses RANDOM_SEED sampling # TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
...@@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" ...@@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu118"* ]]; then if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html # if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels" echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu121"* ]]; then elif [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu121, do not upload the index.html # if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu121 wheels" echo "Skipping index files for cu126 wheels"
else else
# only upload index.html for cu124 wheels (default wheels) # only upload index.html for cu128 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi fi
...@@ -66,12 +66,13 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" ...@@ -66,12 +66,13 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu118"* ]]; then if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html # if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels" echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu121"* ]]; then elif [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu121, do not upload the index.html # if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu121 wheels" echo "Skipping index files for cu126 wheels"
else else
# only upload index.html for cu124 wheels (default wheels) # only upload index.html for cu128 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/" aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment