#!/usr/bin/env python3 # Copyright 2025 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Top-level benchmarking script that automatically discovers and runs all benchmarks in the ./benches directory, organizing outputs into model-specific subfolders. """ import argparse import logging import sys import uuid from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs from framework.benchmark_runner import BenchmarkRunner if __name__ == "__main__": # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("--output-dir", type=str, default=None, help="Output dir for benchmark results") parser.add_argument("--log-level", type=str, choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO") parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)") parser.add_argument("--warmup", "-w", type=int, default=3, help="Number of warmup iterations") parser.add_argument("--iterations", "-i", type=int, default=10, help="Number of measurement iterations") parser.add_argument("--batch-size", "-b", type=int, nargs="+", help="Batch size") parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length") parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate") parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs") parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile") parser.add_argument("--branch-name", type=str, help="Git branch name") parser.add_argument("--commit-id", type=str, help="Git commit ID (if not provided, will auto-detect from git)") parser.add_argument("--commit-message", type=str, help="Git commit message") parser.add_argument( "--no-gpu-monitoring", action="store_true", help="Disables GPU monitoring during benchmark runs" ) parser.add_argument( "--push-result-to-dataset", type=str, default=None, help="Name of the dataset to push results to. If not provided, results are not pushed to the Hub.", ) args = parser.parse_args() # Setup logging benchmark_run_uuid = str(uuid.uuid4())[:8] numeric_level = getattr(logging, args.log_level.upper()) handlers = [logging.StreamHandler(sys.stdout)] logging.basicConfig( level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers ) logger = logging.getLogger("benchmark_v2") logger.info("Starting benchmark discovery and execution") logger.info(f"Benchmark run UUID: {benchmark_run_uuid}") logger.info(f"Output directory: {args.output_dir}") # Error out if one of the arguments is not provided if len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 0: raise ValueError( "At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required" ) # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1: if args.cross_generate: benchmark_configs = generate_all_configs( warmup_iterations=args.warmup, measurement_iterations=args.iterations, batch_size=args.batch_size[0], sequence_length=args.sequence_length[0], num_tokens_to_generate=args.num_tokens_to_generate[0], gpu_monitoring=not args.no_gpu_monitoring, ) else: benchmark_configs = generate_main_configs( warmup_iterations=args.warmup, measurement_iterations=args.iterations, batch_size=args.batch_size[0], sequence_length=args.sequence_length[0], num_tokens_to_generate=args.num_tokens_to_generate[0], ) # Otherwise, we benchmark across all combinations of dimensions else: main_config = generate_main_configs( warmup_iterations=args.warmup, measurement_iterations=args.iterations, batch_size=args.batch_size[0], sequence_length=args.sequence_length[0], num_tokens_to_generate=args.num_tokens_to_generate[0], )[0] benchmark_configs = [] for num_tokens_to_generate in args.num_tokens_to_generate: for sequence_length in args.sequence_length: for batch_size in args.batch_size: cfg_dict = main_config.to_dict() cfg_dict["batch_size"] = batch_size cfg_dict["sequence_length"] = sequence_length cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate cfg_dict.pop("name") benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict)) runner = BenchmarkRunner( logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message, ) timestamp, results = runner.run_benchmarks( args.model_id, benchmark_configs, args.num_tokens_to_profile, pretty_print_summary=True, ) dataset_id = args.push_result_to_dataset if dataset_id is not None and len(results) > 0: runner.push_results_to_hub( dataset_id, results, timestamp, )