# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 output_dir: "./results" model: "meta-llama/Meta-Llama-3.1-8B-Instruct" dataset: "ruler" # see DATASET_REGISTRY in evaluate_registry.py data_dir: "4096" # Subdirectory of the dataset (if applicable) else leave "null" press_name: "knorm" # see PRESS_REGISTRY in evaluate_registry.py compression_ratio: 0.5 # Compression ratio for the press (0.0 to 1.0) key_channel_compression_ratio: null # For ThinKPress and ComposedPress (0.0 to 1.0) threshold: null # For DMSPress fraction: 1.0 # Fraction of dataset to evaluate (0.0 to 1.0), for quick testing max_new_tokens: null # Maximum new tokens to generate (null = use dataset default) max_context_length: null # Maximum context length (null = use model maximum) query_aware: false # Whether to include question in context for query-aware compression needle_depth: null # Depth (int or list of ints) percentage of the needle in the haystack (0 to 100), only for needle_in_haystack dataset device: null # Device to use (null = auto-detect, "cuda:0", "cpu", etc.) fp8: false # Whether to use FP8 quantization (FineGrainedFP8Config() from transformers) # You can add any model kwargs here. model_kwargs: attn_implementation: null dtype: "auto"