{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "In this notebook, we showcase how to use the improve retrieval performance using per-layer compression." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import torch\n", "from transformers import pipeline\n", "\n", "from kvpress import (\n", " ExpectedAttentionPress,\n", " KnormPress,\n", " ObservedAttentionPress,\n", " RandomPress,\n", " SnapKVPress,\n", " StreamingLLMPress,\n", " PerLayerCompressionPress,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load the pipeline and data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "14ee6cc96fce42cfb6e75b2964fbda04", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00