per_layer_compression_demo.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this notebook, we showcase how to use the improve retrieval performance using per-layer compression."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import torch\n",
    "from transformers import pipeline\n",
    "\n",
    "from kvpress import (\n",
    "    ExpectedAttentionPress,\n",
    "    KnormPress,\n",
    "    ObservedAttentionPress,\n",
    "    RandomPress,\n",
    "    SnapKVPress,\n",
    "    StreamingLLMPress,\n",
    "    PerLayerCompressionPress,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the pipeline and data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "14ee6cc96fce42cfb6e75b2964fbda04",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Load pipeline\n",
    "\n",
    "device = \"cuda:0\"\n",
    "ckpt = \"microsoft/Phi-3.5-mini-instruct\"\n",
    "attn_implementation = \"flash_attention_2\"\n",
    "pipe = pipeline(\"kv-press-text-generation\", model=ckpt, device=device, dtype=\"auto\", model_kwargs={\"attn_implementation\":attn_implementation})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets \n",
    "\n",
    "df = datasets.load_dataset(\"simonjegou/ruler\", \"4096\")[\"test\"].to_pandas()\n",
    "df = df.loc[df[\"task\"] == \"niah_single_3\"].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use the pipeline with a press"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pick a press with a compression ratio, you can run the following cells with different presses\n",
    "compression_ratio = 0.3\n",
    "press = ExpectedAttentionPress(compression_ratio)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.\n",
      "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:   What is the special magic uuid for amused-quart mentioned in the provided text? \n",
      "Answer:     1ff49b78-8946-4e85-b59c-de66bacfb3d0\n",
      "Prediction: The special magic uuid for amused-quart mentioned in the text is: 1ff49b78-8946-4e85-b63d-a7e3c0a1c\n",
      "Correctly predicted: False\n"
     ]
    }
   ],
   "source": [
    "# Run the pipeline on a single question\n",
    "idx = 0\n",
    "context = df.iloc[idx][\"context\"] \n",
    "question = df.iloc[idx][\"question\"] \n",
    "true_answer = df.iloc[idx][\"answer\"][0]\n",
    "\n",
    "pred_answer = pipe(context, question=question, press=press)[\"answer\"]\n",
    "\n",
    "print(f\"Question:   {question}\")\n",
    "print(f\"Answer:     {true_answer}\")\n",
    "print(f\"Prediction: {pred_answer}\")\n",
    "print(f\"Correctly predicted: {true_answer in pred_answer}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Apply per-layer-compression with the same overall compression ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.3028125\n"
     ]
    }
   ],
   "source": [
    "# Each layer is compressed differently, some layers have higher compression ratios, other less.\n",
    "# The mean compression ratio is the same as for the original press\n",
    "\n",
    "PHI_35_COMPRESSION_RATIOS = [0.37, 0.3, 0.37, 0.37, 0.37, 0.37, 0.07, 0.37, 0.29, 0.37, 0.36,\n",
    "                             0.13, 0.37, 0.0, 0.37, 0.37, 0.37, 0.36, 0.28, 0.0, 0.09, 0.37,\n",
    "                             0.37, 0.37, 0.37, 0.37, 0.04, 0.37, 0.37, 0.37, 0.37, 0.37]\n",
    "print(np.mean(PHI_35_COMPRESSION_RATIOS))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Per layer compression wrapper is an experimental feature and only works with flash attention. Please make sure that the model uses flash attention.\n"
     ]
    }
   ],
   "source": "press_per_layer = PerLayerCompressionPress(ExpectedAttentionPress(compression_ratio), PHI_35_COMPRESSION_RATIOS)"
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:   What is the special magic uuid for amused-quart mentioned in the provided text? \n",
      "Answer:     1ff49b78-8946-4e85-b59c-de66bacfb3d0\n",
      "Prediction: The special magic uuid mentioned in the text for amused-quart is: 1ff49b78-8946-4e85-b59c-de66bacfb3d0\n",
      "Correctly predicted: True\n"
     ]
    }
   ],
   "source": [
    "pred_answer = pipe(context, question=question, press=press_per_layer)[\"answer\"]\n",
    "\n",
    "print(f\"Question:   {question}\")\n",
    "print(f\"Answer:     {true_answer}\")\n",
    "print(f\"Prediction: {pred_answer}\")\n",
    "print(f\"Correctly predicted: {true_answer in pred_answer}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}