Initial commit

25d2752f · yongshk · 25d2752f · 25d2752f · 25d2752f · 25d2752f
Commit 25d2752f authored May 29, 2025 by yongshk
20 changed files
--- a/candle-examples/examples/mnist-training/main.rs
+++ b/candle-examples/examples/mnist-training/main.rs
+// This should reach 91.5% accuracy.
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use clap::{Parser, ValueEnum};
+use rand::prelude::*;
+
+use candle::{DType, Result, Tensor, D};
+use candle_nn::{loss, ops, Conv2d, Linear, Module, ModuleT, Optimizer, VarBuilder, VarMap};
+
+const IMAGE_DIM: usize = 784;
+const LABELS: usize = 10;
+
+fn linear_z(in_dim: usize, out_dim: usize, vs: VarBuilder) -> Result<Linear> {
+    let ws = vs.get_with_hints((out_dim, in_dim), "weight", candle_nn::init::ZERO)?;
+    let bs = vs.get_with_hints(out_dim, "bias", candle_nn::init::ZERO)?;
+    Ok(Linear::new(ws, Some(bs)))
+}
+
+trait Model: Sized {
+    fn new(vs: VarBuilder) -> Result<Self>;
+    fn forward(&self, xs: &Tensor) -> Result<Tensor>;
+}
+
+struct LinearModel {
+    linear: Linear,
+}
+
+impl Model for LinearModel {
+    fn new(vs: VarBuilder) -> Result<Self> {
+        let linear = linear_z(IMAGE_DIM, LABELS, vs)?;
+        Ok(Self { linear })
+    }
+
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        self.linear.forward(xs)
+    }
+}
+
+struct Mlp {
+    ln1: Linear,
+    ln2: Linear,
+}
+
+impl Model for Mlp {
+    fn new(vs: VarBuilder) -> Result<Self> {
+        let ln1 = candle_nn::linear(IMAGE_DIM, 100, vs.pp("ln1"))?;
+        let ln2 = candle_nn::linear(100, LABELS, vs.pp("ln2"))?;
+        Ok(Self { ln1, ln2 })
+    }
+
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let xs = self.ln1.forward(xs)?;
+        let xs = xs.relu()?;
+        self.ln2.forward(&xs)
+    }
+}
+
+#[derive(Debug)]
+struct ConvNet {
+    conv1: Conv2d,
+    conv2: Conv2d,
+    fc1: Linear,
+    fc2: Linear,
+    dropout: candle_nn::Dropout,
+}
+
+impl ConvNet {
+    fn new(vs: VarBuilder) -> Result<Self> {
+        let conv1 = candle_nn::conv2d(1, 32, 5, Default::default(), vs.pp("c1"))?;
+        let conv2 = candle_nn::conv2d(32, 64, 5, Default::default(), vs.pp("c2"))?;
+        let fc1 = candle_nn::linear(1024, 1024, vs.pp("fc1"))?;
+        let fc2 = candle_nn::linear(1024, LABELS, vs.pp("fc2"))?;
+        let dropout = candle_nn::Dropout::new(0.5);
+        Ok(Self {
+            conv1,
+            conv2,
+            fc1,
+            fc2,
+            dropout,
+        })
+    }
+
+    fn forward(&self, xs: &Tensor, train: bool) -> Result<Tensor> {
+        let (b_sz, _img_dim) = xs.dims2()?;
+        let xs = xs
+            .reshape((b_sz, 1, 28, 28))?
+            .apply(&self.conv1)?
+            .max_pool2d(2)?
+            .apply(&self.conv2)?
+            .max_pool2d(2)?
+            .flatten_from(1)?
+            .apply(&self.fc1)?
+            .relu()?;
+        self.dropout.forward_t(&xs, train)?.apply(&self.fc2)
+    }
+}
+
+struct TrainingArgs {
+    learning_rate: f64,
+    load: Option<String>,
+    save: Option<String>,
+    epochs: usize,
+}
+
+fn training_loop_cnn(
+    m: candle_datasets::vision::Dataset,
+    args: &TrainingArgs,
+) -> anyhow::Result<()> {
+    const BSIZE: usize = 64;
+
+    let dev = candle::Device::cuda_if_available(0)?;
+
+    let train_labels = m.train_labels;
+    let train_images = m.train_images.to_device(&dev)?;
+    let train_labels = train_labels.to_dtype(DType::U32)?.to_device(&dev)?;
+
+    let mut varmap = VarMap::new();
+    let vs = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
+    let model = ConvNet::new(vs.clone())?;
+
+    if let Some(load) = &args.load {
+        println!("loading weights from {load}");
+        varmap.load(load)?
+    }
+
+    let adamw_params = candle_nn::ParamsAdamW {
+        lr: args.learning_rate,
+        ..Default::default()
+    };
+    let mut opt = candle_nn::AdamW::new(varmap.all_vars(), adamw_params)?;
+    let test_images = m.test_images.to_device(&dev)?;
+    let test_labels = m.test_labels.to_dtype(DType::U32)?.to_device(&dev)?;
+    let n_batches = train_images.dim(0)? / BSIZE;
+    let mut batch_idxs = (0..n_batches).collect::<Vec<usize>>();
+    for epoch in 1..args.epochs {
+        let mut sum_loss = 0f32;
+        batch_idxs.shuffle(&mut thread_rng());
+        for batch_idx in batch_idxs.iter() {
+            let train_images = train_images.narrow(0, batch_idx * BSIZE, BSIZE)?;
+            let train_labels = train_labels.narrow(0, batch_idx * BSIZE, BSIZE)?;
+            let logits = model.forward(&train_images, true)?;
+            let log_sm = ops::log_softmax(&logits, D::Minus1)?;
+            let loss = loss::nll(&log_sm, &train_labels)?;
+            opt.backward_step(&loss)?;
+            sum_loss += loss.to_vec0::<f32>()?;
+        }
+        let avg_loss = sum_loss / n_batches as f32;
+
+        let test_logits = model.forward(&test_images, false)?;
+        let sum_ok = test_logits
+            .argmax(D::Minus1)?
+            .eq(&test_labels)?
+            .to_dtype(DType::F32)?
+            .sum_all()?
+            .to_scalar::<f32>()?;
+        let test_accuracy = sum_ok / test_labels.dims1()? as f32;
+        println!(
+            "{epoch:4} train loss {:8.5} test acc: {:5.2}%",
+            avg_loss,
+            100. * test_accuracy
+        );
+    }
+    if let Some(save) = &args.save {
+        println!("saving trained weights in {save}");
+        varmap.save(save)?
+    }
+    Ok(())
+}
+
+fn training_loop<M: Model>(
+    m: candle_datasets::vision::Dataset,
+    args: &TrainingArgs,
+) -> anyhow::Result<()> {
+    let dev = candle::Device::cuda_if_available(0)?;
+
+    let train_labels = m.train_labels;
+    let train_images = m.train_images.to_device(&dev)?;
+    let train_labels = train_labels.to_dtype(DType::U32)?.to_device(&dev)?;
+
+    let mut varmap = VarMap::new();
+    let vs = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
+    let model = M::new(vs.clone())?;
+
+    if let Some(load) = &args.load {
+        println!("loading weights from {load}");
+        varmap.load(load)?
+    }
+
+    let mut sgd = candle_nn::SGD::new(varmap.all_vars(), args.learning_rate)?;
+    let test_images = m.test_images.to_device(&dev)?;
+    let test_labels = m.test_labels.to_dtype(DType::U32)?.to_device(&dev)?;
+    for epoch in 1..args.epochs {
+        let logits = model.forward(&train_images)?;
+        let log_sm = ops::log_softmax(&logits, D::Minus1)?;
+        let loss = loss::nll(&log_sm, &train_labels)?;
+        sgd.backward_step(&loss)?;
+
+        let test_logits = model.forward(&test_images)?;
+        let sum_ok = test_logits
+            .argmax(D::Minus1)?
+            .eq(&test_labels)?
+            .to_dtype(DType::F32)?
+            .sum_all()?
+            .to_scalar::<f32>()?;
+        let test_accuracy = sum_ok / test_labels.dims1()? as f32;
+        println!(
+            "{epoch:4} train loss: {:8.5} test acc: {:5.2}%",
+            loss.to_scalar::<f32>()?,
+            100. * test_accuracy
+        );
+    }
+    if let Some(save) = &args.save {
+        println!("saving trained weights in {save}");
+        varmap.save(save)?
+    }
+    Ok(())
+}
+
+#[derive(ValueEnum, Clone)]
+enum WhichModel {
+    Linear,
+    Mlp,
+    Cnn,
+}
+
+#[derive(Parser)]
+struct Args {
+    #[clap(value_enum, default_value_t = WhichModel::Linear)]
+    model: WhichModel,
+
+    #[arg(long)]
+    learning_rate: Option<f64>,
+
+    #[arg(long, default_value_t = 200)]
+    epochs: usize,
+
+    /// The file where to save the trained weights, in safetensors format.
+    #[arg(long)]
+    save: Option<String>,
+
+    /// The file where to load the trained weights from, in safetensors format.
+    #[arg(long)]
+    load: Option<String>,
+
+    /// The directory where to load the dataset from, in ubyte format.
+    #[arg(long)]
+    local_mnist: Option<String>,
+}
+
+pub fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+    // Load the dataset
+    let m = if let Some(directory) = args.local_mnist {
+        candle_datasets::vision::mnist::load_dir(directory)?
+    } else {
+        candle_datasets::vision::mnist::load()?
+    };
+    println!("train-images: {:?}", m.train_images.shape());
+    println!("train-labels: {:?}", m.train_labels.shape());
+    println!("test-images: {:?}", m.test_images.shape());
+    println!("test-labels: {:?}", m.test_labels.shape());
+
+    let default_learning_rate = match args.model {
+        WhichModel::Linear => 1.,
+        WhichModel::Mlp => 0.05,
+        WhichModel::Cnn => 0.001,
+    };
+    let training_args = TrainingArgs {
+        epochs: args.epochs,
+        learning_rate: args.learning_rate.unwrap_or(default_learning_rate),
+        load: args.load,
+        save: args.save,
+    };
+    match args.model {
+        WhichModel::Linear => training_loop::<LinearModel>(m, &training_args),
+        WhichModel::Mlp => training_loop::<Mlp>(m, &training_args),
+        WhichModel::Cnn => training_loop_cnn(m, &training_args),
+    }
+}
--- a/candle-examples/examples/mobileone/README.md
+++ b/candle-examples/examples/mobileone/README.md
+# candle-mobileone
+
+[MobileOne: An Improved One millisecond Mobile Backbone](https://arxiv.org/abs/2206.04040).
+
+This candle implementation uses a pre-trained MobileOne network for inference. The
+classification head has been trained on the ImageNet dataset and returns the
+probabilities for the top-5 classes.
+
+## Running an example
+
+```
+$ cargo run --example mobileone --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg --which s2
+
+loaded image Tensor[dims 3, 224, 224; f32]
+model built
+mountain bike, all-terrain bike, off-roader: 79.33%
+bicycle-built-for-two, tandem bicycle, tandem: 15.32%
+crash helmet            : 2.58%
+unicycle, monocycle     : 1.70%
+alp                     : 0.21%
+
+```
--- a/candle-examples/examples/mobileone/main.rs
+++ b/candle-examples/examples/mobileone/main.rs
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use clap::{Parser, ValueEnum};
+
+use candle::{DType, IndexOp, D};
+use candle_nn::{Module, VarBuilder};
+use candle_transformers::models::mobileone;
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Which {
+    S0,
+    S1,
+    S2,
+    S3,
+    S4,
+}
+
+impl Which {
+    fn model_filename(&self) -> String {
+        let name = match self {
+            Self::S0 => "s0",
+            Self::S1 => "s1",
+            Self::S2 => "s2",
+            Self::S3 => "s3",
+            Self::S4 => "s4",
+        };
+        format!("timm/mobileone_{}.apple_in1k", name)
+    }
+
+    fn config(&self) -> mobileone::Config {
+        match self {
+            Self::S0 => mobileone::Config::s0(),
+            Self::S1 => mobileone::Config::s1(),
+            Self::S2 => mobileone::Config::s2(),
+            Self::S3 => mobileone::Config::s3(),
+            Self::S4 => mobileone::Config::s4(),
+        }
+    }
+}
+
+#[derive(Parser)]
+struct Args {
+    #[arg(long)]
+    model: Option<String>,
+
+    #[arg(long)]
+    image: String,
+
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    #[arg(value_enum, long, default_value_t=Which::S0)]
+    which: Which,
+}
+
+pub fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+
+    let device = candle_examples::device(args.cpu)?;
+
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
+    println!("loaded image {image:?}");
+
+    let model_file = match args.model {
+        None => {
+            let model_name = args.which.model_filename();
+            let api = hf_hub::api::sync::Api::new()?;
+            let api = api.model(model_name);
+            api.get("model.safetensors")?
+        }
+        Some(model) => model.into(),
+    };
+
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
+    let model = mobileone::mobileone(&args.which.config(), 1000, vb)?;
+    println!("model built");
+    let logits = model.forward(&image.unsqueeze(0)?)?;
+    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
+        .i(0)?
+        .to_vec1::<f32>()?;
+    let mut prs = prs.iter().enumerate().collect::<Vec<_>>();
+    prs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
+    for &(category_idx, pr) in prs.iter().take(5) {
+        println!(
+            "{:24}: {:.2}%",
+            candle_examples::imagenet::CLASSES[category_idx],
+            100. * pr
+        );
+    }
+    Ok(())
+}
--- a/candle-examples/examples/moondream/README.md
+++ b/candle-examples/examples/moondream/README.md
+# candle-moondream
+
+[Moondream](https://github.com/vikhyat/moondream) is a computer-vision model can answer real-world questions about images. It's tiny by today's models, with only 1.6B parameters. That enables it to run on a variety of devices, including mobile phones and edge devices.
+
+## Running some examples
+First download an example image
+```bash
+$ wget https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg
+```
+
+<img src="https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg" width="200">
+
+Now you can run Moondream from the `candle-examples` crate:
+```bash
+$ cargo run --example moondream --release -- --prompt "What is the girl eating?" --image "./demo-1.jpg"
+
+avavx: false, neon: true, simd128: false, f16c: false
+temp: 0.00 repeat-penalty: 1.00 repeat-last-n: 64
+retrieved the files in 3.395583ms
+Running on CPU, to run on GPU(metal), build this example with `--features metal`
+loaded the model in 5.485493792s
+loaded and encoded the image Tensor[dims 3, 378, 378; f32] in 4.801396417s
+starting the inference loop
+ The girl is eating a hamburger.<
+9 tokens generated (0.68 token/s)
+```
\ No newline at end of file
--- a/candle-examples/examples/moondream/main.rs
+++ b/candle-examples/examples/moondream/main.rs
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use anyhow::{Error as E, Result};
+use clap::Parser;
+
+use candle::{DType, Device, Tensor};
+use candle_nn::VarBuilder;
+use candle_transformers::{
+    generation::LogitsProcessor,
+    models::{moondream, quantized_moondream},
+};
+use tokenizers::Tokenizer;
+
+enum Model {
+    Moondream(moondream::Model),
+    Quantized(quantized_moondream::Model),
+}
+
+struct TextGeneration {
+    model: Model,
+    device: Device,
+    tokenizer: Tokenizer,
+    logits_processor: LogitsProcessor,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+    verbose_prompt: bool,
+}
+
+impl TextGeneration {
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        model: Model,
+        tokenizer: Tokenizer,
+        seed: u64,
+        temp: Option<f64>,
+        top_p: Option<f64>,
+        repeat_penalty: f32,
+        repeat_last_n: usize,
+        verbose_prompt: bool,
+        device: &Device,
+    ) -> Self {
+        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+        Self {
+            model,
+            tokenizer,
+            logits_processor,
+            repeat_penalty,
+            repeat_last_n,
+            verbose_prompt,
+            device: device.clone(),
+        }
+    }
+
+    fn run(&mut self, prompt: &str, image_embeds: &Tensor, sample_len: usize) -> Result<()> {
+        use std::io::Write;
+        println!("starting the inference loop");
+        let tokens = self.tokenizer.encode(prompt, true).map_err(E::msg)?;
+        if tokens.is_empty() {
+            anyhow::bail!("Empty prompts are not supported in the Moondream model.")
+        }
+        if self.verbose_prompt {
+            for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) {
+                let token = token.replace('▁', " ").replace("<0x0A>", "\n");
+                println!("{id:7} -> '{token}'");
+            }
+        }
+
+        let mut tokens = tokens.get_ids().to_vec();
+        let mut generated_tokens = 0usize;
+
+        // Moondream tokenizer bos_token and eos_token is "<|endoftext|>"
+        // https://huggingface.co/vikhyatk/moondream2/blob/main/special_tokens_map.json
+        let special_token = match self.tokenizer.get_vocab(true).get("<|endoftext|>") {
+            Some(token) => *token,
+            None => anyhow::bail!("cannot find the special token"),
+        };
+        let (bos_token, eos_token) = (special_token, special_token);
+
+        let start_gen = std::time::Instant::now();
+        let mut load_t = std::time::Duration::from_secs_f64(0f64);
+        for index in 0..sample_len {
+            let context_size = if index > 0 { 1 } else { tokens.len() };
+            let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let logits = if index > 0 {
+                match self.model {
+                    Model::Moondream(ref mut model) => model.text_model.forward(&input)?,
+                    Model::Quantized(ref mut model) => model.text_model.forward(&input)?,
+                }
+            } else {
+                let bos_token = Tensor::new(&[bos_token], &self.device)?.unsqueeze(0)?;
+                let logits = match self.model {
+                    Model::Moondream(ref mut model) => {
+                        model
+                            .text_model
+                            .forward_with_img(&bos_token, &input, image_embeds)?
+                    }
+                    Model::Quantized(ref mut model) => {
+                        model
+                            .text_model
+                            .forward_with_img(&bos_token, &input, image_embeds)?
+                    }
+                };
+                load_t = start_gen.elapsed();
+                println!("load_t: {:?}", load_t);
+                logits
+            };
+            let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
+            let logits = if self.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    self.repeat_penalty,
+                    &tokens[start_at..],
+                )?
+            };
+            let next_token = self.logits_processor.sample(&logits)?;
+            tokens.push(next_token);
+            generated_tokens += 1;
+            if next_token == eos_token || tokens.ends_with(&[27, 10619, 29] /* <END> */) {
+                break;
+            }
+            let token = self.tokenizer.decode(&[next_token], true).map_err(E::msg)?;
+            print!("{token}");
+            std::io::stdout().flush()?;
+        }
+
+        let dt = start_gen.elapsed() - load_t;
+        println!(
+            "\ngenerated in {} seconds\n{generated_tokens} tokens generated ({:.2} token/s)",
+            dt.as_secs_f64(),
+            (generated_tokens - 1) as f64 / dt.as_secs_f64()
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Parser)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    /// Display the token for the specified prompt.
+    #[arg(long)]
+    verbose_prompt: bool,
+
+    #[arg(long)]
+    prompt: String,
+
+    #[arg(long)]
+    image: String,
+
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    temperature: Option<f64>,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 0)]
+    seed: u64,
+
+    #[arg(long, default_value_t = 5000)]
+    sample_len: usize,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.0)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+
+    #[arg(long)]
+    model_id: Option<String>,
+
+    #[arg(long, default_value = "main")]
+    revision: String,
+
+    #[arg(long)]
+    quantized: bool,
+
+    /// Use f16 precision for all the computations rather than f32.
+    #[arg(long)]
+    f16: bool,
+
+    #[arg(long)]
+    model_file: Option<String>,
+
+    #[arg(long)]
+    tokenizer_file: Option<String>,
+}
+
+/// Loads an image from disk using the image crate, this returns a tensor with shape
+/// (3, 378, 378).
+pub fn load_image<P: AsRef<std::path::Path>>(p: P) -> candle::Result<Tensor> {
+    let img = image::io::Reader::open(p)?
+        .decode()
+        .map_err(candle::Error::wrap)?
+        .resize_to_fill(378, 378, image::imageops::FilterType::Triangle); // Adjusted to 378x378
+    let img = img.to_rgb8();
+    let data = img.into_raw();
+    let data = Tensor::from_vec(data, (378, 378, 3), &Device::Cpu)?.permute((2, 0, 1))?;
+    let mean = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?;
+    let std = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?;
+    (data.to_dtype(candle::DType::F32)? / 255.)?
+        .broadcast_sub(&mean)?
+        .broadcast_div(&std)
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx(),
+        candle::utils::with_neon(),
+        candle::utils::with_simd128(),
+        candle::utils::with_f16c()
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature.unwrap_or(0.),
+        args.repeat_penalty,
+        args.repeat_last_n
+    );
+
+    let start = std::time::Instant::now();
+    let api = hf_hub::api::tokio::Api::new()?;
+    let model_id = match args.model_id {
+        Some(model_id) => model_id.to_string(),
+        None => {
+            if args.quantized {
+                "santiagomed/candle-moondream".to_string()
+            } else {
+                "vikhyatk/moondream2".to_string()
+            }
+        }
+    };
+    let repo = api.repo(hf_hub::Repo::with_revision(
+        model_id,
+        hf_hub::RepoType::Model,
+        args.revision,
+    ));
+    let model_file = match args.model_file {
+        Some(m) => m.into(),
+        None => {
+            if args.quantized {
+                repo.get("model-q4_0.gguf").await?
+            } else {
+                repo.get("model.safetensors").await?
+            }
+        }
+    };
+    let tokenizer = match args.tokenizer_file {
+        Some(m) => m.into(),
+        None => repo.get("tokenizer.json").await?,
+    };
+    println!("retrieved the files in {:?}", start.elapsed());
+    let tokenizer = Tokenizer::from_file(tokenizer).map_err(E::msg)?;
+
+    let start = std::time::Instant::now();
+    let device = candle_examples::device(args.cpu)?;
+    let config = moondream::Config::v2();
+    let dtype = if args.quantized {
+        if args.f16 {
+            anyhow::bail!("Quantized model does not support f16");
+        }
+        DType::F32
+    } else if device.is_cuda() || args.f16 {
+        DType::F16
+    } else {
+        DType::F32
+    };
+    let model = if args.quantized {
+        let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
+            &model_file,
+            &device,
+        )?;
+        let model = quantized_moondream::Model::new(&config, vb)?;
+        Model::Quantized(model)
+    } else {
+        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], dtype, &device)? };
+        let model = moondream::Model::new(&config, vb)?;
+        Model::Moondream(model)
+    };
+    println!("loaded the model in {:?}", start.elapsed());
+
+    let start = std::time::Instant::now();
+    let image = load_image(args.image)?
+        .to_device(&device)?
+        .to_dtype(dtype)?;
+    let image_embeds = image.unsqueeze(0)?;
+    let image_embeds = match model {
+        Model::Moondream(ref m) => image_embeds.apply(m.vision_encoder())?,
+        Model::Quantized(ref m) => image_embeds.apply(m.vision_encoder())?,
+    };
+    println!(
+        "loaded and encoded the image {image:?} in {:?}",
+        start.elapsed()
+    );
+
+    let prompt = format!("\n\nQuestion: {0}\n\nAnswer:", args.prompt);
+    let mut pipeline = TextGeneration::new(
+        model,
+        tokenizer,
+        args.seed,
+        args.temperature,
+        args.top_p,
+        args.repeat_penalty,
+        args.repeat_last_n,
+        args.verbose_prompt,
+        &device,
+    );
+    pipeline.run(&prompt, &image_embeds, args.sample_len)?;
+
+    Ok(())
+}
--- a/candle-examples/examples/musicgen/main.rs
+++ b/candle-examples/examples/musicgen/main.rs
+#![allow(dead_code)]
+// https://huggingface.co/facebook/musicgen-small/tree/main
+// https://github.com/huggingface/transformers/blob/cd4584e3c809bb9e1392ccd3fe38b40daba5519a/src/transformers/models/musicgen/modeling_musicgen.py
+// TODO: Add an offline mode.
+// TODO: Add a KV cache.
+
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+mod musicgen_model;
+
+use musicgen_model::{GenConfig, MusicgenForConditionalGeneration};
+
+use anyhow::{Error as E, Result};
+use candle::{DType, Tensor};
+use candle_nn::VarBuilder;
+use clap::Parser;
+use hf_hub::{api::sync::Api, Repo, RepoType};
+
+const DTYPE: DType = DType::F32;
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    /// The model weight file, in safetensor format.
+    #[arg(long)]
+    model: Option<String>,
+
+    /// The tokenizer config.
+    #[arg(long)]
+    tokenizer: Option<String>,
+
+    #[arg(
+        long,
+        default_value = "90s rock song with loud guitars and heavy drums"
+    )]
+    prompt: String,
+}
+
+fn main() -> Result<()> {
+    use tokenizers::Tokenizer;
+
+    let args = Args::parse();
+    let device = candle_examples::device(args.cpu)?;
+    let tokenizer = match args.tokenizer {
+        Some(tokenizer) => std::path::PathBuf::from(tokenizer),
+        None => Api::new()?
+            .model("facebook/musicgen-small".to_string())
+            .get("tokenizer.json")?,
+    };
+    let mut tokenizer = Tokenizer::from_file(tokenizer).map_err(E::msg)?;
+    let tokenizer = tokenizer
+        .with_padding(None)
+        .with_truncation(None)
+        .map_err(E::msg)?;
+
+    let model = match args.model {
+        Some(model) => std::path::PathBuf::from(model),
+        None => Api::new()?
+            .repo(Repo::with_revision(
+                "facebook/musicgen-small".to_string(),
+                RepoType::Model,
+                "refs/pr/13".to_string(),
+            ))
+            .get("model.safetensors")?,
+    };
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model], DTYPE, &device)? };
+    let config = GenConfig::small();
+    let mut model = MusicgenForConditionalGeneration::load(vb, config)?;
+
+    let tokens = tokenizer
+        .encode(args.prompt.as_str(), true)
+        .map_err(E::msg)?
+        .get_ids()
+        .to_vec();
+    println!("tokens: {tokens:?}");
+    let tokens = Tensor::new(tokens.as_slice(), &device)?.unsqueeze(0)?;
+    println!("{tokens:?}");
+    let embeds = model.text_encoder.forward(&tokens)?;
+    println!("{embeds}");
+
+    Ok(())
+}
--- a/candle-examples/examples/musicgen/musicgen_model.rs
+++ b/candle-examples/examples/musicgen/musicgen_model.rs
+use candle::{DType, Device, Result, Tensor, D};
+use candle_nn::{
+    embedding, layer_norm, linear_no_bias, Activation, Embedding, LayerNorm, Linear, Module,
+    VarBuilder,
+};
+use candle_transformers::models::{encodec, t5};
+
+// https://github.com/huggingface/transformers/blob/cd4584e3c809bb9e1392ccd3fe38b40daba5519a/src/transformers/models/musicgen/configuration_musicgen.py#L83
+#[derive(Debug, Clone, PartialEq)]
+pub struct Config {
+    vocab_size: usize,
+    max_position_embeddings: usize,
+    num_hidden_layers: usize,
+    ffn_dim: usize,
+    num_attention_heads: usize,
+    layerdrop: f64,
+    use_cache: bool,
+    activation_function: Activation,
+    hidden_size: usize,
+    dropout: f64,
+    attention_dropout: f64,
+    activation_dropout: f64,
+    initializer_factor: f64,
+    scale_embedding: bool,
+    num_codebooks: usize,
+    pad_token_id: usize,
+    bos_token_id: usize,
+    eos_token_id: Option<usize>,
+    tie_word_embeddings: bool,
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            vocab_size: 2048,
+            max_position_embeddings: 2048,
+            num_hidden_layers: 24,
+            ffn_dim: 4096,
+            num_attention_heads: 16,
+            layerdrop: 0.0,
+            use_cache: true,
+            activation_function: Activation::Gelu,
+            hidden_size: 1024,
+            dropout: 0.1,
+            attention_dropout: 0.0,
+            activation_dropout: 0.0,
+            initializer_factor: 0.02,
+            scale_embedding: false,
+            num_codebooks: 4,
+            pad_token_id: 2048,
+            bos_token_id: 2048,
+            eos_token_id: None,
+            tie_word_embeddings: false,
+        }
+    }
+}
+
+impl Config {
+    fn musicgen_small() -> Self {
+        Self {
+            vocab_size: 2048,
+            max_position_embeddings: 2048,
+            num_hidden_layers: 24,
+            ffn_dim: 4096,
+            num_attention_heads: 16,
+            layerdrop: 0.0,
+            use_cache: true,
+            activation_function: Activation::Gelu,
+            hidden_size: 1024,
+            dropout: 0.1,
+            attention_dropout: 0.0,
+            activation_dropout: 0.0,
+            initializer_factor: 0.02,
+            scale_embedding: false,
+            num_codebooks: 4,
+            pad_token_id: 2048,
+            bos_token_id: 2048,
+            eos_token_id: None,
+            tie_word_embeddings: false,
+        }
+    }
+}
+
+fn get_embedding(num_embeddings: usize, embedding_dim: usize) -> Result<Tensor> {
+    let half_dim = embedding_dim / 2;
+    let emb = f64::ln(10000.) / (half_dim - 1) as f64;
+    let xs: Vec<_> = (0..num_embeddings).map(|v| v as f32).collect();
+    let xs = Tensor::from_vec(xs, (num_embeddings, 1), &Device::Cpu)?;
+    let ys: Vec<_> = (0..half_dim)
+        .map(|v| f64::exp(v as f64 * -emb) as f32)
+        .collect();
+    let ys = Tensor::from_vec(ys, (1, half_dim), &Device::Cpu)?;
+    let shape = (num_embeddings, half_dim);
+    let emb = (xs.broadcast_as(shape)? * ys.broadcast_as(shape)?)?;
+    let emb =
+        Tensor::cat(&[&emb.cos()?, &emb.sin()?], 1)?.reshape((num_embeddings, 2 * half_dim))?;
+    let emb = if embedding_dim % 2 == 1 {
+        let zeros = Tensor::zeros((num_embeddings, 1), DType::F32, &Device::Cpu)?;
+        Tensor::cat(&[&emb, &zeros], 1)?
+    } else {
+        emb
+    };
+    Ok(emb)
+}
+
+#[derive(Debug)]
+struct MusicgenSinusoidalPositionalEmbedding {
+    num_positions: usize,
+    embedding_dim: usize,
+    weights: Tensor,
+}
+
+impl MusicgenSinusoidalPositionalEmbedding {
+    fn load(_vb: VarBuilder, cfg: &Config) -> Result<Self> {
+        let num_positions = cfg.max_position_embeddings;
+        let embedding_dim = cfg.hidden_size;
+        let weights = get_embedding(num_positions, embedding_dim)?;
+        Ok(Self {
+            num_positions,
+            embedding_dim,
+            weights,
+        })
+    }
+
+    fn forward(&mut self, input_ids: &Tensor) -> Result<Tensor> {
+        let (_b_sz, _codebooks, seq_len) = input_ids.dims3()?;
+        if seq_len > self.weights.dim(0)? {
+            self.weights = get_embedding(seq_len, self.embedding_dim)?
+        }
+        self.weights.narrow(0, 0, seq_len)
+    }
+}
+
+#[derive(Debug)]
+struct MusicgenAttention {
+    scaling: f64,
+    is_decoder: bool,
+    num_heads: usize,
+    head_dim: usize,
+    k_proj: Linear,
+    v_proj: Linear,
+    q_proj: Linear,
+    out_proj: Linear,
+}
+
+impl MusicgenAttention {
+    fn load(vb: VarBuilder, cfg: &Config) -> Result<Self> {
+        let h = cfg.hidden_size;
+        let num_heads = cfg.num_attention_heads;
+        let head_dim = h / num_heads;
+        let k_proj = linear_no_bias(h, h, vb.pp("k_proj"))?;
+        let v_proj = linear_no_bias(h, h, vb.pp("v_proj"))?;
+        let q_proj = linear_no_bias(h, h, vb.pp("q_proj"))?;
+        let out_proj = linear_no_bias(h, h, vb.pp("out_proj"))?;
+        Ok(Self {
+            scaling: 1. / (head_dim as f64).sqrt(),
+            is_decoder: true,
+            num_heads,
+            head_dim,
+            k_proj,
+            v_proj,
+            q_proj,
+            out_proj,
+        })
+    }
+
+    fn forward(
+        &mut self,
+        xs: &Tensor,
+        kv_states: Option<&Tensor>,
+        attention_mask: &Tensor,
+    ) -> Result<Tensor> {
+        let (b_sz, tgt_len, _) = xs.dims3()?;
+        let query_states = (self.q_proj.forward(xs)? * self.scaling)?;
+
+        let kv_states = kv_states.unwrap_or(xs);
+        let key_states = self.k_proj.forward(kv_states)?;
+        let value_states = self.v_proj.forward(kv_states)?;
+
+        let tgt = (b_sz, tgt_len, self.num_heads, self.head_dim);
+        let query_states = query_states.reshape(tgt)?.transpose(1, 2)?.contiguous()?;
+        let key_states = key_states.reshape(tgt)?.transpose(1, 2)?.contiguous()?;
+        let value_states = value_states.reshape(tgt)?.transpose(1, 2)?.contiguous()?;
+
+        let src_len = key_states.dim(1)?;
+        let attn_weights = query_states.matmul(&key_states.transpose(1, 2)?)?;
+        let attn_weights = attn_weights
+            .reshape((b_sz, self.num_heads, tgt_len, src_len))?
+            .broadcast_add(attention_mask)?;
+        let attn_weights = candle_nn::ops::softmax(&attn_weights, D::Minus1)?;
+        // TODO: layer_head_mask?
+        let attn_output = attn_weights
+            .matmul(&value_states)?
+            .reshape((b_sz, self.num_heads, tgt_len, self.head_dim))?
+            .transpose(1, 2)?
+            .reshape((b_sz, tgt_len, self.num_heads * self.head_dim))?;
+        let attn_output = self.out_proj.forward(&attn_output)?;
+        Ok(attn_output)
+    }
+}
+
+#[derive(Debug)]
+struct MusicgenDecoderLayer {
+    self_attn: MusicgenAttention,
+    self_attn_layer_norm: LayerNorm,
+    encoder_attn: MusicgenAttention,
+    encoder_attn_layer_norm: LayerNorm,
+    fc1: Linear,
+    fc2: Linear,
+    final_layer_norm: LayerNorm,
+    activation_fn: Activation,
+}
+
+impl MusicgenDecoderLayer {
+    fn load(vb: VarBuilder, cfg: &Config) -> Result<Self> {
+        let h = cfg.hidden_size;
+        let self_attn = MusicgenAttention::load(vb.pp("self_attn"), cfg)?;
+        let self_attn_layer_norm = layer_norm(h, 1e-5, vb.pp("self_attn_layer_norm"))?;
+        let encoder_attn = MusicgenAttention::load(vb.pp("encoder_attn"), cfg)?;
+        let encoder_attn_layer_norm = layer_norm(h, 1e-5, vb.pp("encoder_attn_layer_norm"))?;
+        let fc1 = linear_no_bias(h, cfg.ffn_dim, vb.pp("fc1"))?;
+        let fc2 = linear_no_bias(cfg.ffn_dim, h, vb.pp("fc2"))?;
+        let final_layer_norm = layer_norm(h, 1e-5, vb.pp("final_layer_norm"))?;
+        Ok(Self {
+            self_attn,
+            self_attn_layer_norm,
+            encoder_attn,
+            encoder_attn_layer_norm,
+            fc1,
+            fc2,
+            final_layer_norm,
+            activation_fn: cfg.activation_function,
+        })
+    }
+
+    fn forward(
+        &mut self,
+        xs: &Tensor,
+        attention_mask: &Tensor,
+        encoder_hidden_states: Option<&Tensor>,
+    ) -> Result<Tensor> {
+        let residual = xs.clone();
+        let xs = self.self_attn_layer_norm.forward(xs)?;
+        let xs = self.self_attn.forward(&xs, None, attention_mask)?;
+        let mut xs = (xs + residual)?;
+        if let Some(encoder_hidden_states) = &encoder_hidden_states {
+            let residual = xs.clone();
+            let encoder_attention_mask = attention_mask.clone(); // TODO
+            xs = self.encoder_attn.forward(
+                &xs,
+                Some(encoder_hidden_states),
+                &encoder_attention_mask,
+            )?;
+            xs = (xs + residual)?
+        }
+        let residual = xs.clone();
+        let xs = self.final_layer_norm.forward(&xs)?;
+        let xs = self.fc1.forward(&xs)?;
+        let xs = self.activation_fn.forward(&xs)?;
+        let xs = self.fc2.forward(&xs)?;
+        let xs = (xs + residual)?;
+        Ok(xs)
+    }
+}
+
+#[derive(Debug)]
+struct MusicgenDecoder {
+    embed_tokens: Vec<Embedding>,
+    embed_positions: MusicgenSinusoidalPositionalEmbedding,
+    layers: Vec<MusicgenDecoderLayer>,
+    layer_norm: LayerNorm,
+    embed_scale: f64,
+    num_codebooks: usize,
+    d_model: usize,
+}
+
+impl MusicgenDecoder {
+    fn load(vb: VarBuilder, cfg: &Config) -> Result<Self> {
+        let h = cfg.hidden_size;
+        let embed_scale = if cfg.scale_embedding {
+            (h as f64).sqrt()
+        } else {
+            1.
+        };
+        let embed_dim = cfg.vocab_size + 1;
+        let embed_tokens = (0..cfg.num_codebooks)
+            .map(|i| embedding(embed_dim, h, vb.pp(&format!("embed_tokens.{i}"))))
+            .collect::<Result<Vec<_>>>()?;
+        let embed_positions = MusicgenSinusoidalPositionalEmbedding::load(vb.clone(), cfg)?;
+        let layers = (0..cfg.num_hidden_layers)
+            .map(|i| MusicgenDecoderLayer::load(vb.pp(&format!("layers.{i}")), cfg))
+            .collect::<Result<Vec<_>>>()?;
+        let layer_norm = layer_norm(h, 1e-5, vb.pp("layer_norm"))?;
+        Ok(Self {
+            embed_tokens,
+            embed_positions,
+            layers,
+            layer_norm,
+            embed_scale,
+            num_codebooks: cfg.num_codebooks,
+            d_model: cfg.hidden_size,
+        })
+    }
+
+    fn prepare_decoder_attention_mask(&self, _b_sz: usize, _seq_len: usize) -> Result<Tensor> {
+        todo!()
+    }
+
+    fn forward(&mut self, input_ids: &Tensor) -> Result<Tensor> {
+        let dev = input_ids.device();
+        let (b_sz_times_codebooks, seq_len) = input_ids.dims2()?;
+        let b_sz = b_sz_times_codebooks / self.num_codebooks;
+        let input = input_ids.reshape((b_sz, self.num_codebooks, seq_len))?;
+        let mut inputs_embeds = Tensor::zeros((b_sz, seq_len, self.d_model), DType::F32, dev)?;
+        for (idx, codebook) in self.embed_tokens.iter().enumerate() {
+            let inp = input.narrow(1, idx, 1)?.squeeze(1)?;
+            inputs_embeds = (inputs_embeds + codebook.forward(&inp)?)?
+        }
+        let inputs_embeds = inputs_embeds;
+        let positions = self.embed_positions.forward(&input)?.to_device(dev)?;
+        let mut xs = inputs_embeds.broadcast_add(&positions)?;
+        let attention_mask = self.prepare_decoder_attention_mask(b_sz, seq_len)?;
+        for decoder_layer in self.layers.iter_mut() {
+            xs = decoder_layer.forward(&xs, &attention_mask, None)?;
+        }
+        let xs = self.layer_norm.forward(&xs)?;
+        Ok(xs)
+    }
+}
+
+#[derive(Debug)]
+pub struct MusicgenForCausalLM {
+    decoder: MusicgenDecoder,
+    lm_heads: Vec<Linear>,
+    num_codebooks: usize,
+    vocab_size: usize,
+}
+
+impl MusicgenForCausalLM {
+    pub fn load(vb: VarBuilder, cfg: &Config) -> Result<Self> {
+        let h = cfg.hidden_size;
+        let decoder = MusicgenDecoder::load(vb.pp("model.decoder"), cfg)?;
+        let lm_heads = (0..cfg.num_codebooks)
+            .map(|i| linear_no_bias(h, cfg.vocab_size, vb.pp(&format!("lm_heads.{i}"))))
+            .collect::<Result<Vec<_>>>()?;
+        Ok(Self {
+            decoder,
+            lm_heads,
+            num_codebooks: cfg.num_codebooks,
+            vocab_size: cfg.vocab_size,
+        })
+    }
+
+    pub fn forward(&mut self, input_ids: &Tensor) -> Result<Tensor> {
+        let (b_sz, seq_len) = input_ids.dims2()?;
+        let hidden_states = self.decoder.forward(input_ids)?;
+        let lm_logits = self
+            .lm_heads
+            .iter()
+            .map(|h| h.forward(&hidden_states))
+            .collect::<Result<Vec<_>>>()?;
+        let lm_logits = Tensor::stack(&lm_logits, 1)?.reshape((
+            b_sz * self.num_codebooks,
+            seq_len,
+            self.vocab_size,
+        ))?;
+        Ok(lm_logits)
+    }
+}
+
+#[derive(Debug)]
+pub struct MusicgenForConditionalGeneration {
+    pub text_encoder: t5::T5EncoderModel,
+    pub audio_encoder: encodec::Model,
+    pub decoder: MusicgenForCausalLM,
+    cfg: GenConfig,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct GenConfig {
+    musicgen: Config,
+    t5: t5::Config,
+    encodec: encodec::Config,
+}
+
+impl GenConfig {
+    pub fn small() -> Self {
+        // https://huggingface.co/facebook/musicgen-small/blob/495da4ad086b3416a27c6187f9239f9fd96f3962/config.json#L6
+        let encodec = encodec::Config {
+            audio_channels: 1,
+            chunk_length_s: None,
+            codebook_dim: Some(128),
+            codebook_size: 2048,
+            compress: 2,
+            dilation_growth_rate: 2,
+            hidden_size: 128,
+            kernel_size: 7,
+            last_kernel_size: 7,
+            norm_type: encodec::NormType::WeightNorm,
+            normalize: false,
+            num_filters: 64,
+            num_lstm_layers: 2,
+            num_residual_layers: 1,
+            overlap: None,
+            // This should be Reflect and not Replicate but Reflect does not work yet.
+            pad_mode: encodec::PadMode::Replicate,
+            residual_kernel_size: 3,
+            sampling_rate: 32_000,
+            target_bandwidths: vec![2.2],
+            trim_right_ratio: 1.0,
+            upsampling_ratios: vec![8, 5, 4, 4],
+            use_causal_conv: false,
+            use_conv_shortcut: false,
+        };
+        Self {
+            musicgen: Config::musicgen_small(),
+            t5: t5::Config::musicgen_small(),
+            encodec,
+        }
+    }
+}
+
+impl MusicgenForConditionalGeneration {
+    pub fn config(&self) -> &GenConfig {
+        &self.cfg
+    }
+
+    pub fn load(vb: VarBuilder, cfg: GenConfig) -> Result<Self> {
+        let text_encoder = t5::T5EncoderModel::load(vb.pp("text_encoder"), &cfg.t5)?;
+        let audio_encoder = encodec::Model::new(&cfg.encodec, vb.pp("audio_encoder"))?;
+        let decoder = MusicgenForCausalLM::load(vb.pp("decoder"), &cfg.musicgen)?;
+        Ok(Self {
+            text_encoder,
+            audio_encoder,
+            decoder,
+            cfg,
+        })
+    }
+}
--- a/candle-examples/examples/onnx/README.md
+++ b/candle-examples/examples/onnx/README.md
+## Using ONNX models in Candle
+
+This example demonstrates how to run [ONNX](https://github.com/onnx/onnx) based models in Candle.
+
+It contains small variants of two models, [SqueezeNet](https://arxiv.org/pdf/1602.07360.pdf) (default) and [EfficientNet](https://arxiv.org/pdf/1905.11946.pdf).
+
+You can run the examples with following commands:
+
+```bash
+cargo run --example onnx --features=onnx --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg
+```
+
+Use the `--which` flag to specify explicitly which network to use, i.e.
+
+```bash
+$ cargo run --example onnx --features=onnx --release -- --which squeeze-net --image candle-examples/examples/yolo-v8/assets/bike.jpg
+
+    Finished release [optimized] target(s) in 0.21s
+     Running `target/release/examples/onnx --which squeeze-net --image candle-examples/examples/yolo-v8/assets/bike.jpg`
+loaded image Tensor[dims 3, 224, 224; f32]
+unicycle, monocycle                               : 83.23%
+ballplayer, baseball player                       : 3.68%
+bearskin, busby, shako                            : 1.54%
+military uniform                                  : 0.78%
+cowboy hat, ten-gallon hat                        : 0.76%
+```
+
+```bash
+$ cargo run --example onnx --features=onnx --release -- --which efficient-net --image candle-examples/examples/yolo-v8/assets/bike.jpg
+
+    Finished release [optimized] target(s) in 0.20s
+     Running `target/release/examples/onnx --which efficient-net --image candle-examples/examples/yolo-v8/assets/bike.jpg`
+loaded image Tensor[dims 224, 224, 3; f32]
+bicycle-built-for-two, tandem bicycle, tandem     : 99.16%
+mountain bike, all-terrain bike, off-roader       : 0.60%
+unicycle, monocycle                               : 0.17%
+crash helmet                                      : 0.02%
+alp                                               : 0.02%
+```
--- a/candle-examples/examples/onnx/main.rs
+++ b/candle-examples/examples/onnx/main.rs
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use candle::{IndexOp, D};
+use clap::{Parser, ValueEnum};
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Which {
+    SqueezeNet,
+    EfficientNet,
+}
+
+#[derive(Parser)]
+struct Args {
+    #[arg(long)]
+    image: String,
+
+    #[arg(long)]
+    model: Option<String>,
+
+    /// The model to be used.
+    #[arg(value_enum, long, default_value_t = Which::SqueezeNet)]
+    which: Which,
+}
+
+pub fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = match args.which {
+        Which::SqueezeNet => image,
+        Which::EfficientNet => image.permute((1, 2, 0))?,
+    };
+
+    println!("loaded image {image:?}");
+
+    let model = match args.model {
+        Some(model) => std::path::PathBuf::from(model),
+        None => match args.which {
+            Which::SqueezeNet => hf_hub::api::sync::Api::new()?
+                .model("lmz/candle-onnx".into())
+                .get("squeezenet1.1-7.onnx")?,
+            Which::EfficientNet => hf_hub::api::sync::Api::new()?
+                .model("onnx/EfficientNet-Lite4".into())
+                .get("efficientnet-lite4-11.onnx")?,
+        },
+    };
+
+    let model = candle_onnx::read_file(model)?;
+    let graph = model.graph.as_ref().unwrap();
+    let mut inputs = std::collections::HashMap::new();
+    inputs.insert(graph.input[0].name.to_string(), image.unsqueeze(0)?);
+    let mut outputs = candle_onnx::simple_eval(&model, inputs)?;
+    let output = outputs.remove(&graph.output[0].name).unwrap();
+    let prs = match args.which {
+        Which::SqueezeNet => candle_nn::ops::softmax(&output, D::Minus1)?,
+        Which::EfficientNet => output,
+    };
+    let prs = prs.i(0)?.to_vec1::<f32>()?;
+
+    // Sort the predictions and take the top 5
+    let mut top: Vec<_> = prs.iter().enumerate().collect();
+    top.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
+    let top = top.into_iter().take(5).collect::<Vec<_>>();
+
+    // Print the top predictions
+    for &(i, p) in &top {
+        println!(
+            "{:50}: {:.2}%",
+            candle_examples::imagenet::CLASSES[i],
+            p * 100.0
+        );
+    }
+
+    Ok(())
+}
--- a/candle-examples/examples/onnx_basics.rs
+++ b/candle-examples/examples/onnx_basics.rs
+use anyhow::Result;
+use candle::{Device, Tensor};
+
+use clap::{Parser, Subcommand};
+
+#[derive(Subcommand, Debug, Clone)]
+enum Command {
+    Print {
+        #[arg(long)]
+        file: String,
+    },
+    SimpleEval {
+        #[arg(long)]
+        file: String,
+    },
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+pub struct Args {
+    #[command(subcommand)]
+    command: Command,
+}
+
+pub fn main() -> Result<()> {
+    let args = Args::parse();
+    match args.command {
+        Command::Print { file } => {
+            let model = candle_onnx::read_file(file)?;
+            println!("{model:?}");
+            let graph = model.graph.unwrap();
+            for node in graph.node.iter() {
+                println!("{node:?}");
+            }
+        }
+        Command::SimpleEval { file } => {
+            let model = candle_onnx::read_file(file)?;
+            let graph = model.graph.as_ref().unwrap();
+            let constants: std::collections::HashSet<_> =
+                graph.initializer.iter().map(|i| i.name.as_str()).collect();
+            let mut inputs = std::collections::HashMap::new();
+            for input in graph.input.iter() {
+                use candle_onnx::onnx::tensor_proto::DataType;
+                if constants.contains(input.name.as_str()) {
+                    continue;
+                }
+
+                let type_ = input.r#type.as_ref().expect("no type for input");
+                let type_ = type_.value.as_ref().expect("no type.value for input");
+                let value = match type_ {
+                    candle_onnx::onnx::type_proto::Value::TensorType(tt) => {
+                        let dt = match DataType::try_from(tt.elem_type) {
+                            Ok(dt) => match candle_onnx::dtype(dt) {
+                                Some(dt) => dt,
+                                None => {
+                                    anyhow::bail!(
+                                        "unsupported 'value' data-type {dt:?} for {}",
+                                        input.name
+                                    )
+                                }
+                            },
+                            type_ => anyhow::bail!("unsupported input type {type_:?}"),
+                        };
+                        let shape = tt.shape.as_ref().expect("no tensortype.shape for input");
+                        let dims = shape
+                                .dim
+                                .iter()
+                                .map(|dim| match dim.value.as_ref().expect("no dim value") {
+                                    candle_onnx::onnx::tensor_shape_proto::dimension::Value::DimValue(v) => Ok(*v as usize),
+                                    candle_onnx::onnx::tensor_shape_proto::dimension::Value::DimParam(_) => Ok(42),
+                                })
+                                .collect::<Result<Vec<usize>>>()?;
+                        Tensor::zeros(dims, dt, &Device::Cpu)?
+                    }
+                    type_ => anyhow::bail!("unsupported input type {type_:?}"),
+                };
+                println!("input {}: {value:?}", input.name);
+                inputs.insert(input.name.clone(), value);
+            }
+            let outputs = candle_onnx::simple_eval(&model, inputs)?;
+            for (name, value) in outputs.iter() {
+                println!("output {name}: {value:?}")
+            }
+        }
+    }
+    Ok(())
+}
--- a/candle-examples/examples/phi/README.md
+++ b/candle-examples/examples/phi/README.md
+# candle-phi: 1.3b and 2.7b LLM with state of the art performance for <10b models.
+
+[Phi-1.5](https://huggingface.co/microsoft/phi-1_5) and
+[Phi-2](https://huggingface.co/microsoft/phi-2) are language models using
+only 1.3 and 2.7 billion parameters but with state of the art performance compared to
+models with up to 10 billion parameters.
+
+The candle implementation provides both the standard version as well as a
+quantized variant.
+
+## Running some examples
+
+For the v2 version.
+```bash
+$ cargo run --example phi --release -- --model 2 \
+  --prompt "A skier slides down a frictionless slope of height 40m and length 80m. What's the skier speed at the bottom?"
+
+A skier slides down a frictionless slope of height 40m and length 80m. What's the skier speed at the bottom?
+
+Solution:
+The potential energy of the skier is converted into kinetic energy as it slides down the slope. The formula for potential energy is mgh, where m is mass, g is acceleration due to gravity (9.8 m/s^2), and h is height. Since there's no friction, all the potential energy is converted into kinetic energy at the bottom of the slope. The formula for kinetic energy is 1/2mv^2, where v is velocity. We can equate these two formulas:
+mgh = 1/2mv^2
+Solving for v, we get:
+v = sqrt(2gh)
+Substituting the given values, we get:
+v = sqrt(2*9.8*40) = 28 m/s
+Therefore, the skier speed at the bottom of the slope is 28 m/s.
+```
+
+For the v1.5 version.
+```bash
+$ cargo run --example phi --release -- --prompt "def print_prime(n): "
+
+def print_prime(n): 
+    print("Printing prime numbers")
+    for i in range(2, n+1):
+        if is_prime(i):
+            print(i)
+
+def is_prime(n):
+    if n <= 1:
+        return False
+    for i in range(2, int(math.sqrt(n))+1):
+        if n % i == 0:
+            return False
+    return True
+
+$ cargo run --example phi --release -- \
+  --prompt "Explain how to find the median of an array and write the corresponding python function.\nAnswer:" \
+  --quantized --sample-len 200
+
+Explain how to find the median of an array and write the corresponding python function.
+Answer: The median is the middle value in an array. If the array has an even number of elements, the median is the average of the two middle values.
+
+def median(arr):
+    arr.sort()
+    n = len(arr)
+    if n % 2 == 0:
+        return (arr[n//2 - 1] + arr[n//2]) / 2
+    else:
+        return arr[n//2]
+```
+
+This also supports the [Puffin Phi v2
+model](https://huggingface.co/teknium/Puffin-Phi-v2) for human interaction.
+```
+$ cargo run --example phi --release  -- \
+    --prompt "USER: What would you do on a sunny day in Paris?\nASSISTANT:" \
+    --sample-len 200 --model puffin-phi-v2 --quantized 
+USER: What would you do on a sunny day in Paris?
+ASSISTANT: On a sunny day in Paris, you could visit the Musée du Louvre to admire the famous
+painting "Mona Lisa" by Leonardo da Vinci. You might also want to stroll along the Champs-Élysées
+and enjoy the beautiful architecture of the buildings around you. Don't forget to stop by a café
+for a cup of coffee and to soak up the sun!"
+```
--- a/candle-examples/examples/phi/main.rs
+++ b/candle-examples/examples/phi/main.rs
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use anyhow::{Error as E, Result};
+use clap::{Parser, ValueEnum};
+
+use candle_transformers::models::mixformer::{Config, MixFormerSequentialForCausalLM as MixFormer};
+use candle_transformers::models::phi::{Config as PhiConfig, Model as Phi};
+use candle_transformers::models::quantized_mixformer::MixFormerSequentialForCausalLM as QMixFormer;
+
+use candle::{DType, Device, Tensor};
+use candle_nn::VarBuilder;
+use candle_transformers::generation::LogitsProcessor;
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use tokenizers::Tokenizer;
+
+enum Model {
+    MixFormer(MixFormer),
+    Phi(Phi),
+    Quantized(QMixFormer),
+}
+
+struct TextGeneration {
+    model: Model,
+    device: Device,
+    tokenizer: Tokenizer,
+    logits_processor: LogitsProcessor,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+    verbose_prompt: bool,
+}
+
+impl TextGeneration {
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        model: Model,
+        tokenizer: Tokenizer,
+        seed: u64,
+        temp: Option<f64>,
+        top_p: Option<f64>,
+        repeat_penalty: f32,
+        repeat_last_n: usize,
+        verbose_prompt: bool,
+        device: &Device,
+    ) -> Self {
+        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+        Self {
+            model,
+            tokenizer,
+            logits_processor,
+            repeat_penalty,
+            repeat_last_n,
+            verbose_prompt,
+            device: device.clone(),
+        }
+    }
+
+    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
+        use std::io::Write;
+        println!("starting the inference loop");
+        let tokens = self.tokenizer.encode(prompt, true).map_err(E::msg)?;
+        if tokens.is_empty() {
+            anyhow::bail!("Empty prompts are not supported in the phi model.")
+        }
+        if self.verbose_prompt {
+            for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) {
+                let token = token.replace('▁', " ").replace("<0x0A>", "\n");
+                println!("{id:7} -> '{token}'");
+            }
+        }
+        let mut tokens = tokens.get_ids().to_vec();
+        let mut generated_tokens = 0usize;
+        let eos_token = match self.tokenizer.get_vocab(true).get("<|endoftext|>") {
+            Some(token) => *token,
+            None => anyhow::bail!("cannot find the endoftext token"),
+        };
+        print!("{prompt}");
+        std::io::stdout().flush()?;
+        let start_gen = std::time::Instant::now();
+        for index in 0..sample_len {
+            let context_size = if index > 0 { 1 } else { tokens.len() };
+            let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let logits = match &mut self.model {
+                Model::MixFormer(m) => m.forward(&input)?,
+                Model::Phi(m) => m.forward(&input)?,
+                Model::Quantized(m) => m.forward(&input)?,
+            };
+            let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
+            let logits = if self.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    self.repeat_penalty,
+                    &tokens[start_at..],
+                )?
+            };
+
+            let next_token = self.logits_processor.sample(&logits)?;
+            tokens.push(next_token);
+            generated_tokens += 1;
+            if next_token == eos_token {
+                break;
+            }
+            let token = self.tokenizer.decode(&[next_token], true).map_err(E::msg)?;
+            print!("{token}");
+            std::io::stdout().flush()?;
+        }
+        let dt = start_gen.elapsed();
+        println!(
+            "\n{generated_tokens} tokens generated ({:.2} token/s)",
+            generated_tokens as f64 / dt.as_secs_f64(),
+        );
+        Ok(())
+    }
+}
+
+#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq)]
+enum WhichModel {
+    #[value(name = "1")]
+    V1,
+    #[value(name = "1.5")]
+    V1_5,
+    #[value(name = "2")]
+    V2,
+    #[value(name = "2-old")]
+    V2Old,
+    PuffinPhiV2,
+    PhiHermes,
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    /// Display the token for the specified prompt.
+    #[arg(long)]
+    verbose_prompt: bool,
+
+    #[arg(long)]
+    prompt: Option<String>,
+
+    #[arg(long)]
+    mmlu_dir: Option<String>,
+
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    temperature: Option<f64>,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, short = 'n', default_value_t = 5000)]
+    sample_len: usize,
+
+    #[arg(long)]
+    model_id: Option<String>,
+
+    #[arg(long, default_value = "2")]
+    model: WhichModel,
+
+    #[arg(long)]
+    revision: Option<String>,
+
+    #[arg(long)]
+    weight_file: Option<String>,
+
+    #[arg(long)]
+    tokenizer: Option<String>,
+
+    #[arg(long)]
+    quantized: bool,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+}
+
+fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx(),
+        candle::utils::with_neon(),
+        candle::utils::with_simd128(),
+        candle::utils::with_f16c()
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature.unwrap_or(0.),
+        args.repeat_penalty,
+        args.repeat_last_n
+    );
+
+    let start = std::time::Instant::now();
+    let api = Api::new()?;
+    let model_id = match args.model_id {
+        Some(model_id) => model_id.to_string(),
+        None => {
+            if args.quantized {
+                "lmz/candle-quantized-phi".to_string()
+            } else {
+                match args.model {
+                    WhichModel::V1 => "microsoft/phi-1".to_string(),
+                    WhichModel::V1_5 => "microsoft/phi-1_5".to_string(),
+                    WhichModel::V2 | WhichModel::V2Old => "microsoft/phi-2".to_string(),
+                    WhichModel::PuffinPhiV2 | WhichModel::PhiHermes => {
+                        "lmz/candle-quantized-phi".to_string()
+                    }
+                }
+            }
+        }
+    };
+    let revision = match args.revision {
+        Some(rev) => rev.to_string(),
+        None => {
+            if args.quantized {
+                "main".to_string()
+            } else {
+                match args.model {
+                    WhichModel::V1 => "refs/pr/8".to_string(),
+                    WhichModel::V1_5 => "refs/pr/73".to_string(),
+                    WhichModel::V2Old => "834565c23f9b28b96ccbeabe614dd906b6db551a".to_string(),
+                    WhichModel::V2 | WhichModel::PuffinPhiV2 | WhichModel::PhiHermes => {
+                        "main".to_string()
+                    }
+                }
+            }
+        }
+    };
+    let repo = api.repo(Repo::with_revision(model_id, RepoType::Model, revision));
+    let tokenizer_filename = match args.tokenizer {
+        Some(file) => std::path::PathBuf::from(file),
+        None => match args.model {
+            WhichModel::V1 | WhichModel::V1_5 | WhichModel::V2 | WhichModel::V2Old => {
+                repo.get("tokenizer.json")?
+            }
+            WhichModel::PuffinPhiV2 | WhichModel::PhiHermes => {
+                repo.get("tokenizer-puffin-phi-v2.json")?
+            }
+        },
+    };
+    let filenames = match args.weight_file {
+        Some(weight_file) => vec![std::path::PathBuf::from(weight_file)],
+        None => {
+            if args.quantized {
+                match args.model {
+                    WhichModel::V1 => vec![repo.get("model-v1-q4k.gguf")?],
+                    WhichModel::V1_5 => vec![repo.get("model-q4k.gguf")?],
+                    WhichModel::V2 | WhichModel::V2Old => vec![repo.get("model-v2-q4k.gguf")?],
+                    WhichModel::PuffinPhiV2 => vec![repo.get("model-puffin-phi-v2-q4k.gguf")?],
+                    WhichModel::PhiHermes => vec![repo.get("model-phi-hermes-1_3B-q4k.gguf")?],
+                }
+            } else {
+                match args.model {
+                    WhichModel::V1 | WhichModel::V1_5 => vec![repo.get("model.safetensors")?],
+                    WhichModel::V2 | WhichModel::V2Old => candle_examples::hub_load_safetensors(
+                        &repo,
+                        "model.safetensors.index.json",
+                    )?,
+                    WhichModel::PuffinPhiV2 => vec![repo.get("model-puffin-phi-v2.safetensors")?],
+                    WhichModel::PhiHermes => vec![repo.get("model-phi-hermes-1_3B.safetensors")?],
+                }
+            }
+        }
+    };
+    println!("retrieved the files in {:?}", start.elapsed());
+    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+
+    let start = std::time::Instant::now();
+    let config = || match args.model {
+        WhichModel::V1 => Config::v1(),
+        WhichModel::V1_5 => Config::v1_5(),
+        WhichModel::V2 | WhichModel::V2Old => Config::v2(),
+        WhichModel::PuffinPhiV2 => Config::puffin_phi_v2(),
+        WhichModel::PhiHermes => Config::phi_hermes_1_3b(),
+    };
+    let device = candle_examples::device(args.cpu)?;
+    let model = if args.quantized {
+        let config = config();
+        let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
+            &filenames[0],
+            &device,
+        )?;
+        let model = match args.model {
+            WhichModel::V2 | WhichModel::V2Old => QMixFormer::new_v2(&config, vb)?,
+            _ => QMixFormer::new(&config, vb)?,
+        };
+        Model::Quantized(model)
+    } else {
+        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, DType::F32, &device)? };
+        match args.model {
+            WhichModel::V1 | WhichModel::V1_5 | WhichModel::V2 => {
+                let config_filename = repo.get("config.json")?;
+                let config = std::fs::read_to_string(config_filename)?;
+                let config: PhiConfig = serde_json::from_str(&config)?;
+                let phi = Phi::new(&config, vb)?;
+                Model::Phi(phi)
+            }
+            WhichModel::V2Old => {
+                let config = config();
+                Model::MixFormer(MixFormer::new_v2(&config, vb)?)
+            }
+            WhichModel::PhiHermes | WhichModel::PuffinPhiV2 => {
+                let config = config();
+                Model::MixFormer(MixFormer::new(&config, vb)?)
+            }
+        }
+    };
+    println!("loaded the model in {:?}", start.elapsed());
+
+    match (args.prompt, args.mmlu_dir) {
+        (None, None) | (Some(_), Some(_)) => {
+            anyhow::bail!("exactly one of --prompt and --mmlu-dir must be specified")
+        }
+        (Some(prompt), None) => {
+            let mut pipeline = TextGeneration::new(
+                model,
+                tokenizer,
+                args.seed,
+                args.temperature,
+                args.top_p,
+                args.repeat_penalty,
+                args.repeat_last_n,
+                args.verbose_prompt,
+                &device,
+            );
+            pipeline.run(&prompt, args.sample_len)?;
+        }
+        (None, Some(mmlu_dir)) => mmlu(model, tokenizer, &device, mmlu_dir)?,
+    }
+    Ok(())
+}
+
+fn mmlu<P: AsRef<std::path::Path>>(
+    mut model: Model,
+    tokenizer: Tokenizer,
+    device: &Device,
+    mmlu_dir: P,
+) -> anyhow::Result<()> {
+    for dir_entry in mmlu_dir.as_ref().read_dir()?.flatten() {
+        let dir_entry = dir_entry.path();
+        let theme = match dir_entry.file_stem().and_then(|v| v.to_str()) {
+            None => "".to_string(),
+            Some(v) => match v.strip_suffix("_test") {
+                None => v.replace('_', " "),
+                Some(v) => v.replace('_', " "),
+            },
+        };
+        if dir_entry.extension().as_ref().and_then(|v| v.to_str()) != Some("csv") {
+            continue;
+        }
+        println!("reading {dir_entry:?}");
+        let dir_entry = std::fs::File::open(dir_entry)?;
+        let mut reader = csv::ReaderBuilder::new()
+            .has_headers(false)
+            .from_reader(dir_entry);
+        let token_a = tokenizer.token_to_id("A").unwrap();
+        let token_b = tokenizer.token_to_id("B").unwrap();
+        let token_c = tokenizer.token_to_id("C").unwrap();
+        let token_d = tokenizer.token_to_id("D").unwrap();
+        for row in reader.records() {
+            let row = match row {
+                Err(_) => continue,
+                Ok(row) => row,
+            };
+            if row.len() < 5 {
+                continue;
+            }
+            let question = row.get(0).unwrap();
+            let answer_a = row.get(1).unwrap();
+            let answer_b = row.get(2).unwrap();
+            let answer_c = row.get(3).unwrap();
+            let answer_d = row.get(4).unwrap();
+            let answer = row.get(5).unwrap();
+            let prompt = format!(
+                    "{} {theme}.\n{question}\nA. {answer_a}\nB. {answer_b}\nC. {answer_c}\nD. {answer_d}\nAnswer:\n",
+                    "The following are multiple choice questions (with answers) about"
+                );
+            let tokens = tokenizer.encode(prompt.as_str(), true).map_err(E::msg)?;
+            let tokens = tokens.get_ids().to_vec();
+            let input = Tensor::new(tokens, device)?.unsqueeze(0)?;
+            let logits = match &mut model {
+                Model::MixFormer(m) => {
+                    m.clear_kv_cache();
+                    m.forward(&input)?
+                }
+                Model::Phi(m) => {
+                    m.clear_kv_cache();
+                    m.forward(&input)?
+                }
+                Model::Quantized(m) => {
+                    m.clear_kv_cache();
+                    m.forward(&input)?
+                }
+            };
+            let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
+            let logits_v: Vec<f32> = logits.to_vec1()?;
+            let pr_a = logits_v[token_a as usize];
+            let pr_b = logits_v[token_b as usize];
+            let pr_c = logits_v[token_c as usize];
+            let pr_d = logits_v[token_d as usize];
+            let model_answer = if pr_a > pr_b && pr_a > pr_c && pr_a > pr_d {
+                "A"
+            } else if pr_b > pr_c && pr_b > pr_d {
+                "B"
+            } else if pr_c > pr_d {
+                "C"
+            } else {
+                "D"
+            };
+
+            println!("{prompt}\n -> {model_answer} vs {answer}");
+        }
+    }
+    Ok(())
+}
--- a/candle-examples/examples/quantized-t5/README.md
+++ b/candle-examples/examples/quantized-t5/README.md
+# candle-quantized-t5
+
+## Seq2Seq example
+
+This example uses a quantized version of the t5 model.
+
+```bash
+$ cargo run --example quantized-t5 --release -- --prompt "translate to German: A beautiful candle."
+...
+ Eine schöne Kerze.
+```
+
+## Generating Quantized weight files
+
+The weight file is automatically retrieved from the hub. It is also possible to
+generate quantized weight files from the original safetensors file by using the
+`tensor-tools` command line utility via:
+
+```bash
+$ cargo run --bin tensor-tools --release -- quantize --quantization q6k PATH/TO/T5/model.safetensors /tmp/model.gguf
+```
+
+## Using custom models
+
+To use a different model, specify the `model-id`.
+
+For example, for text editing, you can use quantized [CoEdit models](https://huggingface.co/jbochi/candle-coedit-quantized).
+
+```bash
+$ cargo run --example quantized-t5 --release  -- \
+  --model-id "jbochi/candle-coedit-quantized" \
+  --prompt "Make this text coherent: Their flight is weak. They run quickly through the tree canopy." \
+  --temperature 0
+...
+ Although their flight is weak, they run quickly through the tree canopy.
+```
+
+By default, it will look for `model.gguf` and `config.json`, but you can specify
+custom local or remote `weight-file` and `config-file`s:
+
+```bash
+cargo run --example quantized-t5 --release  -- \
+  --model-id "jbochi/candle-coedit-quantized" \
+  --weight-file "model-xl.gguf" \
+  --config-file "config-xl.json" \
+  --prompt "Rewrite to make this easier to understand: Note that a storm surge is what forecasters consider a hurricane's most treacherous aspect." \
+  --temperature 0
+...
+ Note that a storm surge is what forecasters consider a hurricane's most dangerous part.
+```
+
+### [MADLAD-400](https://arxiv.org/abs/2309.04662)
+
+MADLAD-400 is a series of multilingual machine translation T5 models trained on 250 billion tokens covering over 450 languages using publicly available data. These models are competitive with significantly larger models.
+
+```bash
+cargo run --example quantized-t5 --release  -- \
+  --model-id "jbochi/madlad400-3b-mt" --weight-file "model-q4k.gguf" \
+  --prompt "<2de> How are you, my friend?" \
+  --temperature 0
+...
+ Wie geht es dir, mein Freund?
+```
--- a/candle-examples/examples/quantized-t5/main.rs
+++ b/candle-examples/examples/quantized-t5/main.rs
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+use std::io::Write;
+use std::path::PathBuf;
+
+use candle_transformers::models::quantized_t5 as t5;
+
+use anyhow::{Error as E, Result};
+use candle::{Device, Tensor};
+use candle_transformers::generation::LogitsProcessor;
+use clap::{Parser, ValueEnum};
+use hf_hub::{api::sync::Api, api::sync::ApiRepo, Repo, RepoType};
+use tokenizers::Tokenizer;
+
+#[derive(Clone, Debug, Copy, ValueEnum)]
+enum Which {
+    T5Small,
+    FlanT5Small,
+    FlanT5Base,
+    FlanT5Large,
+    FlanT5Xl,
+    FlanT5Xxl,
+}
+
+#[derive(Parser, Debug, Clone)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    /// The model repository to use on the HuggingFace hub.
+    #[arg(long)]
+    model_id: Option<String>,
+
+    #[arg(long)]
+    revision: Option<String>,
+
+    #[arg(long)]
+    weight_file: Option<String>,
+
+    #[arg(long)]
+    config_file: Option<String>,
+
+    // Enable/disable decoding.
+    #[arg(long, default_value = "false")]
+    disable_cache: bool,
+
+    /// Use this prompt, otherwise compute sentence similarities.
+    #[arg(long)]
+    prompt: String,
+
+    /// The temperature used to generate samples.
+    #[arg(long, default_value_t = 0.8)]
+    temperature: f64,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+
+    /// The model size to use.
+    #[arg(long, default_value = "t5-small")]
+    which: Which,
+}
+
+struct T5ModelBuilder {
+    device: Device,
+    config: t5::Config,
+    weights_filename: PathBuf,
+}
+
+impl T5ModelBuilder {
+    pub fn load(args: &Args) -> Result<(Self, Tokenizer)> {
+        let device = Device::Cpu;
+        let default_model = "lmz/candle-quantized-t5".to_string();
+        let (model_id, revision) = match (args.model_id.to_owned(), args.revision.to_owned()) {
+            (Some(model_id), Some(revision)) => (model_id, revision),
+            (Some(model_id), None) => (model_id, "main".to_string()),
+            (None, Some(revision)) => (default_model, revision),
+            (None, None) => (default_model, "main".to_string()),
+        };
+
+        let repo = Repo::with_revision(model_id, RepoType::Model, revision);
+        let api = Api::new()?;
+        let api = api.repo(repo);
+        let config_filename = match &args.config_file {
+            Some(filename) => Self::get_local_or_remote_file(filename, &api)?,
+            None => match args.which {
+                Which::T5Small => api.get("config.json")?,
+                Which::FlanT5Small => api.get("config-flan-t5-small.json")?,
+                Which::FlanT5Base => api.get("config-flan-t5-base.json")?,
+                Which::FlanT5Large => api.get("config-flan-t5-large.json")?,
+                Which::FlanT5Xl => api.get("config-flan-t5-xl.json")?,
+                Which::FlanT5Xxl => api.get("config-flan-t5-xxl.json")?,
+            },
+        };
+        let tokenizer_filename = api.get("tokenizer.json")?;
+        let weights_filename = match &args.weight_file {
+            Some(filename) => Self::get_local_or_remote_file(filename, &api)?,
+            None => match args.which {
+                Which::T5Small => api.get("model.gguf")?,
+                Which::FlanT5Small => api.get("model-flan-t5-small.gguf")?,
+                Which::FlanT5Base => api.get("model-flan-t5-base.gguf")?,
+                Which::FlanT5Large => api.get("model-flan-t5-large.gguf")?,
+                Which::FlanT5Xl => api.get("model-flan-t5-xl.gguf")?,
+                Which::FlanT5Xxl => api.get("model-flan-t5-xxl.gguf")?,
+            },
+        };
+        let config = std::fs::read_to_string(config_filename)?;
+        let mut config: t5::Config = serde_json::from_str(&config)?;
+        config.use_cache = !args.disable_cache;
+        let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+        Ok((
+            Self {
+                device,
+                config,
+                weights_filename,
+            },
+            tokenizer,
+        ))
+    }
+
+    pub fn build_model(&self) -> Result<t5::T5ForConditionalGeneration> {
+        let device = Device::Cpu;
+        let vb = t5::VarBuilder::from_gguf(&self.weights_filename, &device)?;
+        Ok(t5::T5ForConditionalGeneration::load(vb, &self.config)?)
+    }
+
+    fn get_local_or_remote_file(filename: &str, api: &ApiRepo) -> Result<PathBuf> {
+        let local_filename = std::path::PathBuf::from(filename);
+        if local_filename.exists() {
+            Ok(local_filename)
+        } else {
+            Ok(api.get(filename)?)
+        }
+    }
+}
+
+fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+
+    let (builder, mut tokenizer) = T5ModelBuilder::load(&args)?;
+    let device = &builder.device;
+    let tokenizer = tokenizer
+        .with_padding(None)
+        .with_truncation(None)
+        .map_err(E::msg)?;
+    let tokens = tokenizer
+        .encode(args.prompt, true)
+        .map_err(E::msg)?
+        .get_ids()
+        .to_vec();
+    let input_token_ids = Tensor::new(&tokens[..], device)?.unsqueeze(0)?;
+    let mut model = builder.build_model()?;
+    let mut output_token_ids = [builder
+        .config
+        .decoder_start_token_id
+        .unwrap_or(builder.config.pad_token_id) as u32]
+    .to_vec();
+    let temperature = if args.temperature <= 0. {
+        None
+    } else {
+        Some(args.temperature)
+    };
+    let mut logits_processor = LogitsProcessor::new(299792458, temperature, args.top_p);
+    let encoder_output = model.encode(&input_token_ids)?;
+    let start = std::time::Instant::now();
+
+    for index in 0.. {
+        if output_token_ids.len() > 512 {
+            break;
+        }
+        let decoder_token_ids = if index == 0 || !builder.config.use_cache {
+            Tensor::new(output_token_ids.as_slice(), device)?.unsqueeze(0)?
+        } else {
+            let last_token = *output_token_ids.last().unwrap();
+            Tensor::new(&[last_token], device)?.unsqueeze(0)?
+        };
+        let logits = model
+            .decode(&decoder_token_ids, &encoder_output)?
+            .squeeze(0)?;
+        let logits = if args.repeat_penalty == 1. {
+            logits
+        } else {
+            let start_at = output_token_ids.len().saturating_sub(args.repeat_last_n);
+            candle_transformers::utils::apply_repeat_penalty(
+                &logits,
+                args.repeat_penalty,
+                &output_token_ids[start_at..],
+            )?
+        };
+
+        let next_token_id = logits_processor.sample(&logits)?;
+        if next_token_id as usize == builder.config.eos_token_id {
+            break;
+        }
+        output_token_ids.push(next_token_id);
+        if let Some(text) = tokenizer.id_to_token(next_token_id) {
+            let text = text.replace('▁', " ").replace("<0x0A>", "\n");
+            print!("{text}");
+            std::io::stdout().flush()?;
+        }
+    }
+    let dt = start.elapsed();
+    println!(
+        "\n{} tokens generated ({:.2} token/s)\n",
+        output_token_ids.len(),
+        output_token_ids.len() as f64 / dt.as_secs_f64(),
+    );
+    Ok(())
+}
--- a/candle-examples/examples/quantized/README.md
+++ b/candle-examples/examples/quantized/README.md
+# candle-quantized-llama: Fast Inference of quantized LLaMA models
+
+This example provides a quantized LLaMA model similar to
+[llama.cpp](https://github.com/ggerganov/llama.cpp). This is based on candle
+built-in quantization methods. Supported features include:
+
+- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support.
+- SIMD optimizations on Apple Silicon and x86.
+- Support using the `gguf` and `ggml` file formats.
+
+The weights are automatically downloaded for you from the [HuggingFace
+Hub](https://huggingface.co/) on the first run. There are various command line
+flags to use local files instead, run with `--help` to learn about them.
+
+![Axiom of Choice](./assets/aoc.gif)
+
+## Running some example.
+
+```bash
+cargo run --example quantized --release -- --prompt "The best thing about coding in rust is "
+
+> avx: true, neon: false, simd128: false, f16c: true
+> temp: 0.80 repeat-penalty: 1.10 repeat-last-n: 64
+> loaded 291 tensors (3.79GB) in 2.17s
+> params: HParams { n_vocab: 32000, n_embd: 4096, n_mult: 256, n_head: 32, n_layer: 32, n_rot: 128, ftype: 2 }
+> The best thing about coding in rust is 1.) that I don’t need to worry about memory leaks, 2.) speed and 3.) my program will compile even on old machines.
+```
+
+Using the mixtral sparse mixture of expert model:
+```bash
+
+$ cargo run --example quantized --release -- --which mixtral --prompt "Lebesgue's integral is superior to Riemann's because "
+> avx: true, neon: false, simd128: false, f16c: true
+> temp: 0.80 repeat-penalty: 1.10 repeat-last-n: 64
+> loaded 995 tensors (26.44GB) in 0.03s
+Lebesgue's integral is superior to Riemann's because 1. it is defined for a wider class of functions, those which are absolutely integrable; 2. the definition does not involve limits in two variables---one being computed before the other (which makes some computations more difficult); and 3. interchange of order of integration is easier to establish than with Riemann's integral. On the other hand, Lebesgue's integral applies only for bounded functions defined on finite intervals; it does not provide numerical values for improper integrals. The latter are best evaluated using Cauchy's limit definition.
+
+The reason $f(x) = x^2$ is discontinuous at the ends of its interval of definition, and Riemann's integral requires continuity on the whole of an open interval containing it (see our earlier post), sine no such function exists with this property, is that the endpoints are infinite in measure for Lebesgue's integral.
+ ```
+
+
+## Command-line flags
+
+Run with `--help` to see all options.
+
+- `--which`: specify the model to use, e.g. `7b`, `13-chat`, `7b-code`.
+- `--prompt interactive`: interactive mode where multiple prompts can be
+  entered.
+- `--model mymodelfile.gguf`: use a local model file rather than getting one
+  from the hub.
--- a/candle-examples/examples/quantized/assets/aoc.gif
+++ b/candle-examples/examples/quantized/assets/aoc.gif
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use clap::{Parser, ValueEnum};
+use std::io::Write;
+use tokenizers::Tokenizer;
+
+use candle::quantized::{ggml_file, gguf_file};
+use candle::Tensor;
+use candle_transformers::generation::{LogitsProcessor, Sampling};
+
+use candle_examples::token_output_stream::TokenOutputStream;
+use candle_transformers::models::quantized_llama as model;
+use model::ModelWeights;
+
+const DEFAULT_PROMPT: &str = "My favorite theorem is ";
+
+#[derive(Debug)]
+enum Prompt {
+    Interactive,
+    Chat,
+    One(String),
+}
+
+#[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)]
+enum Which {
+    #[value(name = "7b")]
+    L7b,
+    #[value(name = "13b")]
+    L13b,
+    #[value(name = "70b")]
+    L70b,
+    #[value(name = "7b-chat")]
+    L7bChat,
+    #[value(name = "13b-chat")]
+    L13bChat,
+    #[value(name = "70b-chat")]
+    L70bChat,
+    #[value(name = "7b-code")]
+    L7bCode,
+    #[value(name = "13b-code")]
+    L13bCode,
+    #[value(name = "32b-code")]
+    L34bCode,
+    #[value(name = "7b-leo")]
+    Leo7b,
+    #[value(name = "13b-leo")]
+    Leo13b,
+    #[value(name = "7b-mistral")]
+    Mistral7b,
+    #[value(name = "7b-mistral-instruct")]
+    Mistral7bInstruct,
+    #[value(name = "7b-mistral-instruct-v0.2")]
+    Mistral7bInstructV02,
+    #[value(name = "7b-zephyr-a")]
+    Zephyr7bAlpha,
+    #[value(name = "7b-zephyr-b")]
+    Zephyr7bBeta,
+    #[value(name = "7b-open-chat-3.5")]
+    OpenChat35,
+    #[value(name = "7b-starling-a")]
+    Starling7bAlpha,
+    #[value(name = "mixtral")]
+    Mixtral,
+    #[value(name = "mixtral-instruct")]
+    MixtralInstruct,
+}
+
+impl Which {
+    fn is_mistral(&self) -> bool {
+        match self {
+            Self::L7b
+            | Self::L13b
+            | Self::L70b
+            | Self::L7bChat
+            | Self::L13bChat
+            | Self::L70bChat
+            | Self::L7bCode
+            | Self::L13bCode
+            | Self::L34bCode
+            | Self::Leo7b
+            | Self::Leo13b => false,
+            // Zephyr and OpenChat are fine tuned versions of mistral and should be treated in the
+            // same way. Starling is a fine tuned version of OpenChat.
+            Self::OpenChat35
+            | Self::Starling7bAlpha
+            | Self::Zephyr7bAlpha
+            | Self::Zephyr7bBeta
+            | Self::Mixtral
+            | Self::MixtralInstruct
+            | Self::Mistral7b
+            | Self::Mistral7bInstruct
+            | Self::Mistral7bInstructV02 => true,
+        }
+    }
+
+    fn is_zephyr(&self) -> bool {
+        match self {
+            Self::L7b
+            | Self::L13b
+            | Self::L70b
+            | Self::L7bChat
+            | Self::L13bChat
+            | Self::L70bChat
+            | Self::L7bCode
+            | Self::L13bCode
+            | Self::L34bCode
+            | Self::Leo7b
+            | Self::Leo13b
+            | Self::Mixtral
+            | Self::MixtralInstruct
+            | Self::Mistral7b
+            | Self::Mistral7bInstruct
+            | Self::Mistral7bInstructV02
+            | Self::OpenChat35
+            | Self::Starling7bAlpha => false,
+            Self::Zephyr7bAlpha | Self::Zephyr7bBeta => true,
+        }
+    }
+
+    fn is_open_chat(&self) -> bool {
+        match self {
+            Self::L7b
+            | Self::L13b
+            | Self::L70b
+            | Self::L7bChat
+            | Self::L13bChat
+            | Self::L70bChat
+            | Self::L7bCode
+            | Self::L13bCode
+            | Self::L34bCode
+            | Self::Leo7b
+            | Self::Leo13b
+            | Self::Mixtral
+            | Self::MixtralInstruct
+            | Self::Mistral7b
+            | Self::Mistral7bInstruct
+            | Self::Mistral7bInstructV02
+            | Self::Zephyr7bAlpha
+            | Self::Zephyr7bBeta => false,
+            Self::OpenChat35 | Self::Starling7bAlpha => true,
+        }
+    }
+
+    fn tokenizer_repo(&self) -> &'static str {
+        match self {
+            Which::L7b
+            | Which::L13b
+            | Which::L70b
+            | Which::L7bChat
+            | Which::L13bChat
+            | Which::L70bChat
+            | Which::L7bCode
+            | Which::L13bCode
+            | Which::L34bCode => "hf-internal-testing/llama-tokenizer",
+            Which::Leo7b => "LeoLM/leo-hessianai-7b",
+            Which::Leo13b => "LeoLM/leo-hessianai-13b",
+            Which::Mixtral => "mistralai/Mixtral-8x7B-v0.1",
+            Which::MixtralInstruct => "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            Which::Mistral7b
+            | Which::Mistral7bInstruct
+            | Which::Mistral7bInstructV02
+            | Which::Zephyr7bAlpha
+            | Which::Zephyr7bBeta => "mistralai/Mistral-7B-v0.1",
+            Which::OpenChat35 => "openchat/openchat_3.5",
+            Which::Starling7bAlpha => "berkeley-nest/Starling-LM-7B-alpha",
+        }
+    }
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// GGML/GGUF file to load, typically a .bin/.gguf file generated by the quantize command from llama.cpp
+    #[arg(long)]
+    model: Option<String>,
+
+    /// The initial prompt, use 'interactive' for entering multiple prompts in an interactive way
+    /// and 'chat' for an interactive model where history of previous prompts and generated tokens
+    /// is preserved.
+    #[arg(long)]
+    prompt: Option<String>,
+
+    /// The length of the sample to generate (in tokens).
+    #[arg(short = 'n', long, default_value_t = 1000)]
+    sample_len: usize,
+
+    /// The tokenizer config in json format.
+    #[arg(long)]
+    tokenizer: Option<String>,
+
+    /// The temperature used to generate samples, use 0 for greedy sampling.
+    #[arg(long, default_value_t = 0.8)]
+    temperature: f64,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// Only sample among the top K samples.
+    #[arg(long)]
+    top_k: Option<usize>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    /// Display the token for the specified prompt.
+    #[arg(long)]
+    verbose_prompt: bool,
+
+    /// Process prompt elements separately.
+    #[arg(long)]
+    split_prompt: bool,
+
+    /// Run on CPU rather than GPU even if a GPU is available.
+    #[arg(long)]
+    cpu: bool,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+
+    /// The model size to use.
+    #[arg(long, default_value = "7b")]
+    which: Which,
+
+    /// Group-Query Attention, use 8 for the 70B version of LLaMAv2.
+    #[arg(long)]
+    gqa: Option<usize>,
+
+    /// Use the slower dmmv cuda kernel.
+    #[arg(long)]
+    force_dmmv: bool,
+}
+
+impl Args {
+    fn tokenizer(&self) -> anyhow::Result<Tokenizer> {
+        let tokenizer_path = match &self.tokenizer {
+            Some(config) => std::path::PathBuf::from(config),
+            None => {
+                let api = hf_hub::api::sync::Api::new()?;
+                let repo = self.which.tokenizer_repo();
+                let api = api.model(repo.to_string());
+                api.get("tokenizer.json")?
+            }
+        };
+        Tokenizer::from_file(tokenizer_path).map_err(anyhow::Error::msg)
+    }
+
+    fn model(&self) -> anyhow::Result<std::path::PathBuf> {
+        let model_path = match &self.model {
+            Some(config) => std::path::PathBuf::from(config),
+            None => {
+                let (repo, filename) = match self.which {
+                    Which::L7b => ("TheBloke/Llama-2-7B-GGML", "llama-2-7b.ggmlv3.q4_0.bin"),
+                    Which::L13b => ("TheBloke/Llama-2-13B-GGML", "llama-2-13b.ggmlv3.q4_0.bin"),
+                    Which::L70b => ("TheBloke/Llama-2-70B-GGML", "llama-2-70b.ggmlv3.q4_0.bin"),
+                    Which::L7bChat => (
+                        "TheBloke/Llama-2-7B-Chat-GGML",
+                        "llama-2-7b-chat.ggmlv3.q4_0.bin",
+                    ),
+                    Which::L13bChat => (
+                        "TheBloke/Llama-2-13B-Chat-GGML",
+                        "llama-2-13b-chat.ggmlv3.q4_0.bin",
+                    ),
+                    Which::L70bChat => (
+                        "TheBloke/Llama-2-70B-Chat-GGML",
+                        "llama-2-70b-chat.ggmlv3.q4_0.bin",
+                    ),
+                    Which::L7bCode => ("TheBloke/CodeLlama-7B-GGUF", "codellama-7b.Q8_0.gguf"),
+                    Which::L13bCode => ("TheBloke/CodeLlama-13B-GGUF", "codellama-13b.Q8_0.gguf"),
+                    Which::L34bCode => ("TheBloke/CodeLlama-34B-GGUF", "codellama-34b.Q8_0.gguf"),
+                    Which::Leo7b => (
+                        "TheBloke/leo-hessianai-7B-GGUF",
+                        "leo-hessianai-7b.Q4_K_M.gguf",
+                    ),
+                    Which::Leo13b => (
+                        "TheBloke/leo-hessianai-13B-GGUF",
+                        "leo-hessianai-13b.Q4_K_M.gguf",
+                    ),
+                    Which::Mixtral => (
+                        "TheBloke/Mixtral-8x7B-v0.1-GGUF",
+                        "mixtral-8x7b-v0.1.Q4_K_M.gguf",
+                    ),
+                    Which::MixtralInstruct => (
+                        "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF",
+                        "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
+                    ),
+                    Which::Mistral7b => (
+                        "TheBloke/Mistral-7B-v0.1-GGUF",
+                        "mistral-7b-v0.1.Q4_K_S.gguf",
+                    ),
+                    Which::Mistral7bInstruct => (
+                        "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
+                        "mistral-7b-instruct-v0.1.Q4_K_S.gguf",
+                    ),
+                    Which::Mistral7bInstructV02 => (
+                        "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
+                        "mistral-7b-instruct-v0.2.Q4_K_S.gguf",
+                    ),
+                    Which::Zephyr7bAlpha => (
+                        "TheBloke/zephyr-7B-alpha-GGUF",
+                        "zephyr-7b-alpha.Q4_K_M.gguf",
+                    ),
+                    Which::Zephyr7bBeta => {
+                        ("TheBloke/zephyr-7B-beta-GGUF", "zephyr-7b-beta.Q4_K_M.gguf")
+                    }
+                    Which::OpenChat35 => ("TheBloke/openchat_3.5-GGUF", "openchat_3.5.Q4_K_M.gguf"),
+                    Which::Starling7bAlpha => (
+                        "TheBloke/Starling-LM-7B-alpha-GGUF",
+                        "starling-lm-7b-alpha.Q4_K_M.gguf",
+                    ),
+                };
+                let api = hf_hub::api::sync::Api::new()?;
+                let api = api.model(repo.to_string());
+                api.get(filename)?
+            }
+        };
+        Ok(model_path)
+    }
+}
+
+fn format_size(size_in_bytes: usize) -> String {
+    if size_in_bytes < 1_000 {
+        format!("{}B", size_in_bytes)
+    } else if size_in_bytes < 1_000_000 {
+        format!("{:.2}KB", size_in_bytes as f64 / 1e3)
+    } else if size_in_bytes < 1_000_000_000 {
+        format!("{:.2}MB", size_in_bytes as f64 / 1e6)
+    } else {
+        format!("{:.2}GB", size_in_bytes as f64 / 1e9)
+    }
+}
+
+fn main() -> anyhow::Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+
+    #[cfg(feature = "cuda")]
+    candle::quantized::cuda::set_force_dmmv(args.force_dmmv);
+
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx(),
+        candle::utils::with_neon(),
+        candle::utils::with_simd128(),
+        candle::utils::with_f16c()
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature, args.repeat_penalty, args.repeat_last_n
+    );
+
+    let model_path = args.model()?;
+    let mut file = std::fs::File::open(&model_path)?;
+    let start = std::time::Instant::now();
+    let device = candle_examples::device(args.cpu)?;
+
+    let mut model = match model_path.extension().and_then(|v| v.to_str()) {
+        Some("gguf") => {
+            let model = gguf_file::Content::read(&mut file).map_err(|e| e.with_path(model_path))?;
+            let mut total_size_in_bytes = 0;
+            for (_, tensor) in model.tensor_infos.iter() {
+                let elem_count = tensor.shape.elem_count();
+                total_size_in_bytes +=
+                    elem_count * tensor.ggml_dtype.type_size() / tensor.ggml_dtype.block_size();
+            }
+            println!(
+                "loaded {:?} tensors ({}) in {:.2}s",
+                model.tensor_infos.len(),
+                &format_size(total_size_in_bytes),
+                start.elapsed().as_secs_f32(),
+            );
+            ModelWeights::from_gguf(model, &mut file, &device)?
+        }
+        Some("ggml" | "bin") | Some(_) | None => {
+            let model = ggml_file::Content::read(&mut file, &device)
+                .map_err(|e| e.with_path(model_path))?;
+            let mut total_size_in_bytes = 0;
+            for (_, tensor) in model.tensors.iter() {
+                let elem_count = tensor.shape().elem_count();
+                total_size_in_bytes +=
+                    elem_count * tensor.dtype().type_size() / tensor.dtype().block_size();
+            }
+            println!(
+                "loaded {:?} tensors ({}) in {:.2}s",
+                model.tensors.len(),
+                &format_size(total_size_in_bytes),
+                start.elapsed().as_secs_f32(),
+            );
+            println!("params: {:?}", model.hparams);
+            let default_gqa = match args.which {
+                Which::L7b
+                | Which::L13b
+                | Which::L7bChat
+                | Which::L13bChat
+                | Which::L7bCode
+                | Which::L13bCode
+                | Which::L34bCode
+                | Which::Leo7b
+                | Which::Leo13b => 1,
+                Which::Mixtral
+                | Which::MixtralInstruct
+                | Which::Mistral7b
+                | Which::Mistral7bInstruct
+                | Which::Mistral7bInstructV02
+                | Which::Zephyr7bAlpha
+                | Which::Zephyr7bBeta
+                | Which::L70b
+                | Which::L70bChat
+                | Which::OpenChat35
+                | Which::Starling7bAlpha => 8,
+            };
+            ModelWeights::from_ggml(model, args.gqa.unwrap_or(default_gqa))?
+        }
+    };
+    println!("model built");
+
+    let tokenizer = args.tokenizer()?;
+    let mut tos = TokenOutputStream::new(tokenizer);
+    let prompt = match args.prompt.as_deref() {
+        Some("chat") => Prompt::Chat,
+        Some("interactive") => Prompt::Interactive,
+        Some(s) => Prompt::One(s.to_string()),
+        None => Prompt::One(DEFAULT_PROMPT.to_string()),
+    };
+
+    let mut pre_prompt_tokens = vec![];
+    for prompt_index in 0.. {
+        let prompt_str = match &prompt {
+            Prompt::One(prompt) => prompt.clone(),
+            Prompt::Interactive | Prompt::Chat => {
+                let is_interactive = matches!(prompt, Prompt::Interactive);
+                print!("> ");
+                std::io::stdout().flush()?;
+                let mut prompt = String::new();
+                std::io::stdin().read_line(&mut prompt)?;
+                if prompt.ends_with('\n') {
+                    prompt.pop();
+                    if prompt.ends_with('\r') {
+                        prompt.pop();
+                    }
+                }
+                if args.which.is_open_chat() {
+                    format!("GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:")
+                } else if args.which.is_zephyr() {
+                    if prompt_index == 0 || is_interactive {
+                        format!("<|system|>\n</s>\n<|user|>\n{prompt}</s>\n<|assistant|>",)
+                    } else {
+                        format!("<|user|>\n{prompt}</s>\n<|assistant|>")
+                    }
+                } else if args.which.is_mistral() {
+                    format!("[INST] {prompt} [/INST]")
+                } else {
+                    prompt
+                }
+            }
+        };
+        print!("{}", &prompt_str);
+        let tokens = tos
+            .tokenizer()
+            .encode(prompt_str, true)
+            .map_err(anyhow::Error::msg)?;
+        if args.verbose_prompt {
+            for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) {
+                let token = token.replace('▁', " ").replace("<0x0A>", "\n");
+                println!("{id:7} -> '{token}'");
+            }
+        }
+
+        let prompt_tokens = [&pre_prompt_tokens, tokens.get_ids()].concat();
+        let to_sample = args.sample_len.saturating_sub(1);
+        let prompt_tokens = if prompt_tokens.len() + to_sample > model::MAX_SEQ_LEN - 10 {
+            let to_remove = prompt_tokens.len() + to_sample + 10 - model::MAX_SEQ_LEN;
+            prompt_tokens[prompt_tokens.len().saturating_sub(to_remove)..].to_vec()
+        } else {
+            prompt_tokens
+        };
+        let mut all_tokens = vec![];
+        let mut logits_processor = {
+            let temperature = args.temperature;
+            let sampling = if temperature <= 0. {
+                Sampling::ArgMax
+            } else {
+                match (args.top_k, args.top_p) {
+                    (None, None) => Sampling::All { temperature },
+                    (Some(k), None) => Sampling::TopK { k, temperature },
+                    (None, Some(p)) => Sampling::TopP { p, temperature },
+                    (Some(k), Some(p)) => Sampling::TopKThenTopP { k, p, temperature },
+                }
+            };
+            LogitsProcessor::from_sampling(args.seed, sampling)
+        };
+
+        let start_prompt_processing = std::time::Instant::now();
+        let mut next_token = if !args.split_prompt {
+            let input = Tensor::new(prompt_tokens.as_slice(), &device)?.unsqueeze(0)?;
+            let logits = model.forward(&input, 0)?;
+            let logits = logits.squeeze(0)?;
+            logits_processor.sample(&logits)?
+        } else {
+            let mut next_token = 0;
+            for (pos, token) in prompt_tokens.iter().enumerate() {
+                let input = Tensor::new(&[*token], &device)?.unsqueeze(0)?;
+                let logits = model.forward(&input, pos)?;
+                let logits = logits.squeeze(0)?;
+                next_token = logits_processor.sample(&logits)?
+            }
+            next_token
+        };
+        let prompt_dt = start_prompt_processing.elapsed();
+        all_tokens.push(next_token);
+        if let Some(t) = tos.next_token(next_token)? {
+            print!("{t}");
+            std::io::stdout().flush()?;
+        }
+
+        let eos_token = if args.which.is_open_chat() {
+            "<|end_of_turn|>"
+        } else {
+            "</s>"
+        };
+        let eos_token = *tos.tokenizer().get_vocab(true).get(eos_token).unwrap();
+        let start_post_prompt = std::time::Instant::now();
+        let mut sampled = 0;
+        for index in 0..to_sample {
+            let input = Tensor::new(&[next_token], &device)?.unsqueeze(0)?;
+            let logits = model.forward(&input, prompt_tokens.len() + index)?;
+            let logits = logits.squeeze(0)?;
+            let logits = if args.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = all_tokens.len().saturating_sub(args.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    args.repeat_penalty,
+                    &all_tokens[start_at..],
+                )?
+            };
+            next_token = logits_processor.sample(&logits)?;
+            all_tokens.push(next_token);
+            if let Some(t) = tos.next_token(next_token)? {
+                print!("{t}");
+                std::io::stdout().flush()?;
+            }
+            sampled += 1;
+            if next_token == eos_token {
+                break;
+            };
+        }
+        if let Some(rest) = tos.decode_rest().map_err(candle::Error::msg)? {
+            print!("{rest}");
+        }
+        std::io::stdout().flush()?;
+        let dt = start_post_prompt.elapsed();
+        println!(
+            "\n\n{:4} prompt tokens processed: {:.2} token/s",
+            prompt_tokens.len(),
+            prompt_tokens.len() as f64 / prompt_dt.as_secs_f64(),
+        );
+        println!(
+            "{sampled:4} tokens generated: {:.2} token/s",
+            sampled as f64 / dt.as_secs_f64(),
+        );
+
+        match prompt {
+            Prompt::One(_) => break,
+            Prompt::Interactive => {}
+            Prompt::Chat => {
+                pre_prompt_tokens = [prompt_tokens.as_slice(), all_tokens.as_slice()].concat()
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/candle-examples/examples/qwen/README.md
+++ b/candle-examples/examples/qwen/README.md
+# candle-qwen: large language model series from Alibaba Cloud
+
+Qwen 1.5 is a series of large language models that provide strong performances
+on English and Chinese.
+
+- [Blog post](https://qwenlm.github.io/blog/qwen1.5/) introducing Qwen1.5.
+- [Model card](https://huggingface.co/Qwen/Qwen1.5-0.5B) on the HuggingFace Hub.
+- [Blog post](https://qwenlm.github.io/blog/qwen-moe/) for the
+  mixture-of-experts (MoE) variant.
+
+## Running the example
+
+```bash
+$ cargo run --example qwen --release  -- --prompt "Hello there "
+```
+
+Various model sizes are available via the `--model` argument, including the MoE
+variant.
+
+```bash
+$ cargo run --example qwen --release  -- --model moe-a2.7b --prompt 'def print_prime(n: int): '
+def print_prime(n: int):  # n is the number of primes to be printed
+    for i in range(2, n + 1):
+        if all(i % j != 0 for j in range(2, i)):
+            print(i)
+```
+
--- a/candle-examples/examples/qwen/main.rs
+++ b/candle-examples/examples/qwen/main.rs
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use anyhow::{Error as E, Result};
+use clap::Parser;
+
+use candle_transformers::models::qwen2::{Config as ConfigBase, Model as ModelBase};
+use candle_transformers::models::qwen2_moe::{Config as ConfigMoe, Model as ModelMoe};
+
+use candle::{DType, Device, Tensor};
+use candle_examples::token_output_stream::TokenOutputStream;
+use candle_nn::VarBuilder;
+use candle_transformers::generation::LogitsProcessor;
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use tokenizers::Tokenizer;
+
+enum Model {
+    Base(ModelBase),
+    Moe(ModelMoe),
+}
+
+impl Model {
+    fn forward(&mut self, xs: &Tensor, s: usize) -> candle::Result<Tensor> {
+        match self {
+            Self::Moe(ref mut m) => m.forward(xs, s),
+            Self::Base(ref mut m) => m.forward(xs, s),
+        }
+    }
+}
+
+struct TextGeneration {
+    model: Model,
+    device: Device,
+    tokenizer: TokenOutputStream,
+    logits_processor: LogitsProcessor,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+}
+
+impl TextGeneration {
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        model: Model,
+        tokenizer: Tokenizer,
+        seed: u64,
+        temp: Option<f64>,
+        top_p: Option<f64>,
+        repeat_penalty: f32,
+        repeat_last_n: usize,
+        device: &Device,
+    ) -> Self {
+        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+        Self {
+            model,
+            tokenizer: TokenOutputStream::new(tokenizer),
+            logits_processor,
+            repeat_penalty,
+            repeat_last_n,
+            device: device.clone(),
+        }
+    }
+
+    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
+        use std::io::Write;
+        self.tokenizer.clear();
+        let mut tokens = self
+            .tokenizer
+            .tokenizer()
+            .encode(prompt, true)
+            .map_err(E::msg)?
+            .get_ids()
+            .to_vec();
+        for &t in tokens.iter() {
+            if let Some(t) = self.tokenizer.next_token(t)? {
+                print!("{t}")
+            }
+        }
+        std::io::stdout().flush()?;
+
+        let mut generated_tokens = 0usize;
+        let eos_token = match self.tokenizer.get_token("<|endoftext|>") {
+            Some(token) => token,
+            None => anyhow::bail!("cannot find the <|endoftext|> token"),
+        };
+        let start_gen = std::time::Instant::now();
+        for index in 0..sample_len {
+            let context_size = if index > 0 { 1 } else { tokens.len() };
+            let start_pos = tokens.len().saturating_sub(context_size);
+            let ctxt = &tokens[start_pos..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let logits = self.model.forward(&input, start_pos)?;
+            let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
+            let logits = if self.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    self.repeat_penalty,
+                    &tokens[start_at..],
+                )?
+            };
+
+            let next_token = self.logits_processor.sample(&logits)?;
+            tokens.push(next_token);
+            generated_tokens += 1;
+            if next_token == eos_token {
+                break;
+            }
+            if let Some(t) = self.tokenizer.next_token(next_token)? {
+                print!("{t}");
+                std::io::stdout().flush()?;
+            }
+        }
+        let dt = start_gen.elapsed();
+        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
+            print!("{rest}");
+        }
+        std::io::stdout().flush()?;
+        println!(
+            "\n{generated_tokens} tokens generated ({:.2} token/s)",
+            generated_tokens as f64 / dt.as_secs_f64(),
+        );
+        Ok(())
+    }
+}
+
+#[derive(Clone, Copy, Debug, clap::ValueEnum, PartialEq, Eq)]
+enum WhichModel {
+    #[value(name = "0.5b")]
+    W0_5b,
+    #[value(name = "1.8b")]
+    W1_8b,
+    #[value(name = "4b")]
+    W4b,
+    #[value(name = "7b")]
+    W7b,
+    #[value(name = "14b")]
+    W14b,
+    #[value(name = "72b")]
+    W72b,
+    #[value(name = "moe-a2.7b")]
+    MoeA27b,
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    #[arg(long)]
+    use_flash_attn: bool,
+
+    #[arg(long)]
+    prompt: String,
+
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    temperature: Option<f64>,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, short = 'n', default_value_t = 10000)]
+    sample_len: usize,
+
+    #[arg(long)]
+    model_id: Option<String>,
+
+    #[arg(long, default_value = "main")]
+    revision: String,
+
+    #[arg(long)]
+    tokenizer_file: Option<String>,
+
+    #[arg(long)]
+    weight_files: Option<String>,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+
+    #[arg(long, default_value = "0.5b")]
+    model: WhichModel,
+}
+
+fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx(),
+        candle::utils::with_neon(),
+        candle::utils::with_simd128(),
+        candle::utils::with_f16c()
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature.unwrap_or(0.),
+        args.repeat_penalty,
+        args.repeat_last_n
+    );
+
+    let start = std::time::Instant::now();
+    let api = Api::new()?;
+    let model_id = match args.model_id {
+        Some(model_id) => model_id,
+        None => {
+            let size = match args.model {
+                WhichModel::W0_5b => "0.5B",
+                WhichModel::W1_8b => "1.8B",
+                WhichModel::W4b => "4B",
+                WhichModel::W7b => "7B",
+                WhichModel::W14b => "14B",
+                WhichModel::W72b => "72B",
+                WhichModel::MoeA27b => "MoE-A2.7B",
+            };
+            format!("Qwen/Qwen1.5-{size}")
+        }
+    };
+    let repo = api.repo(Repo::with_revision(
+        model_id,
+        RepoType::Model,
+        args.revision,
+    ));
+    let tokenizer_filename = match args.tokenizer_file {
+        Some(file) => std::path::PathBuf::from(file),
+        None => repo.get("tokenizer.json")?,
+    };
+    let filenames = match args.weight_files {
+        Some(files) => files
+            .split(',')
+            .map(std::path::PathBuf::from)
+            .collect::<Vec<_>>(),
+        None => match args.model {
+            WhichModel::W0_5b | WhichModel::W1_8b => vec![repo.get("model.safetensors")?],
+            WhichModel::W4b
+            | WhichModel::W7b
+            | WhichModel::W14b
+            | WhichModel::W72b
+            | WhichModel::MoeA27b => {
+                candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?
+            }
+        },
+    };
+    println!("retrieved the files in {:?}", start.elapsed());
+    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+
+    let start = std::time::Instant::now();
+    let config_file = repo.get("config.json")?;
+    let device = candle_examples::device(args.cpu)?;
+    let dtype = if device.is_cuda() {
+        DType::BF16
+    } else {
+        DType::F32
+    };
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
+    let model = match args.model {
+        WhichModel::MoeA27b => {
+            let config: ConfigMoe = serde_json::from_slice(&std::fs::read(config_file)?)?;
+            Model::Moe(ModelMoe::new(&config, vb)?)
+        }
+        _ => {
+            let config: ConfigBase = serde_json::from_slice(&std::fs::read(config_file)?)?;
+            Model::Base(ModelBase::new(&config, vb)?)
+        }
+    };
+
+    println!("loaded the model in {:?}", start.elapsed());
+
+    let mut pipeline = TextGeneration::new(
+        model,
+        tokenizer,
+        args.seed,
+        args.temperature,
+        args.top_p,
+        args.repeat_penalty,
+        args.repeat_last_n,
+        &device,
+    );
+    pipeline.run(&args.prompt, args.sample_len)?;
+    Ok(())
+}
--- a/candle-examples/examples/reinforcement-learning/README.md
+++ b/candle-examples/examples/reinforcement-learning/README.md
+# candle-reinforcement-learning
+
+Reinforcement Learning examples for candle.
+
+This has been tested with `gymnasium` version `0.29.1`. You can install the
+Python package with:
+```bash
+pip install "gymnasium[accept-rom-license]"
+```
+
+In order to run the examples, use the following commands. Note the additional
+`--package` flag to ensure that there is no conflict with the `candle-pyo3`
+crate.
+
+For the Policy Gradient example:
+```bash
+cargo run --example reinforcement-learning --features=pyo3 --package candle-examples -- pg
+```
+
+For the Deep Deterministic Policy Gradient example:
+```bash
+cargo run --example reinforcement-learning --features=pyo3 --package candle-examples -- ddpg
+```