Initial commit

25d2752f · yongshk · 25d2752f · 25d2752f · 25d2752f · 25d2752f
Commit 25d2752f authored May 29, 2025 by yongshk
20 changed files
--- a/candle-examples/examples/custom-ops/cuda_kernels.rs
+++ b/candle-examples/examples/custom-ops/cuda_kernels.rs
+pub const LAYERNORM_KERNELS: &str = include_str!(concat!(env!("OUT_DIR"), "/layernorm_kernels.ptx"));
--- a/candle-examples/examples/custom-ops/kernels/layernorm_kernels.cu
+++ b/candle-examples/examples/custom-ops/kernels/layernorm_kernels.cu
+#include <stdint.h>
+#include "reduction_utils.cuh"
+template <typename scalar_t>
+__device__ void
+rms_norm_kernel(scalar_t *__restrict__ out,         // [num_tokens, hidden_size]
+                const scalar_t *__restrict__ input, // [num_tokens, hidden_size]
+                const float epsilon, const uint32_t num_tokens,
+                const uint32_t hidden_size) {
+  __shared__ float s_variance;
+  float variance = 0.0f;
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    const float x = (float)input[blockIdx.x * hidden_size + idx];
+    variance += x * x;
+  }
+  variance = blockReduceSum<float>(variance);
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * hidden_size + idx];
+    out[blockIdx.x * hidden_size + idx] = ((scalar_t)(x * s_variance));
+  }
+}
+extern "C" __global__ void rms_f32(
+    float *__restrict__ out,         // [num_tokens, hidden_size]
+    const float *__restrict__ input, // [num_tokens, hidden_size]
+    const float epsilon, const uint32_t num_tokens,
+    const uint32_t hidden_size) {
+  rms_norm_kernel(out, input, epsilon, num_tokens, hidden_size);
+}
--- a/candle-examples/examples/custom-ops/kernels/reduction_utils.cuh
+++ b/candle-examples/examples/custom-ops/kernels/reduction_utils.cuh
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+template <typename T> __inline__ __device__ T warpReduceSum(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val += __shfl_xor_sync(0xffffffff, val, mask, 32);
+  return val;
+}
+/* Calculate the sum of all elements in a block */
+template <typename T> __inline__ __device__ T blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  int lane = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+  val = warpReduceSum<T>(val);
+  if (lane == 0)
+    shared[wid] = val;
+  __syncthreads();
+  // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+  // blockDim.x is not divided by 32
+  val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
+  val = warpReduceSum<T>(val);
+  return val;
+}
--- a/candle-examples/examples/custom-ops/main.rs
+++ b/candle-examples/examples/custom-ops/main.rs
+// This example illustrates how to implement custom operations. These operations can provide their
+// own forward pass (CPU and GPU versions) as well as their backward pass.
+//
+// In this example we add the RMS normalization operation and implement it for f32.
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+#[rustfmt::skip]
+#[cfg(feature = "cuda")]
+mod cuda_kernels;
+use clap::Parser;
+use candle::{CpuStorage, CustomOp1, Layout, Result, Shape, Tensor};
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+}
+struct LayerNorm {
+    eps: f32,
+}
+impl CustomOp1 for LayerNorm {
+    fn name(&self) -> &'static str {
+        "layer-norm"
+    }
+    fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)> {
+        let (dim1, dim2) = layout.shape().dims2()?;
+        let slice = storage.as_slice::<f32>()?;
+        let src = match layout.contiguous_offsets() {
+            None => candle::bail!("input has to be contiguous"),
+            Some((o1, o2)) => &slice[o1..o2],
+        };
+        let mut dst = Vec::with_capacity(dim1 * dim2);
+        for idx1 in 0..dim1 {
+            let src = &src[idx1 * dim2..(idx1 + 1) * dim2];
+            let variance = src.iter().map(|x| x * x).sum::<f32>();
+            let s_variance = 1f32 / (variance / dim2 as f32 + self.eps).sqrt();
+            dst.extend(src.iter().map(|x| x * s_variance))
+        }
+        let storage = candle::WithDType::to_cpu_storage_owned(dst);
+        Ok((storage, layout.shape().clone()))
+    }
+    #[cfg(feature = "cuda")]
+    fn cuda_fwd(
+        &self,
+        storage: &candle::CudaStorage,
+        layout: &Layout,
+    ) -> Result<(candle::CudaStorage, Shape)> {
+        use candle::backend::BackendStorage;
+        use candle::cuda_backend::cudarc::driver::{LaunchAsync, LaunchConfig};
+        use candle::cuda_backend::WrapErr;
+        let (d1, d2) = layout.shape().dims2()?;
+        let d1 = d1 as u32;
+        let d2 = d2 as u32;
+        let dev = storage.device().clone();
+        let slice = storage.as_cuda_slice::<f32>()?;
+        let slice = match layout.contiguous_offsets() {
+            None => candle::bail!("input has to be contiguous"),
+            Some((o1, o2)) => slice.slice(o1..o2),
+        };
+        let elem_count = layout.shape().elem_count();
+        let dst = unsafe { dev.alloc::<f32>(elem_count) }.w()?;
+        let func = dev.get_or_load_func("rms_f32", cuda_kernels::LAYERNORM_KERNELS)?;
+        let params = (&dst, &slice, self.eps, d1, d2);
+        let cfg = LaunchConfig {
+            grid_dim: (d1, 1, 1),
+            block_dim: (d2, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe { func.launch(cfg, params) }.w()?;
+        let dst = candle::CudaStorage::wrap_cuda_slice(dst, dev);
+        Ok((dst, layout.shape().clone()))
+    }
+}
+fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+    let device = candle_examples::device(args.cpu)?;
+    let t = Tensor::arange(0f32, 14f32, &device)?.reshape((2, 7))?;
+    println!("{t}");
+    let t = t.apply_op1(LayerNorm { eps: 1e-5 })?;
+    println!("{t}");
+    Ok(())
+}
--- a/candle-examples/examples/dinov2/README.md
+++ b/candle-examples/examples/dinov2/README.md
+# candle-dinov2
+[DINOv2](https://github.com/facebookresearch/dinov2) is a computer vision model.
+In this example, it is used as an ImageNet classifier: the model returns the
+probability for the image to belong to each of the 1000 ImageNet categories.
+## Running some example
+```bash
+cargo run --example dinov2 --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg
+> mountain bike, all-terrain bike, off-roader: 43.67%
+> bicycle-built-for-two, tandem bicycle, tandem: 33.20%
+> crash helmet            : 13.23%
+> unicycle, monocycle     : 2.44%
+> maillot                 : 2.42%
+```
+![Leading group, Giro d'Italia 2021](../yolo-v8/assets/bike.jpg)
--- a/candle-examples/examples/dinov2/main.rs
+++ b/candle-examples/examples/dinov2/main.rs
+//! DINOv2: Learning Robust Visual Features without Supervision
+//! https://github.com/facebookresearch/dinov2
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+use clap::Parser;
+use candle::{DType, IndexOp, D};
+use candle_nn::{Module, VarBuilder};
+use candle_transformers::models::dinov2;
+#[derive(Parser)]
+struct Args {
+    #[arg(long)]
+    model: Option<String>,
+    #[arg(long)]
+    image: String,
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+}
+pub fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+    let device = candle_examples::device(args.cpu)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
+    println!("loaded image {image:?}");
+    let model_file = match args.model {
+        None => {
+            let api = hf_hub::api::sync::Api::new()?;
+            let api = api.model("lmz/candle-dino-v2".into());
+            api.get("dinov2_vits14.safetensors")?
+        }
+        Some(model) => model.into(),
+    };
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
+    let model = dinov2::vit_small(vb)?;
+    println!("model built");
+    let logits = model.forward(&image.unsqueeze(0)?)?;
+    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
+        .i(0)?
+        .to_vec1::<f32>()?;
+    let mut prs = prs.iter().enumerate().collect::<Vec<_>>();
+    prs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
+    for &(category_idx, pr) in prs.iter().take(5) {
+        println!(
+            "{:24}: {:.2}%",
+            candle_examples::imagenet::CLASSES[category_idx],
+            100. * pr
+        );
+    }
+    Ok(())
+}
--- a/candle-examples/examples/distilbert/README.md
+++ b/candle-examples/examples/distilbert/README.md
+# candle-distilbert
+DistilBert is a distiled version of the Bert model.
+## Sentence embeddings
+DistilBert is used to compute the sentence embeddings for a prompt. The model weights
+are downloaded from the hub on the first run.
+```bash
+cargo run --example distilbert --release -- --prompt "Here is a test sentence"
+> [[[ 0.5109,  0.1280, -0.2635, ...,  0.3462, -1.0434,  0.1441],
+>   [ 0.1735,  0.0818, -0.5549, ...,  0.3472, -0.8264, -0.0244],
+>   [ 0.0702, -0.1311, -0.4914, ...,  0.3483, -0.6194,  0.1829],
+>   ...
+>   [ 0.2993, -0.0106, -0.4640, ...,  0.2844, -0.6732,  0.0042],
+>   [ 0.1066, -0.0081, -0.4299, ...,  0.3435, -0.7729,  0.0190],
+>   [ 0.8903,  0.2055, -0.2541, ...,  0.3208, -0.6585,  0.0586]]]
+> Tensor[[1, 7, 768], f32]
+```
--- a/candle-examples/examples/distilbert/main.rs
+++ b/candle-examples/examples/distilbert/main.rs
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+use candle_transformers::models::distilbert::{Config, DistilBertModel, DTYPE};
+use anyhow::{Error as E, Result};
+use candle::{Device, Tensor};
+use candle_nn::VarBuilder;
+use clap::Parser;
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use tokenizers::Tokenizer;
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+    /// The model to use, check out available models: https://huggingface.co/models?library=sentence-transformers&sort=trending
+    #[arg(long)]
+    model_id: Option<String>,
+    #[arg(long)]
+    revision: Option<String>,
+    /// When set, compute embeddings for this prompt.
+    #[arg(long)]
+    prompt: String,
+    /// Use the pytorch weights rather than the safetensors ones
+    #[arg(long)]
+    use_pth: bool,
+    /// The number of times to run the prompt.
+    #[arg(long, default_value = "1")]
+    n: usize,
+    /// L2 normalization for embeddings.
+    #[arg(long, default_value = "true")]
+    normalize_embeddings: bool,
+}
+impl Args {
+    fn build_model_and_tokenizer(&self) -> Result<(DistilBertModel, Tokenizer)> {
+        let device = candle_examples::device(self.cpu)?;
+        let default_model = "distilbert-base-uncased".to_string();
+        let default_revision = "main".to_string();
+        let (model_id, revision) = match (self.model_id.to_owned(), self.revision.to_owned()) {
+            (Some(model_id), Some(revision)) => (model_id, revision),
+            (Some(model_id), None) => (model_id, "main".to_string()),
+            (None, Some(revision)) => (default_model, revision),
+            (None, None) => (default_model, default_revision),
+        };
+        let repo = Repo::with_revision(model_id, RepoType::Model, revision);
+        let (config_filename, tokenizer_filename, weights_filename) = {
+            let api = Api::new()?;
+            let api = api.repo(repo);
+            let config = api.get("config.json")?;
+            let tokenizer = api.get("tokenizer.json")?;
+            let weights = if self.use_pth {
+                api.get("pytorch_model.bin")?
+            } else {
+                api.get("model.safetensors")?
+            };
+            (config, tokenizer, weights)
+        };
+        let config = std::fs::read_to_string(config_filename)?;
+        let config: Config = serde_json::from_str(&config)?;
+        let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+        let vb = if self.use_pth {
+            VarBuilder::from_pth(&weights_filename, DTYPE, &device)?
+        } else {
+            unsafe { VarBuilder::from_mmaped_safetensors(&[weights_filename], DTYPE, &device)? }
+        };
+        let model = DistilBertModel::load(vb, &config)?;
+        Ok((model, tokenizer))
+    }
+}
+fn get_mask(size: usize, device: &Device) -> Tensor {
+    let mask: Vec<_> = (0..size)
+        .flat_map(|i| (0..size).map(move |j| u8::from(j > i)))
+        .collect();
+    Tensor::from_slice(&mask, (size, size), device).unwrap()
+}
+fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+    let args = Args::parse();
+    let _guard = if args.tracing {
+        println!("tracing...");
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+    let (model, mut tokenizer) = args.build_model_and_tokenizer()?;
+    let device = &model.device;
+    let tokenizer = tokenizer
+        .with_padding(None)
+        .with_truncation(None)
+        .map_err(E::msg)?;
+    let tokens = tokenizer
+        .encode(args.prompt, true)
+        .map_err(E::msg)?
+        .get_ids()
+        .to_vec();
+    let token_ids = Tensor::new(&tokens[..], device)?.unsqueeze(0)?;
+    let mask = get_mask(tokens.len(), device);
+    println!("token_ids: {:?}", token_ids.to_vec2::<u32>());
+    println!("mask: {:?}", mask.to_vec2::<u8>());
+    let ys = model.forward(&token_ids, &mask)?;
+    println!("{ys}");
+    Ok(())
+}
+pub fn normalize_l2(v: &Tensor) -> Result<Tensor> {
+    Ok(v.broadcast_div(&v.sqr()?.sum_keepdim(1)?.sqrt()?)?)
+}
--- a/candle-examples/examples/efficientnet/main.rs
+++ b/candle-examples/examples/efficientnet/main.rs
+//! EfficientNet implementation.
+//!
+//! https://arxiv.org/abs/1905.11946
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+use candle::{DType, IndexOp, D};
+use candle_nn::{Module, VarBuilder};
+use candle_transformers::models::efficientnet::{EfficientNet, MBConvConfig};
+use clap::{Parser, ValueEnum};
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Which {
+    B0,
+    B1,
+    B2,
+    B3,
+    B4,
+    B5,
+    B6,
+    B7,
+}
+#[derive(Parser)]
+struct Args {
+    #[arg(long)]
+    model: Option<String>,
+    #[arg(long)]
+    image: String,
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+    /// Variant of the model to use.
+    #[arg(value_enum, long, default_value_t = Which::B2)]
+    which: Which,
+}
+pub fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+    let device = candle_examples::device(args.cpu)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
+    println!("loaded image {image:?}");
+    let model_file = match args.model {
+        None => {
+            let api = hf_hub::api::sync::Api::new()?;
+            let api = api.model("lmz/candle-efficientnet".into());
+            let filename = match args.which {
+                Which::B0 => "efficientnet-b0.safetensors",
+                Which::B1 => "efficientnet-b1.safetensors",
+                Which::B2 => "efficientnet-b2.safetensors",
+                Which::B3 => "efficientnet-b3.safetensors",
+                Which::B4 => "efficientnet-b4.safetensors",
+                Which::B5 => "efficientnet-b5.safetensors",
+                Which::B6 => "efficientnet-b6.safetensors",
+                Which::B7 => "efficientnet-b7.safetensors",
+            };
+            api.get(filename)?
+        }
+        Some(model) => model.into(),
+    };
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
+    let cfg = match args.which {
+        Which::B0 => MBConvConfig::b0(),
+        Which::B1 => MBConvConfig::b1(),
+        Which::B2 => MBConvConfig::b2(),
+        Which::B3 => MBConvConfig::b3(),
+        Which::B4 => MBConvConfig::b4(),
+        Which::B5 => MBConvConfig::b5(),
+        Which::B6 => MBConvConfig::b6(),
+        Which::B7 => MBConvConfig::b7(),
+    };
+    let model = EfficientNet::new(vb, cfg, candle_examples::imagenet::CLASS_COUNT as usize)?;
+    println!("model built");
+    let logits = model.forward(&image.unsqueeze(0)?)?;
+    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
+        .i(0)?
+        .to_vec1::<f32>()?;
+    let mut prs = prs.iter().enumerate().collect::<Vec<_>>();
+    prs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
+    for &(category_idx, pr) in prs.iter().take(5) {
+        println!(
+            "{:24}: {:.2}%",
+            candle_examples::imagenet::CLASSES[category_idx],
+            100. * pr
+        );
+    }
+    Ok(())
+}
--- a/candle-examples/examples/efficientvit/README.md
+++ b/candle-examples/examples/efficientvit/README.md
+# candle-efficientvit
+[EfﬁcientViT: Memory Efﬁcient Vision Transformer with Cascaded Group Attention](https://arxiv.org/abs/2305.07027).
+This candle implementation uses a pre-trained EfficientViT (from Microsoft Research Asia) network for inference.
+The classification head has been trained on the ImageNet dataset and returns the probabilities for the top-5 classes.
+## Running an example
+```
+$ cargo run --example efficientvit --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg --which m1
+loaded image Tensor[dims 3, 224, 224; f32]
+model built
+mountain bike, all-terrain bike, off-roader: 69.80%
+unicycle, monocycle     : 13.03%
+bicycle-built-for-two, tandem bicycle, tandem: 9.28%
+crash helmet            : 2.25%
+alp                     : 0.46%
+```
--- a/candle-examples/examples/efficientvit/main.rs
+++ b/candle-examples/examples/efficientvit/main.rs
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+use clap::{Parser, ValueEnum};
+use candle::{DType, IndexOp, D};
+use candle_nn::{Module, VarBuilder};
+use candle_transformers::models::efficientvit;
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Which {
+    M0,
+    M1,
+    M2,
+    M3,
+    M4,
+    M5,
+}
+impl Which {
+    fn model_filename(&self) -> String {
+        let name = match self {
+            Self::M0 => "m0",
+            Self::M1 => "m1",
+            Self::M2 => "m2",
+            Self::M3 => "m3",
+            Self::M4 => "m4",
+            Self::M5 => "m5",
+        };
+        format!("timm/efficientvit_{}.r224_in1k", name)
+    }
+    fn config(&self) -> efficientvit::Config {
+        match self {
+            Self::M0 => efficientvit::Config::m0(),
+            Self::M1 => efficientvit::Config::m1(),
+            Self::M2 => efficientvit::Config::m2(),
+            Self::M3 => efficientvit::Config::m3(),
+            Self::M4 => efficientvit::Config::m4(),
+            Self::M5 => efficientvit::Config::m5(),
+        }
+    }
+}
+#[derive(Parser)]
+struct Args {
+    #[arg(long)]
+    model: Option<String>,
+    #[arg(long)]
+    image: String,
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+    #[arg(value_enum, long, default_value_t=Which::M0)]
+    which: Which,
+}
+pub fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+    let device = candle_examples::device(args.cpu)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
+    println!("loaded image {image:?}");
+    let model_file = match args.model {
+        None => {
+            let model_name = args.which.model_filename();
+            let api = hf_hub::api::sync::Api::new()?;
+            let api = api.model(model_name);
+            api.get("model.safetensors")?
+        }
+        Some(model) => model.into(),
+    };
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
+    let model = efficientvit::efficientvit(&args.which.config(), 1000, vb)?;
+    println!("model built");
+    let logits = model.forward(&image.unsqueeze(0)?)?;
+    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
+        .i(0)?
+        .to_vec1::<f32>()?;
+    let mut prs = prs.iter().enumerate().collect::<Vec<_>>();
+    prs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
+    for &(category_idx, pr) in prs.iter().take(5) {
+        println!(
+            "{:24}: {:.2}%",
+            candle_examples::imagenet::CLASSES[category_idx],
+            100. * pr
+        );
+    }
+    Ok(())
+}
--- a/candle-examples/examples/encodec/README.md
+++ b/candle-examples/examples/encodec/README.md
+# candle-endocec
+[EnCodec](https://huggingface.co/facebook/encodec_24khz) is a high-quality audio
+compression model using an encoder/decoder architecture with residual vector
+quantization.
+## Running one example
+```bash
+cargo run --example encodec --features symphonia --release -- code-to-audio \
+    candle-examples/examples/encodec/jfk-codes.safetensors \
+    jfk.wav
+```
+This decodes the EnCodec tokens stored in `jfk-codes.safetensors` and generates
+an output wav file containing the audio data.
+Instead of `code-to-audio` one can use:
+- `audio-to-audio in.mp3 out.wav`: encodes the input audio file then decodes it to a wav file.
+- `audio-to-code in.mp3 out.safetensors`: generates a safetensors file
+  containing EnCodec tokens for the input audio file.
+If the audio output file name is set to `-`, the audio content directly gets
+played on default audio output device. If the audio input file is set to `-`, the audio
+gets recorded from the default audio input.
--- a/candle-examples/examples/encodec/audio_io.rs
+++ b/candle-examples/examples/encodec/audio_io.rs
+#![allow(unused)]
+use anyhow::{Context, Result};
+use std::sync::{Arc, Mutex};
+pub const SAMPLE_RATE: usize = 24_000;
+pub(crate) struct AudioOutputData_ {
+    resampled_data: std::collections::VecDeque<f32>,
+    resampler: rubato::FastFixedIn<f32>,
+    output_buffer: Vec<f32>,
+    input_buffer: Vec<f32>,
+    input_len: usize,
+}
+impl AudioOutputData_ {
+    pub(crate) fn new(input_sample_rate: usize, output_sample_rate: usize) -> Result<Self> {
+        use rubato::Resampler;
+        let resampled_data = std::collections::VecDeque::with_capacity(output_sample_rate * 10);
+        let resample_ratio = output_sample_rate as f64 / input_sample_rate as f64;
+        let resampler = rubato::FastFixedIn::new(
+            resample_ratio,
+            f64::max(resample_ratio, 1.0),
+            rubato::PolynomialDegree::Septic,
+            1024,
+            1,
+        )?;
+        let input_buffer = resampler.input_buffer_allocate(true).remove(0);
+        let output_buffer = resampler.output_buffer_allocate(true).remove(0);
+        Ok(Self {
+            resampled_data,
+            resampler,
+            input_buffer,
+            output_buffer,
+            input_len: 0,
+        })
+    }
+    pub fn reset(&mut self) {
+        use rubato::Resampler;
+        self.output_buffer.fill(0.);
+        self.input_buffer.fill(0.);
+        self.resampler.reset();
+        self.resampled_data.clear();
+    }
+    pub(crate) fn take_all(&mut self) -> Vec<f32> {
+        let mut data = Vec::with_capacity(self.resampled_data.len());
+        while let Some(elem) = self.resampled_data.pop_back() {
+            data.push(elem);
+        }
+        data
+    }
+    pub(crate) fn is_empty(&self) -> bool {
+        self.resampled_data.is_empty()
+    }
+    // Assumes that the input buffer is large enough.
+    fn push_input_buffer(&mut self, samples: &[f32]) {
+        self.input_buffer[self.input_len..self.input_len + samples.len()].copy_from_slice(samples);
+        self.input_len += samples.len()
+    }
+    pub(crate) fn push_samples(&mut self, samples: &[f32]) -> Result<()> {
+        use rubato::Resampler;
+        let mut pos_in = 0;
+        loop {
+            let rem = self.input_buffer.len() - self.input_len;
+            let pos_end = usize::min(pos_in + rem, samples.len());
+            self.push_input_buffer(&samples[pos_in..pos_end]);
+            pos_in = pos_end;
+            if self.input_len < self.input_buffer.len() {
+                break;
+            }
+            let (_, out_len) = self.resampler.process_into_buffer(
+                &[&self.input_buffer],
+                &mut [&mut self.output_buffer],
+                None,
+            )?;
+            for &elem in self.output_buffer[..out_len].iter() {
+                self.resampled_data.push_front(elem)
+            }
+            self.input_len = 0;
+        }
+        Ok(())
+    }
+}
+type AudioOutputData = Arc<Mutex<AudioOutputData_>>;
+pub(crate) fn setup_output_stream() -> Result<(cpal::Stream, AudioOutputData)> {
+    use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+    println!("Setup audio output stream!");
+    let host = cpal::default_host();
+    let device = host
+        .default_output_device()
+        .context("no output device available")?;
+    let mut supported_configs_range = device.supported_output_configs()?;
+    let config_range = match supported_configs_range.find(|c| c.channels() == 1) {
+        // On macOS, it's commonly the case that there are only stereo outputs.
+        None => device
+            .supported_output_configs()?
+            .next()
+            .context("no audio output available")?,
+        Some(config_range) => config_range,
+    };
+    let sample_rate = cpal::SampleRate(SAMPLE_RATE as u32).clamp(
+        config_range.min_sample_rate(),
+        config_range.max_sample_rate(),
+    );
+    let config: cpal::StreamConfig = config_range.with_sample_rate(sample_rate).into();
+    let channels = config.channels as usize;
+    println!(
+        "cpal device: {} {} {config:?}",
+        device.name().unwrap_or_else(|_| "unk".to_string()),
+        config.sample_rate.0
+    );
+    let audio_data = Arc::new(Mutex::new(AudioOutputData_::new(
+        SAMPLE_RATE,
+        config.sample_rate.0 as usize,
+    )?));
+    let ad = audio_data.clone();
+    let stream = device.build_output_stream(
+        &config,
+        move |data: &mut [f32], _: &cpal::OutputCallbackInfo| {
+            data.fill(0.);
+            let mut ad = ad.lock().unwrap();
+            let mut last_elem = 0f32;
+            for (idx, elem) in data.iter_mut().enumerate() {
+                if idx % channels == 0 {
+                    match ad.resampled_data.pop_back() {
+                        None => break,
+                        Some(v) => {
+                            last_elem = v;
+                            *elem = v
+                        }
+                    }
+                } else {
+                    *elem = last_elem
+                }
+            }
+        },
+        move |err| eprintln!("cpal error: {err}"),
+        None, // None=blocking, Some(Duration)=timeout
+    )?;
+    stream.play()?;
+    Ok((stream, audio_data))
+}
+pub(crate) fn setup_input_stream() -> Result<(cpal::Stream, AudioOutputData)> {
+    use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+    println!("Setup audio input stream!");
+    let host = cpal::default_host();
+    let device = host
+        .default_input_device()
+        .context("no input device available")?;
+    let mut supported_configs_range = device.supported_input_configs()?;
+    let config_range = supported_configs_range
+        .find(|c| c.channels() == 1)
+        .context("no audio input available")?;
+    let sample_rate = cpal::SampleRate(SAMPLE_RATE as u32).clamp(
+        config_range.min_sample_rate(),
+        config_range.max_sample_rate(),
+    );
+    let config: cpal::StreamConfig = config_range.with_sample_rate(sample_rate).into();
+    println!(
+        "cpal device: {} {} {config:?}",
+        device.name().unwrap_or_else(|_| "unk".to_string()),
+        config.sample_rate.0
+    );
+    let audio_data = Arc::new(Mutex::new(AudioOutputData_::new(
+        config.sample_rate.0 as usize,
+        SAMPLE_RATE,
+    )?));
+    let ad = audio_data.clone();
+    let stream = device.build_input_stream(
+        &config,
+        move |data: &[f32], _: &cpal::InputCallbackInfo| {
+            let mut ad = ad.lock().unwrap();
+            if let Err(err) = ad.push_samples(data) {
+                eprintln!("error processing audio input {err:?}")
+            }
+        },
+        move |err| eprintln!("cpal error: {err}"),
+        None, // None=blocking, Some(Duration)=timeout
+    )?;
+    stream.play()?;
+    Ok((stream, audio_data))
+}
+fn conv<T>(samples: &mut Vec<f32>, data: std::borrow::Cow<symphonia::core::audio::AudioBuffer<T>>)
+where
+    T: symphonia::core::sample::Sample,
+    f32: symphonia::core::conv::FromSample<T>,
+{
+    use symphonia::core::audio::Signal;
+    use symphonia::core::conv::FromSample;
+    samples.extend(data.chan(0).iter().map(|v| f32::from_sample(*v)))
+}
+pub(crate) fn pcm_decode<P: AsRef<std::path::Path>>(path: P) -> Result<(Vec<f32>, u32)> {
+    use symphonia::core::audio::{AudioBufferRef, Signal};
+    let src = std::fs::File::open(path)?;
+    let mss = symphonia::core::io::MediaSourceStream::new(Box::new(src), Default::default());
+    let hint = symphonia::core::probe::Hint::new();
+    let meta_opts: symphonia::core::meta::MetadataOptions = Default::default();
+    let fmt_opts: symphonia::core::formats::FormatOptions = Default::default();
+    let probed = symphonia::default::get_probe().format(&hint, mss, &fmt_opts, &meta_opts)?;
+    let mut format = probed.format;
+    let track = format
+        .tracks()
+        .iter()
+        .find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
+        .expect("no supported audio tracks");
+    let mut decoder = symphonia::default::get_codecs()
+        .make(&track.codec_params, &Default::default())
+        .expect("unsupported codec");
+    let track_id = track.id;
+    let sample_rate = track.codec_params.sample_rate.unwrap_or(0);
+    let mut pcm_data = Vec::new();
+    while let Ok(packet) = format.next_packet() {
+        while !format.metadata().is_latest() {
+            format.metadata().pop();
+        }
+        if packet.track_id() != track_id {
+            continue;
+        }
+        match decoder.decode(&packet)? {
+            AudioBufferRef::F32(buf) => pcm_data.extend(buf.chan(0)),
+            AudioBufferRef::U8(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::U16(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::U24(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::U32(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S8(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S16(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S24(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S32(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::F64(data) => conv(&mut pcm_data, data),
+        }
+    }
+    Ok((pcm_data, sample_rate))
+}
+pub(crate) fn resample(pcm_in: &[f32], sr_in: usize, sr_out: usize) -> Result<Vec<f32>> {
+    use rubato::Resampler;
+    let mut pcm_out =
+        Vec::with_capacity((pcm_in.len() as f64 * sr_out as f64 / sr_in as f64) as usize + 1024);
+    let mut resampler = rubato::FftFixedInOut::<f32>::new(sr_in, sr_out, 1024, 1)?;
+    let mut output_buffer = resampler.output_buffer_allocate(true);
+    let mut pos_in = 0;
+    while pos_in + resampler.input_frames_next() < pcm_in.len() {
+        let (in_len, out_len) =
+            resampler.process_into_buffer(&[&pcm_in[pos_in..]], &mut output_buffer, None)?;
+        pos_in += in_len;
+        pcm_out.extend_from_slice(&output_buffer[0][..out_len]);
+    }
+    if pos_in < pcm_in.len() {
+        let (_in_len, out_len) = resampler.process_partial_into_buffer(
+            Some(&[&pcm_in[pos_in..]]),
+            &mut output_buffer,
+            None,
+        )?;
+        pcm_out.extend_from_slice(&output_buffer[0][..out_len]);
+    }
+    Ok(pcm_out)
+}
--- a/candle-examples/examples/encodec/jfk-codes.safetensors
+++ b/candle-examples/examples/encodec/jfk-codes.safetensors
--- a/candle-examples/examples/encodec/main.rs
+++ b/candle-examples/examples/encodec/main.rs
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+use anyhow::Result;
+use candle::{DType, IndexOp, Tensor};
+use candle_nn::VarBuilder;
+use candle_transformers::models::encodec::{Config, Model};
+use clap::{Parser, ValueEnum};
+use hf_hub::api::sync::Api;
+mod audio_io;
+#[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)]
+enum Action {
+    AudioToAudio,
+    AudioToCode,
+    CodeToAudio,
+}
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// The action to be performed, specifies the format for the input and output data.
+    action: Action,
+    /// The input file, either an audio file or some encodec tokens stored as safetensors.
+    in_file: String,
+    /// The output file, either a wave audio file or some encodec tokens stored as safetensors.
+    out_file: String,
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+    /// The model weight file, in safetensor format.
+    #[arg(long)]
+    model: Option<String>,
+}
+fn main() -> Result<()> {
+    let args = Args::parse();
+    let device = candle_examples::device(args.cpu)?;
+    let model = match args.model {
+        Some(model) => std::path::PathBuf::from(model),
+        None => Api::new()?
+            .model("facebook/encodec_24khz".to_string())
+            .get("model.safetensors")?,
+    };
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model], DType::F32, &device)? };
+    let config = Config::default();
+    let model = Model::new(&config, vb)?;
+    let codes = match args.action {
+        Action::CodeToAudio => {
+            let codes = candle::safetensors::load(args.in_file, &device)?;
+            codes.get("codes").expect("no codes in input file").clone()
+        }
+        Action::AudioToCode | Action::AudioToAudio => {
+            let pcm = if args.in_file == "-" {
+                println!(">>>> RECORDING AUDIO, PRESS ENTER ONCE DONE <<<<");
+                let (stream, input_audio) = audio_io::setup_input_stream()?;
+                let mut pcms = vec![];
+                let stdin = std::thread::spawn(|| {
+                    let mut s = String::new();
+                    std::io::stdin().read_line(&mut s)
+                });
+                while !stdin.is_finished() {
+                    let input = input_audio.lock().unwrap().take_all();
+                    if input.is_empty() {
+                        std::thread::sleep(std::time::Duration::from_millis(100));
+                        continue;
+                    }
+                    pcms.push(input)
+                }
+                drop(stream);
+                pcms.concat()
+            } else {
+                let (pcm, sample_rate) = audio_io::pcm_decode(args.in_file)?;
+                if sample_rate != 24_000 {
+                    println!("WARNING: encodec uses a 24khz sample rate, input uses {sample_rate}, resampling...");
+                    audio_io::resample(&pcm, sample_rate as usize, 24_000)?
+                } else {
+                    pcm
+                }
+            };
+            let pcm_len = pcm.len();
+            let pcm = Tensor::from_vec(pcm, (1, 1, pcm_len), &device)?;
+            println!("input pcm shape: {:?}", pcm.shape());
+            model.encode(&pcm)?
+        }
+    };
+    println!("codes shape: {:?}", codes.shape());
+    match args.action {
+        Action::AudioToCode => {
+            codes.save_safetensors("codes", &args.out_file)?;
+        }
+        Action::AudioToAudio | Action::CodeToAudio => {
+            let pcm = model.decode(&codes)?;
+            println!("output pcm shape: {:?}", pcm.shape());
+            let pcm = pcm.i(0)?.i(0)?;
+            let pcm = candle_examples::audio::normalize_loudness(&pcm, 24_000, true)?;
+            let pcm = pcm.to_vec1::<f32>()?;
+            if args.out_file == "-" {
+                let (stream, ad) = audio_io::setup_output_stream()?;
+                {
+                    let mut ad = ad.lock().unwrap();
+                    ad.push_samples(&pcm)?;
+                }
+                loop {
+                    let ad = ad.lock().unwrap();
+                    if ad.is_empty() {
+                        break;
+                    }
+                    // That's very weird, calling thread::sleep here triggers the stream to stop
+                    // playing (the callback doesn't seem to be called anymore).
+                    // std::thread::sleep(std::time::Duration::from_millis(100));
+                }
+                drop(stream)
+            } else {
+                let mut output = std::fs::File::create(&args.out_file)?;
+                candle_examples::wav::write_pcm_as_wav(&mut output, &pcm, 24_000)?;
+            }
+        }
+    }
+    Ok(())
+}
--- a/candle-examples/examples/falcon/README.md
+++ b/candle-examples/examples/falcon/README.md
+# candle-falcon
+Falcon is a general large language model.
--- a/candle-examples/examples/falcon/main.rs
+++ b/candle-examples/examples/falcon/main.rs
+// TODO: Add an offline mode.
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+use anyhow::{Error as E, Result};
+use candle::{DType, Device, Tensor};
+use candle_nn::VarBuilder;
+use candle_transformers::generation::LogitsProcessor;
+use clap::Parser;
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use tokenizers::Tokenizer;
+use candle_transformers::models::falcon::{Config, Falcon};
+struct TextGeneration {
+    model: Falcon,
+    device: Device,
+    tokenizer: Tokenizer,
+    logits_processor: LogitsProcessor,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+}
+struct GenerationOptions {
+    temp: Option<f64>,
+    top_p: Option<f64>,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+}
+impl TextGeneration {
+    fn new(
+        model: Falcon,
+        tokenizer: Tokenizer,
+        generation_options: GenerationOptions,
+        seed: u64,
+        device: &Device,
+    ) -> Self {
+        let logits_processor =
+            LogitsProcessor::new(seed, generation_options.temp, generation_options.top_p);
+        let repeat_penalty = generation_options.repeat_penalty;
+        let repeat_last_n = generation_options.repeat_last_n;
+        Self {
+            model,
+            tokenizer,
+            logits_processor,
+            device: device.clone(),
+            repeat_penalty,
+            repeat_last_n,
+        }
+    }
+    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
+        println!("starting the inference loop");
+        let mut tokens = self
+            .tokenizer
+            .encode(prompt, true)
+            .map_err(E::msg)?
+            .get_ids()
+            .to_vec();
+        let mut new_tokens = vec![];
+        let start_gen = std::time::Instant::now();
+        for index in 0..sample_len {
+            let start_gen = std::time::Instant::now();
+            let context_size = if self.model.config().use_cache && index > 0 {
+                1
+            } else {
+                tokens.len()
+            };
+            let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let logits = self.model.forward(&input)?;
+            let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
+            let logits = if self.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    self.repeat_penalty,
+                    &tokens[start_at..],
+                )?
+            };
+            let next_token = self.logits_processor.sample(&logits)?;
+            tokens.push(next_token);
+            new_tokens.push(next_token);
+            println!("> {:?}", start_gen.elapsed());
+            println!(
+                "{} token: {} '{}'",
+                index + 1,
+                next_token,
+                self.tokenizer.decode(&[next_token], true).map_err(E::msg)?
+            );
+        }
+        let dt = start_gen.elapsed();
+        println!(
+            "{sample_len} tokens generated ({} token/s)\n----\n{}\n----",
+            sample_len as f64 / dt.as_secs_f64(),
+            self.tokenizer.decode(&new_tokens, true).map_err(E::msg)?
+        );
+        Ok(())
+    }
+}
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+    #[arg(long)]
+    prompt: String,
+    /// Use f32 computations rather than bf16.
+    #[arg(long)]
+    use_f32: bool,
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    temperature: Option<f64>,
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, default_value_t = 100)]
+    sample_len: usize,
+    #[arg(long, default_value = "tiiuae/falcon-7b")]
+    model_id: String,
+    #[arg(long, default_value = "refs/pr/43")]
+    revision: String,
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.0)]
+    repeat_penalty: f32,
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+}
+fn main() -> Result<()> {
+    let args = Args::parse();
+    let device = candle_examples::device(args.cpu)?;
+    let start = std::time::Instant::now();
+    let api = Api::new()?;
+    let repo = api.repo(Repo::with_revision(
+        args.model_id,
+        RepoType::Model,
+        args.revision,
+    ));
+    let tokenizer_filename = repo.get("tokenizer.json")?;
+    let filenames = candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?;
+    println!("retrieved the files in {:?}", start.elapsed());
+    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+    let start = std::time::Instant::now();
+    let dtype = if args.use_f32 {
+        DType::F32
+    } else {
+        DType::BF16
+    };
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
+    let config = Config::falcon7b();
+    config.validate()?;
+    let model = Falcon::load(vb, config)?;
+    println!("loaded the model in {:?}", start.elapsed());
+    let generation_options = GenerationOptions {
+        temp: args.temperature,
+        top_p: args.top_p,
+        repeat_penalty: args.repeat_penalty,
+        repeat_last_n: args.repeat_last_n,
+    };
+    let mut pipeline =
+        TextGeneration::new(model, tokenizer, generation_options, args.seed, &device);
+    pipeline.run(&args.prompt, args.sample_len)?;
+    Ok(())
+}
--- a/candle-examples/examples/gemma/README.md
+++ b/candle-examples/examples/gemma/README.md
+# candle-gemma: 2b and 7b LLMs from Google DeepMind
+[Gemma](https://ai.google.dev/gemma/docs) is a collection of lightweight open
+models published by Google Deepmind with a 2b and a 7b variant.
+In order to use the example below, you have to accept the license on the
+[HuggingFace Hub Gemma repo](https://huggingface.co/google/gemma-7b) and set up
+your access token via the [HuggingFace cli login
+command](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login).
+## Running the example
+```bash
+$ cargo run --example gemma --release -- --prompt "fn count_primes(max_n: usize)"
+fn count_primes(max_n: usize) -> usize {
+    let mut primes = vec![true; max_n];
+    for i in 2..=max_n {
+        if primes[i] {
+            for j in i * i..max_n {
+                primes[j] = false;
+             }
+         }
+    }
+    primes.len()
+}
+```
--- a/candle-examples/examples/gemma/main.rs
+++ b/candle-examples/examples/gemma/main.rs
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+use anyhow::{Error as E, Result};
+use clap::Parser;
+use candle_transformers::models::gemma::{Config, Model};
+use candle::{DType, Device, Tensor};
+use candle_examples::token_output_stream::TokenOutputStream;
+use candle_nn::VarBuilder;
+use candle_transformers::generation::LogitsProcessor;
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use tokenizers::Tokenizer;
+#[derive(Clone, Debug, Copy, PartialEq, Eq, clap::ValueEnum)]
+enum Which {
+    #[value(name = "2b")]
+    Base2B,
+    #[value(name = "7b")]
+    Base7B,
+    #[value(name = "2b-it")]
+    Instruct2B,
+    #[value(name = "7b-it")]
+    Instruct7B,
+    #[value(name = "1.1-2b-it")]
+    InstructV1_1_2B,
+    #[value(name = "1.1-7b-it")]
+    InstructV1_1_7B,
+}
+struct TextGeneration {
+    model: Model,
+    device: Device,
+    tokenizer: TokenOutputStream,
+    logits_processor: LogitsProcessor,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+}
+impl TextGeneration {
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        model: Model,
+        tokenizer: Tokenizer,
+        seed: u64,
+        temp: Option<f64>,
+        top_p: Option<f64>,
+        repeat_penalty: f32,
+        repeat_last_n: usize,
+        device: &Device,
+    ) -> Self {
+        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+        Self {
+            model,
+            tokenizer: TokenOutputStream::new(tokenizer),
+            logits_processor,
+            repeat_penalty,
+            repeat_last_n,
+            device: device.clone(),
+        }
+    }
+    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
+        use std::io::Write;
+        self.tokenizer.clear();
+        let mut tokens = self
+            .tokenizer
+            .tokenizer()
+            .encode(prompt, true)
+            .map_err(E::msg)?
+            .get_ids()
+            .to_vec();
+        for &t in tokens.iter() {
+            if let Some(t) = self.tokenizer.next_token(t)? {
+                print!("{t}")
+            }
+        }
+        std::io::stdout().flush()?;
+        let mut generated_tokens = 0usize;
+        let eos_token = match self.tokenizer.get_token("<eos>") {
+            Some(token) => token,
+            None => anyhow::bail!("cannot find the <eos> token"),
+        };
+        let start_gen = std::time::Instant::now();
+        for index in 0..sample_len {
+            let context_size = if index > 0 { 1 } else { tokens.len() };
+            let start_pos = tokens.len().saturating_sub(context_size);
+            let ctxt = &tokens[start_pos..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let logits = self.model.forward(&input, start_pos)?;
+            let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
+            let logits = if self.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    self.repeat_penalty,
+                    &tokens[start_at..],
+                )?
+            };
+            let next_token = self.logits_processor.sample(&logits)?;
+            tokens.push(next_token);
+            generated_tokens += 1;
+            if next_token == eos_token {
+                break;
+            }
+            if let Some(t) = self.tokenizer.next_token(next_token)? {
+                print!("{t}");
+                std::io::stdout().flush()?;
+            }
+        }
+        let dt = start_gen.elapsed();
+        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
+            print!("{rest}");
+        }
+        std::io::stdout().flush()?;
+        println!(
+            "\n{generated_tokens} tokens generated ({:.2} token/s)",
+            generated_tokens as f64 / dt.as_secs_f64(),
+        );
+        Ok(())
+    }
+}
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+    #[arg(long)]
+    prompt: String,
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    temperature: Option<f64>,
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, short = 'n', default_value_t = 10000)]
+    sample_len: usize,
+    #[arg(long)]
+    model_id: Option<String>,
+    #[arg(long, default_value = "main")]
+    revision: String,
+    #[arg(long)]
+    tokenizer_file: Option<String>,
+    #[arg(long)]
+    config_file: Option<String>,
+    #[arg(long)]
+    weight_files: Option<String>,
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+    /// The model to use.
+    #[arg(long, default_value = "2b")]
+    which: Which,
+}
+fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+    let args = Args::parse();
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx(),
+        candle::utils::with_neon(),
+        candle::utils::with_simd128(),
+        candle::utils::with_f16c()
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature.unwrap_or(0.),
+        args.repeat_penalty,
+        args.repeat_last_n
+    );
+    let start = std::time::Instant::now();
+    let api = Api::new()?;
+    let model_id = match &args.model_id {
+        Some(model_id) => model_id.to_string(),
+        None => match args.which {
+            Which::InstructV1_1_2B => "google/gemma-1.1-2b-it".to_string(),
+            Which::InstructV1_1_7B => "google/gemma-1.1-7b-it".to_string(),
+            Which::Base2B => "google/gemma-2b".to_string(),
+            Which::Base7B => "google/gemma-7b".to_string(),
+            Which::Instruct2B => "google/gemma-2b-it".to_string(),
+            Which::Instruct7B => "google/gemma-7b-it".to_string(),
+        },
+    };
+    let repo = api.repo(Repo::with_revision(
+        model_id,
+        RepoType::Model,
+        args.revision,
+    ));
+    let tokenizer_filename = match args.tokenizer_file {
+        Some(file) => std::path::PathBuf::from(file),
+        None => repo.get("tokenizer.json")?,
+    };
+    let config_filename = match args.config_file {
+        Some(file) => std::path::PathBuf::from(file),
+        None => repo.get("config.json")?,
+    };
+    let filenames = match args.weight_files {
+        Some(files) => files
+            .split(',')
+            .map(std::path::PathBuf::from)
+            .collect::<Vec<_>>(),
+        None => candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?,
+    };
+    println!("retrieved the files in {:?}", start.elapsed());
+    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+    let config: Config = serde_json::from_reader(std::fs::File::open(config_filename)?)?;
+    let start = std::time::Instant::now();
+    let device = candle_examples::device(args.cpu)?;
+    let dtype = if device.is_cuda() {
+        DType::BF16
+    } else {
+        DType::F32
+    };
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
+    let model = Model::new(&config, vb)?;
+    println!("loaded the model in {:?}", start.elapsed());
+    let mut pipeline = TextGeneration::new(
+        model,
+        tokenizer,
+        args.seed,
+        args.temperature,
+        args.top_p,
+        args.repeat_penalty,
+        args.repeat_last_n,
+        &device,
+    );
+    pipeline.run(&args.prompt, args.sample_len)?;
+    Ok(())
+}
--- a/candle-examples/examples/jina-bert/README.md
+++ b/candle-examples/examples/jina-bert/README.md
+# candle-jina-bert
+Jina-Bert is a general large language model with a context size of 8192, [model
+card](https://huggingface.co/jinaai/jina-embeddings-v2-base-en). In this example
+it can be used for two different tasks:
+- Compute sentence embeddings for a prompt.
+- Compute similarities between a set of sentences.
+## Sentence embeddings
+Jina-Bert is used to compute the sentence embeddings for a prompt. The model weights
+are downloaded from the hub on the first run.
+```bash
+cargo run --example jina-bert --release -- --prompt "Here is a test sentence"
+> [[[ 0.1595, -0.9885,  0.6494, ...,  0.3003, -0.6901, -1.2355],
+>   [ 0.0374, -0.1798,  1.3359, ...,  0.6731,  0.2133, -1.6807],
+>   [ 0.1700, -0.8534,  0.8924, ..., -0.1785, -0.0727, -1.5087],
+>   ...
+>   [-0.3113, -1.3665,  0.2027, ..., -0.2519,  0.1711, -1.5811],
+>   [ 0.0907, -1.0492,  0.5382, ...,  0.0242, -0.7077, -1.0830],
+>   [ 0.0369, -0.6343,  0.6105, ...,  0.0671,  0.3778, -1.1505]]]
+> Tensor[[1, 7, 768], f32]
+```
+## Similarities
+In this example, Jina-Bert is used to compute the sentence embeddings for a set of
+sentences (hardcoded in the examples). Then cosine similarities are computed for
+each sentence pair and they are reported by decreasing values, hence the first
+reported pair contains the two sentences that have the highest similarity score.
+The sentence embeddings are computed using average pooling through all the
+sentence tokens, including some potential padding.
+```bash
+cargo run --example jina-bert --release
+> score: 0.94 'The new movie is awesome' 'The new movie is so great'
+> score: 0.81 'The cat sits outside' 'The cat plays in the garden'
+> score: 0.78 'I love pasta' 'Do you like pizza?'
+> score: 0.68 'I love pasta' 'The new movie is awesome'
+> score: 0.67 'A man is playing guitar' 'A woman watches TV'
+```