Initial commit

25d2752f · yongshk · 25d2752f · 25d2752f · 25d2752f · 25d2752f
Commit 25d2752f authored May 29, 2025 by yongshk
20 changed files
--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
+//! Support for the GGUF file format.
+//!
+//! Spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md
+
+use super::{GgmlDType, QTensor};
+use crate::{Device, Result};
+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use std::collections::HashMap;
+
+pub const DEFAULT_ALIGNMENT: u64 = 32;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum Magic {
+    Gguf,
+}
+
+impl TryFrom<u32> for Magic {
+    type Error = crate::Error;
+    fn try_from(value: u32) -> Result<Self> {
+        let magic = match value {
+            0x46554747 | 0x47475546 => Self::Gguf,
+            _ => crate::bail!("unknown magic 0x{value:08x}"),
+        };
+        Ok(magic)
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum VersionedMagic {
+    GgufV1,
+    GgufV2,
+    GgufV3,
+}
+
+impl VersionedMagic {
+    fn read<R: std::io::Read>(reader: &mut R) -> Result<Self> {
+        let magic = reader.read_u32::<LittleEndian>()?;
+        let magic = Magic::try_from(magic)?;
+        let version = reader.read_u32::<LittleEndian>()?;
+        let versioned_magic = match (magic, version) {
+            (Magic::Gguf, 1) => Self::GgufV1,
+            (Magic::Gguf, 2) => Self::GgufV2,
+            (Magic::Gguf, 3) => Self::GgufV3,
+            _ => crate::bail!("gguf: unsupported magic/version {magic:?}/{version}"),
+        };
+        Ok(versioned_magic)
+    }
+}
+
+#[derive(Debug)]
+pub struct TensorInfo {
+    pub ggml_dtype: GgmlDType,
+    pub shape: crate::Shape,
+    pub offset: u64,
+}
+
+impl TensorInfo {
+    pub fn read<R: std::io::Seek + std::io::Read>(
+        &self,
+        reader: &mut R,
+        tensor_data_offset: u64,
+        device: &Device,
+    ) -> Result<QTensor> {
+        let tensor_elems = self.shape.elem_count();
+        let block_size = self.ggml_dtype.block_size();
+        if tensor_elems % block_size != 0 {
+            crate::bail!(
+            "the number of elements {tensor_elems} is not divisible by the block size {block_size}"
+        )
+        }
+        let size_in_bytes = tensor_elems / block_size * self.ggml_dtype.type_size();
+        let mut raw_data = vec![0u8; size_in_bytes];
+        reader.seek(std::io::SeekFrom::Start(tensor_data_offset + self.offset))?;
+        reader.read_exact(&mut raw_data)?;
+        super::ggml_file::qtensor_from_ggml(
+            self.ggml_dtype,
+            &raw_data,
+            self.shape.dims().to_vec(),
+            device,
+        )
+    }
+}
+
+#[derive(Debug)]
+pub struct Content {
+    pub magic: VersionedMagic,
+    pub metadata: HashMap<String, Value>,
+    pub tensor_infos: HashMap<String, TensorInfo>,
+    pub tensor_data_offset: u64,
+}
+
+fn read_string<R: std::io::Read>(reader: &mut R, magic: &VersionedMagic) -> Result<String> {
+    let len = match magic {
+        VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
+        VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+            reader.read_u64::<LittleEndian>()? as usize
+        }
+    };
+    let mut v = vec![0u8; len];
+    reader.read_exact(&mut v)?;
+    // GGUF strings are supposed to be non-null terminated but in practice this happens.
+    while let Some(0) = v.last() {
+        v.pop();
+    }
+    // GGUF strings are utf8 encoded but there are cases that don't seem to be valid.
+    Ok(String::from_utf8_lossy(&v).into_owned())
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ValueType {
+    // The value is a 8-bit unsigned integer.
+    U8,
+    // The value is a 8-bit signed integer.
+    I8,
+    // The value is a 16-bit unsigned little-endian integer.
+    U16,
+    // The value is a 16-bit signed little-endian integer.
+    I16,
+    // The value is a 32-bit unsigned little-endian integer.
+    U32,
+    // The value is a 32-bit signed little-endian integer.
+    I32,
+    // The value is a 64-bit unsigned little-endian integer.
+    U64,
+    // The value is a 64-bit signed little-endian integer.
+    I64,
+    // The value is a 32-bit IEEE754 floating point number.
+    F32,
+    // The value is a 64-bit IEEE754 floating point number.
+    F64,
+    // The value is a boolean.
+    // 1-byte value where 0 is false and 1 is true.
+    // Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy.
+    Bool,
+    // The value is a UTF-8 non-null-terminated string, with length prepended.
+    String,
+    // The value is an array of other values, with the length and type prepended.
+    ///
+    // Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes.
+    Array,
+}
+
+#[derive(Debug, Clone)]
+pub enum Value {
+    U8(u8),
+    I8(i8),
+    U16(u16),
+    I16(i16),
+    U32(u32),
+    I32(i32),
+    U64(u64),
+    I64(i64),
+    F32(f32),
+    F64(f64),
+    Bool(bool),
+    String(String),
+    Array(Vec<Value>),
+}
+
+impl Value {
+    pub fn value_type(&self) -> ValueType {
+        match self {
+            Self::U8(_) => ValueType::U8,
+            Self::I8(_) => ValueType::I8,
+            Self::U16(_) => ValueType::U16,
+            Self::I16(_) => ValueType::I16,
+            Self::U32(_) => ValueType::U32,
+            Self::I32(_) => ValueType::I32,
+            Self::U64(_) => ValueType::U64,
+            Self::I64(_) => ValueType::I64,
+            Self::F32(_) => ValueType::F32,
+            Self::F64(_) => ValueType::F64,
+            Self::Bool(_) => ValueType::Bool,
+            Self::String(_) => ValueType::String,
+            Self::Array(_) => ValueType::Array,
+        }
+    }
+
+    pub fn to_u8(&self) -> Result<u8> {
+        match self {
+            Self::U8(v) => Ok(*v),
+            v => crate::bail!("not a u8 {v:?}"),
+        }
+    }
+
+    pub fn to_i8(&self) -> Result<i8> {
+        match self {
+            Self::I8(v) => Ok(*v),
+            v => crate::bail!("not a i8 {v:?}"),
+        }
+    }
+
+    pub fn to_u16(&self) -> Result<u16> {
+        match self {
+            Self::U16(v) => Ok(*v),
+            v => crate::bail!("not a u16 {v:?}"),
+        }
+    }
+
+    pub fn to_i16(&self) -> Result<i16> {
+        match self {
+            Self::I16(v) => Ok(*v),
+            v => crate::bail!("not a i16 {v:?}"),
+        }
+    }
+
+    pub fn to_u32(&self) -> Result<u32> {
+        match self {
+            Self::U32(v) => Ok(*v),
+            v => crate::bail!("not a u32 {v:?}"),
+        }
+    }
+
+    pub fn to_i32(&self) -> Result<i32> {
+        match self {
+            Self::I32(v) => Ok(*v),
+            v => crate::bail!("not a i32 {v:?}"),
+        }
+    }
+
+    pub fn to_u64(&self) -> Result<u64> {
+        match self {
+            Self::U64(v) => Ok(*v),
+            v => crate::bail!("not a u64 {v:?}"),
+        }
+    }
+
+    pub fn to_i64(&self) -> Result<i64> {
+        match self {
+            Self::I64(v) => Ok(*v),
+            v => crate::bail!("not a i64 {v:?}"),
+        }
+    }
+
+    pub fn to_f32(&self) -> Result<f32> {
+        match self {
+            Self::F32(v) => Ok(*v),
+            v => crate::bail!("not a f32 {v:?}"),
+        }
+    }
+
+    pub fn to_f64(&self) -> Result<f64> {
+        match self {
+            Self::F64(v) => Ok(*v),
+            v => crate::bail!("not a f64 {v:?}"),
+        }
+    }
+
+    pub fn to_bool(&self) -> Result<bool> {
+        match self {
+            Self::Bool(v) => Ok(*v),
+            v => crate::bail!("not a bool {v:?}"),
+        }
+    }
+
+    pub fn to_vec(&self) -> Result<&Vec<Value>> {
+        match self {
+            Self::Array(v) => Ok(v),
+            v => crate::bail!("not a vec {v:?}"),
+        }
+    }
+
+    pub fn to_string(&self) -> Result<&String> {
+        match self {
+            Self::String(v) => Ok(v),
+            v => crate::bail!("not a string {v:?}"),
+        }
+    }
+
+    fn read<R: std::io::Read>(
+        reader: &mut R,
+        value_type: ValueType,
+        magic: &VersionedMagic,
+    ) -> Result<Self> {
+        let v = match value_type {
+            ValueType::U8 => Self::U8(reader.read_u8()?),
+            ValueType::I8 => Self::I8(reader.read_i8()?),
+            ValueType::U16 => Self::U16(reader.read_u16::<LittleEndian>()?),
+            ValueType::I16 => Self::I16(reader.read_i16::<LittleEndian>()?),
+            ValueType::U32 => Self::U32(reader.read_u32::<LittleEndian>()?),
+            ValueType::I32 => Self::I32(reader.read_i32::<LittleEndian>()?),
+            ValueType::U64 => Self::U64(reader.read_u64::<LittleEndian>()?),
+            ValueType::I64 => Self::I64(reader.read_i64::<LittleEndian>()?),
+            ValueType::F32 => Self::F32(reader.read_f32::<LittleEndian>()?),
+            ValueType::F64 => Self::F64(reader.read_f64::<LittleEndian>()?),
+            ValueType::Bool => match reader.read_u8()? {
+                0 => Self::Bool(false),
+                1 => Self::Bool(true),
+                b => crate::bail!("unexpected bool value {b}"),
+            },
+            ValueType::String => Self::String(read_string(reader, magic)?),
+            ValueType::Array => {
+                let value_type = reader.read_u32::<LittleEndian>()?;
+                let value_type = ValueType::from_u32(value_type)?;
+                let len = match magic {
+                    VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
+                    VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+                        reader.read_u64::<LittleEndian>()? as usize
+                    }
+                };
+                let mut vs = Vec::with_capacity(len);
+                for _ in 0..len {
+                    vs.push(Value::read(reader, value_type, magic)?)
+                }
+                Self::Array(vs)
+            }
+        };
+        Ok(v)
+    }
+
+    fn write<W: std::io::Write>(&self, w: &mut W) -> Result<()> {
+        match self {
+            &Self::U8(v) => w.write_u8(v)?,
+            &Self::I8(v) => w.write_i8(v)?,
+            &Self::U16(v) => w.write_u16::<LittleEndian>(v)?,
+            &Self::I16(v) => w.write_i16::<LittleEndian>(v)?,
+            &Self::U32(v) => w.write_u32::<LittleEndian>(v)?,
+            &Self::I32(v) => w.write_i32::<LittleEndian>(v)?,
+            &Self::U64(v) => w.write_u64::<LittleEndian>(v)?,
+            &Self::I64(v) => w.write_i64::<LittleEndian>(v)?,
+            &Self::F32(v) => w.write_f32::<LittleEndian>(v)?,
+            &Self::F64(v) => w.write_f64::<LittleEndian>(v)?,
+            &Self::Bool(v) => w.write_u8(u8::from(v))?,
+            Self::String(v) => write_string(w, v.as_str())?,
+            Self::Array(v) => {
+                // The `Value` type does not enforce that all the values in an Array have the same
+                // type.
+                let value_type = if v.is_empty() {
+                    // Doesn't matter, the array is empty.
+                    ValueType::U32
+                } else {
+                    let value_type: std::collections::HashSet<_> =
+                        v.iter().map(|elem| elem.value_type()).collect();
+                    if value_type.len() != 1 {
+                        crate::bail!("multiple value-types in the same array {value_type:?}")
+                    }
+                    value_type.into_iter().next().unwrap()
+                };
+                w.write_u32::<LittleEndian>(value_type.to_u32())?;
+                w.write_u64::<LittleEndian>(v.len() as u64)?;
+                for elem in v.iter() {
+                    elem.write(w)?
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl ValueType {
+    fn from_u32(v: u32) -> Result<Self> {
+        let v = match v {
+            0 => Self::U8,
+            1 => Self::I8,
+            2 => Self::U16,
+            3 => Self::I16,
+            4 => Self::U32,
+            5 => Self::I32,
+            6 => Self::F32,
+            7 => Self::Bool,
+            8 => Self::String,
+            9 => Self::Array,
+            10 => Self::U64,
+            11 => Self::I64,
+            12 => Self::F64,
+            v => crate::bail!("unrecognized value-type {v:#08x}"),
+        };
+        Ok(v)
+    }
+
+    fn to_u32(self) -> u32 {
+        match self {
+            Self::U8 => 0,
+            Self::I8 => 1,
+            Self::U16 => 2,
+            Self::I16 => 3,
+            Self::U32 => 4,
+            Self::I32 => 5,
+            Self::F32 => 6,
+            Self::Bool => 7,
+            Self::String => 8,
+            Self::Array => 9,
+            Self::U64 => 10,
+            Self::I64 => 11,
+            Self::F64 => 12,
+        }
+    }
+}
+
+impl Content {
+    pub fn read<R: std::io::Seek + std::io::Read>(reader: &mut R) -> Result<Self> {
+        let magic = VersionedMagic::read(reader)?;
+
+        let tensor_count = match magic {
+            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
+            VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+                reader.read_u64::<LittleEndian>()? as usize
+            }
+        };
+        let metadata_kv_count = match magic {
+            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
+            VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+                reader.read_u64::<LittleEndian>()? as usize
+            }
+        };
+
+        let mut metadata = HashMap::new();
+        for _idx in 0..metadata_kv_count {
+            let key = read_string(reader, &magic)?;
+            let value_type = reader.read_u32::<LittleEndian>()?;
+            let value_type = ValueType::from_u32(value_type)?;
+            let value = Value::read(reader, value_type, &magic)?;
+            metadata.insert(key, value);
+        }
+        let mut tensor_infos = HashMap::new();
+        for _idx in 0..tensor_count {
+            let tensor_name = read_string(reader, &magic)?;
+            let n_dimensions = reader.read_u32::<LittleEndian>()?;
+
+            let mut dimensions: Vec<usize> = match magic {
+                VersionedMagic::GgufV1 => {
+                    let mut dimensions = vec![0; n_dimensions as usize];
+                    reader.read_u32_into::<LittleEndian>(&mut dimensions)?;
+                    dimensions.into_iter().map(|c| c as usize).collect()
+                }
+                VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+                    let mut dimensions = vec![0; n_dimensions as usize];
+                    reader.read_u64_into::<LittleEndian>(&mut dimensions)?;
+                    dimensions.into_iter().map(|c| c as usize).collect()
+                }
+            };
+
+            dimensions.reverse();
+            let ggml_dtype = reader.read_u32::<LittleEndian>()?;
+            let ggml_dtype = GgmlDType::from_u32(ggml_dtype)?;
+            let offset = reader.read_u64::<LittleEndian>()?;
+            tensor_infos.insert(
+                tensor_name,
+                TensorInfo {
+                    shape: crate::Shape::from(dimensions),
+                    offset,
+                    ggml_dtype,
+                },
+            );
+        }
+        let position = reader.stream_position()?;
+        let alignment = match metadata.get("general.alignment") {
+            Some(Value::U8(v)) => *v as u64,
+            Some(Value::U16(v)) => *v as u64,
+            Some(Value::U32(v)) => *v as u64,
+            Some(Value::I8(v)) if *v >= 0 => *v as u64,
+            Some(Value::I16(v)) if *v >= 0 => *v as u64,
+            Some(Value::I32(v)) if *v >= 0 => *v as u64,
+            _ => DEFAULT_ALIGNMENT,
+        };
+        let tensor_data_offset = (position + alignment - 1) / alignment * alignment;
+        Ok(Self {
+            magic,
+            metadata,
+            tensor_infos,
+            tensor_data_offset,
+        })
+    }
+
+    pub fn tensor<R: std::io::Seek + std::io::Read>(
+        &self,
+        reader: &mut R,
+        name: &str,
+        device: &Device,
+    ) -> Result<QTensor> {
+        let tensor_info = match self.tensor_infos.get(name) {
+            Some(tensor_info) => tensor_info,
+            None => crate::bail!("cannot find tensor info for {name}"),
+        };
+        tensor_info.read(reader, self.tensor_data_offset, device)
+    }
+}
+
+fn write_string<W: std::io::Write>(w: &mut W, str: &str) -> Result<()> {
+    let bytes = str.as_bytes();
+    w.write_u64::<LittleEndian>(bytes.len() as u64)?;
+    w.write_all(bytes)?;
+    Ok(())
+}
+
+pub fn write<W: std::io::Seek + std::io::Write>(
+    w: &mut W,
+    metadata: &[(&str, &Value)],
+    tensors: &[(&str, &QTensor)],
+) -> Result<()> {
+    w.write_u32::<LittleEndian>(0x46554747)?;
+    w.write_u32::<LittleEndian>(2)?; // version 2.
+    w.write_u64::<LittleEndian>(tensors.len() as u64)?;
+    w.write_u64::<LittleEndian>(metadata.len() as u64)?;
+    for (name, value) in metadata.iter() {
+        write_string(w, name)?;
+        w.write_u32::<LittleEndian>(value.value_type().to_u32())?;
+        value.write(w)?;
+    }
+    let mut offset = 0usize;
+    let mut offsets = Vec::with_capacity(tensors.len());
+    for (name, tensor) in tensors.iter() {
+        write_string(w, name)?;
+        let dims = tensor.shape().dims();
+        w.write_u32::<LittleEndian>(dims.len() as u32)?;
+        for &dim in dims.iter().rev() {
+            w.write_u64::<LittleEndian>(dim as u64)?;
+        }
+        w.write_u32::<LittleEndian>(tensor.dtype().to_u32())?;
+        w.write_u64::<LittleEndian>(offset as u64)?;
+        offsets.push(offset);
+        let size_in_bytes = tensor.storage_size_in_bytes();
+        let padding = 31 - (31 + size_in_bytes) % 32;
+        offset += size_in_bytes + padding;
+    }
+    let pos = w.stream_position()? as usize;
+    let padding = 31 - (31 + pos) % 32;
+    w.write_all(&vec![0u8; padding])?;
+    let tensor_start_pos = w.stream_position()? as usize;
+    for (offset, (_name, tensor)) in offsets.iter().zip(tensors.iter()) {
+        let pos = w.stream_position()? as usize;
+        if tensor_start_pos + offset != pos {
+            crate::bail!(
+                "internal error, unexpected current position {tensor_start_pos} {offset} {pos}"
+            )
+        }
+        let data = tensor.data()?;
+        let size_in_bytes = data.len();
+        w.write_all(&data)?;
+        let padding = 31 - (31 + size_in_bytes) % 32;
+        w.write_all(&vec![0u8; padding])?;
+    }
+    Ok(())
+}
--- a/candle-core/src/quantized/k_quants.rs
+++ b/candle-core/src/quantized/k_quants.rs
+use super::utils::{
+    get_scale_min_k4, group_for_dequantization, group_for_quantization, make_q3_quants,
+    make_qkx1_quants, make_qx_quants, nearest_int,
+};
+use super::GgmlDType;
+use crate::Result;
+use byteorder::{ByteOrder, LittleEndian};
+use half::f16;
+use rayon::prelude::*;
+
+// Default to QK_K 256 rather than 64.
+pub const QK_K: usize = 256;
+pub const K_SCALE_SIZE: usize = 12;
+
+pub const QK4_0: usize = 32;
+pub const QK4_1: usize = 32;
+pub const QK5_0: usize = 32;
+pub const QK5_1: usize = 32;
+pub const QK8_0: usize = 32;
+pub const QK8_1: usize = 32;
+
+pub trait GgmlType: Sized + Clone + Send + Sync {
+    const DTYPE: GgmlDType;
+    const BLCK_SIZE: usize;
+    type VecDotType: GgmlType;
+
+    // This is only safe for types that include immediate values such as float/int/...
+    fn zeros() -> Self {
+        unsafe { std::mem::MaybeUninit::zeroed().assume_init() }
+    }
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()>;
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()>;
+
+    /// Dot product used as a building block for quantized mat-mul.
+    /// n is the number of elements to be considered.
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32>;
+
+    /// Generic implementation of the dot product without simd optimizations.
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32>;
+}
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ4_0 {
+    pub(crate) d: f16,
+    pub(crate) qs: [u8; QK4_0 / 2],
+}
+const _: () = assert!(std::mem::size_of::<BlockQ4_0>() == 18);
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ4_1 {
+    pub(crate) d: f16,
+    pub(crate) m: f16,
+    pub(crate) qs: [u8; QK4_1 / 2],
+}
+const _: () = assert!(std::mem::size_of::<BlockQ4_1>() == 20);
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ5_0 {
+    pub(crate) d: f16,
+    pub(crate) qh: [u8; 4],
+    pub(crate) qs: [u8; QK5_0 / 2],
+}
+const _: () = assert!(std::mem::size_of::<BlockQ5_0>() == 22);
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ5_1 {
+    pub(crate) d: f16,
+    pub(crate) m: f16,
+    pub(crate) qh: [u8; 4],
+    pub(crate) qs: [u8; QK5_1 / 2],
+}
+const _: () = assert!(std::mem::size_of::<BlockQ5_1>() == 24);
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ8_0 {
+    pub(crate) d: f16,
+    pub(crate) qs: [i8; QK8_0],
+}
+const _: () = assert!(std::mem::size_of::<BlockQ8_0>() == 34);
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ8_1 {
+    pub(crate) d: f16,
+    pub(crate) s: f16,
+    pub(crate) qs: [i8; QK8_1],
+}
+const _: () = assert!(std::mem::size_of::<BlockQ8_1>() == 36);
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ2K {
+    pub(crate) scales: [u8; QK_K / 16],
+    pub(crate) qs: [u8; QK_K / 4],
+    pub(crate) d: f16,
+    pub(crate) dmin: f16,
+}
+const _: () = assert!(QK_K / 16 + QK_K / 4 + 2 * 2 == std::mem::size_of::<BlockQ2K>());
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ3K {
+    pub(crate) hmask: [u8; QK_K / 8],
+    pub(crate) qs: [u8; QK_K / 4],
+    pub(crate) scales: [u8; 12],
+    pub(crate) d: f16,
+}
+const _: () = assert!(QK_K / 8 + QK_K / 4 + 12 + 2 == std::mem::size_of::<BlockQ3K>());
+
+#[derive(Debug, Clone, PartialEq)]
+// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/k_quants.h#L82
+#[repr(C)]
+pub struct BlockQ4K {
+    pub(crate) d: f16,
+    pub(crate) dmin: f16,
+    pub(crate) scales: [u8; K_SCALE_SIZE],
+    pub(crate) qs: [u8; QK_K / 2],
+}
+const _: () = assert!(QK_K / 2 + K_SCALE_SIZE + 2 * 2 == std::mem::size_of::<BlockQ4K>());
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ5K {
+    pub(crate) d: f16,
+    pub(crate) dmin: f16,
+    pub(crate) scales: [u8; K_SCALE_SIZE],
+    pub(crate) qh: [u8; QK_K / 8],
+    pub(crate) qs: [u8; QK_K / 2],
+}
+const _: () =
+    assert!(QK_K / 8 + QK_K / 2 + 2 * 2 + K_SCALE_SIZE == std::mem::size_of::<BlockQ5K>());
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ6K {
+    pub(crate) ql: [u8; QK_K / 2],
+    pub(crate) qh: [u8; QK_K / 4],
+    pub(crate) scales: [i8; QK_K / 16],
+    pub(crate) d: f16,
+}
+const _: () = assert!(3 * QK_K / 4 + QK_K / 16 + 2 == std::mem::size_of::<BlockQ6K>());
+
+#[derive(Debug, Clone, PartialEq)]
+#[repr(C)]
+pub struct BlockQ8K {
+    pub(crate) d: f32,
+    pub(crate) qs: [i8; QK_K],
+    pub(crate) bsums: [i16; QK_K / 16],
+}
+const _: () = assert!(4 + QK_K + QK_K / 16 * 2 == std::mem::size_of::<BlockQ8K>());
+
+impl GgmlType for BlockQ4_0 {
+    const DTYPE: GgmlDType = GgmlDType::Q4_0;
+    const BLCK_SIZE: usize = QK4_0;
+    type VecDotType = BlockQ8_0;
+
+    // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1525
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        let k = ys.len();
+        let qk = Self::BLCK_SIZE;
+        if k % qk != 0 {
+            crate::bail!("dequantize_row_q4_0: {k} is not divisible by {qk}")
+        }
+
+        let nb = k / qk;
+        for i in 0..nb {
+            let d = xs[i].d.to_f32();
+
+            for j in 0..(qk / 2) {
+                let x0 = (xs[i].qs[j] & 0x0F) as i16 - 8;
+                let x1 = (xs[i].qs[j] >> 4) as i16 - 8;
+
+                ys[i * qk + j] = (x0 as f32) * d;
+                ys[i * qk + j + qk / 2] = (x1 as f32) * d;
+            }
+        }
+        Ok(())
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        // quantize_row_q4_0
+        let qk = Self::BLCK_SIZE;
+        let k = xs.len();
+        if k % qk != 0 {
+            crate::bail!("{k} is not divisible by {}", qk);
+        };
+        let nb = k / qk;
+        if ys.len() != nb {
+            crate::bail!("size mismatch {} {} {}", xs.len(), ys.len(), qk,)
+        }
+        for (i, ys) in ys.iter_mut().enumerate() {
+            let mut amax = 0f32;
+            let mut max = 0f32;
+
+            let xs = &xs[i * qk..(i + 1) * qk];
+            for &x in xs.iter() {
+                if amax < x.abs() {
+                    amax = x.abs();
+                    max = x;
+                }
+            }
+            let d = max / -8.0;
+            let id = if d != 0f32 { 1. / d } else { 0. };
+            ys.d = f16::from_f32(d);
+
+            for (j, q) in ys.qs.iter_mut().enumerate() {
+                let x0 = xs[j] * id;
+                let x1 = xs[qk / 2 + j] * id;
+                let xi0 = u8::min(15, (x0 + 8.5) as u8);
+                let xi1 = u8::min(15, (x1 + 8.5) as u8);
+                *q = xi0 | (xi1 << 4)
+            }
+        }
+        Ok(())
+    }
+
+    // https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/ggml.c#L2361C10-L2361C122
+    #[allow(unreachable_code)]
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        #[cfg(target_feature = "avx")]
+        return super::avx::vec_dot_q4_0_q8_0(n, xs, ys);
+
+        #[cfg(target_feature = "neon")]
+        return super::neon::vec_dot_q4_0_q8_0(n, xs, ys);
+
+        #[cfg(target_feature = "simd128")]
+        return super::simd128::vec_dot_q4_0_q8_0(n, xs, ys);
+
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        let qk = QK8_0;
+        if n % QK8_0 != 0 {
+            crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
+        }
+        // Generic implementation.
+        let mut sumf = 0f32;
+        for (xs, ys) in xs.iter().zip(ys.iter()) {
+            let mut sum_i = 0;
+            for j in 0..qk / 2 {
+                let v0 = (xs.qs[j] & 0x0F) as i32 - 8;
+                let v1 = (xs.qs[j] >> 4) as i32 - 8;
+                sum_i += v0 * ys.qs[j] as i32 + v1 * ys.qs[j + qk / 2] as i32
+            }
+            sumf += sum_i as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
+        }
+        Ok(sumf)
+    }
+}
+
+impl GgmlType for BlockQ4_1 {
+    const DTYPE: GgmlDType = GgmlDType::Q4_1;
+    const BLCK_SIZE: usize = QK4_1;
+    type VecDotType = BlockQ8_1;
+
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        // ggml_vec_dot_q4_1_q8_1
+        let qk = QK8_1;
+        if n % qk != 0 {
+            crate::bail!("vec_dot_q4_1_q8_1: {n} is not divisible by {qk}")
+        }
+        let nb = n / qk;
+        if nb % 2 != 0 {
+            crate::bail!("vec_dot_q4_1_q8_1: {n}, nb is not divisible by 2")
+        }
+
+        // Generic implementation.
+        let mut sumf = 0f32;
+
+        for (xs, ys) in xs.iter().zip(ys.iter()) {
+            let mut sumi = 0i32;
+
+            for j in 0..qk / 2 {
+                let v0 = xs.qs[j] as i32 & 0x0F;
+                let v1 = xs.qs[j] as i32 >> 4;
+                sumi += (v0 * ys.qs[j] as i32) + (v1 * ys.qs[j + qk / 2] as i32);
+            }
+
+            sumf += sumi as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
+                + f16::to_f32(xs.m) * f16::to_f32(ys.s)
+        }
+        Ok(sumf)
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        // quantize_row_q4_1
+        let qk = Self::BLCK_SIZE;
+        if ys.len() * qk != xs.len() {
+            crate::bail!("size mismatch {} {} {}", xs.len(), ys.len(), qk,)
+        }
+        for (i, ys) in ys.iter_mut().enumerate() {
+            let xs = &xs[i * qk..(i + 1) * qk];
+
+            let mut min = f32::INFINITY;
+            let mut max = f32::NEG_INFINITY;
+            for &x in xs.iter() {
+                min = f32::min(x, min);
+                max = f32::max(x, max);
+            }
+            let d = (max - min) / ((1 << 4) - 1) as f32;
+            let id = if d != 0f32 { 1. / d } else { 0. };
+            ys.d = f16::from_f32(d);
+            ys.m = f16::from_f32(min);
+
+            for (j, q) in ys.qs.iter_mut().take(qk / 2).enumerate() {
+                let x0 = (xs[j] - min) * id;
+                let x1 = (xs[qk / 2 + j] - min) * id;
+
+                let xi0 = u8::min(15, (x0 + 0.5) as u8);
+                let xi1 = u8::min(15, (x1 + 0.5) as u8);
+
+                *q = xi0 | (xi1 << 4);
+            }
+        }
+        Ok(())
+    }
+
+    // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1545
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        let k = ys.len();
+        if k % QK4_1 != 0 {
+            crate::bail!("dequantize_row_q4_1: {k} is not divisible by {QK4_1}");
+        }
+
+        let nb = k / QK4_1;
+        for i in 0..nb {
+            let d = xs[i].d.to_f32();
+            let m = xs[i].m.to_f32();
+
+            for j in 0..(QK4_1 / 2) {
+                let x0 = xs[i].qs[j] & 0x0F;
+                let x1 = xs[i].qs[j] >> 4;
+
+                ys[i * QK4_1 + j] = (x0 as f32) * d + m;
+                ys[i * QK4_1 + j + QK4_1 / 2] = (x1 as f32) * d + m;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl GgmlType for BlockQ5_0 {
+    const DTYPE: GgmlDType = GgmlDType::Q5_0;
+    const BLCK_SIZE: usize = QK5_0;
+    type VecDotType = BlockQ8_0;
+
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        let qk = Self::BLCK_SIZE;
+        if n % Self::BLCK_SIZE != 0 {
+            crate::bail!("vec_dot_q5_0_q8_0: {n} is not divisible by {qk}")
+        }
+        let nb = n / qk;
+        if nb % 2 != 0 {
+            crate::bail!("vec_dot_q5_0_q8_0: {n}, nb is not divisible by 2")
+        }
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(_n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        // Generic implementation.
+        let mut sumf = 0f32;
+
+        for (xs, ys) in xs.iter().zip(ys.iter()) {
+            let qh = LittleEndian::read_u32(&xs.qh);
+            let mut sumi = 0i32;
+
+            for j in 0..Self::BLCK_SIZE / 2 {
+                let xh_0 = (((qh & (1u32 << j)) >> j) << 4) as u8;
+                let xh_1 = ((qh & (1u32 << (j + 16))) >> (j + 12)) as u8;
+
+                let x0 = ((xs.qs[j] & 0x0F) as i32 | xh_0 as i32) - 16;
+                let x1 = ((xs.qs[j] >> 4) as i32 | xh_1 as i32) - 16;
+
+                sumi += (x0 * ys.qs[j] as i32) + (x1 * ys.qs[j + Self::BLCK_SIZE / 2] as i32);
+            }
+
+            sumf += sumi as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
+        }
+        Ok(sumf)
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        // quantize_row_q5_0
+        let k = xs.len();
+        if ys.len() * Self::BLCK_SIZE != k {
+            crate::bail!("size mismatch {k} {} {}", ys.len(), Self::BLCK_SIZE)
+        }
+        for (i, ys) in ys.iter_mut().enumerate() {
+            let xs = &xs[i * Self::BLCK_SIZE..(i + 1) * Self::BLCK_SIZE];
+
+            let mut amax = 0f32;
+            let mut max = 0f32;
+            for &x in xs.iter() {
+                if amax < x.abs() {
+                    amax = x.abs();
+                    max = x;
+                }
+            }
+            let d = max / -16.;
+            let id = if d != 0f32 { 1. / d } else { 0. };
+            ys.d = f16::from_f32(d);
+            let mut qh = 0u32;
+            for j in 0..Self::BLCK_SIZE / 2 {
+                let x0 = xs[j] * id;
+                let x1 = xs[j + Self::BLCK_SIZE / 2] * id;
+                let xi0 = ((x0 + 16.5) as i8).min(31) as u8;
+                let xi1 = ((x1 + 16.5) as i8).min(31) as u8;
+                ys.qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+                qh |= ((xi0 as u32 & 0x10) >> 4) << j;
+                qh |= ((xi1 as u32 & 0x10) >> 4) << (j + Self::BLCK_SIZE / 2);
+            }
+            LittleEndian::write_u32(&mut ys.qh, qh)
+        }
+        Ok(())
+    }
+
+    // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1566
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        let k = ys.len();
+        if k % QK5_0 != 0 {
+            crate::bail!("dequantize_row_q5_0: {k} is not divisible by {QK5_0}");
+        }
+
+        let nb = k / QK5_0;
+        for i in 0..nb {
+            let d = xs[i].d.to_f32();
+            let qh: u32 = LittleEndian::read_u32(&xs[i].qh);
+
+            for j in 0..(QK5_0 / 2) {
+                let xh_0 = (((qh >> j) << 4) & 0x10) as u8;
+                let xh_1 = ((qh >> (j + 12)) & 0x10) as u8;
+
+                let x0 = ((xs[i].qs[j] & 0x0F) | xh_0) as i32 - 16;
+                let x1 = ((xs[i].qs[j] >> 4) | xh_1) as i32 - 16;
+
+                ys[i * QK5_0 + j] = (x0 as f32) * d;
+                ys[i * QK5_0 + j + QK5_0 / 2] = (x1 as f32) * d;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl GgmlType for BlockQ5_1 {
+    const DTYPE: GgmlDType = GgmlDType::Q5_1;
+    const BLCK_SIZE: usize = QK5_1;
+    type VecDotType = BlockQ8_1;
+
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        let qk = Self::BLCK_SIZE;
+        if n % Self::BLCK_SIZE != 0 {
+            crate::bail!("vec_dot_q5_1_q8_1: {n} is not divisible by {qk}")
+        }
+        let nb = n / qk;
+        if nb % 2 != 0 {
+            crate::bail!("vec_dot_q5_1_q8_1: {n}, nb is not divisible by 2")
+        }
+
+        // Generic implementation.
+        let mut sumf = 0f32;
+
+        for (xs, ys) in xs.iter().zip(ys.iter()) {
+            let qh = LittleEndian::read_u32(&xs.qh);
+            let mut sumi = 0i32;
+
+            for j in 0..Self::BLCK_SIZE / 2 {
+                let xh_0 = ((qh >> j) << 4) & 0x10;
+                let xh_1 = (qh >> (j + 12)) & 0x10;
+
+                let x0 = (xs.qs[j] as i32 & 0xF) | xh_0 as i32;
+                let x1 = (xs.qs[j] as i32 >> 4) | xh_1 as i32;
+
+                sumi += (x0 * ys.qs[j] as i32) + (x1 * ys.qs[j + Self::BLCK_SIZE / 2] as i32);
+            }
+
+            sumf += sumi as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
+                + f16::to_f32(xs.m) * f16::to_f32(ys.s)
+        }
+        Ok(sumf)
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        // quantize_row_q5_1
+        let qk = Self::BLCK_SIZE;
+        if ys.len() * qk != xs.len() {
+            crate::bail!("size mismatch {} {} {}", xs.len(), ys.len(), qk,)
+        }
+        for (i, ys) in ys.iter_mut().enumerate() {
+            let xs = &xs[i * qk..(i + 1) * qk];
+
+            let mut min = f32::INFINITY;
+            let mut max = f32::NEG_INFINITY;
+            for &x in xs.iter() {
+                min = f32::min(x, min);
+                max = f32::max(x, max);
+            }
+            let d = (max - min) / ((1 << 5) - 1) as f32;
+            let id = if d != 0f32 { 1. / d } else { 0. };
+            ys.d = f16::from_f32(d);
+            ys.m = f16::from_f32(min);
+
+            let mut qh = 0u32;
+            for (j, q) in ys.qs.iter_mut().take(qk / 2).enumerate() {
+                let x0 = (xs[j] - min) * id;
+                let x1 = (xs[qk / 2 + j] - min) * id;
+
+                let xi0 = (x0 + 0.5) as u8;
+                let xi1 = (x1 + 0.5) as u8;
+
+                *q = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+                // get the 5-th bit and store it in qh at the right position
+                qh |= ((xi0 as u32 & 0x10) >> 4) << j;
+                qh |= ((xi1 as u32 & 0x10) >> 4) << (j + qk / 2);
+            }
+            LittleEndian::write_u32(&mut ys.qh, qh);
+        }
+        Ok(())
+    }
+
+    // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1592
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        let k = ys.len();
+        if k % QK5_1 != 0 {
+            crate::bail!("dequantize_row_q5_1: {k} is not divisible by {QK5_1}");
+        }
+
+        let nb = k / QK5_1;
+        for i in 0..nb {
+            let d = xs[i].d.to_f32();
+            let m = xs[i].m.to_f32();
+            let qh: u32 = LittleEndian::read_u32(&xs[i].qh);
+
+            for j in 0..(QK5_1 / 2) {
+                let xh_0 = (((qh >> j) << 4) & 0x10) as u8;
+                let xh_1 = ((qh >> (j + 12)) & 0x10) as u8;
+
+                let x0 = (xs[i].qs[j] & 0x0F) | xh_0;
+                let x1 = (xs[i].qs[j] >> 4) | xh_1;
+
+                ys[i * QK5_1 + j] = (x0 as f32) * d + m;
+                ys[i * QK5_1 + j + QK5_1 / 2] = (x1 as f32) * d + m;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl GgmlType for BlockQ8_0 {
+    const DTYPE: GgmlDType = GgmlDType::Q8_0;
+    const BLCK_SIZE: usize = QK8_0;
+    type VecDotType = BlockQ8_0;
+
+    // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1619
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        let k = ys.len();
+        if k % QK8_0 != 0 {
+            crate::bail!("dequantize_row_q8_0: {k} is not divisible by {QK8_0}");
+        }
+
+        let nb = k / QK8_0;
+
+        for i in 0..nb {
+            let d = xs[i].d.to_f32();
+
+            for j in 0..QK8_0 {
+                ys[i * QK8_0 + j] = xs[i].qs[j] as f32 * d;
+            }
+        }
+        Ok(())
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        // quantize_row_q8_0
+        let k = xs.len();
+        if k % Self::BLCK_SIZE != 0 {
+            crate::bail!("{k} is not divisible by {}", Self::BLCK_SIZE);
+        };
+        let nb = k / Self::BLCK_SIZE;
+        if ys.len() != nb {
+            crate::bail!(
+                "size mismatch {} {} {}",
+                xs.len(),
+                ys.len(),
+                Self::BLCK_SIZE
+            )
+        }
+        for (i, ys) in ys.iter_mut().enumerate() {
+            let mut amax = 0f32;
+            let xs = &xs[i * Self::BLCK_SIZE..(i + 1) * Self::BLCK_SIZE];
+            for &x in xs.iter() {
+                amax = amax.max(x.abs())
+            }
+            let d = amax / ((1 << 7) - 1) as f32;
+            let id = if d != 0f32 { 1. / d } else { 0. };
+            ys.d = f16::from_f32(d);
+            for (y, &x) in ys.qs.iter_mut().zip(xs.iter()) {
+                *y = f32::round(x * id) as i8
+            }
+        }
+        Ok(())
+    }
+
+    #[allow(unreachable_code)]
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        #[cfg(target_feature = "avx")]
+        return super::avx::vec_dot_q8_0_q8_0(n, xs, ys);
+
+        #[cfg(target_feature = "neon")]
+        return super::neon::vec_dot_q8_0_q8_0(n, xs, ys);
+
+        #[cfg(target_feature = "simd128")]
+        return super::simd128::vec_dot_q8_0_q8_0(n, xs, ys);
+
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        let qk = QK8_0;
+        if n % QK8_0 != 0 {
+            crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
+        }
+
+        // Generic implementation.
+        let mut sumf = 0f32;
+        for (xs, ys) in xs.iter().zip(ys.iter()) {
+            let sum_i = xs
+                .qs
+                .iter()
+                .zip(ys.qs.iter())
+                .map(|(&x, &y)| x as i32 * y as i32)
+                .sum::<i32>();
+            sumf += sum_i as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
+        }
+        Ok(sumf)
+    }
+}
+
+impl GgmlType for BlockQ8_1 {
+    const DTYPE: GgmlDType = GgmlDType::Q8_1;
+    const BLCK_SIZE: usize = QK8_1;
+    type VecDotType = BlockQ8_1;
+
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(_n: usize, _xs: &[Self], _ys: &[Self::VecDotType]) -> Result<f32> {
+        unimplemented!("no support for vec-dot on Q8_1")
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        // quantize_row_q8_1
+        let k = xs.len();
+        if ys.len() * Self::BLCK_SIZE != k {
+            crate::bail!("size mismatch {k} {} {}", ys.len(), Self::BLCK_SIZE)
+        }
+        for (i, ys) in ys.iter_mut().enumerate() {
+            let mut amax = 0f32;
+            let xs = &xs[i * Self::BLCK_SIZE..(i + 1) * Self::BLCK_SIZE];
+            for &x in xs.iter() {
+                amax = amax.max(x.abs())
+            }
+            let d = amax / ((1 << 7) - 1) as f32;
+            let id = if d != 0f32 { 1. / d } else { 0. };
+            ys.d = f16::from_f32(d);
+            let mut sum = 0i32;
+            for j in 0..Self::BLCK_SIZE / 2 {
+                let v0 = xs[j] * id;
+                let v1 = xs[j + Self::BLCK_SIZE / 2] * id;
+                ys.qs[j] = f32::round(v0) as i8;
+                ys.qs[j + Self::BLCK_SIZE / 2] = f32::round(v1) as i8;
+                sum += ys.qs[j] as i32 + ys.qs[j + Self::BLCK_SIZE / 2] as i32;
+            }
+            ys.s = f16::from_f32(sum as f32) * ys.d;
+        }
+        Ok(())
+    }
+
+    fn to_float(_xs: &[Self], _ys: &mut [f32]) -> Result<()> {
+        unimplemented!("no support for vec-dot on Q8_1")
+    }
+}
+
+impl GgmlType for BlockQ2K {
+    const DTYPE: GgmlDType = GgmlDType::Q2K;
+    const BLCK_SIZE: usize = QK_K;
+    type VecDotType = BlockQ8K;
+
+    #[allow(unreachable_code)]
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        #[cfg(target_feature = "avx")]
+        return super::avx::vec_dot_q2k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "neon")]
+        return super::neon::vec_dot_q2k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "simd128")]
+        return super::simd128::vec_dot_q2k_q8k(n, xs, ys);
+
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        if n % QK_K != 0 {
+            crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
+        }
+
+        let mut sumf = 0.0;
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let mut q2: &[_] = &x.qs;
+            let mut q8: &[_] = &y.qs;
+            let sc = &x.scales;
+
+            let mut summs = 0;
+            for (bsum, scale) in y.bsums.iter().zip(sc) {
+                summs += *bsum as i32 * ((scale >> 4) as i32);
+            }
+
+            let dall = y.d * x.d.to_f32();
+            let dmin = y.d * x.dmin.to_f32();
+
+            let mut isum = 0;
+            let mut is = 0;
+            for _ in 0..(QK_K / 128) {
+                let mut shift = 0;
+                for _ in 0..4 {
+                    let d = (sc[is] & 0xF) as i32;
+                    is += 1;
+                    let mut isuml = 0;
+                    for l in 0..16 {
+                        isuml += q8[l] as i32 * (((q2[l] >> shift) & 3) as i32);
+                    }
+                    isum += d * isuml;
+                    let d = (sc[is] & 0xF) as i32;
+                    is += 1;
+                    isuml = 0;
+                    for l in 16..32 {
+                        isuml += q8[l] as i32 * (((q2[l] >> shift) & 3) as i32);
+                    }
+                    isum += d * isuml;
+                    shift += 2;
+                    // adjust the indexing
+                    q8 = &q8[32..];
+                }
+                // adjust the indexing
+                q2 = &q2[32..];
+            }
+            sumf += dall * isum as f32 - dmin * summs as f32;
+        }
+
+        Ok(sumf)
+    }
+
+    // https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L279
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        const Q4SCALE: f32 = 15.0;
+
+        for (block, x) in group_for_quantization(xs, ys)? {
+            //calculate scales and mins
+            let mut mins: [f32; QK_K / 16] = [0.0; QK_K / 16];
+            let mut scales: [f32; QK_K / 16] = [0.0; QK_K / 16];
+
+            for (j, x_scale_slice) in x.chunks(16).enumerate() {
+                (scales[j], mins[j]) = make_qkx1_quants(3, 5, x_scale_slice);
+            }
+            // get max scale and max min and ensure they are >= 0.0
+            let max_scale = scales.iter().fold(0.0, |max, &val| val.max(max));
+            let max_min = mins.iter().fold(0.0, |max, &val| val.max(max));
+
+            if max_scale > 0.0 {
+                let iscale = Q4SCALE / max_scale;
+                for (j, scale) in scales.iter().enumerate().take(QK_K / 16) {
+                    block.scales[j] = nearest_int(iscale * scale) as u8;
+                }
+                block.d = f16::from_f32(max_scale / Q4SCALE);
+            } else {
+                for j in 0..QK_K / 16 {
+                    block.scales[j] = 0;
+                }
+                block.d = f16::from_f32(0.0);
+            }
+
+            if max_min > 0.0 {
+                let iscale = Q4SCALE / max_min;
+                for (j, scale) in block.scales.iter_mut().enumerate() {
+                    let l = nearest_int(iscale * mins[j]) as u8;
+                    *scale |= l << 4;
+                }
+                block.dmin = f16::from_f32(max_min / Q4SCALE);
+            } else {
+                block.dmin = f16::from_f32(0.0);
+            }
+
+            let mut big_l: [u8; QK_K] = [0; QK_K];
+
+            for j in 0..QK_K / 16 {
+                let d = block.d.to_f32() * (block.scales[j] & 0xF) as f32;
+                if d == 0.0 {
+                    continue;
+                }
+                let dm = block.dmin.to_f32() * (block.scales[j] >> 4) as f32;
+                for ii in 0..16 {
+                    let ll = nearest_int((x[16 * j + ii] + dm) / d).clamp(0, 3);
+                    big_l[16 * j + ii] = ll as u8;
+                }
+            }
+
+            for j in (0..QK_K).step_by(128) {
+                for ll in 0..32 {
+                    block.qs[j / 4 + ll] = big_l[j + ll]
+                        | (big_l[j + ll + 32] << 2)
+                        | (big_l[j + ll + 64] << 4)
+                        | (big_l[j + ll + 96] << 6);
+                }
+            }
+        }
+        Ok(())
+    }
+    // https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L354
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        for (block, y) in group_for_dequantization(xs, ys)? {
+            let d = block.d.to_f32();
+            let min = block.dmin.to_f32();
+
+            let mut is = 0;
+
+            for (y_block, qs) in y.chunks_exact_mut(128).zip(block.qs.chunks_exact(32)) {
+                // Step by 32 over q.
+                let mut shift = 0;
+                let mut y_block_index = 0;
+                for _j in 0..4 {
+                    let sc = block.scales[is];
+                    is += 1;
+                    let dl = d * (sc & 0xF) as f32;
+                    let ml = min * (sc >> 4) as f32;
+                    for q in &qs[..16] {
+                        let y = dl * ((q >> shift) & 3) as f32 - ml;
+                        y_block[y_block_index] = y;
+                        y_block_index += 1;
+                    }
+
+                    let sc = block.scales[is];
+                    is += 1;
+                    let dl = d * (sc & 0xF) as f32;
+                    let ml = min * (sc >> 4) as f32;
+                    for q in &qs[16..] {
+                        let y = dl * ((q >> shift) & 3) as f32 - ml;
+                        y_block[y_block_index] = y;
+                        y_block_index += 1;
+                    }
+
+                    shift += 2;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl GgmlType for BlockQ3K {
+    const DTYPE: GgmlDType = GgmlDType::Q3K;
+    const BLCK_SIZE: usize = QK_K;
+    type VecDotType = BlockQ8K;
+
+    #[allow(unreachable_code)]
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        #[cfg(target_feature = "avx")]
+        return super::avx::vec_dot_q3k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "neon")]
+        return super::neon::vec_dot_q3k_q8k(n, xs, ys);
+
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        if n % QK_K != 0 {
+            crate::bail!("vec_dot_q3k_q8k: {n} is not divisible by {QK_K}")
+        }
+
+        const KMASK1: u32 = 0x03030303;
+        const KMASK2: u32 = 0x0f0f0f0f;
+
+        let mut aux8: [i8; QK_K] = [0; QK_K];
+        let mut aux16: [i16; 8] = [0; 8];
+        let mut sums: [f32; 8] = [0.0; 8];
+        let mut aux32: [i32; 8] = [0; 8];
+
+        let mut auxs: [u32; 4] = [0; 4];
+
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let mut q3: &[u8] = &x.qs;
+            let hmask: &[u8] = &x.hmask;
+            let mut q8: &[i8] = &y.qs;
+
+            aux32.fill(0);
+            let mut a = &mut aux8[..];
+
+            let mut m = 1;
+            //Like the GGML original this is written this way to enable the compiler to vectorize it.
+            for _ in 0..QK_K / 128 {
+                a.iter_mut()
+                    .take(32)
+                    .zip(q3)
+                    .for_each(|(a_val, q3_val)| *a_val = (q3_val & 3) as i8);
+                a.iter_mut()
+                    .take(32)
+                    .zip(hmask)
+                    .for_each(|(a_val, hmask_val)| {
+                        *a_val -= if hmask_val & m != 0 { 0 } else { 4 }
+                    });
+                a = &mut a[32..];
+                m <<= 1;
+
+                a.iter_mut()
+                    .take(32)
+                    .zip(q3)
+                    .for_each(|(a_val, q3_val)| *a_val = ((q3_val >> 2) & 3) as i8);
+                a.iter_mut()
+                    .take(32)
+                    .zip(hmask)
+                    .for_each(|(a_val, hmask_val)| {
+                        *a_val -= if hmask_val & m != 0 { 0 } else { 4 }
+                    });
+                a = &mut a[32..];
+                m <<= 1;
+
+                a.iter_mut()
+                    .take(32)
+                    .zip(q3)
+                    .for_each(|(a_val, q3_val)| *a_val = ((q3_val >> 4) & 3) as i8);
+                a.iter_mut()
+                    .take(32)
+                    .zip(hmask)
+                    .for_each(|(a_val, hmask_val)| {
+                        *a_val -= if hmask_val & m != 0 { 0 } else { 4 }
+                    });
+                a = &mut a[32..];
+                m <<= 1;
+
+                a.iter_mut()
+                    .take(32)
+                    .zip(q3)
+                    .for_each(|(a_val, q3_val)| *a_val = ((q3_val >> 6) & 3) as i8);
+                a.iter_mut()
+                    .take(32)
+                    .zip(hmask)
+                    .for_each(|(a_val, hmask_val)| {
+                        *a_val -= if hmask_val & m != 0 { 0 } else { 4 }
+                    });
+                a = &mut a[32..];
+                m <<= 1;
+                q3 = &q3[32..];
+            }
+
+            a = &mut aux8[..];
+
+            LittleEndian::read_u32_into(&x.scales, &mut auxs[0..3]);
+
+            let tmp = auxs[2];
+            auxs[2] = ((auxs[0] >> 4) & KMASK2) | (((tmp >> 4) & KMASK1) << 4);
+            auxs[3] = ((auxs[1] >> 4) & KMASK2) | (((tmp >> 6) & KMASK1) << 4);
+            auxs[0] = (auxs[0] & KMASK2) | (((tmp) & KMASK1) << 4);
+            auxs[1] = (auxs[1] & KMASK2) | (((tmp >> 2) & KMASK1) << 4);
+
+            for aux in auxs {
+                for scale in aux.to_le_bytes() {
+                    let scale = i8::from_be_bytes([scale]);
+                    for l in 0..8 {
+                        aux16[l] = q8[l] as i16 * a[l] as i16;
+                    }
+                    for l in 0..8 {
+                        aux32[l] += (scale as i32 - 32) * aux16[l] as i32;
+                    }
+                    q8 = &q8[8..];
+                    a = &mut a[8..];
+
+                    for l in 0..8 {
+                        aux16[l] = q8[l] as i16 * a[l] as i16;
+                    }
+                    for l in 0..8 {
+                        aux32[l] += (scale as i32 - 32) * aux16[l] as i32;
+                    }
+                    q8 = &q8[8..];
+                    a = &mut a[8..];
+                }
+            }
+            let d = x.d.to_f32() * y.d;
+            for l in 0..8 {
+                sums[l] += d * aux32[l] as f32;
+            }
+        }
+
+        Ok(sums.iter().sum())
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        for (block, x) in group_for_quantization(xs, ys)? {
+            let mut scales: [f32; QK_K / 16] = [0.0; QK_K / 16];
+            for (j, x_scale_slice) in x.chunks_exact(16).enumerate() {
+                scales[j] = make_q3_quants(x_scale_slice, 4, true);
+            }
+
+            // Get max scale by absolute value.
+            let mut max_scale: f32 = 0.0;
+            for &scale in scales.iter() {
+                if scale.abs() > max_scale.abs() {
+                    max_scale = scale;
+                }
+            }
+
+            block.scales.fill(0);
+
+            if max_scale != 0.0 {
+                let iscale = -32.0 / max_scale;
+                for (j, scale) in scales.iter().enumerate() {
+                    let l_val = nearest_int(iscale * scale);
+                    let l_val = l_val.clamp(-32, 31) + 32;
+                    if j < 8 {
+                        block.scales[j] = (l_val & 0xF) as u8;
+                    } else {
+                        block.scales[j - 8] |= ((l_val & 0xF) << 4) as u8;
+                    }
+                    let l_val = l_val >> 4;
+                    block.scales[j % 4 + 8] |= (l_val << (2 * (j / 4))) as u8;
+                }
+                block.d = f16::from_f32(1.0 / iscale);
+            } else {
+                block.d = f16::from_f32(0.0);
+            }
+
+            let mut l: [i8; QK_K] = [0; QK_K];
+
+            for j in 0..QK_K / 16 {
+                let sc = if j < 8 {
+                    block.scales[j] & 0xF
+                } else {
+                    block.scales[j - 8] >> 4
+                };
+                let sc = (sc | (((block.scales[8 + j % 4] >> (2 * (j / 4))) & 3) << 4)) as i8 - 32;
+                let d = block.d.to_f32() * sc as f32;
+                if d != 0.0 {
+                    for ii in 0..16 {
+                        let l_val = nearest_int(x[16 * j + ii] / d);
+                        l[16 * j + ii] = (l_val.clamp(-4, 3) + 4) as i8;
+                    }
+                }
+            }
+
+            block.hmask.fill(0);
+            let mut m = 0;
+            let mut hm = 1;
+
+            for ll in l.iter_mut() {
+                if *ll > 3 {
+                    block.hmask[m] |= hm;
+                    *ll -= 4;
+                }
+                m += 1;
+                if m == QK_K / 8 {
+                    m = 0;
+                    hm <<= 1;
+                }
+            }
+
+            for j in (0..QK_K).step_by(128) {
+                for l_val in 0..32 {
+                    block.qs[j / 4 + l_val] = (l[j + l_val]
+                        | (l[j + l_val + 32] << 2)
+                        | (l[j + l_val + 64] << 4)
+                        | (l[j + l_val + 96] << 6))
+                        as u8;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    // https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L533
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        const KMASK1: u32 = 0x03030303;
+        const KMASK2: u32 = 0x0f0f0f0f;
+
+        for (block, y) in group_for_dequantization(xs, ys)? {
+            //Reconstruct the scales
+            let mut aux = [0; 4];
+            LittleEndian::read_u32_into(&block.scales, &mut aux[0..3]);
+
+            let tmp = aux[2];
+            aux[2] = ((aux[0] >> 4) & KMASK2) | (((tmp >> 4) & KMASK1) << 4);
+            aux[3] = ((aux[1] >> 4) & KMASK2) | (((tmp >> 6) & KMASK1) << 4);
+            aux[0] = (aux[0] & KMASK2) | (((tmp) & KMASK1) << 4);
+            aux[1] = (aux[1] & KMASK2) | (((tmp >> 2) & KMASK1) << 4);
+
+            //Transfer the scales into an i8 array
+            let scales: &mut [i8] =
+                unsafe { std::slice::from_raw_parts_mut(aux.as_mut_ptr() as *mut i8, 16) };
+
+            let d_all = block.d.to_f32();
+            let mut m = 1;
+            let mut is = 0;
+
+            // Dequantize both 128 long blocks
+            // 32 qs values per 128 long block
+            // Each 16 elements get a scale
+            for (y, qs) in y.chunks_exact_mut(128).zip(block.qs.chunks_exact(32)) {
+                let mut shift = 0;
+                for shift_scoped_y in y.chunks_exact_mut(32) {
+                    for (scale_index, scale_scoped_y) in
+                        shift_scoped_y.chunks_exact_mut(16).enumerate()
+                    {
+                        let dl = d_all * (scales[is] as f32 - 32.0);
+                        for (i, inner_y) in scale_scoped_y.iter_mut().enumerate() {
+                            let new_y = dl
+                                * (((qs[i + 16 * scale_index] >> shift) & 3) as i8
+                                    - if (block.hmask[i + 16 * scale_index] & m) == 0 {
+                                        4
+                                    } else {
+                                        0
+                                    }) as f32;
+                            *inner_y = new_y;
+                        }
+                        // 16 block finished => advance scale index
+                        is += 1;
+                    }
+                    // 32 block finished => increase shift and m
+                    shift += 2;
+                    m <<= 1;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl GgmlType for BlockQ4K {
+    const DTYPE: GgmlDType = GgmlDType::Q4K;
+    const BLCK_SIZE: usize = QK_K;
+    type VecDotType = BlockQ8K;
+
+    #[allow(unreachable_code)]
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        #[cfg(target_feature = "avx")]
+        return super::avx::vec_dot_q4k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "neon")]
+        return super::neon::vec_dot_q4k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "simd128")]
+        return super::simd128::vec_dot_q4k_q8k(n, xs, ys);
+
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        if n % QK_K != 0 {
+            crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
+        }
+
+        const KMASK1: u32 = 0x3f3f3f3f;
+        const KMASK2: u32 = 0x0f0f0f0f;
+        const KMASK3: u32 = 0x03030303;
+
+        let mut utmp: [u32; 4] = [0; 4];
+        let mut scales: [u8; 8] = [0; 8];
+        let mut mins: [u8; 8] = [0; 8];
+
+        let mut aux8: [i8; QK_K] = [0; QK_K];
+        let mut aux16: [i16; 8] = [0; 8];
+        let mut sums: [f32; 8] = [0.0; 8];
+        let mut aux32: [i32; 8] = [0; 8];
+
+        let mut sumf = 0.0;
+        for (y, x) in ys.iter().zip(xs.iter()) {
+            let q4 = &x.qs;
+            let q8 = &y.qs;
+            aux32.fill(0);
+
+            let mut a = &mut aux8[..];
+            let mut q4 = &q4[..];
+            for _ in 0..QK_K / 64 {
+                for l in 0..32 {
+                    a[l] = (q4[l] & 0xF) as i8;
+                }
+                a = &mut a[32..];
+                for l in 0..32 {
+                    a[l] = (q4[l] >> 4) as i8;
+                }
+                a = &mut a[32..];
+                q4 = &q4[32..];
+            }
+
+            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
+
+            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
+            let uaux = utmp[1] & KMASK1;
+            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
+            utmp[2] = uaux;
+            utmp[0] &= KMASK1;
+
+            //extract scales and mins
+            LittleEndian::write_u32_into(&utmp[0..2], &mut scales);
+            LittleEndian::write_u32_into(&utmp[2..4], &mut mins);
+
+            let mut sumi = 0;
+            for j in 0..QK_K / 16 {
+                sumi += y.bsums[j] as i32 * mins[j / 2] as i32;
+            }
+
+            let mut a = &mut aux8[..];
+            let mut q8 = &q8[..];
+
+            for scale in scales {
+                let scale = scale as i32;
+                for _ in 0..4 {
+                    for l in 0..8 {
+                        aux16[l] = q8[l] as i16 * a[l] as i16;
+                    }
+                    for l in 0..8 {
+                        aux32[l] += scale * aux16[l] as i32;
+                    }
+                    q8 = &q8[8..];
+                    a = &mut a[8..];
+                }
+            }
+            let d = x.d.to_f32() * y.d;
+            for l in 0..8 {
+                sums[l] += d * aux32[l] as f32;
+            }
+            let dmin = x.dmin.to_f32() * y.d;
+            sumf -= dmin * sumi as f32;
+        }
+        Ok(sumf + sums.iter().sum::<f32>())
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        for (block, x) in group_for_quantization(xs, ys)? {
+            let mut mins: [f32; QK_K / 32] = [0.0; QK_K / 32];
+            let mut scales: [f32; QK_K / 32] = [0.0; QK_K / 32];
+
+            for (j, x_scale_slice) in x.chunks_exact(32).enumerate() {
+                (scales[j], mins[j]) = make_qkx1_quants(15, 5, x_scale_slice);
+            }
+
+            // get max scale and max min and ensure they are >= 0.0
+            let max_scale = scales.iter().fold(0.0, |max, &val| val.max(max));
+            let max_min = mins.iter().fold(0.0, |max, &val| val.max(max));
+
+            let inv_scale = if max_scale > 0.0 {
+                63.0 / max_scale
+            } else {
+                0.0
+            };
+            let inv_min = if max_min > 0.0 { 63.0 / max_min } else { 0.0 };
+
+            for j in 0..QK_K / 32 {
+                let ls = nearest_int(inv_scale * scales[j]).min(63) as u8;
+                let lm = nearest_int(inv_min * mins[j]).min(63) as u8;
+                if j < 4 {
+                    block.scales[j] = ls;
+                    block.scales[j + 4] = lm;
+                } else {
+                    block.scales[j + 4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                    block.scales[j - 4] |= (ls >> 4) << 6;
+                    block.scales[j] |= (lm >> 4) << 6;
+                }
+            }
+
+            block.d = f16::from_f32(max_scale / 63.0);
+            block.dmin = f16::from_f32(max_min / 63.0);
+
+            let mut l: [u8; QK_K] = [0; QK_K];
+
+            for j in 0..QK_K / 32 {
+                let (sc, m) = get_scale_min_k4(j, &block.scales);
+                let d = block.d.to_f32() * sc as f32;
+                if d != 0.0 {
+                    let dm = block.dmin.to_f32() * m as f32;
+                    for ii in 0..32 {
+                        let l_val = nearest_int((x[32 * j + ii] + dm) / d);
+                        l[32 * j + ii] = l_val.clamp(0, 15) as u8;
+                    }
+                }
+            }
+
+            let q = &mut block.qs;
+            for j in (0..QK_K).step_by(64) {
+                for l_val in 0..32 {
+                    let offset_index = (j / 64) * 32 + l_val;
+                    q[offset_index] = l[j + l_val] | (l[j + l_val + 32] << 4);
+                }
+            }
+        }
+        Ok(())
+    }
+    // https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L735
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        for (block, y) in group_for_dequantization(xs, ys)? {
+            let d = block.d.to_f32();
+            let min = block.dmin.to_f32();
+            let q = &block.qs;
+            let mut is = 0;
+            let mut ys_index = 0;
+
+            for j in (0..QK_K).step_by(64) {
+                let q = &q[j / 2..j / 2 + 32];
+                let (sc, m) = get_scale_min_k4(is, &block.scales);
+                let d1 = d * sc as f32;
+                let m1 = min * m as f32;
+                let (sc, m) = get_scale_min_k4(is + 1, &block.scales);
+                let d2 = d * sc as f32;
+                let m2 = min * m as f32;
+                for q in q {
+                    y[ys_index] = d1 * (q & 0xF) as f32 - m1;
+                    ys_index += 1;
+                }
+                for q in q {
+                    y[ys_index] = d2 * (q >> 4) as f32 - m2;
+                    ys_index += 1;
+                }
+                is += 2;
+            }
+        }
+        Ok(())
+    }
+}
+
+// https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L928
+impl GgmlType for BlockQ5K {
+    const DTYPE: GgmlDType = GgmlDType::Q5K;
+    const BLCK_SIZE: usize = QK_K;
+    type VecDotType = BlockQ8K;
+
+    #[allow(unreachable_code)]
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        #[cfg(target_feature = "avx")]
+        return super::avx::vec_dot_q5k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "neon")]
+        return super::neon::vec_dot_q5k_q8k(n, xs, ys);
+
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        if n % QK_K != 0 {
+            crate::bail!("vec_dot_q5k_q8k: {n} is not divisible by {QK_K}")
+        }
+
+        const KMASK1: u32 = 0x3f3f3f3f;
+        const KMASK2: u32 = 0x0f0f0f0f;
+        const KMASK3: u32 = 0x03030303;
+
+        let mut utmp: [u32; 4] = [0; 4];
+        let mut scales: [u8; 8] = [0; 8];
+        let mut mins: [u8; 8] = [0; 8];
+
+        let mut aux8: [i8; QK_K] = [0; QK_K];
+        let mut aux16: [i16; 8] = [0; 8];
+        let mut sums: [f32; 8] = [0.0; 8];
+        let mut aux32: [i32; 8] = [0; 8];
+
+        let mut sumf = 0.0;
+        for (y, x) in ys.iter().zip(xs.iter()) {
+            let q5 = &x.qs;
+            let hm = &x.qh;
+            let q8 = &y.qs;
+            aux32.fill(0);
+
+            let mut a = &mut aux8[..];
+            let mut q5 = &q5[..];
+            let mut m = 1u8;
+
+            for _ in 0..QK_K / 64 {
+                for l in 0..32 {
+                    a[l] = (q5[l] & 0xF) as i8;
+                    a[l] += if hm[l] & m != 0 { 16 } else { 0 };
+                }
+                a = &mut a[32..];
+                m <<= 1;
+                for l in 0..32 {
+                    a[l] = (q5[l] >> 4) as i8;
+                    a[l] += if hm[l] & m != 0 { 16 } else { 0 };
+                }
+                a = &mut a[32..];
+                m <<= 1;
+                q5 = &q5[32..];
+            }
+
+            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
+
+            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
+            let uaux = utmp[1] & KMASK1;
+            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
+            utmp[2] = uaux;
+            utmp[0] &= KMASK1;
+
+            //extract scales and mins
+            LittleEndian::write_u32_into(&utmp[0..2], &mut scales);
+            LittleEndian::write_u32_into(&utmp[2..4], &mut mins);
+
+            let mut sumi = 0;
+            for j in 0..QK_K / 16 {
+                sumi += y.bsums[j] as i32 * mins[j / 2] as i32;
+            }
+
+            let mut a = &mut aux8[..];
+            let mut q8 = &q8[..];
+
+            for scale in scales {
+                let scale = scale as i32;
+                for _ in 0..4 {
+                    for l in 0..8 {
+                        aux16[l] = q8[l] as i16 * a[l] as i16;
+                    }
+                    for l in 0..8 {
+                        aux32[l] += scale * aux16[l] as i32;
+                    }
+                    q8 = &q8[8..];
+                    a = &mut a[8..];
+                }
+            }
+            let d = x.d.to_f32() * y.d;
+            for l in 0..8 {
+                sums[l] += d * aux32[l] as f32;
+            }
+            let dmin = x.dmin.to_f32() * y.d;
+            sumf -= dmin * sumi as f32;
+        }
+        Ok(sumf + sums.iter().sum::<f32>())
+    }
+
+    // https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L793
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        for (block, x) in group_for_quantization(xs, ys)? {
+            let mut mins: [f32; QK_K / 32] = [0.0; QK_K / 32];
+            let mut scales: [f32; QK_K / 32] = [0.0; QK_K / 32];
+
+            for (j, x_scale_slice) in x.chunks_exact(32).enumerate() {
+                (scales[j], mins[j]) = make_qkx1_quants(31, 5, x_scale_slice);
+            }
+
+            // get max scale and max min and ensure they are >= 0.0
+            let max_scale = scales.iter().fold(0.0, |max, &val| val.max(max));
+            let max_min = mins.iter().fold(0.0, |max, &val| val.max(max));
+
+            let inv_scale = if max_scale > 0.0 {
+                63.0 / max_scale
+            } else {
+                0.0
+            };
+            let inv_min = if max_min > 0.0 { 63.0 / max_min } else { 0.0 };
+            for j in 0..QK_K / 32 {
+                let ls = nearest_int(inv_scale * scales[j]).min(63) as u8;
+                let lm = nearest_int(inv_min * mins[j]).min(63) as u8;
+                if j < 4 {
+                    block.scales[j] = ls;
+                    block.scales[j + 4] = lm;
+                } else {
+                    block.scales[j + 4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                    block.scales[j - 4] |= (ls >> 4) << 6;
+                    block.scales[j] |= (lm >> 4) << 6;
+                }
+            }
+            block.d = f16::from_f32(max_scale / 63.0);
+            block.dmin = f16::from_f32(max_min / 63.0);
+
+            let mut l: [u8; QK_K] = [0; QK_K];
+            for j in 0..QK_K / 32 {
+                let (sc, m) = get_scale_min_k4(j, &block.scales);
+                let d = block.d.to_f32() * sc as f32;
+                if d == 0.0 {
+                    continue;
+                }
+                let dm = block.dmin.to_f32() * m as f32;
+                for ii in 0..32 {
+                    let ll = nearest_int((x[32 * j + ii] + dm) / d);
+                    l[32 * j + ii] = ll.clamp(0, 31) as u8;
+                }
+            }
+
+            let qh = &mut block.qh;
+            let ql = &mut block.qs;
+            qh.fill(0);
+
+            let mut m1 = 1;
+            let mut m2 = 2;
+            for n in (0..QK_K).step_by(64) {
+                let offset = (n / 64) * 32;
+                for j in 0..32 {
+                    let mut l1 = l[n + j];
+                    if l1 > 15 {
+                        l1 -= 16;
+                        qh[j] |= m1;
+                    }
+                    let mut l2 = l[n + j + 32];
+                    if l2 > 15 {
+                        l2 -= 16;
+                        qh[j] |= m2;
+                    }
+                    ql[offset + j] = l1 | (l2 << 4);
+                }
+                m1 <<= 2;
+                m2 <<= 2;
+            }
+        }
+
+        Ok(())
+    }
+
+    // https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L928
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        for (block, y) in group_for_dequantization(xs, ys)? {
+            let d = block.d.to_f32();
+            let min = block.dmin.to_f32();
+            let ql = &block.qs;
+            let qh = &block.qh;
+            let mut is = 0;
+            let mut u1 = 1;
+            let mut u2 = 2;
+            let mut ys_index = 0;
+
+            for j in (0..QK_K).step_by(64) {
+                let ql = &ql[j / 2..j / 2 + 32];
+                let (sc, m) = get_scale_min_k4(is, &block.scales);
+                let d1 = d * sc as f32;
+                let m1 = min * m as f32;
+                let (sc, m) = get_scale_min_k4(is + 1, &block.scales);
+                let d2 = d * sc as f32;
+                let m2 = min * m as f32;
+                for (ql, qh) in ql.iter().zip(qh) {
+                    let to_add = if qh & u1 != 0 { 16f32 } else { 0f32 };
+                    y[ys_index] = d1 * ((ql & 0xF) as f32 + to_add) - m1;
+                    ys_index += 1;
+                }
+                for (ql, qh) in ql.iter().zip(qh) {
+                    let to_add = if qh & u2 != 0 { 16f32 } else { 0f32 };
+                    y[ys_index] = d2 * ((ql >> 4) as f32 + to_add) - m2;
+                    ys_index += 1;
+                }
+                is += 2;
+                u1 <<= 2;
+                u2 <<= 2;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl GgmlType for BlockQ6K {
+    const DTYPE: GgmlDType = GgmlDType::Q6K;
+    const BLCK_SIZE: usize = QK_K;
+    type VecDotType = BlockQ8K;
+
+    #[allow(unreachable_code)]
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        #[cfg(target_feature = "avx")]
+        return super::avx::vec_dot_q6k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "neon")]
+        return super::neon::vec_dot_q6k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "simd128")]
+        return super::simd128::vec_dot_q6k_q8k(n, xs, ys);
+
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        if n % QK_K != 0 {
+            crate::bail!("vec_dot_q6k_q8k: {n} is not divisible by {QK_K}")
+        }
+
+        let mut aux8 = [0i8; QK_K];
+        let mut aux16 = [0i16; 8];
+        let mut sums = [0f32; 8];
+        let mut aux32 = [0f32; 8];
+
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let q4 = &x.ql;
+            let qh = &x.qh;
+            let q8 = &y.qs;
+            aux32.fill(0f32);
+
+            for j in (0..QK_K).step_by(128) {
+                let aux8 = &mut aux8[j..];
+                let q4 = &q4[j / 2..];
+                let qh = &qh[j / 4..];
+                for l in 0..32 {
+                    aux8[l] = (((q4[l] & 0xF) | ((qh[l] & 3) << 4)) as i32 - 32) as i8;
+                    aux8[l + 32] =
+                        (((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) as i32 - 32) as i8;
+                    aux8[l + 64] = (((q4[l] >> 4) | (((qh[l] >> 4) & 3) << 4)) as i32 - 32) as i8;
+                    aux8[l + 96] =
+                        (((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) as i32 - 32) as i8;
+                }
+            }
+
+            for (j, &scale) in x.scales.iter().enumerate() {
+                let scale = scale as f32;
+                let q8 = &q8[16 * j..];
+                let aux8 = &aux8[16 * j..];
+                for l in 0..8 {
+                    aux16[l] = q8[l] as i16 * aux8[l] as i16;
+                }
+                for l in 0..8 {
+                    aux32[l] += scale * aux16[l] as f32
+                }
+                let q8 = &q8[8..];
+                let aux8 = &aux8[8..];
+                for l in 0..8 {
+                    aux16[l] = q8[l] as i16 * aux8[l] as i16;
+                }
+                for l in 0..8 {
+                    aux32[l] += scale * aux16[l] as f32
+                }
+            }
+
+            let d = x.d.to_f32() * y.d;
+            for (sum, &a) in sums.iter_mut().zip(aux32.iter()) {
+                *sum += a * d;
+            }
+        }
+        Ok(sums.iter().sum())
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        if xs.len() != ys.len() * Self::BLCK_SIZE {
+            crate::bail!(
+                "quantize_row_q6k: size mismatch {} {} {}",
+                xs.len(),
+                ys.len(),
+                Self::BLCK_SIZE
+            )
+        }
+        let mut l = [0i8; QK_K];
+        let mut scales = [0f32; QK_K / 16];
+        let mut x = xs.as_ptr();
+        let l = l.as_mut_ptr();
+        unsafe {
+            for y in ys.iter_mut() {
+                let mut max_scale = 0f32;
+                let mut max_abs_scale = 0f32;
+                for (ib, scale_) in scales.iter_mut().enumerate() {
+                    let scale = make_qx_quants(16, 32, x.add(16 * ib), l.add(16 * ib), 1);
+                    *scale_ = scale;
+                    let abs_scale = scale.abs();
+                    if abs_scale > max_abs_scale {
+                        max_abs_scale = abs_scale;
+                        max_scale = scale
+                    }
+                }
+
+                let iscale = -128f32 / max_scale;
+                y.d = f16::from_f32(1.0 / iscale);
+
+                for (y_scale, scale) in y.scales.iter_mut().zip(scales.iter()) {
+                    *y_scale = nearest_int(iscale * scale).min(127) as i8
+                }
+
+                for (j, &y_scale) in y.scales.iter().enumerate() {
+                    let d = y.d.to_f32() * y_scale as f32;
+                    if d == 0. {
+                        continue;
+                    }
+                    for ii in 0..16 {
+                        let ll = nearest_int(*x.add(16 * j + ii) / d).clamp(-32, 31);
+                        *l.add(16 * j + ii) = (ll + 32) as i8
+                    }
+                }
+
+                let mut ql = y.ql.as_mut_ptr();
+                let mut qh = y.qh.as_mut_ptr();
+
+                for j in (0..QK_K).step_by(128) {
+                    for l_idx in 0..32 {
+                        let q1 = *l.add(j + l_idx) & 0xF;
+                        let q2 = *l.add(j + l_idx + 32) & 0xF;
+                        let q3 = *l.add(j + l_idx + 64) & 0xF;
+                        let q4 = *l.add(j + l_idx + 96) & 0xF;
+                        *ql.add(l_idx) = (q1 | (q3 << 4)) as u8;
+                        *ql.add(l_idx + 32) = (q2 | (q4 << 4)) as u8;
+                        *qh.add(l_idx) = ((*l.add(j + l_idx) >> 4)
+                            | ((*l.add(j + l_idx + 32) >> 4) << 2)
+                            | ((*l.add(j + l_idx + 64) >> 4) << 4)
+                            | ((*l.add(j + l_idx + 96) >> 4) << 6))
+                            as u8;
+                    }
+                    ql = ql.add(64);
+                    qh = qh.add(32);
+                }
+
+                x = x.add(QK_K)
+            }
+        }
+        Ok(())
+    }
+
+    // https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L1067
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        let k = ys.len();
+        if k % QK_K != 0 {
+            crate::bail!("dequantize_row_q6k: {k} is not divisible by {QK_K}")
+        }
+        for (idx_x, x) in xs.iter().enumerate() {
+            let d = x.d.to_f32();
+            let ql = &x.ql;
+            let qh = &x.qh;
+            let sc = &x.scales;
+            for n in (0..QK_K).step_by(128) {
+                let idx = n / 128;
+                let ys = &mut ys[idx_x * QK_K + n..];
+                let sc = &sc[8 * idx..];
+                let ql = &ql[64 * idx..];
+                let qh = &qh[32 * idx..];
+                for l in 0..32 {
+                    let is = l / 16;
+                    let q1 = ((ql[l] & 0xF) | ((qh[l] & 3) << 4)) as i8 - 32;
+                    let q2 = ((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) as i8 - 32;
+                    let q3 = ((ql[l] >> 4) | (((qh[l] >> 4) & 3) << 4)) as i8 - 32;
+                    let q4 = ((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) as i8 - 32;
+                    ys[l] = d * sc[is] as f32 * q1 as f32;
+                    ys[l + 32] = d * sc[is + 2] as f32 * q2 as f32;
+                    ys[l + 64] = d * sc[is + 4] as f32 * q3 as f32;
+                    ys[l + 96] = d * sc[is + 6] as f32 * q4 as f32;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl GgmlType for BlockQ8K {
+    const DTYPE: GgmlDType = GgmlDType::Q8K;
+    const BLCK_SIZE: usize = QK_K;
+    type VecDotType = BlockQ8K;
+
+    #[allow(unreachable_code)]
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        #[cfg(target_feature = "avx")]
+        return super::avx::vec_dot_q8k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "neon")]
+        return super::neon::vec_dot_q8k_q8k(n, xs, ys);
+
+        #[cfg(target_feature = "simd128")]
+        return super::simd128::vec_dot_q8k_q8k(n, xs, ys);
+
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        let qk = QK_K;
+        if n % QK_K != 0 {
+            crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
+        }
+
+        // Generic implementation.
+        let mut sumf = 0f32;
+        for (xs, ys) in xs.iter().zip(ys.iter()) {
+            let sum_i = xs
+                .qs
+                .iter()
+                .zip(ys.qs.iter())
+                .map(|(&x, &y)| x as i32 * y as i32)
+                .sum::<i32>();
+            sumf += sum_i as f32 * xs.d * ys.d
+        }
+        Ok(sumf)
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        let k = xs.len();
+        if k % QK_K != 0 {
+            crate::bail!("quantize_row_q8k: {k} is not divisible by {QK_K}")
+        }
+        for (i, y) in ys.iter_mut().enumerate() {
+            let mut max = 0f32;
+            let mut amax = 0f32;
+            let xs = &xs[i * QK_K..(i + 1) * QK_K];
+            for &x in xs.iter() {
+                if amax < x.abs() {
+                    amax = x.abs();
+                    max = x;
+                }
+            }
+            if amax == 0f32 {
+                y.d = 0f32;
+                y.qs.fill(0)
+            } else {
+                let iscale = -128f32 / max;
+                for (j, q) in y.qs.iter_mut().enumerate() {
+                    // ggml uses nearest_int with bit magic here, maybe we want the same
+                    // but we would have to test and benchmark it.
+                    let v = (iscale * xs[j]).round();
+                    *q = v.min(127.) as i8
+                }
+                for j in 0..QK_K / 16 {
+                    let mut sum = 0i32;
+                    for ii in 0..16 {
+                        sum += y.qs[j * 16 + ii] as i32
+                    }
+                    y.bsums[j] = sum as i16
+                }
+                y.d = 1.0 / iscale
+            }
+        }
+        Ok(())
+    }
+
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        let k = ys.len();
+        if k % QK_K != 0 {
+            crate::bail!("dequantize_row_q8k: {k} is not divisible by {QK_K}")
+        }
+        for (i, x) in xs.iter().enumerate() {
+            for (j, &q) in x.qs.iter().enumerate() {
+                ys[i * QK_K + j] = x.d * q as f32
+            }
+        }
+        Ok(())
+    }
+}
+
+// https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/ggml.c#L10605
+pub fn matmul<T: GgmlType>(
+    mkn: (usize, usize, usize),
+    lhs: &[f32],
+    rhs_t: &[T],
+    dst: &mut [f32],
+) -> Result<()> {
+    let (m, k, n) = mkn;
+    if m * k != lhs.len() {
+        crate::bail!("unexpected lhs length {} {mkn:?}", lhs.len());
+    }
+
+    let k_in_lhs_blocks = (k + T::BLCK_SIZE - 1) / T::BLCK_SIZE;
+    let k_in_rhs_blocks = (k + T::VecDotType::BLCK_SIZE - 1) / T::VecDotType::BLCK_SIZE;
+    // TODO: Do not make this copy if the DotType is f32.
+    // TODO: Pre-allocate this.
+    let mut lhs_b = vec![T::VecDotType::zeros(); m * k_in_lhs_blocks];
+    for row_idx in 0..m {
+        let lhs_b = &mut lhs_b[row_idx * k_in_lhs_blocks..(row_idx + 1) * k_in_lhs_blocks];
+        let lhs = &lhs[row_idx * k..(row_idx + 1) * k];
+        T::VecDotType::from_float(lhs, lhs_b)?
+    }
+    let lhs_b = lhs_b.as_slice();
+
+    for row_idx in 0..m {
+        let lhs_row = &lhs_b[row_idx * k_in_lhs_blocks..(row_idx + 1) * k_in_lhs_blocks];
+        let dst_row = &mut dst[row_idx * n..(row_idx + 1) * n];
+
+        let result: Result<Vec<_>> = dst_row
+            .into_par_iter()
+            .enumerate()
+            .with_min_len(128)
+            .with_max_len(512)
+            .map(|(col_idx, dst)| {
+                let rhs_col = &rhs_t[col_idx * k_in_rhs_blocks..(col_idx + 1) * k_in_rhs_blocks];
+                T::vec_dot(k, rhs_col, lhs_row).map(|value| *dst = value)
+            })
+            .collect();
+
+        result?;
+    }
+    Ok(())
+}
+
+impl GgmlType for f32 {
+    const DTYPE: GgmlDType = GgmlDType::F32;
+    const BLCK_SIZE: usize = 1;
+    type VecDotType = f32;
+
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        if xs.len() < n {
+            crate::bail!("size mismatch {} < {n}", xs.len())
+        }
+        if ys.len() < n {
+            crate::bail!("size mismatch {} < {n}", ys.len())
+        }
+        let mut res = 0f32;
+        unsafe { crate::cpu::vec_dot_f32(xs.as_ptr(), ys.as_ptr(), &mut res, n) };
+        Ok(res)
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        if xs.len() != ys.len() {
+            crate::bail!("size mismatch {} {}", xs.len(), ys.len());
+        }
+        ys.copy_from_slice(xs);
+        Ok(())
+    }
+
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        if xs.len() != ys.len() {
+            crate::bail!("size mismatch {} {}", xs.len(), ys.len());
+        }
+        ys.copy_from_slice(xs);
+        Ok(())
+    }
+}
+
+impl GgmlType for f16 {
+    const DTYPE: GgmlDType = GgmlDType::F16;
+    const BLCK_SIZE: usize = 1;
+    type VecDotType = f16;
+
+    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        Self::vec_dot_unopt(n, xs, ys)
+    }
+
+    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        if xs.len() < n {
+            crate::bail!("size mismatch {} < {n}", xs.len())
+        }
+        if ys.len() < n {
+            crate::bail!("size mismatch {} < {n}", ys.len())
+        }
+        let mut res = 0f32;
+        unsafe { crate::cpu::vec_dot_f16(xs.as_ptr(), ys.as_ptr(), &mut res, n) };
+        Ok(res)
+    }
+
+    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
+        if xs.len() != ys.len() {
+            crate::bail!("size mismatch {} {}", xs.len(), ys.len());
+        }
+        // TODO: vectorize
+        for (x, y) in xs.iter().zip(ys.iter_mut()) {
+            *y = f16::from_f32(*x)
+        }
+        Ok(())
+    }
+
+    fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
+        if xs.len() != ys.len() {
+            crate::bail!("size mismatch {} {}", xs.len(), ys.len());
+        }
+        // TODO: vectorize
+        for (x, y) in xs.iter().zip(ys.iter_mut()) {
+            *y = x.to_f32()
+        }
+        Ok(())
+    }
+}
--- a/candle-core/src/quantized/metal.rs
+++ b/candle-core/src/quantized/metal.rs
+use super::{GgmlDType, QStorage};
+use crate::backend::BackendStorage;
+use crate::{DType, MetalDevice, MetalStorage, Result, Shape};
+use metal::Buffer;
+use std::sync::Arc;
+
+pub struct QMetalStorage {
+    dtype: GgmlDType,
+    device: MetalDevice,
+    buffer: Arc<Buffer>,
+}
+
+impl QMetalStorage {
+    pub fn zeros(device: &MetalDevice, elem_count: usize, dtype: GgmlDType) -> Result<Self> {
+        let size = elem_count * dtype.type_size() / dtype.block_size();
+        let buffer = device.allocate_zeros(size)?;
+        Ok(Self {
+            buffer,
+            device: device.clone(),
+            dtype,
+        })
+    }
+
+    pub fn dtype(&self) -> GgmlDType {
+        self.dtype
+    }
+
+    pub fn device(&self) -> &MetalDevice {
+        &self.device
+    }
+
+    pub fn buffer(&self) -> &Buffer {
+        &self.buffer
+    }
+
+    pub fn dequantize(&self, elem_count: usize) -> Result<MetalStorage> {
+        use crate::quantized::k_quants::GgmlType;
+
+        let buffer = self.device.new_buffer_managed(self.buffer.length())?;
+        let command_buffer = self.device.command_buffer()?;
+        command_buffer.set_label("to_cpu");
+        let blit = command_buffer.new_blit_command_encoder();
+        blit.set_label("blit_to_cpu");
+        blit.copy_from_buffer(&self.buffer, 0, &buffer, 0, self.buffer.length());
+        blit.end_encoding();
+        self.device.wait_until_completed()?;
+        let mut out = vec![0.0; elem_count];
+        let block_len = elem_count / self.dtype.block_size();
+        match self.dtype {
+            GgmlDType::F32 => {
+                let vec: Vec<f32> = read_to_vec(&buffer, block_len);
+                f32::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::F16 => {
+                let vec: Vec<half::f16> = read_to_vec(&buffer, block_len);
+                half::f16::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q4_0 => {
+                let vec: Vec<crate::quantized::BlockQ4_0> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ4_0::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q4_1 => {
+                let vec: Vec<crate::quantized::BlockQ4_1> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ4_1::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q5_0 => {
+                let vec: Vec<crate::quantized::BlockQ5_0> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ5_0::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q5_1 => {
+                let vec: Vec<crate::quantized::BlockQ5_1> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ5_1::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q8_0 => {
+                let vec: Vec<crate::quantized::BlockQ8_0> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ8_0::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q8_1 => {
+                let vec: Vec<crate::quantized::BlockQ8_1> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ8_1::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q2K => {
+                let vec: Vec<crate::quantized::BlockQ2K> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ2K::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q3K => {
+                let vec: Vec<crate::quantized::BlockQ3K> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ3K::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q4K => {
+                let vec: Vec<crate::quantized::BlockQ4K> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ4K::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q5K => {
+                let vec: Vec<crate::quantized::BlockQ5K> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ5K::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q6K => {
+                let vec: Vec<crate::quantized::BlockQ6K> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ6K::to_float(&vec, &mut out)?;
+            }
+            GgmlDType::Q8K => {
+                let vec: Vec<crate::quantized::BlockQ8K> = read_to_vec(&buffer, block_len);
+                crate::quantized::BlockQ8K::to_float(&vec, &mut out)?;
+            }
+        }
+
+        let buffer = self.device.new_buffer_with_data(&out)?;
+        Ok(MetalStorage::new(
+            buffer,
+            self.device.clone(),
+            elem_count,
+            DType::F32,
+        ))
+    }
+
+    pub fn quantize(&mut self, src: &MetalStorage) -> Result<()> {
+        // Quantization only happens on CPU for now.
+        let src = src.to_cpu::<f32>()?;
+        let elem_count = src.len();
+        let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
+        let mut qcpu_storage = crate::Device::Cpu.qzeros(elem_count, self.dtype)?;
+        qcpu_storage.quantize(&src)?;
+        let buffer = self.device.new_buffer_with_data(&qcpu_storage.data()?)?;
+        self.buffer = buffer;
+        Ok(())
+    }
+
+    pub fn storage_size_in_bytes(&self) -> usize {
+        self.buffer.length() as usize
+    }
+
+    pub fn fwd(
+        &self,
+        self_shape: &Shape,
+        storage: &MetalStorage,
+        layout: &crate::Layout,
+    ) -> Result<(MetalStorage, Shape)> {
+        use crate::MetalError;
+
+        if !layout.is_contiguous() {
+            crate::bail!("input tensor is not contiguous {layout:?}")
+        }
+        let src_shape = layout.shape();
+        // self is transposed so n is first then k.
+        if src_shape.rank() < 2 {
+            crate::bail!("input tensor has only one dimension {layout:?}")
+        }
+        let (n, k) = self_shape.dims2()?;
+        let mut dst_shape = src_shape.dims().to_vec();
+
+        // We always use a single batch dimension and stack all the tensors in the batch on the
+        // second dimension as the implementation in candle-metal-kernels doesn't handle batch
+        // properly.
+        let (b, m) = match dst_shape.len() {
+            3 => (1, dst_shape[0] * dst_shape[1]),
+            2 => (1, dst_shape[0]),
+            n => crate::bail!("Invalid rank {n} for quantized matmul metal"),
+        };
+        let last_k = dst_shape.pop().unwrap();
+        if last_k != k {
+            crate::bail!("input tensor {layout:?} incompatible with {:?}", self_shape)
+        }
+        dst_shape.push(n);
+        let dst_shape = Shape::from(dst_shape);
+        let device = storage.device().clone();
+        let dst = device.new_buffer(dst_shape.elem_count(), DType::F32, "qmatmul")?;
+        let command_buffer = device.command_buffer()?;
+        candle_metal_kernels::call_quantized_matmul_t(
+            device.device(),
+            &command_buffer,
+            device.kernels(),
+            self.dtype.into(),
+            (b, m, n, k),
+            storage.buffer(),
+            layout.start_offset() * storage.dtype().size_in_bytes(),
+            &self.buffer,
+            &dst,
+        )
+        .map_err(MetalError::from)?;
+        let dst_storage = crate::MetalStorage::new(dst, device, dst_shape.elem_count(), DType::F32);
+        Ok((dst_storage, dst_shape))
+    }
+}
+
+pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
+    device: &MetalDevice,
+    data: &[T],
+) -> Result<QStorage> {
+    let buffer = device.new_buffer_with_data(data)?;
+    let device = device.clone();
+    Ok(QStorage::Metal(QMetalStorage {
+        dtype: T::DTYPE,
+        device,
+        buffer,
+    }))
+}
+
+fn read_to_vec<T: Clone>(buffer: &Buffer, n: usize) -> Vec<T> {
+    let ptr = buffer.contents() as *const T;
+    assert!(!ptr.is_null());
+    let slice = unsafe { std::slice::from_raw_parts(ptr, n) };
+    slice.to_vec()
+}
+
+impl From<GgmlDType> for candle_metal_kernels::GgmlDType {
+    fn from(value: GgmlDType) -> Self {
+        match value {
+            GgmlDType::Q4_0 => candle_metal_kernels::GgmlDType::Q4_0,
+            GgmlDType::Q4_1 => candle_metal_kernels::GgmlDType::Q4_1,
+            GgmlDType::Q5_0 => candle_metal_kernels::GgmlDType::Q5_0,
+            GgmlDType::Q5_1 => candle_metal_kernels::GgmlDType::Q5_1,
+            GgmlDType::Q8_0 => candle_metal_kernels::GgmlDType::Q8_0,
+            GgmlDType::Q8_1 => candle_metal_kernels::GgmlDType::Q8_1,
+            GgmlDType::Q2K => candle_metal_kernels::GgmlDType::Q2K,
+            GgmlDType::Q3K => candle_metal_kernels::GgmlDType::Q3K,
+            GgmlDType::Q4K => candle_metal_kernels::GgmlDType::Q4K,
+            GgmlDType::Q5K => candle_metal_kernels::GgmlDType::Q5K,
+            GgmlDType::Q6K => candle_metal_kernels::GgmlDType::Q6K,
+            GgmlDType::Q8K => candle_metal_kernels::GgmlDType::Q8K,
+            GgmlDType::F16 => candle_metal_kernels::GgmlDType::F16,
+            GgmlDType::F32 => candle_metal_kernels::GgmlDType::F32,
+        }
+    }
+}
--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
+use crate::{CpuStorage, Device, Result, Shape, Storage, Tensor};
+use k_quants::*;
+use std::borrow::Cow;
+
+#[cfg(target_feature = "avx")]
+pub mod avx;
+mod dummy_cuda;
+mod dummy_metal;
+pub mod ggml_file;
+pub mod gguf_file;
+pub mod k_quants;
+#[cfg(feature = "metal")]
+pub mod metal;
+#[cfg(not(feature = "metal"))]
+mod metal {
+    pub use super::dummy_metal::*;
+}
+#[cfg(feature = "cuda")]
+pub mod cuda;
+#[cfg(not(feature = "cuda"))]
+mod cuda {
+    pub use super::dummy_cuda::*;
+}
+
+#[cfg(target_feature = "neon")]
+pub mod neon;
+#[cfg(target_feature = "simd128")]
+pub mod simd128;
+pub mod utils;
+use half::f16;
+
+pub use k_quants::GgmlType;
+
+pub struct QTensor {
+    storage: QStorage,
+    shape: Shape,
+}
+
+impl Device {
+    fn qzeros(&self, elem_count: usize, dtype: GgmlDType) -> Result<QStorage> {
+        match self {
+            Device::Cpu => {
+                let storage = dtype.cpu_zeros(elem_count);
+                Ok(QStorage::Cpu(storage))
+            }
+            Device::Metal(metal) => {
+                let storage = metal::QMetalStorage::zeros(metal, elem_count, dtype)?;
+                Ok(QStorage::Metal(storage))
+            }
+            Device::Cuda(cuda) => {
+                let storage = cuda::QCudaStorage::zeros(cuda, elem_count, dtype)?;
+                Ok(QStorage::Cuda(storage))
+            }
+        }
+    }
+}
+
+pub enum QStorage {
+    Cpu(Box<dyn QuantizedType>),
+    Metal(metal::QMetalStorage),
+    Cuda(cuda::QCudaStorage),
+}
+
+impl QStorage {
+    fn block_size(&self) -> usize {
+        match self {
+            QStorage::Cpu(storage) => storage.block_size(),
+            QStorage::Metal(storage) => storage.dtype().block_size(),
+            QStorage::Cuda(storage) => storage.dtype().block_size(),
+        }
+    }
+
+    fn dtype(&self) -> GgmlDType {
+        match self {
+            QStorage::Cpu(storage) => storage.dtype(),
+            QStorage::Metal(storage) => storage.dtype(),
+            QStorage::Cuda(storage) => storage.dtype(),
+        }
+    }
+
+    fn device(&self) -> Device {
+        match self {
+            QStorage::Cpu(_storage) => Device::Cpu,
+            QStorage::Metal(storage) => Device::Metal(storage.device().clone()),
+            QStorage::Cuda(storage) => Device::Cuda(storage.device().clone()),
+        }
+    }
+
+    fn size_in_bytes(&self) -> usize {
+        match self {
+            QStorage::Cpu(storage) => storage.storage_size_in_bytes(),
+            QStorage::Metal(storage) => storage.storage_size_in_bytes(),
+            QStorage::Cuda(storage) => storage.storage_size_in_bytes(),
+        }
+    }
+
+    fn quantize(&mut self, src: &Storage) -> Result<()> {
+        match (self, src) {
+            (QStorage::Cpu(storage), Storage::Cpu(src)) => {
+                storage.from_float(src.as_slice::<f32>()?)?;
+            }
+            (QStorage::Metal(storage), Storage::Metal(src)) => storage.quantize(src)?,
+            (QStorage::Cuda(storage), Storage::Cuda(src)) => storage.quantize(src)?,
+            _ => crate::bail!("Invalid dequantize storage locations do not match"),
+        }
+        Ok(())
+    }
+
+    fn dequantize(&self, elem_count: usize) -> Result<Storage> {
+        match self {
+            QStorage::Cpu(storage) => Ok(Storage::Cpu(storage.dequantize(elem_count)?)),
+            QStorage::Metal(storage) => Ok(Storage::Metal(storage.dequantize(elem_count)?)),
+            QStorage::Cuda(storage) => Ok(Storage::Cuda(storage.dequantize(elem_count)?)),
+        }
+    }
+
+    fn data(&self) -> Result<Cow<[u8]>> {
+        match self {
+            QStorage::Cpu(storage) => {
+                let data_ptr = storage.as_ptr();
+                let size_in_bytes = storage.storage_size_in_bytes();
+                let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
+                Ok(Cow::from(data))
+            }
+            QStorage::Metal(_) | QStorage::Cuda(_) => {
+                crate::bail!("not implemented");
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum GgmlDType {
+    F32,
+    F16,
+    Q4_0,
+    Q4_1,
+    Q5_0,
+    Q5_1,
+    Q8_0,
+    Q8_1,
+    Q2K,
+    Q3K,
+    Q4K,
+    Q5K,
+    Q6K,
+    Q8K,
+}
+
+impl GgmlDType {
+    pub(crate) fn from_u32(u: u32) -> Result<Self> {
+        let dtype = match u {
+            0 => Self::F32,
+            1 => Self::F16,
+            2 => Self::Q4_0,
+            3 => Self::Q4_1,
+            6 => Self::Q5_0,
+            7 => Self::Q5_1,
+            8 => Self::Q8_0,
+            9 => Self::Q8_1,
+            10 => Self::Q2K,
+            11 => Self::Q3K,
+            12 => Self::Q4K,
+            13 => Self::Q5K,
+            14 => Self::Q6K,
+            15 => Self::Q8K,
+            _ => crate::bail!("unknown dtype for tensor {u}"),
+        };
+        Ok(dtype)
+    }
+
+    pub(crate) fn to_u32(self) -> u32 {
+        match self {
+            Self::F32 => 0,
+            Self::F16 => 1,
+            Self::Q4_0 => 2,
+            Self::Q4_1 => 3,
+            Self::Q5_0 => 6,
+            Self::Q5_1 => 7,
+            Self::Q8_0 => 8,
+            Self::Q8_1 => 9,
+            Self::Q2K => 10,
+            Self::Q3K => 11,
+            Self::Q4K => 12,
+            Self::Q5K => 13,
+            Self::Q6K => 14,
+            Self::Q8K => 15,
+        }
+    }
+
+    /// The block dtype
+    pub fn cpu_zeros(&self, elem_count: usize) -> Box<dyn QuantizedType> {
+        match self {
+            Self::F32 => Box::new(vec![f32::zeros(); elem_count]),
+            Self::F16 => Box::new(vec![f16::zeros(); elem_count]),
+            Self::Q4_0 => Box::new(vec![BlockQ4_0::zeros(); elem_count / BlockQ4_0::BLCK_SIZE]),
+            Self::Q4_1 => Box::new(vec![BlockQ4_1::zeros(); elem_count / BlockQ4_1::BLCK_SIZE]),
+            Self::Q5_0 => Box::new(vec![BlockQ5_0::zeros(); elem_count / BlockQ5_0::BLCK_SIZE]),
+            Self::Q5_1 => Box::new(vec![BlockQ5_1::zeros(); elem_count / BlockQ5_1::BLCK_SIZE]),
+            Self::Q8_0 => Box::new(vec![BlockQ8_0::zeros(); elem_count / BlockQ8_0::BLCK_SIZE]),
+            Self::Q8_1 => Box::new(vec![BlockQ8_1::zeros(); elem_count / BlockQ8_1::BLCK_SIZE]),
+            Self::Q2K => Box::new(vec![BlockQ2K::zeros(); elem_count / BlockQ2K::BLCK_SIZE]),
+            Self::Q3K => Box::new(vec![BlockQ3K::zeros(); elem_count / BlockQ3K::BLCK_SIZE]),
+            Self::Q4K => Box::new(vec![BlockQ4K::zeros(); elem_count / BlockQ4K::BLCK_SIZE]),
+            Self::Q5K => Box::new(vec![BlockQ5K::zeros(); elem_count / BlockQ5K::BLCK_SIZE]),
+            Self::Q6K => Box::new(vec![BlockQ6K::zeros(); elem_count / BlockQ6K::BLCK_SIZE]),
+            Self::Q8K => Box::new(vec![BlockQ8K::zeros(); elem_count / BlockQ8K::BLCK_SIZE]),
+        }
+    }
+    /// The type size for blocks in bytes.
+    pub fn type_size(&self) -> usize {
+        use k_quants::*;
+        match self {
+            Self::F32 => 4,
+            Self::F16 => 2,
+            Self::Q4_0 => std::mem::size_of::<BlockQ4_0>(),
+            Self::Q4_1 => std::mem::size_of::<BlockQ4_1>(),
+            Self::Q5_0 => std::mem::size_of::<BlockQ5_0>(),
+            Self::Q5_1 => std::mem::size_of::<BlockQ5_1>(),
+            // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L932
+            Self::Q8_0 => std::mem::size_of::<BlockQ8_0>(),
+            Self::Q8_1 => std::mem::size_of::<BlockQ8_1>(),
+            Self::Q2K => std::mem::size_of::<BlockQ2K>(),
+            Self::Q3K => std::mem::size_of::<BlockQ3K>(),
+            Self::Q4K => std::mem::size_of::<BlockQ4K>(),
+            Self::Q5K => std::mem::size_of::<BlockQ5K>(),
+            Self::Q6K => std::mem::size_of::<BlockQ6K>(),
+            Self::Q8K => std::mem::size_of::<BlockQ8K>(),
+        }
+    }
+
+    /// The block size, i.e. the number of elements stored in each block.
+    pub fn block_size(&self) -> usize {
+        match self {
+            Self::F32 => 1,
+            Self::F16 => 1,
+            Self::Q4_0 => k_quants::QK4_0,
+            Self::Q4_1 => k_quants::QK4_1,
+            Self::Q5_0 => k_quants::QK5_0,
+            Self::Q5_1 => k_quants::QK5_1,
+            Self::Q8_0 => k_quants::QK8_0,
+            Self::Q8_1 => k_quants::QK8_1,
+            Self::Q2K | Self::Q3K | Self::Q4K | Self::Q5K | Self::Q6K | Self::Q8K => k_quants::QK_K,
+        }
+    }
+}
+
+// A version of GgmlType without `vec_dot` so that it can be dyn boxed.
+pub trait QuantizedType: Send + Sync {
+    fn dtype(&self) -> GgmlDType;
+    fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()>;
+    fn dequantize(&self, elem_count: usize) -> Result<CpuStorage>;
+    fn storage_size_in_bytes(&self) -> usize;
+    fn as_ptr(&self) -> *const u8;
+    fn block_size(&self) -> usize;
+    #[allow(clippy::wrong_self_convention)]
+    fn from_float(&mut self, xs: &[f32]) -> Result<()>;
+    fn size(&self) -> usize;
+}
+
+impl<T: k_quants::GgmlType + Send + Sync> QuantizedType for Vec<T> {
+    fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()> {
+        k_quants::matmul(mkn, lhs, self.as_slice(), dst)
+    }
+
+    fn size(&self) -> usize {
+        self.len() * core::mem::size_of::<T>()
+    }
+
+    fn from_float(&mut self, xs: &[f32]) -> Result<()> {
+        T::from_float(xs, self)
+    }
+
+    fn dtype(&self) -> GgmlDType {
+        T::DTYPE
+    }
+
+    fn block_size(&self) -> usize {
+        T::BLCK_SIZE
+    }
+
+    fn dequantize(&self, elem_count: usize) -> Result<CpuStorage> {
+        let mut ys = vec![0.0f32; elem_count];
+        T::to_float(self.as_slice(), &mut ys)?;
+        Ok(CpuStorage::F32(ys))
+    }
+
+    fn storage_size_in_bytes(&self) -> usize {
+        self.len() * std::mem::size_of::<T>()
+    }
+
+    fn as_ptr(&self) -> *const u8 {
+        self.as_ptr() as *const u8
+    }
+}
+
+impl std::fmt::Debug for QTensor {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "QTensor[{:?}; {:?}]", self.shape, self.dtype())
+    }
+}
+
+fn check_shape(shape: &Shape, block_size: usize) -> Result<()> {
+    let dims = shape.dims();
+    if dims.is_empty() {
+        crate::bail!("scalar tensor cannot be quantized {shape:?}")
+    }
+    if dims[dims.len() - 1] % block_size != 0 {
+        crate::bail!(
+            "quantized tensor must have their last dim divisible by block size {shape:?} {}",
+            block_size
+        )
+    }
+    Ok(())
+}
+
+impl QTensor {
+    pub fn new<S: Into<Shape>>(storage: QStorage, shape: S) -> Result<Self> {
+        let shape = shape.into();
+        check_shape(&shape, storage.block_size())?;
+        Ok(Self { storage, shape })
+    }
+
+    pub fn quantize(src: &Tensor, dtype: GgmlDType) -> Result<Self> {
+        let shape = src.shape();
+        let block_size = dtype.block_size();
+        check_shape(shape, block_size)?;
+        let src = src.to_dtype(crate::DType::F32)?.flatten_all()?;
+        let elem_count = shape.elem_count();
+        if elem_count % block_size != 0 {
+            crate::bail!(
+                "tensor size ({shape:?}) is not divisible by block size {}",
+                block_size
+            )
+        }
+        let mut storage = src.device().qzeros(elem_count, dtype)?;
+        storage.quantize(&src.storage())?;
+        Ok(Self {
+            storage,
+            shape: shape.clone(),
+        })
+    }
+
+    pub fn dtype(&self) -> GgmlDType {
+        self.storage.dtype()
+    }
+
+    pub fn device(&self) -> Device {
+        self.storage.device()
+    }
+
+    pub fn rank(&self) -> usize {
+        self.shape.rank()
+    }
+
+    pub fn shape(&self) -> &Shape {
+        &self.shape
+    }
+
+    pub fn dequantize(&self, device: &Device) -> Result<Tensor> {
+        let storage = self.storage.dequantize(self.shape.elem_count())?;
+        let none = crate::op::BackpropOp::none();
+        let is_variable = false;
+        crate::tensor::from_storage(storage, self.shape.clone(), none, is_variable)
+            .to_device(device)
+    }
+
+    pub fn storage_size_in_bytes(&self) -> usize {
+        self.storage.size_in_bytes()
+    }
+
+    pub fn data(&self) -> Result<Cow<'_, [u8]>> {
+        self.storage.data()
+    }
+}
+
+#[derive(Clone, Debug)]
+pub enum QMatMul {
+    QTensor(std::sync::Arc<QTensor>),
+    Tensor(Tensor),
+}
+
+thread_local! {
+    static DEQUANTIZE_ALL: bool = {
+        match std::env::var("CANDLE_DEQUANTIZE_ALL") {
+            Ok(s) => {
+                !s.is_empty() && s != "0"
+            },
+            Err(_) => false,
+        }
+    }
+}
+
+impl QMatMul {
+    pub fn from_arc(qtensor: std::sync::Arc<QTensor>) -> Result<Self> {
+        let dequantize = match qtensor.dtype() {
+            GgmlDType::F32 | GgmlDType::F16 => true,
+            _ => DEQUANTIZE_ALL.with(|b| *b),
+        };
+        let t = if dequantize {
+            let tensor = qtensor.dequantize(&qtensor.device())?;
+            Self::Tensor(tensor)
+        } else {
+            Self::QTensor(qtensor)
+        };
+        Ok(t)
+    }
+
+    pub fn from_qtensor(qtensor: QTensor) -> Result<Self> {
+        Self::from_arc(std::sync::Arc::new(qtensor))
+    }
+}
+
+impl crate::CustomOp1 for QTensor {
+    fn name(&self) -> &'static str {
+        "qmatmul"
+    }
+
+    fn cpu_fwd(
+        &self,
+        storage: &crate::CpuStorage,
+        layout: &crate::Layout,
+    ) -> Result<(crate::CpuStorage, Shape)> {
+        if !layout.is_contiguous() {
+            crate::bail!("input tensor is not contiguous {layout:?}")
+        }
+        let src_shape = layout.shape();
+        // self is transposed so n is first then k.
+        let (n, k) = self.shape.dims2()?;
+        if src_shape.rank() < 2 {
+            crate::bail!("input tensor has only one dimension {layout:?}")
+        }
+        let mut dst_shape = src_shape.dims().to_vec();
+        let last_k = dst_shape.pop().unwrap();
+        if last_k != k {
+            crate::bail!("input tensor {layout:?} incompatible with {:?}", self.shape)
+        }
+        dst_shape.push(n);
+        let dst_shape = Shape::from(dst_shape);
+        #[allow(clippy::infallible_destructuring_match)]
+        let self_storage = match &self.storage {
+            QStorage::Cpu(storage) => storage,
+            QStorage::Metal(_) | QStorage::Cuda(_) => crate::bail!("Invalid storage"),
+        };
+        let slice = storage.as_slice::<f32>()?;
+        let slice = &slice[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
+        let mut dst_storage = vec![0f32; dst_shape.elem_count()];
+        self_storage.matmul_t((dst_shape.elem_count() / n, k, n), slice, &mut dst_storage)?;
+        Ok((crate::CpuStorage::F32(dst_storage), dst_shape))
+    }
+
+    fn metal_fwd(
+        &self,
+        storage: &crate::MetalStorage,
+        layout: &crate::Layout,
+    ) -> Result<(crate::MetalStorage, Shape)> {
+        let self_storage = match &self.storage {
+            QStorage::Metal(metal) => metal,
+            _ => unreachable!("Cannot call metal matmul on non metal QTensor"),
+        };
+        self_storage.fwd(&self.shape, storage, layout)
+    }
+
+    fn cuda_fwd(
+        &self,
+        storage: &crate::CudaStorage,
+        layout: &crate::Layout,
+    ) -> Result<(crate::CudaStorage, Shape)> {
+        let self_storage = match &self.storage {
+            QStorage::Cuda(cuda) => cuda,
+            _ => unreachable!("Cannot call cuda matmul on non cuda QTensor"),
+        };
+        self_storage.fwd(&self.shape, storage, layout)
+    }
+}
+
+impl crate::Module for QMatMul {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        match self {
+            Self::QTensor(t) => xs.apply_op1_no_bwd(t.as_ref()),
+            Self::Tensor(w) => {
+                let w = match *xs.dims() {
+                    [b1, b2, _, _] => w.broadcast_left((b1, b2))?.t()?,
+                    [bsize, _, _] => w.broadcast_left(bsize)?.t()?,
+                    _ => w.t()?,
+                };
+                xs.matmul(&w)
+            }
+        }
+    }
+}
--- a/candle-core/src/quantized/neon.rs
+++ b/candle-core/src/quantized/neon.rs
+use super::k_quants::{
+    BlockQ2K, BlockQ3K, BlockQ4K, BlockQ4_0, BlockQ5K, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K,
+};
+use crate::Result;
+use byteorder::{ByteOrder, LittleEndian};
+
+#[allow(unused_imports)]
+#[cfg(target_arch = "arm")]
+use core::arch::arm::*;
+
+#[allow(unused_imports)]
+#[cfg(target_arch = "aarch64")]
+use core::arch::aarch64::*;
+
+#[inline(always)]
+unsafe fn vdotq_s32(a: int8x16_t, b: int8x16_t) -> int32x4_t {
+    // TODO: dotprod
+    let p0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
+    let p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+    vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
+    let qk = QK8_0;
+    let nb = n / qk;
+    if n % QK8_0 != 0 {
+        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
+    }
+
+    unsafe {
+        let mut sumv0 = vdupq_n_f32(0.0f32);
+        for i in 0..nb {
+            let x0 = &xs[i];
+            let y0 = &ys[i];
+
+            let m4b = vdupq_n_u8(0x0F);
+            let s8b = vdupq_n_s8(0x8);
+
+            let v0_0 = vld1q_u8(x0.qs.as_ptr());
+
+            // 4-bit -> 8-bit
+            let v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
+            let v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+
+            // sub 8
+            let v0_0ls = vsubq_s8(v0_0l, s8b);
+            let v0_0hs = vsubq_s8(v0_0h, s8b);
+
+            // load y
+            let v1_0l = vld1q_s8(y0.qs.as_ptr());
+            let v1_0h = vld1q_s8(y0.qs.as_ptr().add(16));
+
+            let pl0 = vdotq_s32(v0_0ls, v1_0l);
+            let ph0 = vdotq_s32(v0_0hs, v1_0h);
+            sumv0 = vmlaq_n_f32(
+                sumv0,
+                vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
+                x0.d.to_f32() * y0.d.to_f32(),
+            );
+        }
+        Ok(vaddvq_f32(sumv0))
+    }
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) -> Result<f32> {
+    let qk = QK8_0;
+    if n % QK8_0 != 0 {
+        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
+    }
+    let nb = n / QK8_0;
+    unsafe {
+        let mut sumv0 = vdupq_n_f32(0.0f32);
+        for i in 0..nb {
+            let x0 = &xs[i];
+            let y0 = &ys[i];
+
+            let x0_0 = vld1q_s8(x0.qs.as_ptr());
+            let x0_1 = vld1q_s8(x0.qs.as_ptr().add(16));
+
+            // load y
+            let y0_0 = vld1q_s8(y0.qs.as_ptr());
+            let y0_1 = vld1q_s8(y0.qs.as_ptr().add(16));
+
+            let p0 = vdotq_s32(x0_0, y0_0);
+            let p1 = vdotq_s32(x0_1, y0_1);
+
+            sumv0 = vmlaq_n_f32(
+                sumv0,
+                vcvtq_f32_s32(vaddq_s32(p0, p1)),
+                x0.d.to_f32() * y0.d.to_f32(),
+            );
+        }
+        Ok(vaddvq_f32(sumv0))
+    }
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
+    let qk = QK_K;
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
+    }
+
+    let mut sumf = 0f32;
+    for (xs, ys) in xs.iter().zip(ys.iter()) {
+        unsafe {
+            let mut sum_i = vdupq_n_s32(0);
+            let scale = xs.d * ys.d;
+            let xs = xs.qs.as_ptr();
+            let ys = ys.qs.as_ptr();
+            for i in (0..QK_K).step_by(16) {
+                let xs = vld1q_s8(xs.add(i));
+                let ys = vld1q_s8(ys.add(i));
+                let xy = vdotq_s32(xs, ys);
+                sum_i = vaddq_s32(sum_i, xy)
+            }
+            sumf += vaddvq_s32(sum_i) as f32 * scale
+        }
+    }
+    Ok(sumf)
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Result<f32> {
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q6k_q8k: {n} is not divisible by {QK_K}")
+    }
+    let mut sum = 0f32;
+    unsafe {
+        let m4b = vdupq_n_u8(0xF);
+
+        let mone = vdupq_n_u8(3);
+
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let d_all = x.d.to_f32();
+
+            let mut q6 = x.ql.as_ptr();
+            let mut qh = x.qh.as_ptr();
+            let mut q8 = y.qs.as_ptr();
+
+            let mut scale = x.scales.as_ptr();
+
+            let q8sums = vld1q_s16_x2(y.bsums.as_ptr());
+            let scales = vld1q_s8(scale);
+            let q6scales = int16x8x2_t(
+                vmovl_s8(vget_low_s8(scales)),
+                vmovl_s8(vget_high_s8(scales)),
+            );
+
+            let prod = vaddq_s32(
+                vaddq_s32(
+                    vmull_s16(vget_low_s16(q8sums.0), vget_low_s16(q6scales.0)),
+                    vmull_s16(vget_high_s16(q8sums.0), vget_high_s16(q6scales.0)),
+                ),
+                vaddq_s32(
+                    vmull_s16(vget_low_s16(q8sums.1), vget_low_s16(q6scales.1)),
+                    vmull_s16(vget_high_s16(q8sums.1), vget_high_s16(q6scales.1)),
+                ),
+            );
+            let isum_mins = vaddvq_s32(prod);
+
+            let mut isum = 0i32;
+
+            for _j in 0..QK_K / 128 {
+                let qhbits = vld1q_u8_x2(qh);
+                qh = qh.add(32);
+                let q6bits = vld1q_u8_x4(q6);
+                q6 = q6.add(64);
+                let q8bytes = vld1q_s8_x4(q8);
+                q8 = q8.add(64);
+
+                let q6h_0 = vshlq_n_u8(vandq_u8(mone, qhbits.0), 4);
+                let q6h_1 = vshlq_n_u8(vandq_u8(mone, qhbits.1), 4);
+                let shifted = vshrq_n_u8(qhbits.0, 2);
+                let q6h_2 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+                let shifted = vshrq_n_u8(qhbits.1, 2);
+                let q6h_3 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+                let q6bytes_0 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.0, m4b), q6h_0));
+                let q6bytes_1 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.1, m4b), q6h_1));
+                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.2, m4b), q6h_2));
+                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.3, m4b), q6h_3));
+
+                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
+                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
+                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
+                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
+                scale = scale.add(2);
+
+                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
+                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
+                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
+                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
+                scale = scale.add(2);
+
+                let q8bytes = vld1q_s8_x4(q8);
+                q8 = q8.add(64);
+
+                let shifted = vshrq_n_u8(qhbits.0, 4);
+                let q6h_0 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+                let shifted = vshrq_n_u8(qhbits.1, 4);
+                let q6h_1 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+                let shifted = vshrq_n_u8(qhbits.0, 6);
+                let q6h_2 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+                let shifted = vshrq_n_u8(qhbits.1, 6);
+                let q6h_3 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+                let q6bytes_0 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.0, 4), q6h_0));
+                let q6bytes_1 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.1, 4), q6h_1));
+                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.2, 4), q6h_2));
+                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.3, 4), q6h_3));
+
+                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
+                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
+                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
+                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
+                scale = scale.add(2);
+
+                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
+                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
+                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
+                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
+                scale = scale.add(2);
+            }
+            sum += d_all * y.d * ((isum - 32 * isum_mins) as f32);
+        }
+    }
+    Ok(sum)
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Result<f32> {
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q5k_q8k: {n} is not divisible by {QK_K}")
+    }
+    let mut sumf = 0f32;
+    let mut utmp = [0u32; 4];
+    const KMASK1: u32 = 0x3f3f3f3f;
+    const KMASK2: u32 = 0x0f0f0f0f;
+    const KMASK3: u32 = 0x03030303;
+
+    unsafe {
+        let m4b = vdupq_n_u8(0xF);
+        let mone = vdupq_n_u8(1);
+        let mtwo = vdupq_n_u8(2);
+
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let d = y.d * x.d.to_f32();
+            let dmin = y.d * x.dmin.to_f32();
+
+            let q8sums = vpaddq_s16(
+                vld1q_s16(y.bsums.as_ptr()),
+                vld1q_s16(y.bsums.as_ptr().add(8)),
+            );
+
+            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
+
+            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
+            let uaux = utmp[1] & KMASK1;
+            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
+            utmp[2] = uaux;
+            utmp[0] &= KMASK1;
+
+            let mins8 = vld1_u8((utmp.as_ptr() as *const u8).add(8));
+            let mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
+            let prod = vaddq_s32(
+                vmull_s16(vget_low_s16(q8sums), vget_low_s16(mins)),
+                vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)),
+            );
+            let sumi_mins = vaddvq_s32(prod);
+
+            let mut scales = utmp.as_ptr() as *const u8;
+
+            let mut q5 = x.qs.as_ptr();
+            let mut q8 = y.qs.as_ptr();
+
+            let mut qhbits = vld1q_u8_x2(x.qh.as_ptr());
+
+            let mut sumi = 0i32;
+
+            for _j in 0..QK_K / 64 {
+                let q5bits = vld1q_u8_x2(q5);
+                q5 = q5.add(32);
+                let q8bytes = vld1q_s8_x4(q8);
+                q8 = q8.add(64);
+
+                let q5h_0 = vshlq_n_u8(vandq_u8(mone, qhbits.0), 4);
+                let q5h_1 = vshlq_n_u8(vandq_u8(mone, qhbits.1), 4);
+                let q5h_2 = vshlq_n_u8(vandq_u8(mtwo, qhbits.0), 3);
+                let q5h_3 = vshlq_n_u8(vandq_u8(mtwo, qhbits.1), 3);
+                qhbits.0 = vshrq_n_u8(qhbits.0, 2);
+                qhbits.1 = vshrq_n_u8(qhbits.1, 2);
+
+                let q5bytes_0 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.0, m4b), q5h_0));
+                let q5bytes_1 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.1, m4b), q5h_1));
+                let q5bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.0, 4), q5h_2));
+                let q5bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.1, 4), q5h_3));
+
+                let p0 = vdotq_s32(q5bytes_0, q8bytes.0);
+                let p1 = vdotq_s32(q5bytes_1, q8bytes.1);
+                sumi += vaddvq_s32(vaddq_s32(p0, p1)) * *scales as i32;
+                scales = scales.add(1);
+
+                let p2 = vdotq_s32(q5bytes_2, q8bytes.2);
+                let p3 = vdotq_s32(q5bytes_3, q8bytes.3);
+                sumi += vaddvq_s32(vaddq_s32(p2, p3)) * *scales as i32;
+                scales = scales.add(1);
+            }
+            sumf += d * sumi as f32 - dmin * sumi_mins as f32;
+        }
+    }
+    Ok(sumf)
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Result<f32> {
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
+    }
+    let mut sumf = 0f32;
+    let mut utmp = [0u32; 4];
+    let mut scales = [0u8; 16];
+    const KMASK1: u32 = 0x3f3f3f3f;
+    const KMASK2: u32 = 0x0f0f0f0f;
+    const KMASK3: u32 = 0x03030303;
+
+    unsafe {
+        let m4b = vdupq_n_u8(0xF);
+
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let d = y.d * x.d.to_f32();
+            let dmin = y.d * x.dmin.to_f32();
+
+            let q8sums = vpaddq_s16(
+                vld1q_s16(y.bsums.as_ptr()),
+                vld1q_s16(y.bsums.as_ptr().add(8)),
+            );
+
+            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
+
+            let mins8 = vld1_u32(
+                [
+                    utmp[1] & KMASK1,
+                    ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4),
+                ]
+                .as_ptr(),
+            );
+            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
+            utmp[0] &= KMASK1;
+
+            let mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
+            let prod = vaddq_s32(
+                vmull_s16(vget_low_s16(q8sums), vget_low_s16(mins)),
+                vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)),
+            );
+            sumf -= dmin * vaddvq_s32(prod) as f32;
+
+            LittleEndian::write_u32_into(&utmp, &mut scales);
+
+            let mut q4 = x.qs.as_ptr();
+            let mut q8 = y.qs.as_ptr();
+
+            let mut sumi1 = 0i32;
+            let mut sumi2 = 0i32;
+
+            for j in 0..QK_K / 64 {
+                let q4bits = vld1q_u8_x2(q4);
+                q4 = q4.add(32);
+                let q8bytes = vld1q_s8_x2(q8);
+                q8 = q8.add(32);
+                let q4bytes = int8x16x2_t(
+                    vreinterpretq_s8_u8(vandq_u8(q4bits.0, m4b)),
+                    vreinterpretq_s8_u8(vandq_u8(q4bits.1, m4b)),
+                );
+                let p0 = vdotq_s32(q4bytes.0, q8bytes.0);
+                let p1 = vdotq_s32(q4bytes.1, q8bytes.1);
+                sumi1 += vaddvq_s32(vaddq_s32(p0, p1)) * scales[2 * j] as i32;
+
+                let q8bytes = vld1q_s8_x2(q8);
+                q8 = q8.add(32);
+                let q4bytes = int8x16x2_t(
+                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.0, 4)),
+                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.1, 4)),
+                );
+                let p2 = vdotq_s32(q4bytes.0, q8bytes.0);
+                let p3 = vdotq_s32(q4bytes.1, q8bytes.1);
+                sumi2 += vaddvq_s32(vaddq_s32(p2, p3)) * scales[2 * j + 1] as i32;
+            }
+            sumf += d * (sumi1 + sumi2) as f32;
+        }
+    }
+    Ok(sumf)
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Result<f32> {
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q3k_q8k: {n} is not divisible by {QK_K}")
+    }
+    let mut sumf = 0f32;
+    let mut utmp = [0u32; 4];
+    let mut aux = [0u32; 3];
+    const KMASK1: u32 = 0x03030303;
+    const KMASK2: u32 = 0x0f0f0f0f;
+
+    unsafe {
+        let m3b = vdupq_n_u8(0x3);
+        let m0 = vdupq_n_u8(1);
+        let m1 = vshlq_n_u8(m0, 1);
+        let m2 = vshlq_n_u8(m0, 2);
+        let m3 = vshlq_n_u8(m0, 3);
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let d = y.d * x.d.to_f32();
+            let mut q3 = x.qs.as_ptr();
+            let qh = x.hmask.as_ptr();
+            let mut q8 = y.qs.as_ptr();
+
+            let mut qhbits = vld1q_u8_x2(qh);
+
+            let mut isum = 0i32;
+
+            // Set up scales
+            LittleEndian::read_u32_into(&x.scales, &mut aux);
+
+            utmp[3] = ((aux[1] >> 4) & KMASK2) | (((aux[2] >> 6) & KMASK1) << 4);
+            utmp[2] = ((aux[0] >> 4) & KMASK2) | (((aux[2] >> 4) & KMASK1) << 4);
+            utmp[1] = (aux[1] & KMASK2) | (((aux[2] >> 2) & KMASK1) << 4);
+            utmp[0] = (aux[0] & KMASK2) | ((aux[2] & KMASK1) << 4);
+
+            let mut scale = utmp.as_mut_ptr() as *mut i8;
+            for j in 0..16 {
+                *scale.add(j) -= 32i8
+            }
+
+            for j in 0..QK_K / 128 {
+                let q3bits = vld1q_u8_x2(q3);
+                q3 = q3.add(32);
+                let q8bytes_1 = vld1q_s8_x4(q8);
+                q8 = q8.add(64);
+                let q8bytes_2 = vld1q_s8_x4(q8);
+                q8 = q8.add(64);
+
+                let q3h_0 = vshlq_n_u8(vbicq_u8(m0, qhbits.0), 2);
+                let q3h_1 = vshlq_n_u8(vbicq_u8(m0, qhbits.1), 2);
+                let q3h_2 = vshlq_n_u8(vbicq_u8(m1, qhbits.0), 1);
+                let q3h_3 = vshlq_n_u8(vbicq_u8(m1, qhbits.1), 1);
+
+                let q3bytes_0 = vsubq_s8(
+                    vreinterpretq_s8_u8(vandq_u8(q3bits.0, m3b)),
+                    vreinterpretq_s8_u8(q3h_0),
+                );
+                let q3bytes_1 = vsubq_s8(
+                    vreinterpretq_s8_u8(vandq_u8(q3bits.1, m3b)),
+                    vreinterpretq_s8_u8(q3h_1),
+                );
+                let q3bytes_2 = vsubq_s8(
+                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.0, 2), m3b)),
+                    vreinterpretq_s8_u8(q3h_2),
+                );
+                let q3bytes_3 = vsubq_s8(
+                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.1, 2), m3b)),
+                    vreinterpretq_s8_u8(q3h_3),
+                );
+
+                let p0 = vdotq_s32(q3bytes_0, q8bytes_1.0);
+                let p1 = vdotq_s32(q3bytes_1, q8bytes_1.1);
+                let p2 = vdotq_s32(q3bytes_2, q8bytes_1.2);
+                let p3 = vdotq_s32(q3bytes_3, q8bytes_1.3);
+                isum += vaddvq_s32(p0) * *scale as i32
+                    + vaddvq_s32(p1) * *scale.add(1) as i32
+                    + vaddvq_s32(p2) * *scale.add(2) as i32
+                    + vaddvq_s32(p3) * *scale.add(3) as i32;
+                scale = scale.add(4);
+
+                let q3h_0 = vbicq_u8(m2, qhbits.0);
+                let q3h_1 = vbicq_u8(m2, qhbits.1);
+                let q3h_2 = vshrq_n_u8(vbicq_u8(m3, qhbits.0), 1);
+                let q3h_3 = vshrq_n_u8(vbicq_u8(m3, qhbits.1), 1);
+
+                let q3bytes_0 = vsubq_s8(
+                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.0, 4), m3b)),
+                    vreinterpretq_s8_u8(q3h_0),
+                );
+                let q3bytes_1 = vsubq_s8(
+                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.1, 4), m3b)),
+                    vreinterpretq_s8_u8(q3h_1),
+                );
+                let q3bytes_2 = vsubq_s8(
+                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.0, 6), m3b)),
+                    vreinterpretq_s8_u8(q3h_2),
+                );
+                let q3bytes_3 = vsubq_s8(
+                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.1, 6), m3b)),
+                    vreinterpretq_s8_u8(q3h_3),
+                );
+
+                let p0 = vdotq_s32(q3bytes_0, q8bytes_2.0);
+                let p1 = vdotq_s32(q3bytes_1, q8bytes_2.1);
+                let p2 = vdotq_s32(q3bytes_2, q8bytes_2.2);
+                let p3 = vdotq_s32(q3bytes_3, q8bytes_2.3);
+                isum += vaddvq_s32(p0) * *scale as i32
+                    + vaddvq_s32(p1) * *scale.add(1) as i32
+                    + vaddvq_s32(p2) * *scale.add(2) as i32
+                    + vaddvq_s32(p3) * *scale.add(3) as i32;
+                scale = scale.add(4);
+
+                if j == 0 {
+                    qhbits.0 = vshrq_n_u8(qhbits.0, 4);
+                    qhbits.1 = vshrq_n_u8(qhbits.1, 4);
+                }
+            }
+            sumf += d * isum as f32;
+        }
+    }
+    Ok(sumf)
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Result<f32> {
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
+    }
+    let mut sumf = 0f32;
+    let mut aux = [0u8; 16];
+
+    unsafe {
+        let m3 = vdupq_n_u8(0x3);
+        let m4 = vdupq_n_u8(0xF);
+
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let d = y.d * x.d.to_f32();
+            let dmin = -y.d * x.dmin.to_f32();
+
+            let mut q2 = x.qs.as_ptr();
+            let mut q8 = y.qs.as_ptr();
+            let sc = x.scales.as_ptr();
+
+            let mins_and_scales = vld1q_u8(sc);
+            let scales = vandq_u8(mins_and_scales, m4);
+            vst1q_u8(aux.as_mut_ptr(), scales);
+
+            let mins = vshrq_n_u8(mins_and_scales, 4);
+            let q8sums = vld1q_s16_x2(y.bsums.as_ptr());
+            let mins16 = int16x8x2_t(
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins))),
+            );
+            let s0 = vaddq_s32(
+                vmull_s16(vget_low_s16(mins16.0), vget_low_s16(q8sums.0)),
+                vmull_s16(vget_high_s16(mins16.0), vget_high_s16(q8sums.0)),
+            );
+            let s1 = vaddq_s32(
+                vmull_s16(vget_low_s16(mins16.1), vget_low_s16(q8sums.1)),
+                vmull_s16(vget_high_s16(mins16.1), vget_high_s16(q8sums.1)),
+            );
+            sumf += dmin * vaddvq_s32(vaddq_s32(s0, s1)) as f32;
+
+            let mut isum = 0i32;
+            let mut is = 0usize;
+
+            // TODO: dotprod
+            for _j in 0..QK_K / 128 {
+                let q2bits = vld1q_u8_x2(q2);
+                q2 = q2.add(32);
+
+                let q8bytes = vld1q_s8_x2(q8);
+                q8 = q8.add(32);
+                let mut q2bytes = int8x16x2_t(
+                    vreinterpretq_s8_u8(vandq_u8(q2bits.0, m3)),
+                    vreinterpretq_s8_u8(vandq_u8(q2bits.1, m3)),
+                );
+                isum += multiply_accum_with_scale(&aux, is, 0, q2bytes, q8bytes);
+
+                let q8bytes = vld1q_s8_x2(q8);
+                q8 = q8.add(32);
+                q2bytes.0 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.0, 2), m3));
+                q2bytes.1 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.1, 2), m3));
+                isum += multiply_accum_with_scale(&aux, is, 2, q2bytes, q8bytes);
+
+                let q8bytes = vld1q_s8_x2(q8);
+                q8 = q8.add(32);
+                q2bytes.0 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.0, 4), m3));
+                q2bytes.1 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.1, 4), m3));
+                isum += multiply_accum_with_scale(&aux, is, 4, q2bytes, q8bytes);
+
+                let q8bytes = vld1q_s8_x2(q8);
+                q8 = q8.add(32);
+                q2bytes.0 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.0, 6), m3));
+                q2bytes.1 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.1, 6), m3));
+                isum += multiply_accum_with_scale(&aux, is, 6, q2bytes, q8bytes);
+
+                is += 8;
+            }
+            sumf += d * isum as f32;
+        }
+    }
+    Ok(sumf)
+}
+
+#[inline(always)]
+unsafe fn multiply_accum_with_scale(
+    aux: &[u8; 16],
+    is: usize,
+    index: usize,
+    q2bytes: int8x16x2_t,
+    q8bytes: int8x16x2_t,
+) -> i32 {
+    let p1 = vdotq_s32(q2bytes.0, q8bytes.0);
+    let p2 = vdotq_s32(q2bytes.1, q8bytes.1);
+    vaddvq_s32(p1) * aux[is + index] as i32 + vaddvq_s32(p2) * aux[is + 1 + index] as i32
+}
--- a/candle-core/src/quantized/simd128.rs
+++ b/candle-core/src/quantized/simd128.rs
+use super::k_quants::{BlockQ2K, BlockQ4K, BlockQ4_0, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K};
+use crate::Result;
+use byteorder::{ByteOrder, LittleEndian};
+use half::f16;
+
+use core::arch::wasm32::*;
+
+#[inline(always)]
+pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
+    let qk = QK8_0;
+    if n % QK8_0 != 0 {
+        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
+    }
+    unsafe {
+        let mut acc = f32x4_splat(0.0f32);
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let x1234 = v128_load(x.qs.as_ptr() as *const v128);
+            let x12 = v128_and(x1234, u8x16_splat(0x0F));
+            let x12 = i8x16_sub(x12, i8x16_splat(8));
+            let x34 = u8x16_shr(x1234, 4);
+            let x34 = i8x16_sub(x34, i8x16_splat(8));
+
+            let x1 = i16x8_extend_low_i8x16(x12);
+            let y1 = i16x8_load_extend_i8x8(y.qs.as_ptr());
+            let sum_xy = i32x4_dot_i16x8(x1, y1);
+
+            let x2 = i16x8_extend_high_i8x16(x12);
+            let y2 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(8));
+            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x2, y2));
+
+            let x3 = i16x8_extend_low_i8x16(x34);
+            let y3 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(16));
+            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x3, y3));
+
+            let x4 = i16x8_extend_high_i8x16(x34);
+            let y4 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(24));
+            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x4, y4));
+
+            let sum_xy = f32x4_convert_i32x4(sum_xy);
+
+            // f32x4_relaxed_madd is nightly only.
+            let d = f32x4_splat(f16::to_f32(x.d) * f16::to_f32(y.d));
+            let scaled = f32x4_mul(sum_xy, d);
+            acc = f32x4_add(acc, scaled)
+        }
+        let res = f32x4_extract_lane::<0>(acc)
+            + f32x4_extract_lane::<1>(acc)
+            + f32x4_extract_lane::<2>(acc)
+            + f32x4_extract_lane::<3>(acc);
+        Ok(res)
+    }
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) -> Result<f32> {
+    let qk = QK8_0;
+    if n % QK8_0 != 0 {
+        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
+    }
+    unsafe {
+        let mut acc = f32x4_splat(0.0f32);
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let x1 = i16x8_load_extend_i8x8(x.qs.as_ptr());
+            let y1 = i16x8_load_extend_i8x8(y.qs.as_ptr());
+            let sum_xy = i32x4_dot_i16x8(x1, y1);
+
+            let x2 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(8));
+            let y2 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(8));
+            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x2, y2));
+
+            let x3 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(16));
+            let y3 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(16));
+            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x3, y3));
+
+            let x4 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(24));
+            let y4 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(24));
+            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x4, y4));
+
+            let sum_xy = f32x4_convert_i32x4(sum_xy);
+
+            // f32x4_relaxed_madd is nightly only.
+            let d = f32x4_splat(f16::to_f32(x.d) * f16::to_f32(y.d));
+            let scaled = f32x4_mul(sum_xy, d);
+            acc = f32x4_add(acc, scaled)
+        }
+        let res = f32x4_extract_lane::<0>(acc)
+            + f32x4_extract_lane::<1>(acc)
+            + f32x4_extract_lane::<2>(acc)
+            + f32x4_extract_lane::<3>(acc);
+        Ok(res)
+    }
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Result<f32> {
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
+    }
+    unsafe {
+        let mut sumf = f32x4_splat(0f32);
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let mut q2: &[_] = &x.qs;
+            let mut q8: &[_] = &y.qs;
+            let sc = &x.scales;
+
+            let mut summs = i32x4_splat(0);
+            for i in (0..(QK_K / 16)).step_by(4) {
+                let bsums = i32x4_load_extend_i16x4(y.bsums.as_ptr().add(i));
+                let scales = i32x4_shr(
+                    i32x4(
+                        sc[i] as i32,
+                        sc[i + 1] as i32,
+                        sc[i + 2] as i32,
+                        sc[i + 3] as i32,
+                    ),
+                    4,
+                );
+                summs = i32x4_add(summs, i32x4_mul(bsums, scales))
+            }
+            let summs = f32x4_convert_i32x4(summs);
+
+            let dall = y.d * x.d.to_f32();
+            let dmin = y.d * x.dmin.to_f32();
+
+            let mut isum = i32x4_splat(0);
+            let mut is = 0;
+            for _ in 0..(QK_K / 128) {
+                let mut shift = 0;
+                for _ in 0..4 {
+                    let d = (sc[is] & 0xF) as i32;
+                    is += 1;
+                    let mut isuml = i16x8_splat(0);
+                    for l in (0..16).step_by(8) {
+                        let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(l));
+                        let q2 = i16x8_load_extend_u8x8(q2.as_ptr().add(l));
+                        let q2 = v128_and(i16x8_shr(q2, shift), i16x8_splat(3));
+                        isuml = i16x8_add(isuml, i16x8_mul(q2, q8))
+                    }
+                    let dd = i32x4_splat(d);
+                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_low_i16x8(isuml), dd));
+                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_high_i16x8(isuml), dd));
+                    let d = (sc[is] & 0xF) as i32;
+                    is += 1;
+                    let mut isuml = i16x8_splat(0);
+                    for l in (16..32).step_by(8) {
+                        let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(l));
+                        let q2 = i16x8_load_extend_u8x8(q2.as_ptr().add(l));
+                        let q2 = v128_and(i16x8_shr(q2, shift), i16x8_splat(3));
+                        isuml = i16x8_add(isuml, i16x8_mul(q2, q8))
+                    }
+                    let dd = i32x4_splat(d);
+                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_low_i16x8(isuml), dd));
+                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_high_i16x8(isuml), dd));
+                    shift += 2;
+                    // adjust the indexing
+                    q8 = &q8[32..];
+                }
+                // adjust the indexing
+                q2 = &q2[32..];
+            }
+            let isum = f32x4_convert_i32x4(isum);
+            sumf = f32x4_add(
+                sumf,
+                f32x4_sub(
+                    f32x4_mul(isum, f32x4_splat(dall)),
+                    f32x4_mul(summs, f32x4_splat(dmin)),
+                ),
+            );
+        }
+        let sumf = f32x4_extract_lane::<0>(sumf)
+            + f32x4_extract_lane::<1>(sumf)
+            + f32x4_extract_lane::<2>(sumf)
+            + f32x4_extract_lane::<3>(sumf);
+        Ok(sumf)
+    }
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Result<f32> {
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
+    }
+
+    const KMASK1: u32 = 0x3f3f3f3f;
+    const KMASK2: u32 = 0x0f0f0f0f;
+    const KMASK3: u32 = 0x03030303;
+
+    let mut utmp: [u32; 4] = [0; 4];
+    let mut scales: [u8; 8] = [0; 8];
+    let mut mins: [u8; 8] = [0; 8];
+
+    let mut aux8: [u8; QK_K] = [0; QK_K];
+    let mut sums = f32x4_splat(0f32);
+    unsafe {
+        for (y, x) in ys.iter().zip(xs.iter()) {
+            let q4 = &x.qs;
+            let q8 = &y.qs;
+
+            for j in 0..QK_K / 64 {
+                let q4_1 = v128_load(q4.as_ptr().add(32 * j) as *const v128);
+                let q4_2 = v128_load(q4.as_ptr().add(32 * j + 16) as *const v128);
+                v128_store(
+                    aux8.as_mut_ptr().add(64 * j) as *mut v128,
+                    v128_and(q4_1, u8x16_splat(0x0F)),
+                );
+                v128_store(
+                    aux8.as_mut_ptr().add(64 * j + 16) as *mut v128,
+                    v128_and(q4_2, u8x16_splat(0x0F)),
+                );
+                v128_store(
+                    aux8.as_mut_ptr().add(64 * j + 32) as *mut v128,
+                    u8x16_shr(q4_1, 4),
+                );
+                v128_store(
+                    aux8.as_mut_ptr().add(64 * j + 48) as *mut v128,
+                    u8x16_shr(q4_2, 4),
+                );
+            }
+
+            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
+
+            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
+            let uaux = utmp[1] & KMASK1;
+            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
+            utmp[2] = uaux;
+            utmp[0] &= KMASK1;
+
+            //extract scales and mins
+            LittleEndian::write_u32_into(&utmp[0..2], &mut scales);
+            LittleEndian::write_u32_into(&utmp[2..4], &mut mins);
+
+            let mut sumi = i32x4_splat(0);
+            for j in (0..QK_K / 16).step_by(4) {
+                let bsums = i32x4_load_extend_i16x4(y.bsums.as_ptr().add(j));
+                let (m1, m2) = (mins[j / 2] as i32, mins[j / 2 + 1] as i32);
+                let mins = i32x4(m1, m1, m2, m2);
+                sumi = i32x4_add(sumi, i32x4_mul(bsums, mins));
+            }
+
+            let mut aux32 = i32x4_splat(0i32);
+            for (scale_i, scale) in scales.iter().enumerate() {
+                let scale = i32x4_splat(*scale as i32);
+                for j in 0..4 {
+                    let i = 32 * scale_i + 8 * j;
+                    let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(i));
+                    let aux8 = i16x8_load_extend_u8x8(aux8.as_ptr().add(i));
+                    let aux16 = i16x8_mul(q8, aux8);
+                    aux32 = i32x4_add(aux32, i32x4_mul(scale, i32x4_extend_low_i16x8(aux16)));
+                    aux32 = i32x4_add(aux32, i32x4_mul(scale, i32x4_extend_high_i16x8(aux16)));
+                }
+            }
+            let aux32 = f32x4_convert_i32x4(aux32);
+            let d = f32x4_splat(x.d.to_f32() * y.d);
+            sums = f32x4_add(sums, f32x4_mul(aux32, d));
+            let dmin = x.dmin.to_f32() * y.d;
+            let dmin = f32x4_splat(dmin);
+            let sumi = f32x4_convert_i32x4(sumi);
+            sums = f32x4_sub(sums, f32x4_mul(sumi, dmin));
+        }
+        let sums = f32x4_extract_lane::<0>(sums)
+            + f32x4_extract_lane::<1>(sums)
+            + f32x4_extract_lane::<2>(sums)
+            + f32x4_extract_lane::<3>(sums);
+        Ok(sums)
+    }
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Result<f32> {
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q6k_q8k: {n} is not divisible by {QK_K}")
+    }
+
+    let mut aux8 = [0i8; QK_K];
+    unsafe {
+        let mut sums = f32x4_splat(0f32);
+
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let q4 = &x.ql;
+            let qh = &x.qh;
+            let q8 = &y.qs;
+            let mut aux32 = f32x4_splat(0f32);
+
+            for j in (0..QK_K).step_by(128) {
+                let aux8 = aux8.as_mut_ptr().add(j);
+                let q4 = &q4.as_ptr().add(j / 2);
+                let qh = &qh.as_ptr().add(j / 4);
+                for l in (0..32).step_by(16) {
+                    // aux8[l] = (((q4[l] & 0xF) | ((qh[l] & 3) << 4)) as i32 - 32) as i8;
+                    let a8 = v128_or(
+                        v128_and(v128_load(q4.add(l) as *const v128), u8x16_splat(0xF)),
+                        u8x16_shl(
+                            v128_and(v128_load(qh.add(l) as *const v128), u8x16_splat(3)),
+                            4,
+                        ),
+                    );
+                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
+                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
+                    v128_store(
+                        aux8.add(l) as *mut v128,
+                        i8x16_narrow_i16x8(a8_low, a8_high),
+                    );
+
+                    // aux8[l + 32] =
+                    //    (((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) as i32 - 32) as i8;
+                    let a8 = v128_or(
+                        v128_and(v128_load(q4.add(l + 32) as *const v128), u8x16_splat(0xF)),
+                        u8x16_shl(
+                            v128_and(
+                                u8x16_shr(v128_load(qh.add(l) as *const v128), 2),
+                                u8x16_splat(3),
+                            ),
+                            4,
+                        ),
+                    );
+                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
+                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
+                    v128_store(
+                        aux8.add(l + 32) as *mut v128,
+                        i8x16_narrow_i16x8(a8_low, a8_high),
+                    );
+
+                    // aux8[l + 64] = (((q4[l] >> 4) | (((qh[l] >> 4) & 3) << 4)) as i32 - 32) as i8;
+                    let a8 = v128_or(
+                        u8x16_shr(v128_load(q4.add(l) as *const v128), 4),
+                        u8x16_shl(
+                            v128_and(
+                                u8x16_shr(v128_load(qh.add(l) as *const v128), 4),
+                                u8x16_splat(3),
+                            ),
+                            4,
+                        ),
+                    );
+                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
+                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
+                    v128_store(
+                        aux8.add(l + 64) as *mut v128,
+                        i8x16_narrow_i16x8(a8_low, a8_high),
+                    );
+
+                    // aux8[l + 96] =
+                    //    (((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) as i32 - 32) as i8;
+                    let a8 = v128_or(
+                        u8x16_shr(v128_load(q4.add(l + 32) as *const v128), 4),
+                        u8x16_shl(
+                            v128_and(
+                                u8x16_shr(v128_load(qh.add(l) as *const v128), 6),
+                                u8x16_splat(3),
+                            ),
+                            4,
+                        ),
+                    );
+                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
+                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
+                    v128_store(
+                        aux8.add(l + 96) as *mut v128,
+                        i8x16_narrow_i16x8(a8_low, a8_high),
+                    );
+                }
+            }
+
+            for (j, &scale) in x.scales.iter().enumerate() {
+                let scale = f32x4_splat(scale as f32);
+                for offset in [0, 8] {
+                    let aux16 = i16x8_mul(
+                        i16x8_load_extend_i8x8(q8.as_ptr().add(16 * j + offset)),
+                        i16x8_load_extend_i8x8(aux8.as_ptr().add(16 * j + offset)),
+                    );
+                    aux32 = f32x4_add(
+                        aux32,
+                        f32x4_mul(f32x4_convert_i32x4(i32x4_extend_low_i16x8(aux16)), scale),
+                    );
+                    aux32 = f32x4_add(
+                        aux32,
+                        f32x4_mul(f32x4_convert_i32x4(i32x4_extend_high_i16x8(aux16)), scale),
+                    );
+                }
+            }
+
+            let d = f32x4_splat(x.d.to_f32() * y.d);
+            sums = f32x4_add(sums, f32x4_mul(aux32, d));
+        }
+        let sums = f32x4_extract_lane::<0>(sums)
+            + f32x4_extract_lane::<1>(sums)
+            + f32x4_extract_lane::<2>(sums)
+            + f32x4_extract_lane::<3>(sums);
+        Ok(sums)
+    }
+}
+
+#[inline(always)]
+pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
+    let qk = QK_K;
+    if n % QK_K != 0 {
+        crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
+    }
+
+    unsafe {
+        let mut acc = f32x4_splat(0.0f32);
+        for (xs, ys) in xs.iter().zip(ys.iter()) {
+            let x_qs = xs.qs.as_ptr();
+            let y_qs = ys.qs.as_ptr();
+            let mut sumi = i32x4_splat(0);
+            for j in (0..QK_K).step_by(8) {
+                let xs = i16x8_load_extend_i8x8(x_qs.add(j));
+                let ys = i16x8_load_extend_i8x8(y_qs.add(j));
+                let sum_xy = i32x4_dot_i16x8(xs, ys);
+                sumi = i32x4_add(sumi, sum_xy)
+            }
+            let d = f32x4_splat(xs.d * ys.d);
+            acc = f32x4_add(acc, f32x4_mul(f32x4_convert_i32x4(sumi), d))
+        }
+        let res = f32x4_extract_lane::<0>(acc)
+            + f32x4_extract_lane::<1>(acc)
+            + f32x4_extract_lane::<2>(acc)
+            + f32x4_extract_lane::<3>(acc);
+        Ok(res)
+    }
+}
--- a/candle-core/src/quantized/utils.rs
+++ b/candle-core/src/quantized/utils.rs
+use crate::Result;
+
+pub(super) fn nearest_int(v: f32) -> i32 {
+    v.round() as i32
+}
+
+/// Validates that the input and output are the right size and returns an iterator which maps each
+/// input region `xs` to its corresponding output block in `ys`. Each output region is guaranteed
+/// to be `T::BLCK_SIZE` long.
+pub(super) fn group_for_quantization<'a, 'b, T: super::k_quants::GgmlType>(
+    xs: &'b [f32],
+    ys: &'a mut [T],
+) -> Result<Vec<(&'a mut T, &'b [f32])>> {
+    let block_size = T::BLCK_SIZE;
+    let dtype = T::DTYPE;
+
+    let expected_blocks = xs.len() / block_size;
+    let actual_blocks = ys.len();
+
+    // Validate that the input is the right size
+    if expected_blocks != actual_blocks {
+        crate::bail!("quantize {dtype:?}: expected {expected_blocks} blocks but only {actual_blocks} were provided!")
+    }
+
+    Ok(ys.iter_mut().zip(xs.chunks_exact(block_size)).collect())
+}
+
+/// Validates that the input and output are the right size and returns an iterator which maps each
+/// input block `xs` to its corresponding output region in `ys`. Each output region is guaranteed
+/// to be `T::BLCK_SIZE` long.
+pub(super) fn group_for_dequantization<'a, 'b, T: super::k_quants::GgmlType>(
+    xs: &'a [T],
+    ys: &'b mut [f32],
+) -> Result<Vec<(&'a T, &'b mut [f32])>> {
+    let block_size = T::BLCK_SIZE;
+    let dtype = T::DTYPE;
+
+    let actual_output_len = ys.len();
+    let expected_output_len = xs.len() * block_size;
+    // Validate that the output is the right size
+    if expected_output_len != actual_output_len {
+        crate::bail!("dequantize {dtype:?}: ys (len = {actual_output_len}) does not match the expected length of {expected_output_len}!")
+    }
+
+    // Zip the blocks and outputs together
+    Ok(xs.iter().zip(ys.chunks_exact_mut(block_size)).collect())
+}
+
+pub(super) fn get_scale_min_k4(j: usize, q: &[u8]) -> (u8, u8) {
+    if j < 4 {
+        let d = q[j] & 63;
+        let m = q[j + 4] & 63;
+        (d, m)
+    } else {
+        let d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
+        let m = (q[j + 4] >> 4) | ((q[j] >> 6) << 4);
+        (d, m)
+    }
+}
+
+pub(super) unsafe fn make_qx_quants(
+    n: usize,
+    nmax: i32,
+    x: *const f32,
+    ls: *mut i8,
+    rmse_type: i32,
+) -> f32 {
+    let mut max = 0f32;
+    let mut amax = 0f32;
+    for i in 0..n {
+        let x = *x.add(i);
+        let ax = x.abs();
+        if ax > amax {
+            amax = ax;
+            max = x;
+        }
+    }
+    if amax == 0. {
+        // all zero
+        for i in 0..n {
+            *ls.add(i) = 0;
+        }
+        return 0.;
+    }
+    let mut iscale = -(nmax as f32) / max;
+    if rmse_type == 0 {
+        for i in 0..n {
+            let x = *x.add(i);
+            let l = nearest_int(iscale * x);
+            *ls.add(i) = (nmax + l.clamp(-nmax, nmax - 1)) as i8;
+        }
+        return 1.0 / iscale;
+    }
+    let weight_type = rmse_type % 2;
+    let mut sumlx = 0f32;
+    let mut suml2 = 0f32;
+    for i in 0..n {
+        let x = *x.add(i);
+        let l = nearest_int(iscale * x);
+        let l = l.clamp(-nmax, nmax - 1);
+        *ls.add(i) = (l + nmax) as i8;
+        let w = if weight_type == 1 { x * x } else { 1.0 };
+        let l = l as f32;
+        sumlx += w * x * l;
+        suml2 += w * l * l;
+    }
+    let mut scale = sumlx / suml2;
+    let mut best = scale * sumlx;
+    for _itry in 0..3 {
+        let iscale = 1.0 / scale;
+        let mut slx = 0f32;
+        let mut sl2 = 0f32;
+        let mut changed = false;
+        for i in 0..n {
+            let x = *x.add(i);
+            let l = nearest_int(iscale * x);
+            let l = l.clamp(-nmax, nmax - 1);
+            if l + nmax != *ls.add(i) as i32 {
+                changed = true;
+            }
+            let w = if weight_type == 1 { x * x } else { 1f32 };
+            let l = l as f32;
+            slx += w * x * l;
+            sl2 += w * l * l;
+        }
+        if !changed || sl2 == 0.0 || slx * slx <= best * sl2 {
+            break;
+        }
+        for i in 0..n {
+            let x = *x.add(i);
+            let l = nearest_int(iscale * x);
+            *ls.add(i) = (nmax + l.clamp(-nmax, nmax - 1)) as i8;
+        }
+        sumlx = slx;
+        suml2 = sl2;
+        scale = sumlx / suml2;
+        best = scale * sumlx;
+    }
+    for _itry in 0..5 {
+        let mut n_changed = 0;
+        for i in 0..n {
+            let x = *x.add(i);
+            let w = if weight_type == 1 { x * x } else { 1. };
+            let l = *ls.add(i) as i32 - nmax;
+            let mut slx = sumlx - w * x * l as f32;
+            if slx > 0. {
+                let mut sl2 = suml2 - w * l as f32 * l as f32;
+                let new_l = nearest_int(x * sl2 / slx);
+                let new_l = new_l.clamp(-nmax, nmax - 1);
+                if new_l != l {
+                    slx += w * x * new_l as f32;
+                    sl2 += w * new_l as f32 * new_l as f32;
+                    if sl2 > 0. && slx * slx * suml2 > sumlx * sumlx * sl2 {
+                        *ls.add(i) = (nmax + new_l) as i8;
+                        sumlx = slx;
+                        suml2 = sl2;
+                        scale = sumlx / suml2;
+                        best = scale * sumlx;
+                        n_changed += 1;
+                    }
+                }
+            }
+        }
+        if n_changed == 0 {
+            break;
+        }
+    }
+    if rmse_type < 3 {
+        return scale;
+    }
+    for is in -4..4 {
+        if is == 0 {
+            continue;
+        }
+        iscale = -(nmax as f32 + 0.1f32 * is as f32) / max;
+        let mut sumlx = 0.;
+        let mut suml2 = 0.;
+        for i in 0..n {
+            let x = *x.add(i);
+            let l = nearest_int(iscale * x);
+            let l = l.clamp(-nmax, nmax - 1);
+            let w = if weight_type == 1 { x * x } else { 1. };
+            let l = l as f32;
+            sumlx += w * x * l;
+            suml2 += w * l * l;
+        }
+        if suml2 > 0. && sumlx * sumlx > best * suml2 {
+            for i in 0..n {
+                let x = *x.add(i);
+                let l = nearest_int(iscale * x);
+                *ls.add(i) = (nmax + l.clamp(-nmax, nmax - 1)) as i8;
+            }
+            scale = sumlx / suml2;
+            best = scale * sumlx;
+        }
+    }
+    scale
+}
+
+// https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L224
+pub(super) fn make_qkx1_quants(nmax: i32, ntry: usize, x: &[f32]) -> (f32, f32) {
+    let n = x.len();
+    let mut l = vec![0; n];
+    // Get min/max
+    let min = *x
+        .iter()
+        .take(n)
+        .min_by(|a, b| a.total_cmp(b))
+        .unwrap_or(&x[0]);
+    let max = *x.iter().max_by(|a, b| a.total_cmp(b)).unwrap_or(&x[0]);
+
+    // If min == max, all values are the same => nothing to do here
+    if max == min {
+        return (0.0, 0.0);
+    }
+
+    // Ensure min <= 0.0
+    let mut min = min.min(0.);
+
+    // Compute scale and inverse scale
+    let mut iscale = nmax as f32 / (max - min);
+    let mut scale = 1.0 / iscale;
+
+    for _ in 0..ntry {
+        let mut sumlx = 0.0;
+        let mut suml2 = 0;
+        let mut did_change = false;
+
+        for (i, value) in x.iter().enumerate().take(n) {
+            let li = nearest_int(iscale * (value - min)).clamp(0, nmax);
+            let clamped_li = li as u8;
+            if clamped_li != l[i] {
+                l[i] = clamped_li;
+                did_change = true;
+            }
+            sumlx += (value - min) * li as f32;
+            suml2 += li * li;
+        }
+        scale = sumlx / suml2 as f32;
+
+        let sum: f32 = x
+            .iter()
+            .take(n)
+            .zip(l.iter().take(n))
+            .map(|(xi, &li)| xi - scale * li as f32)
+            .sum();
+
+        min = sum / n as f32;
+        if min > 0.0 {
+            min = 0.0;
+        }
+        iscale = 1.0 / scale;
+        if !did_change {
+            break;
+        }
+    }
+    (scale, -min)
+}
+
+// https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L165
+pub(super) fn make_q3_quants(x: &[f32], nmax: i32, do_rmse: bool) -> f32 {
+    let n = x.len();
+    let mut l = vec![0i8; n];
+
+    let mut max = 0.0;
+    let mut amax = 0.0;
+    for &xi in x.iter().take(n) {
+        let ax = xi.abs();
+        if ax > amax {
+            amax = ax;
+            max = xi;
+        }
+    }
+
+    if amax == 0.0 {
+        return 0.0;
+    }
+
+    let iscale = -(nmax as f32) / max;
+    if do_rmse {
+        let mut sumlx = 0.0;
+        let mut suml2 = 0.0;
+        for i in 0..n {
+            let li = (iscale * x[i]).round() as i32;
+            let li = li.clamp(-nmax, nmax - 1);
+            l[i] = li as i8;
+            let w = x[i] * x[i];
+            sumlx += w * x[i] * li as f32;
+            suml2 += w * (li * li) as f32;
+        }
+        for _ in 0..5 {
+            let mut n_changed = 0;
+            for i in 0..n {
+                let w = x[i] * x[i];
+                let mut slx = sumlx - w * x[i] * l[i] as f32;
+                if slx > 0.0 {
+                    let mut sl2 = suml2 - w * (l[i] as i32 * l[i] as i32) as f32;
+                    let mut new_l = (x[i] * sl2 / slx).round() as i32;
+                    new_l = new_l.clamp(-nmax, nmax - 1);
+                    if new_l != l[i] as i32 {
+                        slx += w * x[i] * new_l as f32;
+                        sl2 += w * (new_l * new_l) as f32;
+                        if sl2 > 0.0 && slx * slx * suml2 > sumlx * sumlx * sl2 {
+                            l[i] = new_l as i8;
+                            sumlx = slx;
+                            suml2 = sl2;
+                            n_changed += 1;
+                        }
+                    }
+                }
+            }
+            if n_changed == 0 {
+                break;
+            }
+        }
+        for li in l.iter_mut() {
+            *li += nmax as i8;
+        }
+        return sumlx / suml2;
+    }
+    for i in 0..n {
+        let li = (iscale * x[i]).round() as i32;
+        l[i] = (li.clamp(-nmax, nmax - 1) + nmax) as i8;
+    }
+    1.0 / iscale
+}
--- a/candle-core/src/safetensors.rs
+++ b/candle-core/src/safetensors.rs
+use crate::{DType, Device, Error, Result, Tensor, WithDType};
+use safetensors::tensor as st;
+use safetensors::tensor::SafeTensors;
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::path::Path;
+
+impl From<DType> for st::Dtype {
+    fn from(value: DType) -> Self {
+        match value {
+            DType::U8 => st::Dtype::U8,
+            DType::U32 => st::Dtype::U32,
+            DType::I64 => st::Dtype::I64,
+            DType::BF16 => st::Dtype::BF16,
+            DType::F16 => st::Dtype::F16,
+            DType::F32 => st::Dtype::F32,
+            DType::F64 => st::Dtype::F64,
+        }
+    }
+}
+
+impl TryFrom<st::Dtype> for DType {
+    type Error = Error;
+    fn try_from(value: st::Dtype) -> Result<Self> {
+        match value {
+            st::Dtype::U8 => Ok(DType::U8),
+            st::Dtype::U32 => Ok(DType::U32),
+            st::Dtype::I64 => Ok(DType::I64),
+            st::Dtype::BF16 => Ok(DType::BF16),
+            st::Dtype::F16 => Ok(DType::F16),
+            st::Dtype::F32 => Ok(DType::F32),
+            st::Dtype::F64 => Ok(DType::F64),
+            dtype => Err(Error::UnsupportedSafeTensorDtype(dtype)),
+        }
+    }
+}
+
+impl st::View for Tensor {
+    fn dtype(&self) -> st::Dtype {
+        self.dtype().into()
+    }
+    fn shape(&self) -> &[usize] {
+        self.shape().dims()
+    }
+
+    fn data(&self) -> Cow<[u8]> {
+        // This copies data from GPU to CPU.
+        // TODO: Avoid the unwrap here.
+        Cow::Owned(convert_back(self).unwrap())
+    }
+
+    fn data_len(&self) -> usize {
+        let n: usize = self.shape().elem_count();
+        let bytes_per_element = self.dtype().size_in_bytes();
+        n * bytes_per_element
+    }
+}
+
+impl st::View for &Tensor {
+    fn dtype(&self) -> st::Dtype {
+        (*self).dtype().into()
+    }
+    fn shape(&self) -> &[usize] {
+        self.dims()
+    }
+
+    fn data(&self) -> Cow<[u8]> {
+        // This copies data from GPU to CPU.
+        // TODO: Avoid the unwrap here.
+        Cow::Owned(convert_back(self).unwrap())
+    }
+
+    fn data_len(&self) -> usize {
+        let n: usize = self.dims().iter().product();
+        let bytes_per_element = (*self).dtype().size_in_bytes();
+        n * bytes_per_element
+    }
+}
+
+impl Tensor {
+    pub fn save_safetensors<P: AsRef<Path>>(&self, name: &str, filename: P) -> Result<()> {
+        let data = [(name, self.clone())];
+        Ok(st::serialize_to_file(data, &None, filename.as_ref())?)
+    }
+}
+
+fn convert_slice<T: WithDType>(data: &[u8], shape: &[usize], device: &Device) -> Result<Tensor> {
+    let size_in_bytes = T::DTYPE.size_in_bytes();
+    let elem_count = data.len() / size_in_bytes;
+    if (data.as_ptr() as usize) % size_in_bytes == 0 {
+        // SAFETY This is safe because we just checked that this
+        // was correctly aligned.
+        let data: &[T] =
+            unsafe { std::slice::from_raw_parts(data.as_ptr() as *const T, elem_count) };
+        Tensor::from_slice(data, shape, device)
+    } else {
+        // XXX: We need to specify `T` here, otherwise the compiler will infer u8 because of the following cast
+        // Making this vector too small to fit a full f16/f32/f64 weights, resulting in out-of-bounds access
+        let mut c: Vec<T> = Vec::with_capacity(elem_count);
+        // SAFETY: We just created c, so the allocated memory is necessarily
+        // contiguous and non overlapping with the view's data.
+        // We're downgrading the `c` pointer from T to u8, which removes alignment
+        // constraints.
+        unsafe {
+            std::ptr::copy_nonoverlapping(data.as_ptr(), c.as_mut_ptr() as *mut u8, data.len());
+            c.set_len(elem_count)
+        }
+        Tensor::from_slice(&c, shape, device)
+    }
+}
+
+fn convert_slice_with_cast<T: Sized + Copy, U: WithDType, F: Fn(T) -> Result<U>>(
+    data: &[u8],
+    shape: &[usize],
+    device: &Device,
+    conv: F,
+) -> Result<Tensor> {
+    let size_in_bytes = std::mem::size_of::<T>();
+    let elem_count = data.len() / size_in_bytes;
+    if (data.as_ptr() as usize) % size_in_bytes == 0 {
+        // SAFETY This is safe because we just checked that this
+        // was correctly aligned.
+        let data: &[T] =
+            unsafe { std::slice::from_raw_parts(data.as_ptr() as *const T, elem_count) };
+        let data = data.iter().map(|t| conv(*t)).collect::<Result<Vec<_>>>()?;
+        Tensor::from_vec(data, shape, device)
+    } else {
+        // XXX: We need to specify `T` here, otherwise the compiler will infer u8 because of the following cast
+        // Making this vector too small to fit a full f16/f32/f64 weights, resulting in out-of-bounds access
+        let mut c: Vec<T> = Vec::with_capacity(elem_count);
+        // SAFETY: We just created c, so the allocated memory is necessarily
+        // contiguous and non overlapping with the view's data.
+        // We're downgrading the `c` pointer from T to u8, which removes alignment
+        // constraints.
+        unsafe {
+            std::ptr::copy_nonoverlapping(data.as_ptr(), c.as_mut_ptr() as *mut u8, data.len());
+            c.set_len(elem_count)
+        }
+        let c = c.into_iter().map(conv).collect::<Result<Vec<_>>>()?;
+        Tensor::from_vec(c, shape, device)
+    }
+}
+
+fn convert_with_cast_<T: Sized + Copy, U: WithDType, F: Fn(T) -> Result<U>>(
+    view: &st::TensorView<'_>,
+    device: &Device,
+    conv: F,
+) -> Result<Tensor> {
+    convert_slice_with_cast::<T, U, F>(view.data(), view.shape(), device, conv)
+}
+
+fn convert_<T: WithDType>(view: &st::TensorView<'_>, device: &Device) -> Result<Tensor> {
+    convert_slice::<T>(view.data(), view.shape(), device)
+}
+
+fn convert_back_<T: WithDType>(mut vs: Vec<T>) -> Vec<u8> {
+    let size_in_bytes = T::DTYPE.size_in_bytes();
+    let length = vs.len() * size_in_bytes;
+    let capacity = vs.capacity() * size_in_bytes;
+    let ptr = vs.as_mut_ptr() as *mut u8;
+    // Don't run the destructor for Vec<T>
+    std::mem::forget(vs);
+    // SAFETY:
+    //
+    // Every T is larger than u8, so there is no issue regarding alignment.
+    // This re-interpret the Vec<T> as a Vec<u8>.
+    unsafe { Vec::from_raw_parts(ptr, length, capacity) }
+}
+
+pub trait Load {
+    fn load(&self, device: &Device) -> Result<Tensor>;
+}
+
+impl<'a> Load for st::TensorView<'a> {
+    fn load(&self, device: &Device) -> Result<Tensor> {
+        convert(self, device)
+    }
+}
+
+impl Tensor {
+    pub fn from_raw_buffer(
+        data: &[u8],
+        dtype: DType,
+        shape: &[usize],
+        device: &Device,
+    ) -> Result<Self> {
+        match dtype {
+            DType::U8 => convert_slice::<u8>(data, shape, device),
+            DType::U32 => convert_slice::<u32>(data, shape, device),
+            DType::I64 => convert_slice::<i64>(data, shape, device),
+            DType::BF16 => convert_slice::<half::bf16>(data, shape, device),
+            DType::F16 => convert_slice::<half::f16>(data, shape, device),
+            DType::F32 => convert_slice::<f32>(data, shape, device),
+            DType::F64 => convert_slice::<f64>(data, shape, device),
+        }
+    }
+}
+
+fn convert(view: &st::TensorView<'_>, device: &Device) -> Result<Tensor> {
+    match view.dtype() {
+        st::Dtype::U8 => convert_::<u8>(view, device),
+        st::Dtype::U16 => {
+            let conv = |x| Ok(u32::from(x));
+            convert_with_cast_::<u16, u32, _>(view, device, conv)
+        }
+        st::Dtype::U32 => convert_::<u32>(view, device),
+        st::Dtype::I32 => {
+            let conv = |x| Ok(i64::from(x));
+            convert_with_cast_::<i32, i64, _>(view, device, conv)
+        }
+        st::Dtype::I64 => convert_::<i64>(view, device),
+        st::Dtype::BF16 => convert_::<half::bf16>(view, device),
+        st::Dtype::F16 => convert_::<half::f16>(view, device),
+        st::Dtype::F32 => convert_::<f32>(view, device),
+        st::Dtype::F64 => convert_::<f64>(view, device),
+        dtype => Err(Error::UnsupportedSafeTensorDtype(dtype)),
+    }
+}
+
+fn convert_back(tensor: &Tensor) -> Result<Vec<u8>> {
+    // TODO: This makes an unnecessary copy when the tensor is on the cpu.
+    let tensor = tensor.flatten_all()?;
+    match tensor.dtype() {
+        DType::U8 => Ok(convert_back_::<u8>(tensor.to_vec1()?)),
+        DType::U32 => Ok(convert_back_::<u32>(tensor.to_vec1()?)),
+        DType::I64 => Ok(convert_back_::<i64>(tensor.to_vec1()?)),
+        DType::F16 => Ok(convert_back_::<half::f16>(tensor.to_vec1()?)),
+        DType::BF16 => Ok(convert_back_::<half::bf16>(tensor.to_vec1()?)),
+        DType::F32 => Ok(convert_back_::<f32>(tensor.to_vec1()?)),
+        DType::F64 => Ok(convert_back_::<f64>(tensor.to_vec1()?)),
+    }
+}
+
+pub fn load<P: AsRef<Path>>(filename: P, device: &Device) -> Result<HashMap<String, Tensor>> {
+    let data = std::fs::read(filename.as_ref())?;
+    load_buffer(&data[..], device)
+}
+
+pub fn load_buffer(data: &[u8], device: &Device) -> Result<HashMap<String, Tensor>> {
+    let st = safetensors::SafeTensors::deserialize(data)?;
+    st.tensors()
+        .into_iter()
+        .map(|(name, view)| Ok((name, view.load(device)?)))
+        .collect()
+}
+
+pub fn save<K: AsRef<str> + Ord + std::fmt::Display, P: AsRef<Path>>(
+    tensors: &HashMap<K, Tensor>,
+    filename: P,
+) -> Result<()> {
+    Ok(st::serialize_to_file(tensors, &None, filename.as_ref())?)
+}
+
+#[derive(yoke::Yokeable)]
+struct SafeTensors_<'a>(SafeTensors<'a>);
+
+pub struct MmapedSafetensors {
+    safetensors: Vec<yoke::Yoke<SafeTensors_<'static>, memmap2::Mmap>>,
+    routing: Option<HashMap<String, usize>>,
+}
+
+impl MmapedSafetensors {
+    /// Creates a wrapper around a memory mapped file and deserialize the safetensors header.
+    ///
+    /// # Safety
+    ///
+    /// The unsafe is inherited from [`memmap2::MmapOptions`].
+    pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
+        let p = p.as_ref();
+        let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
+        let file = memmap2::MmapOptions::new()
+            .map(&file)
+            .map_err(|e| Error::from(e).with_path(p))?;
+        let safetensors = yoke::Yoke::<SafeTensors_<'static>, memmap2::Mmap>::try_attach_to_cart(
+            file,
+            |data: &[u8]| {
+                let st = safetensors::SafeTensors::deserialize(data)
+                    .map_err(|e| Error::from(e).with_path(p))?;
+                Ok::<_, Error>(SafeTensors_(st))
+            },
+        )?;
+        Ok(Self {
+            safetensors: vec![safetensors],
+            routing: None,
+        })
+    }
+
+    /// Creates a wrapper around multiple memory mapped file and deserialize the safetensors headers.
+    ///
+    /// If a tensor name appears in multiple files, the last entry is returned.
+    ///
+    /// # Safety
+    ///
+    /// The unsafe is inherited from [`memmap2::MmapOptions`].
+    pub unsafe fn multi<P: AsRef<Path>>(paths: &[P]) -> Result<Self> {
+        let mut routing = HashMap::new();
+        let mut safetensors = vec![];
+        for (index, p) in paths.iter().enumerate() {
+            let p = p.as_ref();
+            let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
+            let file = memmap2::MmapOptions::new()
+                .map(&file)
+                .map_err(|e| Error::from(e).with_path(p))?;
+            let data = yoke::Yoke::<SafeTensors_<'static>, memmap2::Mmap>::try_attach_to_cart(
+                file,
+                |data: &[u8]| {
+                    let st = safetensors::SafeTensors::deserialize(data)
+                        .map_err(|e| Error::from(e).with_path(p))?;
+                    Ok::<_, Error>(SafeTensors_(st))
+                },
+            )?;
+            for k in data.get().0.names() {
+                routing.insert(k.to_string(), index);
+            }
+            safetensors.push(data)
+        }
+        Ok(Self {
+            safetensors,
+            routing: Some(routing),
+        })
+    }
+
+    pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
+        self.get(name)?.load(dev)
+    }
+
+    pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
+        let mut tensors = vec![];
+        for safetensors in self.safetensors.iter() {
+            tensors.push(safetensors.get().0.tensors())
+        }
+        tensors.into_iter().flatten().collect()
+    }
+
+    pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
+        let index = match &self.routing {
+            None => 0,
+            Some(routing) => {
+                let index = routing.get(name).ok_or_else(|| {
+                    Error::CannotFindTensor {
+                        path: name.to_string(),
+                    }
+                    .bt()
+                })?;
+                *index
+            }
+        };
+        Ok(self.safetensors[index].get().0.tensor(name)?)
+    }
+}
+
+pub struct BufferedSafetensors {
+    safetensors: yoke::Yoke<SafeTensors_<'static>, Vec<u8>>,
+}
+
+impl BufferedSafetensors {
+    /// Creates a wrapper around a binary buffer and deserialize the safetensors header.
+    pub fn new(buffer: Vec<u8>) -> Result<Self> {
+        let safetensors = yoke::Yoke::<SafeTensors_<'static>, Vec<u8>>::try_attach_to_cart(
+            buffer,
+            |data: &[u8]| {
+                let st = safetensors::SafeTensors::deserialize(data)?;
+                Ok::<_, Error>(SafeTensors_(st))
+            },
+        )?;
+        Ok(Self { safetensors })
+    }
+
+    pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
+        self.get(name)?.load(dev)
+    }
+
+    pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
+        self.safetensors.get().0.tensors()
+    }
+
+    pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
+        Ok(self.safetensors.get().0.tensor(name)?)
+    }
+}
+
+pub struct MmapedFile {
+    path: std::path::PathBuf,
+    inner: memmap2::Mmap,
+}
+
+impl MmapedFile {
+    /// Creates a wrapper around a memory mapped file from which you can retrieve
+    /// tensors using [`MmapedFile::deserialize`]
+    ///
+    /// # Safety
+    ///
+    /// The unsafe is inherited from [`memmap2::MmapOptions`].
+    pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
+        let p = p.as_ref();
+        let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
+        let inner = memmap2::MmapOptions::new()
+            .map(&file)
+            .map_err(|e| Error::from(e).with_path(p))?;
+        Ok(Self {
+            inner,
+            path: p.to_path_buf(),
+        })
+    }
+
+    pub fn deserialize(&self) -> Result<SafeTensors<'_>> {
+        let st = safetensors::SafeTensors::deserialize(&self.inner)
+            .map_err(|e| Error::from(e).with_path(&self.path))?;
+        Ok(st)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::collections::HashMap;
+
+    #[test]
+    fn save_single_tensor() {
+        let t = Tensor::zeros((2, 2), DType::F32, &Device::Cpu).unwrap();
+        t.save_safetensors("t", "t.safetensors").unwrap();
+        let bytes = std::fs::read("t.safetensors").unwrap();
+        assert_eq!(bytes, b"@\0\0\0\0\0\0\0{\"t\":{\"dtype\":\"F32\",\"shape\":[2,2],\"data_offsets\":[0,16]}}       \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
+        std::fs::remove_file("t.safetensors").unwrap();
+    }
+
+    #[test]
+    fn save_load_multiple_tensors() {
+        let t = Tensor::zeros((2, 2), DType::F32, &Device::Cpu).unwrap();
+        let u = Tensor::zeros((1, 2), DType::F32, &Device::Cpu).unwrap();
+        let map: HashMap<_, _> = [("t", t), ("u", u)].into_iter().collect();
+        save(&map, "multi.safetensors").unwrap();
+
+        let weights = load("multi.safetensors", &Device::Cpu).unwrap();
+        assert_eq!(weights.get("t").unwrap().dims(), &[2, 2]);
+        assert_eq!(weights.get("u").unwrap().dims(), &[1, 2]);
+        let bytes = std::fs::read("multi.safetensors").unwrap();
+        assert_eq!(bytes, b"x\0\0\0\0\0\0\0{\"t\":{\"dtype\":\"F32\",\"shape\":[2,2],\"data_offsets\":[0,16]},\"u\":{\"dtype\":\"F32\",\"shape\":[1,2],\"data_offsets\":[16,24]}}      \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
+        std::fs::remove_file("multi.safetensors").unwrap();
+    }
+}
--- a/candle-core/src/scalar.rs
+++ b/candle-core/src/scalar.rs
+use crate::{Result, Tensor, WithDType};
+
+pub enum TensorScalar {
+    Tensor(Tensor),
+    Scalar(Tensor),
+}
+
+pub trait TensorOrScalar {
+    fn to_tensor_scalar(self) -> Result<TensorScalar>;
+}
+
+impl TensorOrScalar for &Tensor {
+    fn to_tensor_scalar(self) -> Result<TensorScalar> {
+        Ok(TensorScalar::Tensor(self.clone()))
+    }
+}
+
+impl<T: WithDType> TensorOrScalar for T {
+    fn to_tensor_scalar(self) -> Result<TensorScalar> {
+        let scalar = Tensor::new(self, &crate::Device::Cpu)?;
+        Ok(TensorScalar::Scalar(scalar))
+    }
+}
--- a/candle-core/src/shape.rs
+++ b/candle-core/src/shape.rs
+//! The shape of a tensor is a tuple with the size of each of its dimensions.
+#![allow(clippy::redundant_closure_call)]
+use crate::{Error, Result};
+
+#[derive(Clone, PartialEq, Eq)]
+pub struct Shape(Vec<usize>);
+
+pub const SCALAR: Shape = Shape(vec![]);
+
+impl std::fmt::Debug for Shape {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", &self.dims())
+    }
+}
+
+impl<const C: usize> From<&[usize; C]> for Shape {
+    fn from(dims: &[usize; C]) -> Self {
+        Self(dims.to_vec())
+    }
+}
+
+impl From<&[usize]> for Shape {
+    fn from(dims: &[usize]) -> Self {
+        Self(dims.to_vec())
+    }
+}
+
+impl From<&Shape> for Shape {
+    fn from(shape: &Shape) -> Self {
+        Self(shape.0.to_vec())
+    }
+}
+
+impl From<()> for Shape {
+    fn from(_: ()) -> Self {
+        Self(vec![])
+    }
+}
+
+impl From<usize> for Shape {
+    fn from(d1: usize) -> Self {
+        Self(vec![d1])
+    }
+}
+
+impl From<(usize,)> for Shape {
+    fn from(d1: (usize,)) -> Self {
+        Self(vec![d1.0])
+    }
+}
+
+impl From<(usize, usize)> for Shape {
+    fn from(d12: (usize, usize)) -> Self {
+        Self(vec![d12.0, d12.1])
+    }
+}
+
+impl From<(usize, usize, usize)> for Shape {
+    fn from(d123: (usize, usize, usize)) -> Self {
+        Self(vec![d123.0, d123.1, d123.2])
+    }
+}
+
+impl From<(usize, usize, usize, usize)> for Shape {
+    fn from(d1234: (usize, usize, usize, usize)) -> Self {
+        Self(vec![d1234.0, d1234.1, d1234.2, d1234.3])
+    }
+}
+
+impl From<(usize, usize, usize, usize, usize)> for Shape {
+    fn from(d12345: (usize, usize, usize, usize, usize)) -> Self {
+        Self(vec![d12345.0, d12345.1, d12345.2, d12345.3, d12345.4])
+    }
+}
+
+impl From<(usize, usize, usize, usize, usize, usize)> for Shape {
+    fn from(d123456: (usize, usize, usize, usize, usize, usize)) -> Self {
+        Self(vec![
+            d123456.0, d123456.1, d123456.2, d123456.3, d123456.4, d123456.5,
+        ])
+    }
+}
+
+impl From<Vec<usize>> for Shape {
+    fn from(dims: Vec<usize>) -> Self {
+        Self(dims)
+    }
+}
+
+macro_rules! extract_dims {
+    ($fn_name:ident, $cnt:tt, $dims:expr, $out_type:ty) => {
+        pub fn $fn_name(dims: &[usize]) -> Result<$out_type> {
+            if dims.len() != $cnt {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: $cnt,
+                    got: dims.len(),
+                    shape: Shape::from(dims),
+                }
+                .bt())
+            } else {
+                Ok($dims(dims))
+            }
+        }
+
+        impl Shape {
+            pub fn $fn_name(&self) -> Result<$out_type> {
+                $fn_name(self.0.as_slice())
+            }
+        }
+
+        impl crate::Tensor {
+            pub fn $fn_name(&self) -> Result<$out_type> {
+                self.shape().$fn_name()
+            }
+        }
+
+        impl std::convert::TryInto<$out_type> for Shape {
+            type Error = crate::Error;
+            fn try_into(self) -> std::result::Result<$out_type, Self::Error> {
+                self.$fn_name()
+            }
+        }
+    };
+}
+
+impl Shape {
+    pub fn from_dims(dims: &[usize]) -> Self {
+        Self(dims.to_vec())
+    }
+
+    /// The rank is the number of dimensions, 0 for a scalar value, 1 for a vector, etc.
+    pub fn rank(&self) -> usize {
+        self.0.len()
+    }
+
+    pub fn into_dims(self) -> Vec<usize> {
+        self.0
+    }
+
+    /// The dimensions as a slice of `usize`.
+    pub fn dims(&self) -> &[usize] {
+        &self.0
+    }
+
+    /// The total number of elements, this is the product of all dimension sizes.
+    pub fn elem_count(&self) -> usize {
+        self.0.iter().product()
+    }
+
+    /// The strides given in number of elements for a contiguous n-dimensional
+    /// arrays using this shape.
+    pub(crate) fn stride_contiguous(&self) -> Vec<usize> {
+        let mut stride: Vec<_> = self
+            .0
+            .iter()
+            .rev()
+            .scan(1, |prod, u| {
+                let prod_pre_mult = *prod;
+                *prod *= u;
+                Some(prod_pre_mult)
+            })
+            .collect();
+        stride.reverse();
+        stride
+    }
+
+    /// Returns true if the strides are C contiguous (aka row major).
+    pub fn is_contiguous(&self, stride: &[usize]) -> bool {
+        if self.0.len() != stride.len() {
+            return false;
+        }
+        let mut acc = 1;
+        for (&stride, &dim) in stride.iter().zip(self.0.iter()).rev() {
+            if dim > 1 && stride != acc {
+                return false;
+            }
+            acc *= dim;
+        }
+        true
+    }
+
+    /// Returns true if the strides are Fortran contiguous (aka column major).
+    pub fn is_fortran_contiguous(&self, stride: &[usize]) -> bool {
+        if self.0.len() != stride.len() {
+            return false;
+        }
+        let mut acc = 1;
+        for (&stride, &dim) in stride.iter().zip(self.0.iter()) {
+            if dim > 1 && stride != acc {
+                return false;
+            }
+            acc *= dim;
+        }
+        true
+    }
+
+    /// Modifies the shape by adding a list of additional dimensions at the end of the existing
+    /// dimensions.
+    pub fn extend(mut self, additional_dims: &[usize]) -> Self {
+        self.0.extend(additional_dims);
+        self
+    }
+
+    /// Check whether the two shapes are compatible for broadcast, and if it is the case return the
+    /// broadcasted shape. This is to be used for binary pointwise ops.
+    pub fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
+        let lhs = self;
+        let lhs_dims = lhs.dims();
+        let rhs_dims = rhs.dims();
+        let lhs_ndims = lhs_dims.len();
+        let rhs_ndims = rhs_dims.len();
+        let bcast_ndims = usize::max(lhs_ndims, rhs_ndims);
+        let mut bcast_dims = vec![0; bcast_ndims];
+        for (idx, bcast_value) in bcast_dims.iter_mut().enumerate() {
+            let rev_idx = bcast_ndims - idx;
+            let l_value = if lhs_ndims < rev_idx {
+                1
+            } else {
+                lhs_dims[lhs_ndims - rev_idx]
+            };
+            let r_value = if rhs_ndims < rev_idx {
+                1
+            } else {
+                rhs_dims[rhs_ndims - rev_idx]
+            };
+            *bcast_value = if l_value == r_value {
+                l_value
+            } else if l_value == 1 {
+                r_value
+            } else if r_value == 1 {
+                l_value
+            } else {
+                Err(Error::ShapeMismatchBinaryOp {
+                    lhs: lhs.clone(),
+                    rhs: rhs.clone(),
+                    op,
+                }
+                .bt())?
+            }
+        }
+        Ok(Shape::from(bcast_dims))
+    }
+
+    pub(crate) fn broadcast_shape_matmul(&self, rhs: &Self) -> Result<(Shape, Shape)> {
+        let lhs = self;
+        let lhs_dims = lhs.dims();
+        let rhs_dims = rhs.dims();
+        if lhs_dims.len() < 2 || rhs_dims.len() < 2 {
+            crate::bail!("only 2d matrixes are supported {lhs:?} {rhs:?}")
+        }
+        let (m, lhs_k) = (lhs_dims[lhs_dims.len() - 2], lhs_dims[lhs_dims.len() - 1]);
+        let (rhs_k, n) = (rhs_dims[rhs_dims.len() - 2], rhs_dims[rhs_dims.len() - 1]);
+        if lhs_k != rhs_k {
+            crate::bail!("different inner dimensions in broadcast matmul {lhs:?} {rhs:?}")
+        }
+
+        let lhs_b = Self::from(&lhs_dims[..lhs_dims.len() - 2]);
+        let rhs_b = Self::from(&rhs_dims[..rhs_dims.len() - 2]);
+        let bcast = lhs_b.broadcast_shape_binary_op(&rhs_b, "broadcast_matmul")?;
+        let bcast_dims = bcast.dims();
+
+        let bcast_lhs = [bcast_dims, &[m, lhs_k]].concat();
+        let bcast_rhs = [bcast_dims, &[rhs_k, n]].concat();
+        Ok((Shape::from(bcast_lhs), Shape::from(bcast_rhs)))
+    }
+}
+
+pub trait Dim {
+    fn to_index(&self, shape: &Shape, op: &'static str) -> Result<usize>;
+    fn to_index_plus_one(&self, shape: &Shape, op: &'static str) -> Result<usize>;
+}
+
+impl Dim for usize {
+    fn to_index(&self, shape: &Shape, op: &'static str) -> Result<usize> {
+        let dim = *self;
+        if dim >= shape.dims().len() {
+            Err(Error::DimOutOfRange {
+                shape: shape.clone(),
+                dim: dim as i32,
+                op,
+            }
+            .bt())?
+        } else {
+            Ok(dim)
+        }
+    }
+
+    fn to_index_plus_one(&self, shape: &Shape, op: &'static str) -> Result<usize> {
+        let dim = *self;
+        if dim > shape.dims().len() {
+            Err(Error::DimOutOfRange {
+                shape: shape.clone(),
+                dim: dim as i32,
+                op,
+            }
+            .bt())?
+        } else {
+            Ok(dim)
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub enum D {
+    Minus1,
+    Minus2,
+}
+
+impl D {
+    fn out_of_range(&self, shape: &Shape, op: &'static str) -> Error {
+        let dim = match self {
+            Self::Minus1 => -1,
+            Self::Minus2 => -2,
+        };
+        Error::DimOutOfRange {
+            shape: shape.clone(),
+            dim,
+            op,
+        }
+        .bt()
+    }
+}
+
+impl Dim for D {
+    fn to_index(&self, shape: &Shape, op: &'static str) -> Result<usize> {
+        let rank = shape.rank();
+        match self {
+            Self::Minus1 if rank >= 1 => Ok(rank - 1),
+            Self::Minus2 if rank >= 2 => Ok(rank - 2),
+            _ => Err(self.out_of_range(shape, op)),
+        }
+    }
+
+    fn to_index_plus_one(&self, shape: &Shape, op: &'static str) -> Result<usize> {
+        let rank = shape.rank();
+        match self {
+            Self::Minus1 => Ok(rank),
+            Self::Minus2 if rank >= 1 => Ok(rank - 1),
+            _ => Err(self.out_of_range(shape, op)),
+        }
+    }
+}
+
+pub trait Dims: Sized {
+    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>>;
+
+    fn to_indexes(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
+        let dims = self.to_indexes_internal(shape, op)?;
+        for (i, &dim) in dims.iter().enumerate() {
+            if dims[..i].contains(&dim) {
+                Err(Error::DuplicateDimIndex {
+                    shape: shape.clone(),
+                    dims: dims.clone(),
+                    op,
+                }
+                .bt())?
+            }
+            if dim >= shape.rank() {
+                Err(Error::DimOutOfRange {
+                    shape: shape.clone(),
+                    dim: dim as i32,
+                    op,
+                }
+                .bt())?
+            }
+        }
+        Ok(dims)
+    }
+}
+
+impl Dims for Vec<usize> {
+    fn to_indexes_internal(self, _: &Shape, _: &'static str) -> Result<Vec<usize>> {
+        Ok(self)
+    }
+}
+
+impl<const N: usize> Dims for [usize; N] {
+    fn to_indexes_internal(self, _: &Shape, _: &'static str) -> Result<Vec<usize>> {
+        Ok(self.to_vec())
+    }
+}
+
+impl Dims for &[usize] {
+    fn to_indexes_internal(self, _: &Shape, _: &'static str) -> Result<Vec<usize>> {
+        Ok(self.to_vec())
+    }
+}
+
+impl Dims for () {
+    fn to_indexes_internal(self, _: &Shape, _: &'static str) -> Result<Vec<usize>> {
+        Ok(vec![])
+    }
+}
+
+impl<D: Dim + Sized> Dims for D {
+    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
+        let dim = self.to_index(shape, op)?;
+        Ok(vec![dim])
+    }
+}
+
+impl<D: Dim> Dims for (D,) {
+    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
+        let dim = self.0.to_index(shape, op)?;
+        Ok(vec![dim])
+    }
+}
+
+impl<D1: Dim, D2: Dim> Dims for (D1, D2) {
+    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
+        let d0 = self.0.to_index(shape, op)?;
+        let d1 = self.1.to_index(shape, op)?;
+        Ok(vec![d0, d1])
+    }
+}
+
+impl<D1: Dim, D2: Dim, D3: Dim> Dims for (D1, D2, D3) {
+    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
+        let d0 = self.0.to_index(shape, op)?;
+        let d1 = self.1.to_index(shape, op)?;
+        let d2 = self.2.to_index(shape, op)?;
+        Ok(vec![d0, d1, d2])
+    }
+}
+
+impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim> Dims for (D1, D2, D3, D4) {
+    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
+        let d0 = self.0.to_index(shape, op)?;
+        let d1 = self.1.to_index(shape, op)?;
+        let d2 = self.2.to_index(shape, op)?;
+        let d3 = self.3.to_index(shape, op)?;
+        Ok(vec![d0, d1, d2, d3])
+    }
+}
+
+impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim, D5: Dim> Dims for (D1, D2, D3, D4, D5) {
+    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
+        let d0 = self.0.to_index(shape, op)?;
+        let d1 = self.1.to_index(shape, op)?;
+        let d2 = self.2.to_index(shape, op)?;
+        let d3 = self.3.to_index(shape, op)?;
+        let d4 = self.4.to_index(shape, op)?;
+        Ok(vec![d0, d1, d2, d3, d4])
+    }
+}
+
+impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim, D5: Dim, D6: Dim> Dims for (D1, D2, D3, D4, D5, D6) {
+    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
+        let d0 = self.0.to_index(shape, op)?;
+        let d1 = self.1.to_index(shape, op)?;
+        let d2 = self.2.to_index(shape, op)?;
+        let d3 = self.3.to_index(shape, op)?;
+        let d4 = self.4.to_index(shape, op)?;
+        let d5 = self.5.to_index(shape, op)?;
+        Ok(vec![d0, d1, d2, d3, d4, d5])
+    }
+}
+
+extract_dims!(dims0, 0, |_: &[usize]| (), ());
+extract_dims!(dims1, 1, |d: &[usize]| d[0], usize);
+extract_dims!(dims2, 2, |d: &[usize]| (d[0], d[1]), (usize, usize));
+extract_dims!(
+    dims3,
+    3,
+    |d: &[usize]| (d[0], d[1], d[2]),
+    (usize, usize, usize)
+);
+extract_dims!(
+    dims4,
+    4,
+    |d: &[usize]| (d[0], d[1], d[2], d[3]),
+    (usize, usize, usize, usize)
+);
+extract_dims!(
+    dims5,
+    5,
+    |d: &[usize]| (d[0], d[1], d[2], d[3], d[4]),
+    (usize, usize, usize, usize, usize)
+);
+
+pub trait ShapeWithOneHole {
+    fn into_shape(self, el_count: usize) -> Result<Shape>;
+}
+
+impl<S: Into<Shape>> ShapeWithOneHole for S {
+    fn into_shape(self, _el_count: usize) -> Result<Shape> {
+        Ok(self.into())
+    }
+}
+
+impl ShapeWithOneHole for ((),) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        Ok(el_count.into())
+    }
+}
+
+fn hole_size(el_count: usize, prod_d: usize, s: &dyn std::fmt::Debug) -> Result<usize> {
+    if prod_d == 0 {
+        crate::bail!("cannot reshape tensor of {el_count} elements to {s:?}")
+    }
+    if el_count % prod_d != 0 {
+        crate::bail!("cannot reshape tensor with {el_count} elements to {s:?}")
+    }
+    Ok(el_count / prod_d)
+}
+
+impl ShapeWithOneHole for ((), usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let ((), d1) = self;
+        Ok((hole_size(el_count, d1, &self)?, d1).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, ()) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, ()) = self;
+        Ok((d1, hole_size(el_count, d1, &self)?).into())
+    }
+}
+
+impl ShapeWithOneHole for ((), usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let ((), d1, d2) = self;
+        Ok((hole_size(el_count, d1 * d2, &self)?, d1, d2).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, (), usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, (), d2) = self;
+        Ok((d1, hole_size(el_count, d1 * d2, &self)?, d2).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, ()) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, ()) = self;
+        Ok((d1, d2, hole_size(el_count, d1 * d2, &self)?).into())
+    }
+}
+
+impl ShapeWithOneHole for ((), usize, usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let ((), d1, d2, d3) = self;
+        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
+        Ok((d, d1, d2, d3).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, (), usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, (), d2, d3) = self;
+        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
+        Ok((d1, d, d2, d3).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, (), usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, (), d3) = self;
+        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
+        Ok((d1, d2, d, d3).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, usize, ()) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, d3, ()) = self;
+        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
+        Ok((d1, d2, d3, d).into())
+    }
+}
+
+impl ShapeWithOneHole for ((), usize, usize, usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let ((), d1, d2, d3, d4) = self;
+        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        Ok((d, d1, d2, d3, d4).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, (), usize, usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, (), d2, d3, d4) = self;
+        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        Ok((d1, d, d2, d3, d4).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, (), usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, (), d3, d4) = self;
+        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        Ok((d1, d2, d, d3, d4).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, usize, (), usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, d3, (), d4) = self;
+        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        Ok((d1, d2, d3, d, d4).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, usize, usize, ()) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, d3, d4, ()) = self;
+        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        Ok((d1, d2, d3, d4, d).into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn stride() {
+        let shape = Shape::from(());
+        assert_eq!(shape.stride_contiguous(), Vec::<usize>::new());
+        let shape = Shape::from(42);
+        assert_eq!(shape.stride_contiguous(), [1]);
+        let shape = Shape::from((42, 1337));
+        assert_eq!(shape.stride_contiguous(), [1337, 1]);
+        let shape = Shape::from((299, 792, 458));
+        assert_eq!(shape.stride_contiguous(), [458 * 792, 458, 1]);
+    }
+}
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
+use crate::backend::BackendStorage;
+use crate::op::{self, CmpOp, ReduceOp};
+use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage, Result, Shape};
+use crate::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3};
+
+// We do not want to implement Clone on Storage as cloning may fail because of
+// out of memory. Instead try_clone should be used.
+#[derive(Debug)]
+pub enum Storage {
+    Cpu(CpuStorage),
+    Cuda(CudaStorage),
+    Metal(MetalStorage),
+}
+
+impl Storage {
+    pub fn try_clone(&self, layout: &Layout) -> Result<Self> {
+        match self {
+            Self::Cpu(storage) => Ok(Self::Cpu(storage.clone())),
+            Self::Cuda(storage) => {
+                let storage = storage.try_clone(layout)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.try_clone(layout)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub fn device(&self) -> Device {
+        match self {
+            Self::Cpu(_) => Device::Cpu,
+            Self::Cuda(storage) => Device::Cuda(storage.device().clone()),
+            Self::Metal(storage) => Device::Metal(storage.device().clone()),
+        }
+    }
+
+    pub fn dtype(&self) -> DType {
+        match self {
+            Self::Cpu(storage) => storage.dtype(),
+            Self::Cuda(storage) => storage.dtype(),
+            Self::Metal(storage) => storage.dtype(),
+        }
+    }
+
+    pub(crate) fn same_device(&self, rhs: &Self, op: &'static str) -> Result<()> {
+        let lhs_device = self.device();
+        let rhs_device = rhs.device();
+        let lhs = lhs_device.location();
+        let rhs = rhs_device.location();
+        let same_device = if self.device().is_metal() {
+            // On metal, we require the device to be exactly the same rather than
+            // having the same location. In cuda this is not necessary as all CudaDevice on the
+            // same GPU will use the same cuda stream.
+            lhs_device.same_device(&rhs_device)
+        } else {
+            lhs == rhs
+        };
+        if !same_device {
+            Err(Error::DeviceMismatchBinaryOp { lhs, rhs, op }.bt())
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(crate) fn same_dtype(&self, rhs: &Self, op: &'static str) -> Result<()> {
+        let lhs = self.dtype();
+        let rhs = rhs.dtype();
+        if lhs != rhs {
+            Err(Error::DTypeMismatchBinaryOp { lhs, rhs, op }.bt())
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(crate) fn affine(&self, layout: &Layout, mul: f64, add: f64) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.affine(layout, mul, add)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.affine(layout, mul, add)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.affine(layout, mul, add)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn powf(&self, layout: &Layout, alpha: f64) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.powf(layout, alpha)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.powf(layout, alpha)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.powf(layout, alpha)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn elu(&self, layout: &Layout, alpha: f64) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.elu(layout, alpha)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.elu(layout, alpha)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.elu(layout, alpha)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn cmp(
+        &self,
+        op: CmpOp,
+        rhs: &Self,
+        lhs_layout: &Layout,
+        rhs_layout: &Layout,
+    ) -> Result<Self> {
+        self.same_device(rhs, "cmp")?;
+        self.same_dtype(rhs, "cmp")?;
+        match (self, rhs) {
+            (Storage::Cpu(lhs), Storage::Cpu(rhs)) => {
+                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(lhs), Self::Cuda(rhs)) => {
+                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
+                Ok(Self::Cuda(storage))
+            }
+            (Self::Metal(lhs), Self::Metal(rhs)) => {
+                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
+                Ok(Self::Metal(storage))
+            }
+            (lhs, rhs) => {
+                // Should not happen because of the same device check above but we're defensive
+                // anyway.
+                Err(Error::DeviceMismatchBinaryOp {
+                    lhs: lhs.device().location(),
+                    rhs: rhs.device().location(),
+                    op: "cmp",
+                }
+                .bt())
+            }
+        }
+    }
+
+    pub(crate) fn reduce_op(&self, op: ReduceOp, layout: &Layout, s: &[usize]) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.reduce_op(op, layout, s)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.reduce_op(op, layout, s)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.reduce_op(op, layout, s)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn to_dtype(&self, layout: &Layout, dtype: DType) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.to_dtype(layout, dtype)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.to_dtype(layout, dtype)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.to_dtype(layout, dtype)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn apply_op1(&self, l: &Layout, c: &dyn CustomOp1) -> Result<(Self, Shape)> {
+        match self {
+            Self::Cpu(storage) => {
+                let (storage, shape) = c.cpu_fwd(storage, l)?;
+                Ok((Self::Cpu(storage), shape))
+            }
+            Self::Cuda(storage) => {
+                let (storage, shape) = c.cuda_fwd(storage, l)?;
+                Ok((Self::Cuda(storage), shape))
+            }
+            Self::Metal(storage) => {
+                let (storage, shape) = c.metal_fwd(storage, l)?;
+                Ok((Self::Metal(storage), shape))
+            }
+        }
+    }
+
+    pub(crate) fn apply_op2(
+        &self,
+        l1: &Layout,
+        t2: &Self,
+        l2: &Layout,
+        c: &dyn CustomOp2,
+    ) -> Result<(Self, Shape)> {
+        self.same_device(t2, c.name())?;
+        match (self, t2) {
+            (Self::Cpu(s1), Self::Cpu(s2)) => {
+                let (s, shape) = c.cpu_fwd(s1, l1, s2, l2)?;
+                Ok((Self::Cpu(s), shape))
+            }
+            (Self::Cuda(s1), Self::Cuda(s2)) => {
+                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2)?;
+                Ok((Self::Cuda(s), shape))
+            }
+            (Self::Metal(s1), Self::Metal(s2)) => {
+                let (s, shape) = c.metal_fwd(s1, l1, s2, l2)?;
+                Ok((Self::Metal(s), shape))
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    pub(crate) fn apply_op3(
+        &self,
+        l1: &Layout,
+        t2: &Self,
+        l2: &Layout,
+        t3: &Self,
+        l3: &Layout,
+        c: &dyn CustomOp3,
+    ) -> Result<(Self, Shape)> {
+        //println!("candle-core/src/storage.rs:247 apply_op3 11");
+        self.same_device(t2, c.name())?;
+        self.same_device(t3, c.name())?;
+        match (self, t2, t3) {
+            (Self::Cpu(s1), Self::Cpu(s2), Self::Cpu(s3)) => {
+                //println!("candle-core/src/storage.rs apply_op3 22");
+                let (s, shape) = c.cpu_fwd(s1, l1, s2, l2, s3, l3)?;
+                Ok((Self::Cpu(s), shape))
+            }
+            (Self::Cuda(s1), Self::Cuda(s2), Self::Cuda(s3)) => {
+                //println!("candle-core/src/storage.rs apply_op3 33");
+                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2, s3, l3)?;
+                Ok((Self::Cuda(s), shape))
+            }
+            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
+                //println!("candle-core/src/storage.rs apply_op3 44");
+                let (s, shape) = c.metal_fwd(s1, l1, s2, l2, s3, l3)?;
+                Ok((Self::Metal(s), shape))
+            }
+            _ => {
+                //println!("candle-core/src/storage.rs apply_op3 55");
+                unreachable!()
+            },
+        }
+    }
+
+    pub(crate) fn inplace_op1(&mut self, l: &Layout, c: &dyn InplaceOp1) -> Result<()> {
+        match self {
+            Self::Cpu(storage) => c.cpu_fwd(storage, l),
+            Self::Cuda(storage) => c.cuda_fwd(storage, l),
+            Self::Metal(storage) => c.metal_fwd(storage, l),
+        }
+    }
+
+    pub(crate) fn inplace_op2(
+        &mut self,
+        l1: &Layout,
+        t2: &Self,
+        l2: &Layout,
+        c: &dyn InplaceOp2,
+    ) -> Result<()> {
+        self.same_device(t2, c.name())?;
+        match (self, t2) {
+            (Self::Cpu(s1), Self::Cpu(s2)) => c.cpu_fwd(s1, l1, s2, l2),
+            (Self::Cuda(s1), Self::Cuda(s2)) => c.cuda_fwd(s1, l1, s2, l2),
+            (Self::Metal(s1), Self::Metal(s2)) => c.metal_fwd(s1, l1, s2, l2),
+            _ => unreachable!(),
+        }
+    }
+
+    pub(crate) fn inplace_op3(
+        &mut self,
+        l1: &Layout,
+        t2: &Self,
+        l2: &Layout,
+        t3: &Self,
+        l3: &Layout,
+        c: &dyn InplaceOp3,
+    ) -> Result<()> {
+        self.same_device(t2, c.name())?;
+        self.same_device(t3, c.name())?;
+        match (self, t2, t3) {
+            (Self::Cpu(s1), Self::Cpu(s2), Self::Cpu(s3)) => c.cpu_fwd(s1, l1, s2, l2, s3, l3),
+            (Self::Cuda(s1), Self::Cuda(s2), Self::Cuda(s3)) => c.cuda_fwd(s1, l1, s2, l2, s3, l3),
+            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
+                c.metal_fwd(s1, l1, s2, l2, s3, l3)
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    pub(crate) fn unary_impl<B: op::UnaryOpT>(&self, layout: &Layout) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.unary_impl::<B>(layout)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.unary_impl::<B>(layout)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.unary_impl::<B>(layout)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn binary_impl<B: op::BinaryOpT>(
+        &self,
+        rhs: &Self,
+        lhs_layout: &Layout,
+        rhs_layout: &Layout,
+    ) -> Result<Self> {
+        self.same_device(rhs, B::NAME)?;
+        self.same_dtype(rhs, B::NAME)?;
+        match (self, rhs) {
+            (Storage::Cpu(lhs), Storage::Cpu(rhs)) => {
+                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(lhs), Self::Cuda(rhs)) => {
+                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
+                Ok(Self::Cuda(storage))
+            }
+            (Self::Metal(lhs), Self::Metal(rhs)) => {
+                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
+                Ok(Self::Metal(storage))
+            }
+            (lhs, rhs) => {
+                // Should not happen because of the same device check above but we're defensive
+                // anyway.
+                Err(Error::DeviceMismatchBinaryOp {
+                    lhs: lhs.device().location(),
+                    rhs: rhs.device().location(),
+                    op: B::NAME,
+                }
+                .bt())
+            }
+        }
+    }
+
+    pub(crate) fn conv1d(
+        &self,
+        l: &Layout,
+        kernel: &Self,
+        kernel_l: &Layout,
+        params: &crate::conv::ParamsConv1D,
+    ) -> Result<Self> {
+        self.same_device(kernel, "conv1d")?;
+        self.same_dtype(kernel, "conv1d")?;
+        match (self, &kernel) {
+            (Storage::Cpu(inp), Storage::Cpu(kernel)) => {
+                let s = inp.conv1d(l, kernel, kernel_l, params)?;
+                Ok(Self::Cpu(s))
+            }
+            (Storage::Cuda(inp), Storage::Cuda(kernel)) => {
+                let s = inp.conv1d(l, kernel, kernel_l, params)?;
+                Ok(Self::Cuda(s))
+            }
+            (Storage::Metal(inp), Storage::Metal(kernel)) => {
+                let s = inp.conv1d(l, kernel, kernel_l, params)?;
+                Ok(Self::Metal(s))
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "conv1d",
+            }
+            .bt()),
+        }
+    }
+
+    pub(crate) fn conv_transpose1d(
+        &self,
+        l: &Layout,
+        kernel: &Self,
+        kernel_l: &Layout,
+        params: &crate::conv::ParamsConvTranspose1D,
+    ) -> Result<Self> {
+        self.same_device(kernel, "conv-transpose1d")?;
+        self.same_dtype(kernel, "conv-transpose1d")?;
+        match (self, &kernel) {
+            (Storage::Cpu(inp), Storage::Cpu(kernel)) => {
+                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
+                Ok(Self::Cpu(s))
+            }
+            (Storage::Cuda(inp), Storage::Cuda(kernel)) => {
+                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
+                Ok(Self::Cuda(s))
+            }
+            (Storage::Metal(inp), Storage::Metal(kernel)) => {
+                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
+                Ok(Self::Metal(s))
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "conv-transpose1d",
+            }
+            .bt()),
+        }
+    }
+
+    pub(crate) fn conv2d(
+        &self,
+        l: &Layout,
+        kernel: &Self,
+        kernel_l: &Layout,
+        params: &crate::conv::ParamsConv2D,
+    ) -> Result<Self> {
+        self.same_device(kernel, "conv2d")?;
+        self.same_dtype(kernel, "conv2d")?;
+        match (self, &kernel) {
+            (Storage::Cpu(inp), Storage::Cpu(kernel)) => {
+                let s = inp.conv2d(l, kernel, kernel_l, params)?;
+                Ok(Self::Cpu(s))
+            }
+            (Storage::Cuda(inp), Storage::Cuda(kernel)) => {
+                let s = inp.conv2d(l, kernel, kernel_l, params)?;
+                Ok(Self::Cuda(s))
+            }
+            (Storage::Metal(inp), Storage::Metal(kernel)) => {
+                let s = inp.conv2d(l, kernel, kernel_l, params)?;
+                Ok(Self::Metal(s))
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "conv2d",
+            }
+            .bt()),
+        }
+    }
+
+    pub(crate) fn conv_transpose2d(
+        &self,
+        l: &Layout,
+        kernel: &Self,
+        kernel_l: &Layout,
+        params: &crate::conv::ParamsConvTranspose2D,
+    ) -> Result<Self> {
+        self.same_device(kernel, "conv_transpose2d")?;
+        self.same_dtype(kernel, "conv_transpose2d")?;
+        match (self, &kernel) {
+            (Storage::Cpu(inp), Storage::Cpu(kernel)) => {
+                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
+                Ok(Self::Cpu(s))
+            }
+            (Storage::Cuda(inp), Storage::Cuda(kernel)) => {
+                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
+                Ok(Self::Cuda(s))
+            }
+            (Storage::Metal(inp), Storage::Metal(kernel)) => {
+                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
+                Ok(Self::Metal(s))
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "conv_transpose2d",
+            }
+            .bt()),
+        }
+    }
+
+    pub(crate) fn avg_pool2d(
+        &self,
+        layout: &Layout,
+        kernel_size: (usize, usize),
+        stride: (usize, usize),
+    ) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn max_pool2d(
+        &self,
+        layout: &Layout,
+        kernel_size: (usize, usize),
+        stride: (usize, usize),
+    ) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn upsample_nearest1d(&self, layout: &Layout, sz: usize) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.upsample_nearest1d(layout, sz)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.upsample_nearest1d(layout, sz)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.upsample_nearest1d(layout, sz)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn upsample_nearest2d(&self, layout: &Layout, h: usize, w: usize) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.upsample_nearest2d(layout, h, w)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.upsample_nearest2d(layout, h, w)?;
+                Ok(Self::Cuda(storage))
+            }
+            Self::Metal(storage) => {
+                let storage = storage.upsample_nearest2d(layout, h, w)?;
+                Ok(Self::Metal(storage))
+            }
+        }
+    }
+
+    pub(crate) fn where_cond(
+        &self,
+        layout: &Layout,
+        t: &Self,
+        layout_t: &Layout,
+        f: &Self,
+        layout_f: &Layout,
+    ) -> Result<Self> {
+        self.same_device(t, "where")?;
+        self.same_device(f, "where")?;
+        t.same_dtype(f, "where")?;
+        match (self, t, f) {
+            (Storage::Cpu(cond), Storage::Cpu(t), Storage::Cpu(f)) => {
+                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(cond), Self::Cuda(t), Self::Cuda(f)) => {
+                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
+                Ok(Self::Cuda(storage))
+            }
+            (Self::Metal(cond), Self::Metal(t), Self::Metal(f)) => {
+                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
+                Ok(Self::Metal(storage))
+            }
+            (_, lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "where",
+            }
+            .bt()),
+        }
+    }
+
+    pub(crate) fn gather(
+        &self,
+        l: &Layout,
+        indexes: &Self,
+        indexes_l: &Layout,
+        d: usize,
+    ) -> Result<Self> {
+        self.same_device(indexes, "index-add")?;
+        match (self, indexes) {
+            (Self::Cpu(s), Self::Cpu(indexes)) => {
+                let storage = s.gather(l, indexes, indexes_l, d)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(s), Self::Cuda(indexes)) => {
+                let storage = s.gather(l, indexes, indexes_l, d)?;
+                Ok(Self::Cuda(storage))
+            }
+            (Self::Metal(s), Self::Metal(indexes)) => {
+                let storage = s.gather(l, indexes, indexes_l, d)?;
+                Ok(Self::Metal(storage))
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    pub(crate) fn scatter_add(
+        &self,
+        l: &Layout,
+        indexes: &Self,
+        indexes_l: &Layout,
+        source: &Self,
+        source_l: &Layout,
+        d: usize,
+    ) -> Result<Self> {
+        self.same_device(indexes, "scatter-add")?;
+        self.same_device(source, "scatter-add")?;
+        match (self, indexes, source) {
+            (Self::Cpu(s), Self::Cpu(indexes), Self::Cpu(source)) => {
+                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(s), Self::Cuda(indexes), Self::Cuda(source)) => {
+                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
+                Ok(Self::Cuda(storage))
+            }
+            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
+                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
+                Ok(Self::Metal(storage))
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    pub(crate) fn index_add(
+        &self,
+        l: &Layout,
+        indexes: &Self,
+        indexes_l: &Layout,
+        source: &Self,
+        source_l: &Layout,
+        d: usize,
+    ) -> Result<Self> {
+        self.same_device(indexes, "index-add")?;
+        self.same_device(source, "index-add")?;
+        match (self, indexes, source) {
+            (Self::Cpu(s), Self::Cpu(indexes), Self::Cpu(source)) => {
+                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(s), Self::Cuda(indexes), Self::Cuda(source)) => {
+                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
+                Ok(Self::Cuda(storage))
+            }
+            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
+                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
+                Ok(Self::Metal(storage))
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    pub(crate) fn index_select(
+        &self,
+        rhs: &Self,
+        lhs_l: &Layout,
+        rhs_l: &Layout,
+        d: usize,
+    ) -> Result<Self> {
+        self.same_device(rhs, "index-select")?;
+        match (self, rhs) {
+            (Self::Cpu(lhs), Self::Cpu(rhs)) => {
+                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(lhs), Self::Cuda(rhs)) => {
+                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
+                Ok(Self::Cuda(storage))
+            }
+            (Self::Metal(lhs), Self::Metal(rhs)) => {
+                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
+                Ok(Self::Metal(storage))
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "index-select",
+            }
+            .bt()),
+        }
+    }
+
+    pub(crate) fn matmul(
+        &self,
+        rhs: &Self,
+        bmnk: (usize, usize, usize, usize),
+        lhs_layout: &Layout,
+        rhs_layout: &Layout,
+    ) -> Result<Self> {
+        self.same_device(rhs, "matmul")?;
+        self.same_dtype(rhs, "matmul")?;
+        match (self, rhs) {
+            (Self::Cpu(lhs), Self::Cpu(rhs)) => {
+                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(lhs), Self::Cuda(rhs)) => {
+                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
+                Ok(Self::Cuda(storage))
+            }
+            (Self::Metal(lhs), Self::Metal(rhs)) => {
+                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
+                Ok(Self::Metal(storage))
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "matmul",
+            }
+            .bt()),
+        }
+    }
+
+    // self, the source can be strided whereas dst is contiguous.
+    pub(crate) fn copy_strided_src(
+        &self,
+        dst: &mut Self,
+        dst_offset: usize,
+        src_l: &Layout,
+    ) -> Result<()> {
+        match (self, dst) {
+            (Self::Cpu(src), Self::Cpu(dst)) => src.copy_strided_src(dst, dst_offset, src_l),
+            (Self::Cuda(src), Self::Cuda(dst)) => Ok(src.copy_strided_src(dst, dst_offset, src_l)?),
+            (Self::Metal(src), Self::Metal(dst)) => {
+                Ok(src.copy_strided_src(dst, dst_offset, src_l)?)
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "copy",
+            }
+            .bt()),
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn copy2d(
+        &self,
+        dst: &mut Self,
+        d1: usize,
+        d2: usize,
+        src_s: usize,
+        dst_s: usize,
+        src_o: usize,
+        dst_o: usize,
+    ) -> Result<()> {
+        match (self, dst) {
+            (Self::Cpu(src), Self::Cpu(dst)) => src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o),
+            (Self::Cuda(src), Self::Cuda(dst)) => {
+                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
+            }
+            (Self::Metal(src), Self::Metal(dst)) => {
+                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "copy2d",
+            }
+            .bt()),
+        }
+    }
+}
--- a/candle-core/src/strided_index.rs
+++ b/candle-core/src/strided_index.rs
+use crate::Layout;
+
+/// An iterator over offset position for items of an N-dimensional arrays stored in a
+/// flat buffer using some potential strides.
+#[derive(Debug)]
+pub struct StridedIndex<'a> {
+    next_storage_index: Option<usize>,
+    multi_index: Vec<usize>,
+    dims: &'a [usize],
+    stride: &'a [usize],
+}
+
+impl<'a> StridedIndex<'a> {
+    pub(crate) fn new(dims: &'a [usize], stride: &'a [usize], start_offset: usize) -> Self {
+        let elem_count: usize = dims.iter().product();
+        let next_storage_index = if elem_count == 0 {
+            None
+        } else {
+            // This applies to the scalar case.
+            Some(start_offset)
+        };
+        StridedIndex {
+            next_storage_index,
+            multi_index: vec![0; dims.len()],
+            dims,
+            stride,
+        }
+    }
+
+    pub(crate) fn from_layout(l: &'a Layout) -> Self {
+        Self::new(l.dims(), l.stride(), l.start_offset())
+    }
+}
+
+impl<'a> Iterator for StridedIndex<'a> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let storage_index = match self.next_storage_index {
+            None => return None,
+            Some(storage_index) => storage_index,
+        };
+        let mut updated = false;
+        let mut next_storage_index = storage_index;
+        for ((multi_i, max_i), stride_i) in self
+            .multi_index
+            .iter_mut()
+            .zip(self.dims.iter())
+            .zip(self.stride.iter())
+            .rev()
+        {
+            let next_i = *multi_i + 1;
+            if next_i < *max_i {
+                *multi_i = next_i;
+                updated = true;
+                next_storage_index += stride_i;
+                break;
+            } else {
+                next_storage_index -= *multi_i * stride_i;
+                *multi_i = 0
+            }
+        }
+        self.next_storage_index = if updated {
+            Some(next_storage_index)
+        } else {
+            None
+        };
+        Some(storage_index)
+    }
+}
+
+#[derive(Debug)]
+pub enum StridedBlocks<'a> {
+    SingleBlock {
+        start_offset: usize,
+        len: usize,
+    },
+    MultipleBlocks {
+        block_start_index: StridedIndex<'a>,
+        block_len: usize,
+    },
+}
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
+//! Tensors are N-dimensional matrixes of elements using a single data type.
+#![allow(clippy::redundant_closure_call)]
+use crate::backend::{BackendDevice, BackendStorage};
+use crate::op::{BackpropOp, BinaryOp, CmpOp, Op, ReduceOp, UnaryOp};
+use crate::scalar::TensorOrScalar;
+use crate::shape::{Dim, Dims};
+use crate::{bail, storage::Storage, DType, Device, Error, Layout, Result, Shape};
+use std::sync::{Arc, RwLock};
+
+/// Unique identifier for tensors.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub struct TensorId(usize);
+
+impl TensorId {
+    fn new() -> Self {
+        // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805
+        use std::sync::atomic;
+        static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1);
+        Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed))
+    }
+}
+
+pub struct Tensor_ {
+    id: TensorId,
+    // As we provide inner mutability on the tensor content, the alternatives are:
+    // - Using a mutex, this would have the highest cost when retrieving the storage but would
+    //   prevent errors when concurrent access takes place. Mutex would also be subject to
+    //   deadlocks for example using the current code if the same tensor is used twice by a single
+    //   binary op.
+    // - Using a refcell unsafe cell would have some intermediary cost, borrow checking would be
+    //   verified dynamically, but the resulting tensors would not be send or sync.
+    // - Using an unsafe cell would have the lowest cost but undefined behavior on concurrent
+    //   accesses.
+    // Ideally, we would use Arc<Storage> for tensors on which we don't plan on modifying the data
+    // and Arc<Mutex<Storage>> for tensors where the data could be modified, e.g. variables but
+    // that's tricky to encode in the current setup.
+    storage: Arc<RwLock<Storage>>,
+    layout: Layout,
+    op: BackpropOp,
+    is_variable: bool,
+    dtype: DType,
+    device: Device,
+}
+
+impl AsRef<Tensor> for Tensor {
+    fn as_ref(&self) -> &Tensor {
+        self
+    }
+}
+
+// Tensors are refcounted so that cloning is cheap when building the op graph.
+// Storages are also refcounted independently so that its possible to avoid
+// copying the storage for operations that only modify the shape or stride.
+#[derive(Clone)]
+/// The core struct for manipulating tensors.
+///
+/// ```rust
+/// use candle_core::{Tensor, DType, Device};
+///
+/// let a = Tensor::arange(0f32, 6f32, &Device::Cpu)?.reshape((2, 3))?;
+/// let b = Tensor::arange(0f32, 12f32, &Device::Cpu)?.reshape((3, 4))?;
+///
+/// let c = a.matmul(&b)?;
+/// # Ok::<(), candle_core::Error>(())
+/// ```
+///
+/// Tensors are reference counted with [`Arc`] so cloning them is cheap.
+pub struct Tensor(Arc<Tensor_>);
+
+impl std::ops::Deref for Tensor {
+    type Target = Tensor_;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+macro_rules! unary_op {
+    ($fn_name:ident, $op_name:ident) => {
+        pub fn $fn_name(&self) -> Result<Self> {
+            let shape = self.shape();
+            let storage = self
+                .storage()
+                .unary_impl::<crate::op::$op_name>(self.layout())?;
+            let op = BackpropOp::new1(self, |s| Op::Unary(s, UnaryOp::$op_name));
+            Ok(from_storage(storage, shape.clone(), op, false))
+        }
+    };
+}
+
+macro_rules! binary_op {
+    ($fn_name:ident, $op_name:ident) => {
+        pub fn $fn_name(&self, rhs: &Self) -> Result<Self> {
+            let shape = self.same_shape_binary_op(rhs, stringify!($fn_name))?;
+            let storage = self.storage().binary_impl::<crate::op::$op_name>(
+                &*rhs.storage(),
+                self.layout(),
+                rhs.layout(),
+            )?;
+            let op = BackpropOp::new2(self, rhs, |t1, t2| Op::Binary(t1, t2, BinaryOp::$op_name));
+            Ok(from_storage(storage, shape.clone(), op, false))
+        }
+    };
+}
+
+macro_rules! binary_op_scalar {
+    ($fn_name:ident, $op_name:ident) => {
+        pub fn $fn_name<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
+            let rhs = match rhs.to_tensor_scalar()? {
+                crate::scalar::TensorScalar::Tensor(rhs) => rhs,
+                crate::scalar::TensorScalar::Scalar(rhs) => rhs
+                    .to_dtype(self.dtype())?
+                    .to_device(self.device())?
+                    .broadcast_as(self.shape())?,
+            };
+            let shape = self.same_shape_binary_op(&rhs, stringify!($fn_name))?;
+            let storage = self.storage().binary_impl::<crate::op::$op_name>(
+                &*rhs.storage(),
+                self.layout(),
+                rhs.layout(),
+            )?;
+            let op = BackpropOp::new2(self, &rhs, |t1, t2| Op::Binary(t1, t2, BinaryOp::$op_name));
+            Ok(from_storage(storage, shape.clone(), op, false))
+        }
+    };
+}
+
+macro_rules! broadcast_binary_op {
+    ($fn_name:ident, $inner_fn_name:ident) => {
+        pub fn $fn_name(&self, rhs: &Self) -> Result<Self> {
+            let lhs = self;
+            let shape = lhs
+                .shape()
+                .broadcast_shape_binary_op(rhs.shape(), stringify!($fn_name))?;
+            let l_broadcast = shape != *lhs.shape();
+            let r_broadcast = shape != *rhs.shape();
+            match (l_broadcast, r_broadcast) {
+                (true, true) => lhs
+                    .broadcast_as(&shape)?
+                    .$inner_fn_name(&rhs.broadcast_as(&shape)?),
+                (false, true) => lhs.$inner_fn_name(&rhs.broadcast_as(&shape)?),
+                (true, false) => lhs.broadcast_as(&shape)?.$inner_fn_name(rhs),
+                (false, false) => lhs.$inner_fn_name(rhs),
+            }
+        }
+    };
+}
+
+/// Creates a fresh tensor structure based on a storage and a shape, this uses contiguous strides.
+pub(crate) fn from_storage<S: Into<Shape>>(
+    storage: Storage,
+    shape: S,
+    op: BackpropOp,
+    is_variable: bool,
+) -> Tensor {
+    let dtype = storage.dtype();
+    let device = storage.device();
+    let tensor_ = Tensor_ {
+        id: TensorId::new(),
+        storage: Arc::new(RwLock::new(storage)),
+        layout: Layout::contiguous(shape),
+        op,
+        is_variable,
+        dtype,
+        device,
+    };
+    Tensor(Arc::new(tensor_))
+}
+
+impl Tensor {
+    pub(crate) fn ones_impl<S: Into<Shape>>(
+        shape: S,
+        dtype: DType,
+        device: &Device,
+        is_variable: bool,
+    ) -> Result<Self> {
+        let none = BackpropOp::none();
+        let shape = shape.into();
+        let storage = device.ones(&shape, dtype)?;
+        Ok(from_storage(storage, shape, none, is_variable))
+    }
+
+    /// Creates a new tensor filled with ones.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, DType, Device};
+    /// let a = Tensor::ones((2, 3), DType::F32, &Device::Cpu)?;
+    /// let b = Tensor::from_slice(&[1.0f32, 1.0, 1.0, 1.0, 1.0, 1.0], (2, 3), &Device::Cpu)?;
+    /// // a == b
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn ones<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) -> Result<Self> {
+        Self::ones_impl(shape, dtype, device, false)
+    }
+
+    /// Creates a new tensor filled with ones with same shape, dtype, and device as the other tensor.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, DType, Device};
+    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    /// let b = a.ones_like()?;
+    /// // b == a + 1
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn ones_like(&self) -> Result<Self> {
+        Tensor::ones(self.shape(), self.dtype(), self.device())
+    }
+
+    // Do not expose outside of the crate, the `is_variable=true` case should only be accessed from
+    // the variable module.
+    pub(crate) fn zeros_impl<S: Into<Shape>>(
+        shape: S,
+        dtype: DType,
+        device: &Device,
+        is_variable: bool,
+    ) -> Result<Self> {
+        let none = BackpropOp::none();
+        let shape = shape.into();
+        let storage = device.zeros(&shape, dtype)?;
+        Ok(from_storage(storage, shape, none, is_variable))
+    }
+
+    /// Creates a new tensor filled with zeros.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, DType, Device};
+    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    /// let b = Tensor::from_slice(&[0.0f32, 0.0, 0.0, 0.0, 0.0, 0.0], (2, 3), &Device::Cpu)?;
+    /// // a == b
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn zeros<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) -> Result<Self> {
+        Self::zeros_impl(shape, dtype, device, false)
+    }
+
+    /// Creates a new tensor filled with ones with same shape, dtype, and device as the other
+    /// tensor.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, DType, Device};
+    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    /// let b = a.zeros_like()?;
+    /// // b is on CPU f32.
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn zeros_like(&self) -> Result<Self> {
+        Tensor::zeros(self.shape(), self.dtype(), self.device())
+    }
+
+    pub(crate) fn rand_impl<S: Into<Shape>, T: crate::FloatDType>(
+        lo: T,
+        up: T,
+        s: S,
+        device: &Device,
+        is_variable: bool,
+    ) -> Result<Self> {
+        let s = s.into();
+        let storage = device.rand_uniform(lo, up, &s)?;
+        let none = BackpropOp::none();
+        Ok(from_storage(storage, s, none, is_variable))
+    }
+
+    pub(crate) fn rand_f64_impl<S: Into<Shape>>(
+        lo: f64,
+        up: f64,
+        s: S,
+        dtype: DType,
+        device: &Device,
+        is_variable: bool,
+    ) -> Result<Self> {
+        let s = s.into();
+        let storage = device.rand_uniform_f64(lo, up, &s, dtype)?;
+        let none = BackpropOp::none();
+        Ok(from_storage(storage, s, none, is_variable))
+    }
+
+    /// Creates a new tensor initialized with values sampled uniformly between `lo` and `up`.
+    pub fn rand<S: Into<Shape>, T: crate::FloatDType>(
+        lo: T,
+        up: T,
+        s: S,
+        device: &Device,
+    ) -> Result<Self> {
+        Self::rand_impl(lo, up, s, device, false)
+    }
+
+    pub fn rand_like(&self, lo: f64, up: f64) -> Result<Self> {
+        Tensor::rand_f64_impl(lo, up, self.shape(), self.dtype(), self.device(), false)
+    }
+
+    pub(crate) fn randn_impl<S: Into<Shape>, T: crate::FloatDType>(
+        mean: T,
+        std: T,
+        s: S,
+        device: &Device,
+        is_variable: bool,
+    ) -> Result<Self> {
+        let s = s.into();
+        let storage = device.rand_normal(mean, std, &s)?;
+        let none = BackpropOp::none();
+        Ok(from_storage(storage, s, none, is_variable))
+    }
+
+    pub(crate) fn randn_f64_impl<S: Into<Shape>>(
+        mean: f64,
+        std: f64,
+        s: S,
+        dtype: DType,
+        device: &Device,
+        is_variable: bool,
+    ) -> Result<Self> {
+        let s = s.into();
+        let storage = device.rand_normal_f64(mean, std, &s, dtype)?;
+        let none = BackpropOp::none();
+        Ok(from_storage(storage, s, none, is_variable))
+    }
+
+    pub fn randn_like(&self, mean: f64, stdev: f64) -> Result<Self> {
+        Tensor::randn_f64_impl(
+            mean,
+            stdev,
+            self.shape(),
+            self.dtype(),
+            self.device(),
+            false,
+        )
+    }
+
+    /// Creates a new tensor initialized with values sampled from a normal distribution with the
+    /// specified `mean` and standard deviation `std`.
+    pub fn randn<S: Into<Shape>, T: crate::FloatDType>(
+        mean: T,
+        std: T,
+        s: S,
+        device: &Device,
+    ) -> Result<Self> {
+        Self::randn_impl(mean, std, s, device, false)
+    }
+
+    pub(crate) fn new_impl<A: crate::device::NdArray>(
+        array: A,
+        shape: Shape,
+        device: &Device,
+        is_variable: bool,
+    ) -> Result<Self> {
+        let n: usize = shape.elem_count();
+        let buffer_size: usize = array.shape()?.elem_count();
+        if buffer_size != n {
+            return Err(Error::ShapeMismatch { buffer_size, shape }.bt());
+        }
+        let storage = device.storage(array)?;
+        let none = BackpropOp::none();
+        Ok(from_storage(storage, shape, none, is_variable))
+    }
+
+    /// Creates a new tensor on the specified device using the content and shape of the input.
+    pub fn new<A: crate::device::NdArray>(array: A, device: &Device) -> Result<Self> {
+        let shape = array.shape()?;
+        Self::new_impl(array, shape, device, false)
+    }
+
+    /// Returns a new tensor with all the elements having the same specified value. Note that
+    /// the tensor is not contiguous so you would have to call `.contiguous()` on it if needed.
+    pub fn full<D: crate::WithDType, S: Into<Shape>>(
+        value: D,
+        shape: S,
+        device: &Device,
+    ) -> Result<Self> {
+        Self::from_vec_impl(vec![value], (), device, false)?.broadcast_as(shape)
+    }
+
+    /// Creates a new 1D tensor from an iterator.
+    pub fn from_iter<D: crate::WithDType>(
+        iter: impl IntoIterator<Item = D>,
+        device: &Device,
+    ) -> Result<Self> {
+        let data = iter.into_iter().collect::<Vec<_>>();
+        let len = data.len();
+        Self::from_vec_impl(data, len, device, false)
+    }
+
+    /// Creates a new 1D tensor with values from the interval `[start, end)` taken with a common
+    /// difference `1` from `start`.
+    pub fn arange<D: crate::WithDType>(start: D, end: D, device: &Device) -> Result<Self> {
+        Self::arange_step(start, end, D::one(), device)
+    }
+
+    /// Creates a new 1D tensor with values from the interval `[start, end)` taken with a common
+    /// difference `step` from `start`.
+    pub fn arange_step<D: crate::WithDType>(
+        start: D,
+        end: D,
+        step: D,
+        device: &Device,
+    ) -> Result<Self> {
+        if D::is_zero(&step) {
+            bail!("step cannot be zero")
+        }
+        let mut data = vec![];
+        let mut current = start;
+        if step >= D::zero() {
+            while current < end {
+                data.push(current);
+                current += step;
+            }
+        } else {
+            while current > end {
+                data.push(current);
+                current += step;
+            }
+        }
+        let len = data.len();
+        Self::from_vec_impl(data, len, device, false)
+    }
+
+    pub(crate) fn from_vec_impl<S: Into<Shape>, D: crate::WithDType>(
+        data: Vec<D>,
+        shape: S,
+        device: &Device,
+        is_variable: bool,
+    ) -> Result<Self> {
+        let shape = shape.into();
+        let buffer_size = data.len();
+        if buffer_size != shape.elem_count() {
+            return Err(Error::ShapeMismatch { buffer_size, shape }.bt());
+        }
+        let storage = device.storage_owned(data)?;
+        let none = BackpropOp::none();
+        Ok(from_storage(storage, shape, none, is_variable))
+    }
+
+    /// Creates a new tensor initialized with values from the input vector. The number of elements
+    /// in this vector must be the same as the number of elements defined by the shape.
+    /// If the device is cpu, no data copy is made.
+    pub fn from_vec<S: Into<Shape>, D: crate::WithDType>(
+        data: Vec<D>,
+        shape: S,
+        device: &Device,
+    ) -> Result<Self> {
+        Self::from_vec_impl(data, shape, device, false)
+    }
+
+    /// Creates a new tensor initialized with values from the input slice. The number of elements
+    /// in this vector must be the same as the number of elements defined by the shape.
+    pub fn from_slice<S: Into<Shape>, D: crate::WithDType>(
+        array: &[D],
+        shape: S,
+        device: &Device,
+    ) -> Result<Self> {
+        Self::new_impl(array, shape.into(), device, false)
+    }
+
+    pub(crate) fn same_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<&Shape> {
+        let lhs = self.shape();
+        let rhs = rhs.shape();
+        if lhs != rhs {
+            Err(Error::ShapeMismatchBinaryOp {
+                lhs: lhs.clone(),
+                rhs: rhs.clone(),
+                op,
+            }
+            .bt())
+        } else {
+            Ok(lhs)
+        }
+    }
+
+    /// Returns true if the computation graph should track this op, that is if it is
+    /// a variable or if it has some variable as dependencies.
+    pub fn track_op(&self) -> bool {
+        self.is_variable || self.op.is_some()
+    }
+
+    // TODO: Also make an inplace version or a pre-allocated? This could be tricky
+    // if this can create cycles in the compute graph.
+    binary_op!(add, Add);
+    binary_op!(mul, Mul);
+    binary_op!(sub, Sub);
+    binary_op!(div, Div);
+    binary_op_scalar!(maximum, Maximum);
+    binary_op_scalar!(minimum, Minimum);
+    broadcast_binary_op!(broadcast_add, add);
+    broadcast_binary_op!(broadcast_mul, mul);
+    broadcast_binary_op!(broadcast_sub, sub);
+    broadcast_binary_op!(broadcast_div, div);
+    broadcast_binary_op!(broadcast_maximum, maximum);
+    broadcast_binary_op!(broadcast_minimum, minimum);
+    broadcast_binary_op!(broadcast_eq, eq);
+    broadcast_binary_op!(broadcast_ne, ne);
+    broadcast_binary_op!(broadcast_lt, lt);
+    broadcast_binary_op!(broadcast_le, le);
+    broadcast_binary_op!(broadcast_gt, gt);
+    broadcast_binary_op!(broadcast_ge, ge);
+
+    unary_op!(recip, Recip);
+    unary_op!(neg, Neg);
+    unary_op!(exp, Exp);
+    unary_op!(log, Log);
+    unary_op!(sin, Sin);
+    unary_op!(cos, Cos);
+    unary_op!(tanh, Tanh);
+    unary_op!(abs, Abs);
+    unary_op!(sqr, Sqr);
+    unary_op!(sqrt, Sqrt);
+    unary_op!(gelu, Gelu);
+    unary_op!(gelu_erf, GeluErf);
+    unary_op!(erf, Erf);
+    unary_op!(relu, Relu);
+    unary_op!(silu, Silu);
+    unary_op!(ceil, Ceil);
+    unary_op!(floor, Floor);
+    unary_op!(round, Round);
+    unary_op!(sign, Sign);
+
+    /// Round element of the input tensor to the nearest integer.
+    ///
+    /// If the number of decimals is negative, it specifies the number of positions to the left of
+    /// the decimal point.
+    pub fn round_to(&self, decimals: i32) -> Result<Self> {
+        let mult = 10f64.powi(decimals);
+        (self * mult)?.round()? * (1f64 / mult)
+    }
+
+    /// Retrieves the single scalar value hold in the tensor. If the tensor contains multiple
+    /// dimensions, an error is returned instead.
+    pub fn to_scalar<S: crate::WithDType>(&self) -> Result<S> {
+        if self.rank() != 0 {
+            Err(Error::UnexpectedNumberOfDims {
+                expected: 0,
+                got: self.rank(),
+                shape: self.shape().clone(),
+            }
+            .bt())?
+        }
+        let from_cpu_storage = |cpu_storage: &crate::CpuStorage| {
+            let data = S::cpu_storage_as_slice(cpu_storage)?;
+            Ok::<_, Error>(data[self.layout().start_offset()])
+        };
+        match &*self.storage() {
+            Storage::Cpu(cpu_storage) => from_cpu_storage(cpu_storage),
+            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
+            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
+        }
+    }
+
+    /// An alias for `to_scalar`.
+    pub fn to_vec0<S: crate::WithDType>(&self) -> Result<S> {
+        self.to_scalar::<S>()
+    }
+
+    /// Repeat this tensor along the specified dimensions.
+    pub fn repeat<S: Into<Shape>>(&self, shape: S) -> Result<Tensor> {
+        // Similar to PyTorch, we extend the number of dimensions of self if needed.
+        let repeats = shape.into();
+        let repeats = repeats.dims();
+        let mut inp = if self.rank() < repeats.len() {
+            let shape = [vec![1; repeats.len() - self.rank()], self.dims().to_vec()].concat();
+            self.reshape(shape)?
+        } else {
+            self.clone()
+        };
+        for (idx, &repeat) in repeats.iter().enumerate() {
+            if repeat > 1 {
+                inp = Tensor::cat(&vec![&inp; repeat], idx)?
+            }
+        }
+        Ok(inp)
+    }
+
+    /// Creates grids of coordinates specified by the 1D inputs.
+    ///
+    /// # Arguments
+    ///
+    /// * `args` - A slice of 1D tensors.
+    /// * `xy_indexing` - Whether to use xy indexing or ij indexing. If xy is selected, the
+    /// first dimension corresponds to the cardinality of the second input and the second
+    /// dimension corresponds to the cardinality of the first input. If ij is selected, the
+    /// dimensions are in the same order as the cardinality of the inputs.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device, Shape};
+    /// let x = Tensor::new(&[1f32, 2., 3.], &Device::Cpu)?;
+    /// let y = Tensor::new(&[4f32, 5., 6.], &Device::Cpu)?;
+    ///
+    /// let grids_xy = Tensor::meshgrid(&[&x, &y], true)?;
+    ///
+    /// assert_eq!(grids_xy.len(), 2);
+    /// assert_eq!(grids_xy[0].dims(), &[3, 3]);
+    ///
+    /// assert_eq!(grids_xy[0].to_vec2::<f32>()?, &[[1., 2., 3.], [1., 2., 3.], [1., 2., 3.]]);
+    /// assert_eq!(grids_xy[1].to_vec2::<f32>()?, &[[4., 4., 4.], [5., 5., 5.], [6., 6., 6.]]);
+    ///
+    /// let grids_ij = Tensor::meshgrid(&[&x, &y], false)?;
+    ///
+    /// assert_eq!(grids_ij[0].to_vec2::<f32>()?, &[[1., 1., 1.], [2., 2., 2.], [3., 3., 3.]]);
+    /// assert_eq!(grids_ij[1].to_vec2::<f32>()?, &[[4., 5., 6.], [4., 5., 6.], [4., 5., 6.]]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    ///
+    /// # Errors
+    ///
+    /// * Will return `Err` if `args` contains less than 2 tensors.
+    ///
+    pub fn meshgrid<A: AsRef<Tensor>>(args: &[A], xy_indexing: bool) -> Result<Vec<Self>> {
+        if args.len() <= 1 {
+            Err(Error::OpRequiresAtLeastTwoTensors { op: "meshgrid" }.bt())?
+        }
+        let args: Vec<_> = if xy_indexing {
+            args.iter().rev().collect()
+        } else {
+            args.iter().collect()
+        };
+
+        let mut shape = Vec::with_capacity(args.len());
+        for arg in args.iter() {
+            shape.push(arg.as_ref().dims1()?)
+        }
+
+        let mut grids = Vec::with_capacity(args.len());
+        for idx in 0..args.len() {
+            let mut ones = vec![1usize; args.len()];
+            ones[idx] = shape[idx];
+            let arg = args[idx].as_ref().reshape(ones)?;
+            let mut repeats = shape.clone();
+            repeats[idx] = 1;
+            let repeated_tensor = arg.repeat(repeats)?;
+            grids.push(repeated_tensor);
+        }
+        if xy_indexing {
+            grids.reverse();
+        }
+        Ok(grids)
+    }
+
+    /// This operation multiplies the input tensor by `mul` then adds `add` and return the result.
+    /// The input values `mul` and `add` are casted to the appropriate type so some rounding might
+    /// be performed.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let a = Tensor::new(&[[0f32, 1.], [2., 3.]], &Device::Cpu)?;
+    /// let a = a.affine(4., -2.)?;
+    /// assert_eq!(a.to_vec2::<f32>()?, &[[-2.0, 2.0], [6.0, 10.0]]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn affine(&self, mul: f64, add: f64) -> Result<Self> {
+        let storage = self.storage().affine(self.layout(), mul, add)?;
+        let op = BackpropOp::new1(self, |arg| Op::Affine { arg, mul, add });
+        Ok(from_storage(storage, self.shape(), op, false))
+    }
+
+    /// Applies the Exponential Linear Unit (ELU) function on each element of the input tensor.
+    pub fn elu(&self, alpha: f64) -> Result<Self> {
+        let storage = self.storage().elu(self.layout(), alpha)?;
+        let op = BackpropOp::new1(self, |t| Op::Elu(t, alpha));
+        Ok(from_storage(storage, self.shape(), op, false))
+    }
+
+    /// Raise the tensor to some float exponent `e`.
+    pub fn powf(&self, e: f64) -> Result<Self> {
+        let storage = self.storage().powf(self.layout(), e)?;
+        let op = BackpropOp::new1(self, |t| Op::Powf(t, e));
+        Ok(from_storage(storage, self.shape(), op, false))
+    }
+
+    pub(crate) fn check_dim(&self, dim: usize, op: &'static str) -> Result<()> {
+        if dim >= self.dims().len() {
+            Err(Error::DimOutOfRange {
+                shape: self.shape().clone(),
+                dim: dim as i32,
+                op,
+            }
+            .bt())?
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Split a tensor into the specified number of chunks, this may return less chunks than
+    /// specified.
+    pub fn chunk<D: Dim>(&self, chunks: usize, dim: D) -> Result<Vec<Self>> {
+        let dim = dim.to_index(self.shape(), "chunk")?;
+        let size = self.dim(dim)?;
+        if size < chunks {
+            (0..size).map(|i| self.narrow(dim, i, 1)).collect()
+        } else {
+            let chunk_size = size / chunks;
+            let cnt_additional = size % chunks;
+            let mut tensors = vec![];
+            let mut sum_chunk_size = 0;
+            for i in 0..chunks {
+                let chunk_size = if i < cnt_additional {
+                    chunk_size + 1
+                } else {
+                    chunk_size
+                };
+                let tensor = self.narrow(dim, sum_chunk_size, chunk_size)?;
+                tensors.push(tensor);
+                sum_chunk_size += chunk_size
+            }
+            Ok(tensors)
+        }
+    }
+
+    /// Returns a new tensor that is a narrowed version of the input, the dimension `dim`
+    /// ranges from `start` to `start + len`.
+    pub fn narrow<D: Dim>(&self, dim: D, start: usize, len: usize) -> Result<Self> {
+        let dims = self.dims();
+        let dim = dim.to_index(self.shape(), "narrow")?;
+        let err = |msg| {
+            Err::<(), _>(
+                Error::NarrowInvalidArgs {
+                    shape: self.shape().clone(),
+                    dim,
+                    start,
+                    len,
+                    msg,
+                }
+                .bt(),
+            )
+        };
+        if start > dims[dim] {
+            err("start > dim_len")?
+        }
+        if start.saturating_add(len) > dims[dim] {
+            err("start + len > dim_len")?
+        }
+        if start == 0 && dims[dim] == len {
+            Ok(self.clone())
+        } else {
+            let op = BackpropOp::new1(self, |t| Op::Narrow(t, dim, start, len));
+            let layout = self.layout().narrow(dim, start, len)?;
+            let tensor_ = Tensor_ {
+                id: TensorId::new(),
+                storage: self.storage.clone(),
+                layout,
+                op,
+                is_variable: false,
+                dtype: self.dtype,
+                device: self.device.clone(),
+            };
+            Ok(Tensor(Arc::new(tensor_)))
+        }
+    }
+
+    fn squeeze_dims(self, dims: &[usize]) -> Result<Self> {
+        match dims {
+            [] => Ok(self),
+            [i] => self.squeeze(*i),
+            dims => {
+                let dims = self
+                    .dims()
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(dim_idx, &v)| {
+                        if dims.contains(&dim_idx) {
+                            None
+                        } else {
+                            Some(v)
+                        }
+                    })
+                    .collect::<Vec<_>>();
+                self.reshape(dims)
+            }
+        }
+    }
+
+    fn reduce_impl<D: Dim>(&self, dim: D, keepdim: bool, op: ReduceOp) -> Result<Self> {
+        let dim = dim.to_index(self.shape(), op.name())?;
+        let storage = self.storage().reduce_op(op, self.layout(), &[dim])?;
+        let mut dims = self.dims().to_vec();
+        dims[dim] = 1;
+        let op = match op {
+            ReduceOp::Sum | ReduceOp::Min | ReduceOp::Max => {
+                BackpropOp::new1(self, |arg| Op::Reduce(arg, op, dims.to_vec()))
+            }
+            ReduceOp::ArgMin | ReduceOp::ArgMax => BackpropOp::none(),
+        };
+        let res = from_storage(storage, dims, op, false);
+        if keepdim {
+            Ok(res)
+        } else {
+            res.squeeze_dims(&[dim])
+        }
+    }
+
+    fn sum_impl<D: Dims>(&self, sum_dims: D, keepdim: bool) -> Result<Self> {
+        let sum_dims = sum_dims.to_indexes(self.shape(), "sum")?;
+        let storage = self
+            .storage()
+            .reduce_op(ReduceOp::Sum, self.layout(), &sum_dims)?;
+        let mut dims = self.dims().to_vec();
+        for &sum_dim in sum_dims.iter() {
+            dims[sum_dim] = 1
+        }
+        let op = BackpropOp::new1(self, |a| Op::Reduce(a, ReduceOp::Sum, dims.to_vec()));
+        let sum = from_storage(storage, dims, op, false);
+        if keepdim {
+            Ok(sum)
+        } else {
+            sum.squeeze_dims(&sum_dims)
+        }
+    }
+
+    /// Roll the tensor input along the given dimension.
+    /// Elements that are shifted beyond the last position are re-introduced at the first position.
+    ///
+    /// ```rust
+    /// # use candle_core::{Tensor, Device};
+    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let tensor = tensor.roll(1, 0)?;
+    /// assert_eq!(tensor.to_vec2::<f32>()?, &[[4., 5.], [0., 1.], [2., 3.]]);
+    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let tensor = tensor.roll(-1, 0)?;
+    /// assert_eq!(tensor.to_vec2::<f32>()?, &[[2., 3.], [4., 5.], [0., 1.]]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn roll<D>(&self, shift: i32, dim: D) -> Result<Self>
+    where
+        D: Dim + Clone,
+    {
+        let dim = dim.to_index(self.shape(), "roll")?;
+        let dim_size = self.dim(dim)?;
+        let shift = shift.rem_euclid(dim_size as i32) as usize;
+        if shift == 0 {
+            Ok(self.clone())
+        } else {
+            let a = self.narrow(dim, 0, dim_size - shift)?;
+            let b = self.narrow(dim, dim_size - shift, shift)?;
+            Tensor::cat(&[&b, &a], dim)
+        }
+    }
+
+    /// Returns the sum of all elements in the input tensor. The sum is performed over all the
+    /// input dimensions.
+    ///
+    /// The resulting tensor has a shape that is similar to the shape of the input tensor, except
+    /// that the number of elements for each dimension index in `sum_dims` is 1.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let a = Tensor::new(&[[0f32, 1.], [2., 3.]], &Device::Cpu)?;
+    /// let s = a.sum_keepdim(0)?;
+    /// assert_eq!(s.to_vec2::<f32>()?, &[[2., 4.]]);
+    /// let s = a.sum_keepdim(1)?;
+    /// assert_eq!(s.to_vec2::<f32>()?, &[[1.], [5.]]);
+    /// let s = a.sum_keepdim((0, 1))?;
+    /// assert_eq!(s.to_vec2::<f32>()?, &[[6.]]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn sum_keepdim<D: Dims>(&self, sum_dims: D) -> Result<Self> {
+        self.sum_impl(sum_dims, true)
+    }
+
+    /// Returns the sum of all elements in the input tensor. The sum is performed over all the
+    /// input dimensions and compared to `sum_keepdim` these dimensions are squeezed rather than
+    /// kept.
+    pub fn sum<D: Dims>(&self, sum_dims: D) -> Result<Self> {
+        self.sum_impl(sum_dims, false)
+    }
+
+    /// Returns the mean of all elements in the input tensor. The mean is performed over all the
+    /// input dimensions.
+    ///
+    /// The resulting tensor has a shape that is similar to the shape of the input tensor, except
+    /// that the number of elements for each dimension index in `mean_dims` is 1.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let a = Tensor::new(&[[0f32, 1.], [2., 3.]], &Device::Cpu)?;
+    /// let s = a.mean_keepdim(0)?;
+    /// assert_eq!(s.to_vec2::<f32>()?, &[[1., 2.]]);
+    /// let s = a.mean_keepdim(1)?;
+    /// assert_eq!(s.to_vec2::<f32>()?, &[[0.5], [2.5]]);
+    /// let s = a.mean_keepdim((0, 1))?;
+    /// assert_eq!(s.to_vec2::<f32>()?, &[[1.5]]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn mean_keepdim<D: Dims>(&self, mean_dims: D) -> Result<Self> {
+        let mean_dims = mean_dims.to_indexes(self.shape(), "mean-keepdim")?;
+        let reduced_dim: usize = mean_dims.iter().map(|i| self.dims()[*i]).product();
+        let scale = 1f64 / (reduced_dim as f64);
+        self.sum_impl(mean_dims, true)? * scale
+    }
+
+    /// Returns the mean of all elements in the input tensor. The mean is performed over all the
+    /// input dimensions and compared to `mean_keepdim` these dimensions are squeezed rather than
+    /// kept.
+    pub fn mean<D: Dims>(&self, mean_dims: D) -> Result<Self> {
+        let mean_dims = mean_dims.to_indexes(self.shape(), "mean")?;
+        let reduced_dim: usize = mean_dims.iter().map(|i| self.dims()[*i]).product();
+        let scale = 1f64 / (reduced_dim as f64);
+        self.sum_impl(mean_dims, false)? * scale
+    }
+
+    /// Returns the unbiased variance over the selected dimension.
+    pub fn var_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
+        let dim = dim.to_index(self.shape(), "var")?;
+        let mean = self.mean_keepdim(dim)?;
+        let squares = self.broadcast_sub(&mean)?.sqr()?;
+        squares.sum_impl(dim, true)? / (self.dim(dim)? - 1) as f64
+    }
+
+    /// Returns the unbiased variance over the selected dimension.
+    pub fn var<D: Dim>(&self, dim: D) -> Result<Self> {
+        let dim = dim.to_index(self.shape(), "var")?;
+        self.var_keepdim(dim)?.squeeze(dim)
+    }
+
+    /// Gathers the maximum value across the selected dimension. The resulting shape has the same
+    /// number of dimensions as the original tensor and the select dimension has a single element.
+    pub fn max_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
+        self.reduce_impl(dim, true, ReduceOp::Max)
+    }
+
+    /// Similar to `max_keepdim` but the target dimension is squeezed.
+    pub fn max<D: Dim>(&self, dim: D) -> Result<Self> {
+        self.reduce_impl(dim, false, ReduceOp::Max)
+    }
+
+    /// Gathers the minimum value across the selected dimension. The resulting shape has the same
+    /// number of dimensions as the original tensor and the select dimension has a single element.
+    pub fn min_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
+        self.reduce_impl(dim, true, ReduceOp::Min)
+    }
+
+    /// Similar to `min_keepdim` but the target dimension is squeezed.
+    pub fn min<D: Dim>(&self, dim: D) -> Result<Self> {
+        self.reduce_impl(dim, false, ReduceOp::Min)
+    }
+
+    pub fn argmax_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
+        self.reduce_impl(dim, true, ReduceOp::ArgMax)
+    }
+
+    /// Similar to `argmax_keepdim` but the target dimension is squeezed.
+    pub fn argmax<D: Dim>(&self, dim: D) -> Result<Self> {
+        self.reduce_impl(dim, false, ReduceOp::ArgMax)
+    }
+
+    pub fn argmin_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
+        self.reduce_impl(dim, true, ReduceOp::ArgMin)
+    }
+
+    /// Similar to `argmin_keepdim` but the target dimension is squeezed.
+    pub fn argmin<D: Dim>(&self, dim: D) -> Result<Self> {
+        self.reduce_impl(dim, false, ReduceOp::ArgMin)
+    }
+
+    /// Element-wise comparison between two tensors, e.g. equality, greater than, ... The actual
+    /// comparison operation is specified by the `op` argument.
+    ///
+    /// The returned tensor has the same shape as the original tensors and uses `u8` elements.
+    pub fn cmp<T: TensorOrScalar>(&self, rhs: T, op: CmpOp) -> Result<Self> {
+        let rhs = match rhs.to_tensor_scalar()? {
+            crate::scalar::TensorScalar::Tensor(rhs) => rhs,
+            crate::scalar::TensorScalar::Scalar(rhs) => rhs
+                .to_dtype(self.dtype())?
+                .to_device(self.device())?
+                .broadcast_as(self.shape())?,
+        };
+        let shape = self.same_shape_binary_op(&rhs, "cmp")?;
+        let storage = self
+            .storage()
+            .cmp(op, &rhs.storage(), self.layout(), rhs.layout())?;
+        let op = BackpropOp::new1(self, |a| Op::Cmp(a, op));
+        Ok(from_storage(storage, shape.dims(), op, false))
+    }
+
+    /// Element-wise equality.
+    pub fn eq<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
+        self.cmp(rhs, CmpOp::Eq)
+    }
+
+    /// Element-wise non-equality.
+    pub fn ne<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
+        self.cmp(rhs, CmpOp::Ne)
+    }
+
+    /// Element-wise comparison with lower-than, the returned tensor uses value 1 where `self <
+    /// rhs` and 0 otherwise.
+    pub fn lt<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
+        self.cmp(rhs, CmpOp::Lt)
+    }
+
+    /// Element-wise comparison with greater-than, the returned tensor uses value 1 where `self >
+    /// rhs` and 0 otherwise.
+    pub fn gt<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
+        self.cmp(rhs, CmpOp::Gt)
+    }
+
+    /// Element-wise comparison with greater-equal, the returned tensor uses value 1 where `self >=
+    /// rhs` and 0 otherwise.
+    pub fn ge<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
+        self.cmp(rhs, CmpOp::Ge)
+    }
+
+    /// Element-wise comparison with lower-equal, the returned tensor uses value 1 where `self <=
+    /// rhs` and 0 otherwise.
+    pub fn le<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
+        self.cmp(rhs, CmpOp::Le)
+    }
+
+    /// Clamp the tensor values to be between `min` and `max`.
+    pub fn clamp<T1: TensorOrScalar, T2: TensorOrScalar>(&self, min: T1, max: T2) -> Result<Self> {
+        self.maximum(min)?.minimum(max)
+    }
+
+    /// Interpolate the input tensor to the `target_size` size, taking the value of the nearest element.
+    ///
+    /// The input tensor should have three dimensions, `(batch, channels, l)`, the returned
+    /// tensor also has three dimensions, `(batch, channels, target_size)`.
+    pub fn interpolate1d(&self, target_size: usize) -> Result<Self> {
+        let (n, c, _l) = self.dims3()?;
+        let op = BackpropOp::new1(self, |arg| Op::UpsampleNearest1D { arg, target_size });
+        let storage = self
+            .storage()
+            .upsample_nearest1d(self.layout(), target_size)?;
+        Ok(from_storage(storage, (n, c, target_size), op, false))
+    }
+
+    /// Alias for `interpolate1d`.
+    pub fn upsample_nearest1d(&self, target_size: usize) -> Result<Self> {
+        self.interpolate1d(target_size)
+    }
+
+    /// Interpolate the input tensor to the `(target_h, target_w)` size, taking the value of the
+    /// nearest element.
+    ///
+    /// The input tensor should have four dimensions, `(batch, channels, h, w)`, the returned
+    /// tensor also has four dimensions, `(batch, channels, target_h, target_w)`.
+    pub fn interpolate2d(&self, target_h: usize, target_w: usize) -> Result<Self> {
+        let (n, c, _h, _w) = self.dims4()?;
+        let op = BackpropOp::new1(self, |arg| Op::UpsampleNearest2D {
+            arg,
+            target_h,
+            target_w,
+        });
+        let storage = self
+            .storage()
+            .upsample_nearest2d(self.layout(), target_h, target_w)?;
+        Ok(from_storage(storage, (n, c, target_h, target_w), op, false))
+    }
+
+    /// Alias for `interpolate2d`.
+    pub fn upsample_nearest2d(&self, target_h: usize, target_w: usize) -> Result<Self> {
+        self.interpolate2d(target_h, target_w)
+    }
+
+    /// 2D average pooling over an input tensor with multiple channels.
+    ///
+    /// The input tensor should have four dimensions, `(batch, channels, h, w)`, the returned
+    /// tensor also has four dimensions, `(batch, channels, h', w')`. The pooling is performed on
+    /// the two last dimensions using a kernel of size `sz`. The returned element is the average
+    /// value over the kernel window.
+    pub fn avg_pool2d<T: crate::ToUsize2>(&self, sz: T) -> Result<Self> {
+        let sz = sz.to_usize2();
+        self.avg_pool2d_with_stride(sz, sz)
+    }
+
+    /// Same as `avg_pool2d` but with a `stride` that can be set to a value different from the
+    /// kernel size.
+    pub fn avg_pool2d_with_stride<T: crate::ToUsize2>(
+        &self,
+        kernel_size: T,
+        stride: T,
+    ) -> Result<Self> {
+        let kernel_size = kernel_size.to_usize2();
+        let stride = stride.to_usize2();
+        let (n, c, h, w) = self.dims4()?;
+        if h < kernel_size.0 || w < kernel_size.1 {
+            bail!("kernel-size {kernel_size:?} is larger than the input size {h},{w}")
+        }
+        // https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html#torch.nn.AvgPool2d
+        let h_out = (h - kernel_size.0) / stride.0 + 1;
+        let w_out = (w - kernel_size.1) / stride.1 + 1;
+        let op = BackpropOp::new1(self, |arg| Op::AvgPool2D {
+            arg,
+            kernel_size,
+            stride,
+        });
+        let storage = self
+            .storage()
+            .avg_pool2d(self.layout(), kernel_size, stride)?;
+        Ok(from_storage(storage, (n, c, h_out, w_out), op, false))
+    }
+
+    /// 2D max pooling over an input tensor with multiple channels.
+    ///
+    /// The input tensor should have four dimensions, `(batch, channels, h, w)`, the returned
+    /// tensor also has four dimensions, `(batch, channels, h', w')`. The pooling is performed on
+    /// the two last dimensions using a kernel of size `sz`, the returned element is the maximum
+    /// value over the kernel window.
+    pub fn max_pool2d<T: crate::ToUsize2>(&self, sz: T) -> Result<Self> {
+        let sz = sz.to_usize2();
+        self.max_pool2d_with_stride(sz, sz)
+    }
+
+    /// Same as `max_pool2d` but with a `stride` that can be set to a value different from the
+    /// kernel size.
+    pub fn max_pool2d_with_stride<T: crate::ToUsize2>(
+        &self,
+        kernel_size: T,
+        stride: T,
+    ) -> Result<Self> {
+        let kernel_size = kernel_size.to_usize2();
+        let stride = stride.to_usize2();
+        let (n, c, h, w) = self.dims4()?;
+        if h < kernel_size.0 || w < kernel_size.1 {
+            bail!("kernel-size {kernel_size:?} is larger than the input size {h},{w}")
+        }
+        // https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d
+        let h_out = (h - kernel_size.0) / stride.0 + 1;
+        let w_out = (w - kernel_size.1) / stride.1 + 1;
+        let op = BackpropOp::new1(self, |arg| Op::MaxPool2D {
+            arg,
+            kernel_size,
+            stride,
+        });
+        let storage = self
+            .storage()
+            .max_pool2d(self.layout(), kernel_size, stride)?;
+        Ok(from_storage(storage, (n, c, h_out, w_out), op, false))
+    }
+
+    /// Returns the matrix-multiplication of the input tensor with the other provided tensor.
+    ///
+    /// # Arguments
+    ///
+    /// * `self` - A tensor with dimensions `b1, b2, ..., bi, m, k`.
+    /// * `rhs` - A tensor with dimensions `b1, b2, ..., bi, k, n`.
+    ///
+    /// The resulting tensor has dimensions `b1, b2, ..., bi, m, n`.
+    pub fn matmul(&self, rhs: &Self) -> Result<Self> {
+        let a_dims = self.shape().dims();
+        let b_dims = rhs.shape().dims();
+
+        let dim = a_dims.len();
+
+        if dim < 2 || b_dims.len() != dim {
+            Err(Error::ShapeMismatchBinaryOp {
+                lhs: self.shape().clone(),
+                rhs: rhs.shape().clone(),
+                op: "matmul",
+            }
+            .bt())?
+        }
+
+        let m = a_dims[dim - 2];
+        let k = a_dims[dim - 1];
+        let k2 = b_dims[dim - 2];
+        let n = b_dims[dim - 1];
+
+        let c_shape = Shape::from(&a_dims[..dim - 2]).extend(&[m, n]);
+        let batching: usize = a_dims[..dim - 2].iter().product();
+        let batching_b: usize = b_dims[..dim - 2].iter().product();
+        if k != k2 || batching != batching_b {
+            Err(Error::ShapeMismatchBinaryOp {
+                lhs: self.shape().clone(),
+                rhs: rhs.shape().clone(),
+                op: "matmul",
+            }
+            .bt())?
+        }
+
+        let storage = self.storage().matmul(
+            &rhs.storage(),
+            (batching, m, n, k),
+            self.layout(),
+            rhs.layout(),
+        )?;
+        let op = BackpropOp::new2(self, rhs, Op::Matmul);
+        Ok(from_storage(storage, c_shape, op, false))
+    }
+
+    /// Matrix-multiplication with broadcasting support.
+    ///
+    /// Compared to `matmul` the two matrixes are allowed to have different dimensions as long as
+    /// they are compatible for broadcast. E.g. if `self` has shape `(j, 1, n, k)` and `rhs` has
+    /// shape `(l, k, m)`, the output will have shape `(j, l, n, m)`.
+    pub fn broadcast_matmul(&self, rhs: &Self) -> Result<Self> {
+        let lhs = self;
+        let (l_shape, r_shape) = lhs.shape().broadcast_shape_matmul(rhs.shape())?;
+        let l_broadcast = l_shape != *lhs.shape();
+        let r_broadcast = r_shape != *rhs.shape();
+        // TODO: Avoid concretising the broadcasted matrixes via contiguous.
+        match (l_broadcast, r_broadcast) {
+            (true, true) => lhs
+                .broadcast_as(&l_shape)?
+                .contiguous()?
+                .matmul(&rhs.broadcast_as(&r_shape)?.contiguous()?),
+            (false, true) => lhs.matmul(&rhs.broadcast_as(&r_shape)?.contiguous()?),
+            (true, false) => lhs.broadcast_as(&l_shape)?.contiguous()?.matmul(rhs),
+            (false, false) => lhs.matmul(rhs),
+        }
+    }
+
+    /// Returns a tensor with the same shape as the input tensor, the values are taken from
+    /// `on_true` if the input tensor value is not zero, and `on_false` at the positions where the
+    /// input tensor is equal to zero.
+    pub fn where_cond(&self, on_true: &Self, on_false: &Self) -> Result<Self> {
+        let _shap = self.same_shape_binary_op(on_true, "where_cond")?;
+        let shape = self.same_shape_binary_op(on_false, "where_cond")?;
+        let storage = self.storage().where_cond(
+            self.layout(),
+            &on_true.storage(),
+            on_true.layout(),
+            &on_false.storage(),
+            on_false.layout(),
+        )?;
+        let op = BackpropOp::new3(self, on_true, on_false, Op::WhereCond);
+        Ok(from_storage(storage, shape, op, false))
+    }
+
+    /// Returns a tensor with the values from the `self` tensor at the index corresponding to the
+    /// values hold in the `ids` tensor.
+    ///
+    /// # Arguments
+    ///
+    /// * `self` - A tensor with dimensions `v, h`.
+    /// * `ids` - A tensor with dimensions `s` and with integer values between 0 and v (exclusive).
+    ///
+    /// The resulting tensor has dimensions `s, h`. `s` is called the sequence length, `v` the
+    /// vocabulary size, and `h` the hidden size.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let values = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let ids = Tensor::new(&[2u32, 1u32, 2u32], &Device::Cpu)?;
+    /// let emb = values.embedding(&ids)?;
+    /// assert_eq!(emb.to_vec2::<f32>()?, &[[4., 5.], [2., 3.], [4., 5.]]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn embedding(&self, ids: &Self) -> Result<Self> {
+        if self.rank() != 2 || ids.rank() != 1 {
+            Err(Error::ShapeMismatchBinaryOp {
+                lhs: self.shape().clone(),
+                rhs: ids.shape().clone(),
+                op: "embedding",
+            }
+            .bt())?
+        }
+        self.index_select(ids, 0)
+    }
+
+    pub fn scatter_add<D: Dim>(&self, indexes: &Self, source: &Self, dim: D) -> Result<Self> {
+        let dim = dim.to_index(self.shape(), "scatter-add")?;
+        let source_dims = source.dims();
+        let self_dims = self.dims();
+        let mismatch = if source_dims.len() != self_dims.len() {
+            true
+        } else {
+            let mut mismatch = false;
+            for (i, (&d1, &d2)) in self_dims.iter().zip(source_dims.iter()).enumerate() {
+                if i != dim && d1 != d2 {
+                    mismatch = true;
+                    break;
+                }
+            }
+            mismatch
+        };
+        if mismatch {
+            Err(Error::ShapeMismatchBinaryOp {
+                op: "scatter-add (self, src)",
+                lhs: self.shape().clone(),
+                rhs: source.shape().clone(),
+            }
+            .bt())?
+        }
+        if indexes.dims() != source.dims() {
+            Err(Error::ShapeMismatchBinaryOp {
+                op: "scatter-add (indexes, src)",
+                lhs: indexes.shape().clone(),
+                rhs: source.shape().clone(),
+            }
+            .bt())?
+        }
+        let storage = self.storage().scatter_add(
+            self.layout(),
+            &indexes.storage(),
+            indexes.layout(),
+            &source.storage(),
+            source.layout(),
+            dim,
+        )?;
+        let op = BackpropOp::new3(self, indexes, source, |t1, t2, t3| {
+            Op::ScatterAdd(t1, t2, t3, dim)
+        });
+        Ok(from_storage(storage, self.shape(), op, false))
+    }
+
+    /// Embeds the values of the `src` tensor into the `self` tensor on the specified dimension.
+    pub fn slice_scatter<D: Dim>(&self, src: &Self, dim: D, start: usize) -> Result<Self> {
+        let dim = dim.to_index(self.shape(), "slice-scatter")?;
+        if dim == 0 {
+            self.slice_scatter0(src, start)
+        } else {
+            // TODO: Maybe we want to add a more efficient implementation at some point.
+            self.transpose(0, dim)?
+                .slice_scatter0(&src.transpose(0, dim)?, start)?
+                .transpose(0, dim)
+        }
+    }
+
+    /// Embeds the values of the `src` tensor into the `self` tensor on the first dimension.
+    pub fn slice_scatter0(&self, src: &Self, start: usize) -> Result<Self> {
+        if self.dtype() != src.dtype() {
+            Err(Error::DTypeMismatchBinaryOp {
+                lhs: self.dtype(),
+                rhs: src.dtype(),
+                op: "slice-scatter",
+            }
+            .bt())?
+        }
+        if self.device().location() != src.device.location() {
+            Err(Error::DeviceMismatchBinaryOp {
+                lhs: self.device().location(),
+                rhs: src.device().location(),
+                op: "slice-scatter",
+            }
+            .bt())?
+        }
+        if self.rank() != src.rank() {
+            Err(Error::UnexpectedNumberOfDims {
+                expected: self.rank(),
+                got: src.rank(),
+                shape: src.shape().clone(),
+            }
+            .bt())?
+        }
+        let shape_ok =
+            self.dims()
+                .iter()
+                .zip(src.dims().iter())
+                .enumerate()
+                .all(|(dim_idx, (&d1, &d2))| {
+                    if 0 == dim_idx {
+                        d2 + start <= d1
+                    } else {
+                        d1 == d2
+                    }
+                });
+        if !shape_ok {
+            Err(Error::ShapeMismatchBinaryOp {
+                op: "slice-scatter (self, src)",
+                lhs: self.shape().clone(),
+                rhs: src.shape().clone(),
+            }
+            .bt())?
+        }
+        let mut storage = unsafe { self.device().alloc_uninit(self.shape(), self.dtype())? };
+        self.storage()
+            .copy_strided_src(&mut storage, 0, self.layout())?;
+        let offset = start * src.dims()[1..].iter().product::<usize>();
+        src.storage()
+            .copy_strided_src(&mut storage, offset, src.layout())?;
+        let op = BackpropOp::new2(self, src, |t1, t2| Op::SliceScatter0(t1, t2, start));
+        Ok(from_storage(storage, self.shape(), op, false))
+    }
+
+    /// Accumulate element from `source` at indexes `indexes` and add them to `self`.
+    pub fn index_add<D: Dim>(&self, indexes: &Self, source: &Self, dim: D) -> Result<Self> {
+        let dim = dim.to_index(self.shape(), "index-add")?;
+        let source_dims = source.dims();
+        let self_dims = self.dims();
+        let mismatch = if source_dims.len() != self_dims.len() {
+            true
+        } else {
+            let mut mismatch = false;
+            for (i, (&d1, &d2)) in self_dims.iter().zip(source_dims.iter()).enumerate() {
+                if i != dim && d1 != d2 {
+                    mismatch = true;
+                    break;
+                }
+            }
+            mismatch
+        };
+        if mismatch {
+            Err(Error::ShapeMismatchBinaryOp {
+                op: "index-add (self, source)",
+                lhs: self.shape().clone(),
+                rhs: source.shape().clone(),
+            }
+            .bt())?
+        }
+        // The number of element in indexes must match the dimension on which the add is
+        // performed on the source tensor (and the index values from `indexes` are taken from
+        // the target tensor self)
+        let indexes_len = indexes.dims1()?;
+        if source_dims[dim] != indexes_len {
+            Err(Error::ShapeMismatchBinaryOp {
+                op: "index-add (ids, source))",
+                lhs: indexes.shape().clone(),
+                rhs: source.shape().clone(),
+            }
+            .bt())?
+        }
+        let storage = self.storage().index_add(
+            self.layout(),
+            &indexes.storage(),
+            indexes.layout(),
+            &source.storage(),
+            source.layout(),
+            dim,
+        )?;
+        let op = BackpropOp::new3(self, indexes, source, |t1, t2, t3| {
+            Op::IndexAdd(t1, t2, t3, dim)
+        });
+        Ok(from_storage(storage, self.shape(), op, false))
+    }
+
+    /// Gather values across the target dimension.
+    ///
+    /// # Arguments
+    ///
+    /// * `self` - The input tensor.
+    /// * `indexes` - The indices of elements to gather, this should have the same shape as `self`
+    ///   but can have a different number of elements on the target dimension.
+    /// * `dim` - the target dimension.
+    ///
+    /// The resulting tensor has the same shape as `indexes` and use values from `self` indexed on
+    /// dimension `dim` by the values in `indexes`.
+    pub fn gather<D: Dim>(&self, indexes: &Self, dim: D) -> Result<Self> {
+        let dim = dim.to_index(self.shape(), "gather")?;
+        let self_dims = self.dims();
+        let indexes_dims = indexes.dims();
+        let mismatch = if indexes_dims.len() != self_dims.len() {
+            true
+        } else {
+            let mut mismatch = false;
+            for (i, (&d1, &d2)) in self_dims.iter().zip(indexes_dims.iter()).enumerate() {
+                if i != dim && d1 != d2 {
+                    mismatch = true;
+                    break;
+                }
+            }
+            mismatch
+        };
+        if mismatch {
+            Err(Error::ShapeMismatchBinaryOp {
+                op: "gather",
+                lhs: self.shape().clone(),
+                rhs: indexes.shape().clone(),
+            }
+            .bt())?
+        }
+        let storage =
+            self.storage()
+                .gather(self.layout(), &indexes.storage(), indexes.layout(), dim)?;
+        let op = BackpropOp::new2(self, indexes, |t1, t2| Op::Gather(t1, t2, dim));
+        Ok(from_storage(storage, indexes.shape(), op, false))
+    }
+
+    /// Select values for the input tensor at the target indexes across the specified dimension.
+    ///
+    /// The `indexes` is argument is an int tensor with a single dimension.
+    /// The output has the same number of dimension as the `self` input. The target dimension of
+    /// the output has length the length of `indexes` and the values are taken from `self` using
+    /// the index from `indexes`. Other dimensions have the same number of elements as the input
+    /// tensor.
+    pub fn index_select<D: Dim>(&self, indexes: &Self, dim: D) -> Result<Self> {
+        let dim = dim.to_index(self.shape(), "index-select")?;
+        let indexes_len = match indexes.dims() {
+            [l] => *l,
+            _ => Err(Error::ShapeMismatchBinaryOp {
+                lhs: self.shape().clone(),
+                rhs: indexes.shape().clone(),
+                op: "index-select",
+            }
+            .bt())?,
+        };
+        let storage = self.storage().index_select(
+            &indexes.storage(),
+            self.layout(),
+            indexes.layout(),
+            dim,
+        )?;
+        let mut dims = self.dims().to_vec();
+        dims[dim] = indexes_len;
+        let op = BackpropOp::new2(self, indexes, |t1, t2| Op::IndexSelect(t1, t2, dim));
+        Ok(from_storage(storage, dims, op, false))
+    }
+
+    /// Returns an iterator over position of the elements in the storage when ranging over the
+    /// index tuples in lexicographic order.
+    pub fn strided_index(&self) -> crate::StridedIndex {
+        self.layout.strided_index()
+    }
+
+    /// Similar to `strided_index` but returns the position of the start of each contiguous block
+    /// as well as the length of the contiguous blocks. For a contiguous tensor, the index iterator
+    /// will only return the start offset and the size would be the number of elements in the
+    /// tensor.
+    pub fn strided_blocks(&self) -> crate::StridedBlocks {
+        self.layout.strided_blocks()
+    }
+
+    /// Returns the data contained in a 1D tensor as a vector of scalar values.
+    pub fn to_vec1<S: crate::WithDType>(&self) -> Result<Vec<S>> {
+        if self.rank() != 1 {
+            Err(Error::UnexpectedNumberOfDims {
+                expected: 1,
+                got: self.rank(),
+                shape: self.shape().clone(),
+            }
+            .bt())?
+        }
+        let from_cpu_storage = |cpu_storage: &crate::CpuStorage| {
+            let data = S::cpu_storage_as_slice(cpu_storage)?;
+            let data = match self.layout.contiguous_offsets() {
+                Some((o1, o2)) => data[o1..o2].to_vec(),
+                None => self.strided_index().map(|i| data[i]).collect(),
+            };
+            Ok::<Vec<_>, Error>(data)
+        };
+        match &*self.storage() {
+            Storage::Cpu(storage) => from_cpu_storage(storage),
+            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
+            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
+        }
+    }
+
+    /// Returns the data contained in a 2D tensor as a vector of vector of scalar values.
+    pub fn to_vec2<S: crate::WithDType>(&self) -> Result<Vec<Vec<S>>> {
+        let (dim1, dim2) = self.dims2()?;
+        let from_cpu_storage = |cpu_storage: &crate::CpuStorage| {
+            let data = S::cpu_storage_as_slice(cpu_storage)?;
+            let mut rows = vec![];
+            match self.layout.contiguous_offsets() {
+                Some((o1, o2)) => {
+                    let data = &data[o1..o2];
+                    for idx_row in 0..dim1 {
+                        rows.push(data[idx_row * dim2..(idx_row + 1) * dim2].to_vec())
+                    }
+                }
+                None => {
+                    let mut src_index = self.strided_index();
+                    for _idx_row in 0..dim1 {
+                        let row = (0..dim2).map(|_| data[src_index.next().unwrap()]).collect();
+                        rows.push(row)
+                    }
+                    assert!(src_index.next().is_none());
+                }
+            }
+            Ok(rows)
+        };
+        match &*self.storage() {
+            Storage::Cpu(storage) => from_cpu_storage(storage),
+            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
+            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
+        }
+    }
+
+    /// Returns the data contained in a 3D tensor.
+    pub fn to_vec3<S: crate::WithDType>(&self) -> Result<Vec<Vec<Vec<S>>>> {
+        let (dim1, dim2, dim3) = self.dims3()?;
+        let from_cpu_storage = |cpu_storage: &crate::CpuStorage| {
+            let data = S::cpu_storage_as_slice(cpu_storage)?;
+            let mut top_rows = vec![];
+            match self.layout.contiguous_offsets() {
+                Some((o1, o2)) => {
+                    let data = &data[o1..o2];
+                    let dim23 = dim2 * dim3;
+                    for idx1 in 0..dim1 {
+                        let data = &data[idx1 * dim23..(idx1 + 1) * dim23];
+                        let mut rows = vec![];
+                        for idx2 in 0..dim2 {
+                            rows.push(data[idx2 * dim3..(idx2 + 1) * dim3].to_vec())
+                        }
+                        top_rows.push(rows);
+                    }
+                }
+                None => {
+                    let mut src_index = self.strided_index();
+                    for _idx in 0..dim1 {
+                        let mut rows = vec![];
+                        for _jdx in 0..dim2 {
+                            let row = (0..dim3).map(|_| data[src_index.next().unwrap()]).collect();
+                            rows.push(row)
+                        }
+                        top_rows.push(rows);
+                    }
+                    assert!(src_index.next().is_none());
+                }
+            }
+            Ok(top_rows)
+        };
+        match &*self.storage() {
+            Storage::Cpu(storage) => from_cpu_storage(storage),
+            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
+            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
+        }
+    }
+
+    /// The dtype for the elements stored in the input tensor.
+    pub fn dtype(&self) -> DType {
+        self.dtype
+    }
+
+    /// The device on which the input tensor is located.
+    pub fn device(&self) -> &Device {
+        &self.device
+    }
+
+    /// The tensor shape, i.e. dimension sizes on each axis.
+    pub fn shape(&self) -> &Shape {
+        self.layout().shape()
+    }
+
+    /// The dimension size for this tensor on each axis.
+    pub fn dims(&self) -> &[usize] {
+        self.shape().dims()
+    }
+
+    /// The dimension size for a specified dimension index.
+    pub fn dim<D: Dim>(&self, dim: D) -> Result<usize> {
+        let dim = dim.to_index(self.shape(), "dim")?;
+        Ok(self.dims()[dim])
+    }
+
+    /// The layout of the input tensor, this stores both the shape of the tensor as well as the
+    /// strides and the start offset to apply to the underlying storage.
+    pub fn layout(&self) -> &Layout {
+        &self.layout
+    }
+
+    pub fn stride(&self) -> &[usize] {
+        self.layout.stride()
+    }
+
+    /// The number of dimensions for this tensor, 0 for a scalar tensor, 1 for a 1D tensor, etc.
+    pub fn rank(&self) -> usize {
+        self.shape().rank()
+    }
+
+    /// The number of elements stored in this tensor.
+    pub fn elem_count(&self) -> usize {
+        self.shape().elem_count()
+    }
+
+    /// The unique identifier for this tensor.
+    pub fn id(&self) -> TensorId {
+        self.id
+    }
+
+    /// Whether this tensor is a variable or not. A variable is a tensor for which gradient is
+    /// tracked and on which backpropagation can be performed.
+    pub fn is_variable(&self) -> bool {
+        self.is_variable
+    }
+
+    pub(crate) fn op(&self) -> &Option<Op> {
+        &self.op
+    }
+
+    /// Computes the sum of all the elements in this tensor and returns a tensor holding this
+    /// scalar with zero dimensions.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let tensor = tensor.sum_all()?;
+    /// assert_eq!(tensor.to_scalar::<f32>()?, 15.);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn sum_all(&self) -> Result<Tensor> {
+        let dims: Vec<_> = (0..self.rank()).collect();
+        self.sum(dims)
+    }
+
+    pub fn mean_all(&self) -> Result<Tensor> {
+        self.sum_all()? / self.elem_count() as f64
+    }
+
+    fn flatten_<D1: Dim, D2: Dim>(
+        &self,
+        start_dim: Option<D1>,
+        end_dim: Option<D2>,
+    ) -> Result<Tensor> {
+        if self.rank() == 0 {
+            self.reshape(1)
+        } else {
+            let start_dim = match start_dim {
+                None => 0,
+                Some(dim) => dim.to_index(self.shape(), "flatten")?,
+            };
+            let end_dim = match end_dim {
+                None => self.rank() - 1,
+                Some(dim) => dim.to_index(self.shape(), "flatten")?,
+            };
+            if start_dim < end_dim {
+                let dims = self.dims();
+                let mut dst_dims = dims[..start_dim].to_vec();
+                dst_dims.push(dims[start_dim..end_dim + 1].iter().product::<usize>());
+                if end_dim + 1 < dims.len() {
+                    dst_dims.extend(&dims[end_dim + 1..]);
+                }
+                self.reshape(dst_dims)
+            } else {
+                Ok(self.clone())
+            }
+        }
+    }
+
+    /// Flattens the input tensor on the dimension indexes from `start_dim` to `end_dim` (both
+    /// inclusive).
+    pub fn flatten<D1: Dim, D2: Dim>(&self, start_dim: D1, end_dim: D2) -> Result<Tensor> {
+        self.flatten_(Some(start_dim), Some(end_dim))
+    }
+
+    /// Flattens the input tensor on the dimension indexes from `0` to `end_dim` (inclusive).
+    pub fn flatten_to<D: Dim>(&self, end_dim: D) -> Result<Tensor> {
+        self.flatten_(None::<usize>, Some(end_dim))
+    }
+
+    /// Flattens the input tensor on the dimension indexes from `start_dim` (inclusive) to the last
+    /// dimension.
+    pub fn flatten_from<D: Dim>(&self, start_dim: D) -> Result<Tensor> {
+        self.flatten_(Some(start_dim), None::<usize>)
+    }
+
+    /// Flattens the input tensor by reshaping it into a one dimension tensor.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let tensor = tensor.flatten_all()?;
+    /// assert_eq!(tensor.to_vec1::<f32>()?, &[0., 1., 2., 3., 4., 5.]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn flatten_all(&self) -> Result<Tensor> {
+        self.flatten_(None::<usize>, None::<usize>)
+    }
+
+    /// Returns the sub-tensor fixing the index at `i` on the first dimension.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let t = tensor.get(0)?;
+    /// assert_eq!(t.to_vec1::<f32>()?, &[0., 1.]);
+    /// let t = tensor.get(1)?;
+    /// assert_eq!(t.to_vec1::<f32>()?, &[2., 3.]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn get(&self, i: usize) -> Result<Tensor> {
+        let dims = self.dims();
+        if dims.is_empty() {
+            Ok(self.clone())
+        } else {
+            self.narrow(0, i, 1)?.reshape(&dims[1..])
+        }
+    }
+
+    /// Returns the sub-tensor fixing the index at `index` on the dimension `dim`.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let t = tensor.get_on_dim(1, 0)?;
+    /// assert_eq!(t.to_vec1::<f32>()?, &[0., 2., 4.]);
+    /// let t = tensor.get_on_dim(1, 1)?;
+    /// assert_eq!(t.to_vec1::<f32>()?, &[1., 3., 5.]);
+    /// let t = tensor.get_on_dim(0, 1)?;
+    /// assert_eq!(t.to_vec1::<f32>()?, &[2., 3.]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn get_on_dim<D: Dim>(&self, dim: D, index: usize) -> Result<Tensor> {
+        let dim = dim.to_index(self.shape(), "get_on_dim")?;
+        self.narrow(dim, index, 1)?.squeeze(dim)
+    }
+
+    /// Returns a tensor that is a transposed version of the input, the two last dimensions of the
+    /// input are swapped.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let tensor = tensor.t()?;
+    /// assert_eq!(tensor.to_vec2::<f32>()?, &[[0.0, 2.0, 4.0], [1.0, 3.0, 5.0]]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn t(&self) -> Result<Tensor> {
+        let rank = self.rank();
+        if rank < 2 {
+            Err(Error::UnexpectedNumberOfDims {
+                expected: 2,
+                got: rank,
+                shape: self.shape().clone(),
+            }
+            .bt())?
+        }
+        self.transpose(rank - 2, rank - 1)
+    }
+
+    /// Returns a tensor that is a transposed version of the input, the given dimensions are
+    /// swapped.
+    pub fn transpose<D1: Dim, D2: Dim>(&self, dim1: D1, dim2: D2) -> Result<Tensor> {
+        let dim1 = dim1.to_index(self.shape(), "transpose")?;
+        let dim2 = dim2.to_index(self.shape(), "transpose")?;
+        if dim1 == dim2 {
+            return Ok(self.clone());
+        }
+        let op = BackpropOp::new1(self, |t| Op::Transpose(t, dim1, dim2));
+        let tensor_ = Tensor_ {
+            id: TensorId::new(),
+            storage: self.storage.clone(),
+            layout: self.layout.transpose(dim1, dim2)?,
+            op,
+            is_variable: false,
+            dtype: self.dtype,
+            device: self.device.clone(),
+        };
+        Ok(Tensor(Arc::new(tensor_)))
+    }
+
+    /// Returns a tensor with the same data as the input where the dimensions have been permuted.
+    /// dims must be a permutation, i.e. include each dimension index exactly once.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let tensor = Tensor::arange(0u32, 120u32, &Device::Cpu)?.reshape((2, 3, 4, 5))?;
+    /// assert_eq!(tensor.dims(), &[2, 3, 4, 5]);
+    /// let tensor = tensor.permute((2, 3, 1, 0))?;
+    /// assert_eq!(tensor.dims(), &[4, 5, 3, 2]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn permute<D: Dims>(&self, dims: D) -> Result<Tensor> {
+        let dims = dims.to_indexes(self.shape(), "permute")?;
+        // O(n^2) permutation check but these arrays are small.
+        let is_permutation =
+            dims.len() == self.rank() && (0..dims.len()).all(|i| dims.contains(&i));
+        if !is_permutation {
+            bail!(
+                "dimension mismatch in permute, tensor {:?}, dims: {:?}",
+                self.dims(),
+                dims
+            )
+        }
+        let op = BackpropOp::new1(self, |t| Op::Permute(t, dims.clone()));
+        let tensor_ = Tensor_ {
+            id: TensorId::new(),
+            storage: self.storage.clone(),
+            layout: self.layout.permute(&dims)?,
+            op,
+            is_variable: false,
+            dtype: self.dtype,
+            device: self.device.clone(),
+        };
+        Ok(Tensor(Arc::new(tensor_)))
+    }
+
+    /// Returns true if the data is stored in a C contiguous (aka row major) way.
+    pub fn is_contiguous(&self) -> bool {
+        self.layout.is_contiguous()
+    }
+
+    /// Returns true if the data is stored in a Fortran contiguous (aka column major) way.
+    pub fn is_fortran_contiguous(&self) -> bool {
+        self.layout.is_fortran_contiguous()
+    }
+
+    /// Compared to clone, this copies the actual storage but may fail because of running out of
+    /// memory.
+    pub fn copy(&self) -> Result<Tensor> {
+        let op = BackpropOp::new1(self, Op::Copy);
+        let tensor_ = Tensor_ {
+            id: TensorId::new(),
+            storage: Arc::new(RwLock::new(self.storage().try_clone(self.layout())?)),
+            layout: self.layout.clone(),
+            op,
+            is_variable: false,
+            dtype: self.dtype,
+            device: self.device.clone(),
+        };
+        Ok(Tensor(Arc::new(tensor_)))
+    }
+
+    /// Returns a new tensor detached from the current graph, gradient are not propagated through
+    /// this new node. The storage of this tensor is shared with the initial tensor.
+    ///
+    /// If the tensor is already detached from the computation graph, the same tensor is returned.
+    pub fn detach(&self) -> Tensor {
+        if self.op.is_none() && !self.is_variable {
+            self.clone()
+        } else {
+            let tensor_ = Tensor_ {
+                id: TensorId::new(),
+                storage: self.storage.clone(),
+                layout: self.layout.clone(),
+                op: BackpropOp::none(),
+                is_variable: false,
+                dtype: self.dtype,
+                device: self.device.clone(),
+            };
+            Tensor(Arc::new(tensor_))
+        }
+    }
+
+    /// If the target device is the same as the tensor device, only a shallow copy is performed.
+    pub fn to_device(&self, device: &Device) -> Result<Tensor> {
+        if self.device().same_device(device) {
+            Ok(self.clone())
+        } else {
+            let storage = match (&*self.storage(), device) {
+                (Storage::Cpu(storage), Device::Cuda(cuda)) => {
+                    Storage::Cuda(cuda.storage_from_cpu_storage(storage)?)
+                }
+                (Storage::Cpu(storage), Device::Metal(metal)) => {
+                    Storage::Metal(metal.storage_from_cpu_storage(storage)?)
+                }
+                (Storage::Cuda(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
+                (Storage::Metal(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
+                (Storage::Cuda(storage), Device::Cuda(cuda)) => {
+                    // TODO: Avoid passing through the cpu storage here, especially if the gpu ids
+                    // are the same.
+                    let cpu_storage = storage.to_cpu_storage()?;
+                    Storage::Cuda(cuda.storage_from_cpu_storage(&cpu_storage)?)
+                }
+                (Storage::Cpu(storage), Device::Cpu) => Storage::Cpu(storage.clone()),
+                _ => {
+                    bail!("not implemented yet")
+                }
+            };
+            let op = BackpropOp::new1(self, Op::ToDevice);
+            let tensor_ = Tensor_ {
+                id: TensorId::new(),
+                storage: Arc::new(RwLock::new(storage)),
+                layout: self.layout.clone(),
+                op,
+                is_variable: false,
+                dtype: self.dtype,
+                device: device.clone(),
+            };
+            Ok(Tensor(Arc::new(tensor_)))
+        }
+    }
+
+    /// Returns a new tensor duplicating data from the original tensor. New dimensions are inserted
+    /// on the left.
+    pub fn broadcast_left<S: Into<Shape>>(&self, left_shape: S) -> Result<Self> {
+        let left_shape = left_shape.into();
+        let mut dims = left_shape.into_dims();
+        dims.extend(self.dims());
+        self.broadcast_as(dims)
+    }
+
+    /// Broadcast the input tensor to the target shape. This returns an error if the input shape is
+    /// not compatible with the target shape.
+    ///
+    /// If the input shape is `i_1, i_2, ... i_k`, the target shape has to have `k` dimensions or
+    /// more and shape `j_1, ..., j_l, t_1, t_2, ..., t_k`. The dimensions `j_1` to `j_l` can have
+    /// any value, the dimension `t_a` must be equal to `i_a` if `i_a` is different from 1. If
+    /// `i_a` is equal to 1, any value can be used.
+    pub fn broadcast_as<S: Into<Shape>>(&self, shape: S) -> Result<Self> {
+        let tensor_ = Tensor_ {
+            id: TensorId::new(),
+            storage: self.storage.clone(),
+            layout: self.layout.broadcast_as(shape)?,
+            op: BackpropOp::new1(self, Op::Broadcast),
+            is_variable: false,
+            dtype: self.dtype,
+            device: self.device.clone(),
+        };
+        Ok(Tensor(Arc::new(tensor_)))
+    }
+
+    /// An alias for broadcast_as.
+    pub fn expand<S: Into<Shape>>(&self, shape: S) -> Result<Self> {
+        self.broadcast_as(shape)
+    }
+
+    /// Casts the input tensor to the target `dtype`.
+    ///
+    /// ```rust
+    /// use candle_core::{Tensor, Device};
+    /// let tensor = Tensor::new(3.14159265358979f64, &Device::Cpu)?;
+    /// assert_eq!(tensor.to_scalar::<f64>()?, 3.14159265358979);
+    /// let tensor = tensor.to_dtype(candle_core::DType::F32)?;
+    /// assert_eq!(tensor.to_scalar::<f32>()?, 3.1415927);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn to_dtype(&self, dtype: DType) -> Result<Self> {
+        if self.dtype() == dtype {
+            Ok(self.clone())
+        } else {
+            let shape = self.shape();
+            let storage = self.storage().to_dtype(self.layout(), dtype)?;
+            let op = BackpropOp::new1(self, Op::ToDType);
+            Ok(from_storage(storage, shape.clone(), op, false))
+        }
+    }
+
+    /// Returns a tensor that is in row major order. This is the same as the original tensor if it
+    /// was already contiguous, otherwise a copy is triggered.
+    pub fn contiguous(&self) -> Result<Tensor> {
+        if self.is_contiguous() {
+            Ok(self.clone())
+        } else {
+            let shape = self.shape();
+            let mut storage = unsafe { self.device().alloc_uninit(shape, self.dtype())? };
+            self.storage()
+                .copy_strided_src(&mut storage, 0, self.layout())?;
+            let op = BackpropOp::new1(self, Op::Copy);
+            Ok(from_storage(storage, shape.clone(), op, false))
+        }
+    }
+
+    /// Returns a tensor that is in row major order. This always makes a copy.
+    pub fn force_contiguous(&self) -> Result<Tensor> {
+        let shape = self.shape();
+        let mut storage = unsafe { self.device().alloc_uninit(shape, self.dtype())? };
+        self.storage()
+            .copy_strided_src(&mut storage, 0, self.layout())?;
+        let op = BackpropOp::new1(self, Op::Copy);
+        Ok(from_storage(storage, shape.clone(), op, false))
+    }
+
+    /// Create a variable based on the values currently stored in a tensor. The storage is always
+    /// copied.
+    pub(crate) fn make_var(&self) -> Result<Tensor> {
+        let shape = self.shape().clone();
+        let mut storage = unsafe { self.device().alloc_uninit(&shape, self.dtype())? };
+        self.storage()
+            .copy_strided_src(&mut storage, 0, self.layout())?;
+        Ok(from_storage(storage, shape, BackpropOp::none(), true))
+    }
+
+    /// Reshape returns a tensor with the target shape provided that the number of elements of the
+    /// original tensor is the same.
+    /// If the input tensor is contiguous, this is a view on the original data. Otherwise this uses
+    /// a new storage and copies the data over, the returned tensor is always contiguous.
+    ///
+    /// The shape can be specified using a tuple of `usize` and at most one `()` in which case
+    /// the behavior is the same as when using `-1` in PyTorch: this dimension size is adjusted so
+    /// as to match the number of elements in the tensor.
+    ///
+    /// ```rust
+    /// # use candle_core::{Tensor, DType, Device, D};
+    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    ///
+    /// let c = a.reshape((1, 6))?;
+    /// assert_eq!(c.shape().dims(), &[1, 6]);
+    ///
+    /// let c = a.reshape((3, 2))?;
+    /// assert_eq!(c.shape().dims(), &[3, 2]);
+    ///
+    /// let c = a.reshape((2, (), 1))?;
+    /// assert_eq!(c.shape().dims(), &[2, 3, 1]);
+    ///
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn reshape<S: crate::shape::ShapeWithOneHole>(&self, s: S) -> Result<Tensor> {
+        let shape = s.into_shape(self.elem_count())?;
+        if shape.elem_count() != self.elem_count() {
+            return Err(Error::ShapeMismatchBinaryOp {
+                lhs: self.shape().clone(),
+                rhs: shape,
+                op: "reshape",
+            }
+            .bt());
+        }
+        let op = BackpropOp::new1(self, Op::Reshape);
+        if self.is_contiguous() {
+            let tensor_ = Tensor_ {
+                id: TensorId::new(),
+                storage: self.storage.clone(),
+                layout: Layout::contiguous_with_offset(shape, self.layout.start_offset()),
+                op,
+                is_variable: false,
+                dtype: self.dtype,
+                device: self.device.clone(),
+            };
+            Ok(Tensor(Arc::new(tensor_)))
+        } else {
+            let mut storage = unsafe { self.device().alloc_uninit(&shape, self.dtype())? };
+            self.storage()
+                .copy_strided_src(&mut storage, 0, self.layout())?;
+            Ok(from_storage(storage, shape, op, false))
+        }
+    }
+
+    /// Creates a new tensor with the specified dimension removed if its size was one.
+    ///
+    /// ```rust
+    /// # use candle_core::{Tensor, DType, Device, D};
+    /// let a = Tensor::zeros((2, 3, 1), DType::F32, &Device::Cpu)?;
+    ///
+    /// let c = a.squeeze(2)?;
+    /// assert_eq!(c.shape().dims(), &[2, 3]);
+    ///
+    /// let c = a.squeeze(D::Minus1)?;
+    /// assert_eq!(c.shape().dims(), &[2, 3]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn squeeze<D: Dim>(&self, dim: D) -> Result<Self> {
+        // The PyTorch semantics are to return the same tensor if the target dimension
+        // does not have a size of 1.
+        let dims = self.dims();
+        let dim = dim.to_index(self.shape(), "squeeze")?;
+        if dims[dim] == 1 {
+            let mut dims = dims.to_vec();
+            let mut strides = self.stride().to_vec();
+            dims.remove(dim);
+            strides.remove(dim);
+            let tensor_ = Tensor_ {
+                id: TensorId::new(),
+                storage: self.storage.clone(),
+                layout: Layout::new(dims.into(), strides, self.layout.start_offset()),
+                op: BackpropOp::new1(self, Op::Reshape),
+                is_variable: false,
+                dtype: self.dtype,
+                device: self.device.clone(),
+            };
+            Ok(Tensor(Arc::new(tensor_)))
+        } else {
+            Ok(self.clone())
+        }
+    }
+
+    /// Creates a new tensor with a dimension of size one inserted at the specified position.
+    ///
+    /// ```rust
+    /// # use candle_core::{Tensor, DType, Device, D};
+    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    ///
+    /// let c = a.unsqueeze(0)?;
+    /// assert_eq!(c.shape().dims(), &[1, 2, 3]);
+    ///
+    /// let c = a.unsqueeze(D::Minus1)?;
+    /// assert_eq!(c.shape().dims(), &[2, 3, 1]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn unsqueeze<D: Dim>(&self, dim: D) -> Result<Self> {
+        let mut dims = self.dims().to_vec();
+        let mut strides = self.stride().to_vec();
+        let dim = dim.to_index_plus_one(self.shape(), "unsqueeze")?;
+        // Cannot panic because to_index_plus_one already checks dimensions
+        dims.insert(dim, 1);
+        // Any stride would work here, but we pick one so as to maximize the probability to remain
+        // C contiguous.
+        let stride = if dim < strides.len() { strides[dim] } else { 1 };
+        strides.insert(dim, stride);
+        let tensor_ = Tensor_ {
+            id: TensorId::new(),
+            storage: self.storage.clone(),
+            layout: Layout::new(dims.into(), strides, self.layout.start_offset()),
+            op: BackpropOp::new1(self, Op::Reshape),
+            is_variable: false,
+            dtype: self.dtype,
+            device: self.device.clone(),
+        };
+        Ok(Tensor(Arc::new(tensor_)))
+    }
+
+    /// Stacks two or more tensors along a particular dimension.
+    ///
+    /// All tensors must have the same rank, and the output has one additional rank
+    ///
+    /// ```rust
+    /// # use candle_core::{Tensor, DType, Device};
+    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    ///
+    /// let c = Tensor::stack(&[&a, &b], 0)?;
+    /// assert_eq!(c.shape().dims(), &[2, 2, 3]);
+    ///
+    /// let c = Tensor::stack(&[&a, &b], 2)?;
+    /// assert_eq!(c.shape().dims(), &[2, 3, 2]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn stack<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
+        if args.is_empty() {
+            Err(Error::OpRequiresAtLeastOneTensor { op: "stack" }.bt())?
+        }
+        let dim = dim.to_index_plus_one(args[0].as_ref().shape(), "stack")?;
+        let args = args
+            .iter()
+            .map(|t| t.as_ref().unsqueeze(dim))
+            .collect::<Result<Vec<_>>>()?;
+        Self::cat(&args, dim)
+    }
+
+    /// Pad the input tensor using 0s along dimension `dim`. This adds `left` elements before the
+    /// input tensor values and `right` elements after.
+    pub fn pad_with_zeros<D: Dim>(&self, dim: D, left: usize, right: usize) -> Result<Self> {
+        if left == 0 && right == 0 {
+            Ok(self.clone())
+        } else if left == 0 {
+            let dim = dim.to_index(self.shape(), "pad_with_zeros")?;
+            let mut dims = self.dims().to_vec();
+            dims[dim] = right;
+            let right = Tensor::zeros(dims.as_slice(), self.dtype, self.device())?;
+            Tensor::cat(&[self, &right], dim)
+        } else if right == 0 {
+            let dim = dim.to_index(self.shape(), "pad_with_zeros")?;
+            let mut dims = self.dims().to_vec();
+            dims[dim] = left;
+            let left = Tensor::zeros(dims.as_slice(), self.dtype, self.device())?;
+            Tensor::cat(&[&left, self], dim)
+        } else {
+            let dim = dim.to_index(self.shape(), "pad_with_zeros")?;
+            let mut dims = self.dims().to_vec();
+            dims[dim] = left;
+            let left = Tensor::zeros(dims.as_slice(), self.dtype, self.device())?;
+            dims[dim] = right;
+            let right = Tensor::zeros(dims.as_slice(), self.dtype, self.device())?;
+            Tensor::cat(&[&left, self, &right], dim)
+        }
+    }
+
+    /// Pad the input tensor using same values along dimension `dim`. This adds `left` elements before the
+    /// input tensor values and `right` elements after.
+    pub fn pad_with_same<D: Dim>(&self, dim: D, left: usize, right: usize) -> Result<Self> {
+        if left == 0 && right == 0 {
+            Ok(self.clone())
+        } else if self.elem_count() == 0 {
+            bail!("cannot use pad_with_same on an empty tensor")
+        } else if left == 0 {
+            let dim = dim.to_index(self.shape(), "pad_with_same")?;
+            let r = self.narrow(dim, self.dim(dim)? - 1, 1)?;
+            let mut v = vec![self];
+            for _ in 0..right {
+                v.push(&r)
+            }
+            Tensor::cat(&v, dim)
+        } else if right == 0 {
+            let dim = dim.to_index(self.shape(), "pad_with_same")?;
+            let l = self.narrow(dim, 0, 1)?;
+            let mut v = vec![];
+            for _ in 0..left {
+                v.push(&l)
+            }
+            v.push(self);
+            Tensor::cat(&v, dim)
+        } else {
+            let dim = dim.to_index(self.shape(), "pad_with_same")?;
+            let l = self.narrow(dim, 0, 1)?;
+            let r = self.narrow(dim, self.dim(dim)? - 1, 1)?;
+            let mut v = vec![];
+            for _ in 0..left {
+                v.push(&l)
+            }
+            v.push(self);
+            for _ in 0..right {
+                v.push(&r)
+            }
+            Tensor::cat(&v, dim)
+        }
+    }
+
+    /// Run the `forward` method of `m` on `self`.
+    pub fn apply<M: crate::Module>(&self, m: &M) -> Result<Self> {
+        m.forward(self)
+    }
+
+    /// Run the `forward` method of `m` on `self`.
+    pub fn apply_t<M: crate::ModuleT>(&self, m: &M, train: bool) -> Result<Self> {
+        m.forward_t(self, train)
+    }
+
+    pub(crate) fn storage(&self) -> std::sync::RwLockReadGuard<'_, Storage> {
+        self.storage.read().unwrap()
+    }
+
+    pub(crate) fn storage_mut(&self) -> std::sync::RwLockWriteGuard<'_, Storage> {
+        self.storage.write().unwrap()
+    }
+
+    // If we extend the visibility of this function to be usable outside of this crate, we should
+    // make it unsafe.
+    pub(crate) fn storage_mut_and_layout(
+        &self,
+    ) -> (std::sync::RwLockWriteGuard<'_, Storage>, &Layout) {
+        let storage = self.storage.write().unwrap();
+        (storage, &self.layout)
+    }
+
+    /// The storage used by this tensor, together with the layout to use to access it safely.
+    pub fn storage_and_layout(&self) -> (std::sync::RwLockReadGuard<'_, Storage>, &Layout) {
+        let storage = self.storage.read().unwrap();
+        (storage, &self.layout)
+    }
+
+    pub(crate) fn same_storage(&self, rhs: &Self) -> bool {
+        let lhs: &RwLock<Storage> = self.storage.as_ref();
+        let rhs: &RwLock<Storage> = rhs.storage.as_ref();
+        std::ptr::eq(lhs, rhs)
+    }
+
+    /// Normalize a 'relative' axis value: positive values are kept, negative
+    /// values means counting the dimensions from the back.
+    pub fn normalize_axis(&self, axis: i64) -> Result<usize> {
+        let rank = self.rank() as i64;
+        if rank <= axis {
+            bail!("axis {axis} is too large, tensor rank {rank}")
+        } else if 0 <= axis {
+            Ok(axis as usize)
+        } else {
+            let naxis = rank + axis;
+            if naxis < 0 {
+                bail!("axis {axis} is too small, tensor rank {rank}")
+            }
+            Ok(naxis as usize)
+        }
+    }
+
+    /// Returns a lower triangular matrix of ones of size n by n.
+    pub fn tril2(n: usize, dtype: DType, device: &Device) -> Result<Self> {
+        let t = Tensor::arange(0u32, n as u32, device)?;
+        let t1 = t.reshape((1, n))?.broadcast_as((n, n))?;
+        let t2 = t.reshape((n, 1))?.broadcast_as((n, n))?;
+        t1.le(&t2)?.to_dtype(dtype)
+    }
+
+    /// Returns an upper triangular matrix of ones of size n by n.
+    pub fn triu2(n: usize, dtype: DType, device: &Device) -> Result<Self> {
+        let t = Tensor::arange(0u32, n as u32, device)?;
+        let t1 = t.reshape((1, n))?.broadcast_as((n, n))?;
+        let t2 = t.reshape((n, 1))?.broadcast_as((n, n))?;
+        t1.ge(&t2)?.to_dtype(dtype)
+    }
+
+    /// Returns a matrix with a diagonal of ones of size n by n.
+    pub fn eye(n: usize, dtype: DType, device: &Device) -> Result<Self> {
+        let t = Tensor::arange(0u32, n as u32, device)?;
+        let t1 = t.reshape((1, n))?.broadcast_as((n, n))?;
+        let t2 = t.reshape((n, 1))?.broadcast_as((n, n))?;
+        t1.eq(&t2)?.to_dtype(dtype)
+    }
+
+    /// Returns the cumulative sum of elements of the input tensor summed over the specified
+    /// dimension.
+    ///
+    /// This operation is most efficient when dim is the last dimension of the tensor.
+    pub fn cumsum<D: Dim>(&self, dim: D) -> Result<Self> {
+        let dim = dim.to_index(self.shape(), "cumsum")?;
+        let rank = self.rank();
+        if rank == 0 {
+            return Ok(self.clone());
+        }
+        let n_axis = self.dim(dim)?;
+        let triu = Tensor::triu2(n_axis, self.dtype(), self.device())?;
+        if rank == 1 {
+            self.unsqueeze(0)?.matmul(&triu)?.squeeze(0)
+        } else {
+            let last = rank - 1;
+            let t = self.transpose(dim, last)?;
+            let t = t.broadcast_matmul(&triu)?;
+            t.transpose(dim, last)
+        }
+    }
+
+    /// Returns a copy of `self` where the values within `ranges` have been replaced with the
+    /// content of `src`.
+    pub fn slice_assign<D: std::ops::RangeBounds<usize>>(
+        &self,
+        ranges: &[D],
+        src: &Tensor,
+    ) -> Result<Self> {
+        let src_dims = src.dims();
+        let self_dims = self.dims();
+        if self_dims.len() != src_dims.len() {
+            bail!(
+                "slice-assign requires input with the same rank {} <> {}",
+                self_dims.len(),
+                src_dims.len()
+            )
+        }
+        if self_dims.len() != ranges.len() {
+            bail!(
+                "slice-assign requires input with the same rank as there are ranges {} <> {}",
+                self_dims.len(),
+                ranges.len()
+            )
+        }
+        let mut src = src.clone();
+        let mut mask = Self::ones(src.shape(), DType::U8, src.device())?;
+        for (i, range) in ranges.iter().enumerate() {
+            let start_included = match range.start_bound() {
+                std::ops::Bound::Unbounded => 0,
+                std::ops::Bound::Included(v) => *v,
+                std::ops::Bound::Excluded(v) => *v + 1,
+            };
+            let end_excluded = match range.end_bound() {
+                std::ops::Bound::Unbounded => self_dims[i],
+                std::ops::Bound::Included(v) => *v + 1,
+                std::ops::Bound::Excluded(v) => *v,
+            };
+            if end_excluded <= start_included {
+                bail!("slice-assign: empty range for dim {i}, {start_included} {end_excluded}")
+            }
+            if self_dims[i] < end_excluded {
+                bail!(
+                    "slice-assign: upper bound is out of range for dim {i}, {end_excluded} {}",
+                    self_dims[i]
+                )
+            }
+            if end_excluded - start_included != src_dims[i] {
+                bail!(
+                    "slice-assign: the range for dim {i} ({start_included}..{end_excluded}) does not match the size of src {}", src_dims[i]
+                )
+            }
+            src = src.pad_with_zeros(i, start_included, self_dims[i] - end_excluded)?;
+            mask = mask.pad_with_zeros(i, start_included, self_dims[i] - end_excluded)?
+        }
+        mask.where_cond(/* on_true= */ &src, /* on_false= */ self)
+    }
+
+    /// Returns log(sum(exp(tensor), dim)).
+    pub fn log_sum_exp<D: Dims>(&self, sum_dims: D) -> Result<Self> {
+        let exp = self.exp()?;
+        let sum = exp.sum(sum_dims)?;
+        sum.log()
+    }
+
+    /// Pointwise pow operation.
+    pub fn pow(&self, rhs: &Tensor) -> Result<Self> {
+        rhs.mul(&self.log()?)?.exp()
+    }
+
+    /// Broadcasting version of `pow`.
+    pub fn broadcast_pow(&self, rhs: &Tensor) -> Result<Self> {
+        rhs.broadcast_mul(&self.log()?)?.exp()
+    }
+}
+
+macro_rules! bin_trait {
+    ($trait:ident, $fn1:ident, $mul:expr, $add:expr) => {
+        impl<B: std::borrow::Borrow<Tensor>> std::ops::$trait<B> for Tensor {
+            type Output = Result<Tensor>;
+
+            fn $fn1(self, rhs: B) -> Self::Output {
+                Tensor::$fn1(&self, rhs.borrow())
+            }
+        }
+
+        impl<B: std::borrow::Borrow<Tensor>> std::ops::$trait<B> for &Tensor {
+            type Output = Result<Tensor>;
+
+            fn $fn1(self, rhs: B) -> Self::Output {
+                Tensor::$fn1(&self, rhs.borrow())
+            }
+        }
+
+        impl<B: std::borrow::Borrow<Tensor>> std::ops::$trait<Tensor> for Result<B> {
+            type Output = Result<Tensor>;
+
+            fn $fn1(self, rhs: Tensor) -> Self::Output {
+                Tensor::$fn1(self?.borrow(), &rhs)
+            }
+        }
+
+        impl<B: std::borrow::Borrow<Tensor>> std::ops::$trait<&Tensor> for Result<B> {
+            type Output = Result<Tensor>;
+
+            fn $fn1(self, rhs: &Tensor) -> Self::Output {
+                Tensor::$fn1(self?.borrow(), rhs)
+            }
+        }
+
+        impl<B: std::borrow::Borrow<Tensor>> std::ops::$trait<Result<B>> for Tensor {
+            type Output = Result<Tensor>;
+
+            fn $fn1(self, rhs: Result<B>) -> Self::Output {
+                Tensor::$fn1(&self, rhs?.borrow())
+            }
+        }
+
+        impl<B: std::borrow::Borrow<Tensor>> std::ops::$trait<Result<B>> for &Tensor {
+            type Output = Result<Tensor>;
+
+            fn $fn1(self, rhs: Result<B>) -> Self::Output {
+                Tensor::$fn1(&self, rhs?.borrow())
+            }
+        }
+
+        impl std::ops::$trait<f64> for Tensor {
+            type Output = Result<Tensor>;
+
+            fn $fn1(self, rhs: f64) -> Self::Output {
+                self.affine($mul(rhs), $add(rhs))
+            }
+        }
+
+        impl std::ops::$trait<f64> for &Tensor {
+            type Output = Result<Tensor>;
+
+            fn $fn1(self, rhs: f64) -> Self::Output {
+                self.affine($mul(rhs), $add(rhs))
+            }
+        }
+    };
+}
+
+bin_trait!(Add, add, |_| 1., |v| v);
+bin_trait!(Sub, sub, |_| 1., |v: f64| -v);
+bin_trait!(Mul, mul, |v| v, |_| 0.);
+bin_trait!(Div, div, |v| 1. / v, |_| 0.);
+
+impl std::ops::Add<Tensor> for f64 {
+    type Output = Result<Tensor>;
+
+    fn add(self, rhs: Tensor) -> Self::Output {
+        rhs + self
+    }
+}
+
+impl std::ops::Add<&Tensor> for f64 {
+    type Output = Result<Tensor>;
+
+    fn add(self, rhs: &Tensor) -> Self::Output {
+        rhs + self
+    }
+}
+
+impl std::ops::Mul<Tensor> for f64 {
+    type Output = Result<Tensor>;
+
+    fn mul(self, rhs: Tensor) -> Self::Output {
+        rhs * self
+    }
+}
+
+impl std::ops::Mul<&Tensor> for f64 {
+    type Output = Result<Tensor>;
+
+    fn mul(self, rhs: &Tensor) -> Self::Output {
+        rhs * self
+    }
+}
+
+impl std::ops::Sub<Tensor> for f64 {
+    type Output = Result<Tensor>;
+
+    fn sub(self, rhs: Tensor) -> Self::Output {
+        rhs.affine(-1., self)
+    }
+}
+
+impl std::ops::Sub<&Tensor> for f64 {
+    type Output = Result<Tensor>;
+
+    fn sub(self, rhs: &Tensor) -> Self::Output {
+        rhs.affine(-1., self)
+    }
+}
+
+impl std::ops::Div<Tensor> for f64 {
+    type Output = Result<Tensor>;
+
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn div(self, rhs: Tensor) -> Self::Output {
+        rhs.recip()? * self
+    }
+}
+
+impl std::ops::Div<&Tensor> for f64 {
+    type Output = Result<Tensor>;
+
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn div(self, rhs: &Tensor) -> Self::Output {
+        rhs.recip()? * self
+    }
+}
--- a/candle-core/src/tensor_cat.rs
+++ b/candle-core/src/tensor_cat.rs
+use crate::{shape::Dim, Error, Result, Shape, Tensor};
+
+impl Tensor {
+    /// Concatenates two or more tensors along a particular dimension.
+    ///
+    /// All tensors must of the same rank, and the output will have
+    /// the same rank
+    ///
+    /// ```rust
+    /// # use candle_core::{Tensor, DType, Device};
+    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    ///
+    /// let c = Tensor::cat(&[&a, &b], 0)?;
+    /// assert_eq!(c.shape().dims(), &[4, 3]);
+    ///
+    /// let c = Tensor::cat(&[&a, &b], 1)?;
+    /// assert_eq!(c.shape().dims(), &[2, 6]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn cat<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
+        if args.is_empty() {
+            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
+        }
+        let arg0 = args[0].as_ref();
+        if args.len() == 1 {
+            return Ok(arg0.clone());
+        }
+        let dim = dim.to_index(arg0.shape(), "cat")?;
+        for arg in args {
+            arg.as_ref().check_dim(dim, "cat")?;
+        }
+        for (arg_idx, arg) in args.iter().enumerate() {
+            let arg = arg.as_ref();
+            if arg0.rank() != arg.rank() {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: arg0.rank(),
+                    got: arg.rank(),
+                    shape: arg.shape().clone(),
+                }
+                .bt())?
+            }
+            for (dim_idx, (v1, v2)) in arg0
+                .shape()
+                .dims()
+                .iter()
+                .zip(arg.shape().dims().iter())
+                .enumerate()
+            {
+                if dim_idx != dim && v1 != v2 {
+                    Err(Error::ShapeMismatchCat {
+                        dim: dim_idx,
+                        first_shape: arg0.shape().clone(),
+                        n: arg_idx + 1,
+                        nth_shape: arg.shape().clone(),
+                    }
+                    .bt())?
+                }
+            }
+        }
+        let all_contiguous = args.iter().all(|v| v.as_ref().is_contiguous());
+        if all_contiguous {
+            Self::cat_contiguous(args, dim)
+        } else if dim == 0 {
+            Self::cat0(args)
+        } else {
+            let args: Vec<Tensor> = args
+                .iter()
+                .map(|a| a.as_ref().transpose(0, dim))
+                .collect::<Result<Vec<_>>>()?;
+            let cat = Self::cat0(&args)?;
+            cat.transpose(0, dim)
+        }
+    }
+
+    fn cat0<A: AsRef<Tensor>>(args: &[A]) -> Result<Self> {
+        if args.is_empty() {
+            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
+        }
+        let arg0 = args[0].as_ref();
+        if args.len() == 1 {
+            return Ok(arg0.clone());
+        }
+        let rank = arg0.rank();
+        let device = arg0.device();
+        let dtype = arg0.dtype();
+        let first_dims = arg0.shape().dims();
+        let mut cat_dims = first_dims.to_vec();
+        cat_dims[0] = 0;
+        let mut offsets = vec![0usize];
+        for (arg_idx, arg) in args.iter().enumerate() {
+            let arg = arg.as_ref();
+            if arg.dtype() != dtype {
+                Err(Error::DTypeMismatchBinaryOp {
+                    lhs: dtype,
+                    rhs: arg.dtype(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if arg.device().location() != device.location() {
+                Err(Error::DeviceMismatchBinaryOp {
+                    lhs: device.location(),
+                    rhs: arg.device().location(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if rank != arg.rank() {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: rank,
+                    got: arg.rank(),
+                    shape: arg.shape().clone(),
+                }
+                .bt())?
+            }
+            for (dim_idx, (v1, v2)) in arg0
+                .shape()
+                .dims()
+                .iter()
+                .zip(arg.shape().dims().iter())
+                .enumerate()
+            {
+                if dim_idx == 0 {
+                    cat_dims[0] += v2;
+                }
+                if dim_idx != 0 && v1 != v2 {
+                    Err(Error::ShapeMismatchCat {
+                        dim: dim_idx,
+                        first_shape: arg0.shape().clone(),
+                        n: arg_idx + 1,
+                        nth_shape: arg.shape().clone(),
+                    }
+                    .bt())?
+                }
+            }
+            let next_offset = offsets.last().unwrap() + arg.elem_count();
+            offsets.push(next_offset);
+        }
+        let shape = Shape::from(cat_dims);
+        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, 0));
+        let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? };
+        for (arg, &offset) in args.iter().zip(offsets.iter()) {
+            let arg = arg.as_ref();
+            arg.storage()
+                .copy_strided_src(&mut storage, offset, arg.layout())?;
+        }
+        Ok(crate::tensor::from_storage(storage, shape, op, false))
+    }
+
+    fn cat_contiguous<A: AsRef<Tensor>>(args: &[A], dim: usize) -> Result<Self> {
+        if args.is_empty() {
+            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
+        }
+        let arg0 = args[0].as_ref();
+        if args.len() == 1 {
+            return Ok(arg0.clone());
+        }
+        let rank = arg0.rank();
+        let device = arg0.device();
+        let dtype = arg0.dtype();
+        let first_dims = arg0.shape().dims();
+        let mut cat_dims = first_dims.to_vec();
+        cat_dims[dim] = 0;
+        for (arg_idx, arg) in args.iter().enumerate() {
+            let arg = arg.as_ref();
+            if arg.dtype() != dtype {
+                Err(Error::DTypeMismatchBinaryOp {
+                    lhs: dtype,
+                    rhs: arg.dtype(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if arg.device().location() != device.location() {
+                Err(Error::DeviceMismatchBinaryOp {
+                    lhs: device.location(),
+                    rhs: arg.device().location(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if rank != arg.rank() {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: rank,
+                    got: arg.rank(),
+                    shape: arg.shape().clone(),
+                }
+                .bt())?
+            }
+            for (dim_idx, (v1, v2)) in arg0
+                .shape()
+                .dims()
+                .iter()
+                .zip(arg.shape().dims().iter())
+                .enumerate()
+            {
+                if dim_idx == dim {
+                    cat_dims[dim] += v2;
+                }
+                if dim_idx != dim && v1 != v2 {
+                    Err(Error::ShapeMismatchCat {
+                        dim: dim_idx,
+                        first_shape: arg0.shape().clone(),
+                        n: arg_idx + 1,
+                        nth_shape: arg.shape().clone(),
+                    }
+                    .bt())?
+                }
+            }
+        }
+        let cat_target_dim_len = cat_dims[dim];
+        let block_size: usize = cat_dims.iter().skip(1 + dim).product();
+        let shape = Shape::from(cat_dims);
+        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, dim));
+        let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? };
+        let mut dst_o = 0;
+        for arg in args.iter() {
+            let arg = arg.as_ref();
+            let arg_dims = arg.shape().dims();
+            let d1: usize = arg_dims.iter().take(dim).product();
+            let d2 = block_size * arg_dims[dim];
+            let dst_s = block_size * cat_target_dim_len;
+            let src_o = arg.layout().start_offset();
+            arg.storage().copy2d(
+                &mut storage,
+                d1,
+                d2,
+                /* src_s */ d2,
+                dst_s,
+                src_o,
+                dst_o,
+            )?;
+            dst_o += d2;
+        }
+        Ok(crate::tensor::from_storage(storage, shape, op, false))
+    }
+}
--- a/candle-core/src/test_utils.rs
+++ b/candle-core/src/test_utils.rs
+use crate::{Result, Tensor};
+
+#[macro_export]
+macro_rules! test_device {
+    // TODO: Switch to generating the two last arguments automatically once concat_idents is
+    // stable. https://github.com/rust-lang/rust/issues/29599
+    ($fn_name: ident, $test_cpu: ident, $test_cuda: ident, $test_metal: ident) => {
+        #[test]
+        fn $test_cpu() -> Result<()> {
+            $fn_name(&Device::Cpu)
+        }
+
+        #[cfg(feature = "cuda")]
+        #[test]
+        fn $test_cuda() -> Result<()> {
+            $fn_name(&Device::new_cuda(0)?)
+        }
+
+        #[cfg(feature = "metal")]
+        #[test]
+        fn $test_metal() -> Result<()> {
+            $fn_name(&Device::new_metal(0)?)
+        }
+    };
+}
+
+pub fn to_vec0_round(t: &Tensor, digits: i32) -> Result<f32> {
+    let b = 10f32.powi(digits);
+    let t = t.to_vec0::<f32>()?;
+    Ok(f32::round(t * b) / b)
+}
+
+pub fn to_vec1_round(t: &Tensor, digits: i32) -> Result<Vec<f32>> {
+    let b = 10f32.powi(digits);
+    let t = t.to_vec1::<f32>()?;
+    let t = t.iter().map(|t| f32::round(t * b) / b).collect();
+    Ok(t)
+}
+
+pub fn to_vec2_round(t: &Tensor, digits: i32) -> Result<Vec<Vec<f32>>> {
+    let b = 10f32.powi(digits);
+    let t = t.to_vec2::<f32>()?;
+    let t = t
+        .iter()
+        .map(|t| t.iter().map(|t| f32::round(t * b) / b).collect())
+        .collect();
+    Ok(t)
+}
+
+pub fn to_vec3_round(t: &Tensor, digits: i32) -> Result<Vec<Vec<Vec<f32>>>> {
+    let b = 10f32.powi(digits);
+    let t = t.to_vec3::<f32>()?;
+    let t = t
+        .iter()
+        .map(|t| {
+            t.iter()
+                .map(|t| t.iter().map(|t| f32::round(t * b) / b).collect())
+                .collect()
+        })
+        .collect();
+    Ok(t)
+}
--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
+use std::str::FromStr;
+
+pub fn get_num_threads() -> usize {
+    // Respond to the same environment variable as rayon.
+    match std::env::var("RAYON_NUM_THREADS")
+        .ok()
+        .and_then(|s| usize::from_str(&s).ok())
+    {
+        Some(x) if x > 0 => x,
+        Some(_) | None => num_cpus::get(),
+    }
+}
+
+pub fn has_accelerate() -> bool {
+    cfg!(feature = "accelerate")
+}
+
+pub fn has_mkl() -> bool {
+    cfg!(any(feature = "mkl", feature = "mkl-dynamic"))
+}
+
+pub fn cuda_is_available() -> bool {
+    cfg!(feature = "cuda")
+}
+
+pub fn metal_is_available() -> bool {
+    cfg!(feature = "metal")
+}
+
+pub fn with_avx() -> bool {
+    cfg!(target_feature = "avx")
+}
+
+pub fn with_neon() -> bool {
+    cfg!(target_feature = "neon")
+}
+
+pub fn with_simd128() -> bool {
+    cfg!(target_feature = "simd128")
+}
+
+pub fn with_f16c() -> bool {
+    cfg!(target_feature = "f16c")
+}
--- a/candle-core/src/variable.rs
+++ b/candle-core/src/variable.rs
+// Variables are wrappers around tensors that can be modified, they are typically used for holding
+// weights and being modified by gradient descent.
+// We do not expose a public way to create variables as this would break the invariant that the
+// tensor within a variable is actually with `is_variable` set to `true`.
+use crate::{DType, Device, Error, Result, Shape, Tensor};
+
+/// A variable is a wrapper around a tensor, however variables can have their content modified
+/// whereas tensors are immutable.
+#[derive(Clone, Debug)]
+pub struct Var(Tensor);
+
+impl std::fmt::Display for Var {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Display::fmt(&self.0, f)
+    }
+}
+
+impl std::ops::Deref for Var {
+    type Target = Tensor;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+impl Var {
+    pub fn zeros<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) -> Result<Self> {
+        let inner = Tensor::zeros_impl(shape, dtype, device, true)?;
+        Ok(Self(inner))
+    }
+
+    pub fn ones<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) -> Result<Self> {
+        let inner = Tensor::ones_impl(shape, dtype, device, true)?;
+        Ok(Self(inner))
+    }
+
+    pub fn from_tensor(t: &Tensor) -> Result<Self> {
+        let inner = t.make_var()?;
+        Ok(Self(inner))
+    }
+
+    pub fn rand_f64<S: Into<Shape>>(
+        lo: f64,
+        up: f64,
+        s: S,
+        dtype: DType,
+        device: &Device,
+    ) -> Result<Self> {
+        let inner = Tensor::rand_f64_impl(lo, up, s, dtype, device, true)?;
+        Ok(Self(inner))
+    }
+
+    pub fn randn_f64<S: Into<Shape>>(
+        mean: f64,
+        std: f64,
+        s: S,
+        dtype: DType,
+        device: &Device,
+    ) -> Result<Self> {
+        let inner = Tensor::randn_f64_impl(mean, std, s, dtype, device, true)?;
+        Ok(Self(inner))
+    }
+
+    pub fn rand<S: Into<Shape>, T: crate::FloatDType>(
+        lo: T,
+        up: T,
+        s: S,
+        device: &Device,
+    ) -> Result<Self> {
+        let inner = Tensor::rand_impl(lo, up, s, device, true)?;
+        Ok(Self(inner))
+    }
+
+    pub fn randn<S: Into<Shape>, T: crate::FloatDType>(
+        mean: T,
+        std: T,
+        s: S,
+        device: &Device,
+    ) -> Result<Self> {
+        let inner = Tensor::randn_impl(mean, std, s, device, true)?;
+        Ok(Self(inner))
+    }
+
+    /// Creates a new tensor on the specified device using the content and shape of the input.
+    /// This is similar to `new` but the resulting tensor is a variable.
+    pub fn new<A: crate::device::NdArray>(array: A, device: &Device) -> Result<Self> {
+        let shape = array.shape()?;
+        let inner = Tensor::new_impl(array, shape, device, true)?;
+        Ok(Self(inner))
+    }
+
+    pub fn from_vec<S: Into<Shape>, D: crate::WithDType>(
+        data: Vec<D>,
+        shape: S,
+        device: &Device,
+    ) -> Result<Self> {
+        let inner = Tensor::from_vec_impl(data, shape, device, true)?;
+        Ok(Self(inner))
+    }
+
+    pub fn from_slice<S: Into<Shape>, D: crate::WithDType>(
+        array: &[D],
+        shape: S,
+        device: &Device,
+    ) -> Result<Self> {
+        let inner = Tensor::new_impl(array, shape.into(), device, true)?;
+        Ok(Self(inner))
+    }
+
+    pub fn as_detached_tensor(&self) -> Tensor {
+        self.0.detach()
+    }
+
+    pub fn as_tensor(&self) -> &Tensor {
+        &self.0
+    }
+
+    /// Consumes this `Var` and return the underlying tensor.
+    pub fn into_inner(self) -> Tensor {
+        self.0
+    }
+
+    /// Sets the content of the inner tensor, this does not require a mutable reference as inner
+    /// mutability is used.
+    pub fn set(&self, src: &Tensor) -> Result<()> {
+        if self.same_storage(src) {
+            let msg = "cannot set a variable to a tensor that is derived from its value";
+            Err(Error::CannotSetVar { msg }.bt())?
+        }
+        let (mut dst, layout) = self.storage_mut_and_layout();
+        if !layout.is_contiguous() {
+            let msg = "cannot set a non-contiguous variable";
+            Err(Error::CannotSetVar { msg }.bt())?
+        }
+        let (src, src_l) = src.storage_and_layout();
+        if layout.shape() != src_l.shape() {
+            Err(Error::ShapeMismatchBinaryOp {
+                lhs: layout.shape().clone(),
+                rhs: src_l.shape().clone(),
+                op: "set",
+            }
+            .bt())?
+        }
+        src.copy_strided_src(&mut dst, layout.start_offset(), src_l)?;
+        Ok(())
+    }
+}
--- a/candle-core/tests/conv_tests.rs
+++ b/candle-core/tests/conv_tests.rs
+use anyhow::Result;
+use candle_core::{test_device, test_utils, Device, IndexOp, Tensor};
+
+/* This test is based on the following script.
+import torch
+torch.manual_seed(4242)
+
+t = torch.randn((1, 4, 5))
+w = torch.randn((2, 4, 3))
+print(t.flatten())
+print(w.flatten())
+res = torch.nn.functional.conv1d(t, w)
+print(res.flatten())
+res = torch.nn.functional.conv1d(t, w, padding=1)
+print(res.flatten())
+
+w_t = w.transpose(0, 1)
+res = torch.nn.functional.conv_transpose1d(t, w_t)
+print(res.shape)
+print(res)
+res = torch.nn.functional.conv_transpose1d(t, w_t, groups=2)
+print(res.shape)
+print(res)
+*/
+fn conv1d(dev: &Device) -> Result<()> {
+    let t = Tensor::new(
+        &[
+            0.4056f32, -0.8689, -0.0773, -1.5630, 1.2279, -0.9287, -1.7030, 0.1370, 0.1866, 0.4145,
+            1.8025, -0.1536, 2.2013, -0.6836, 0.2477, 1.3127, -0.6957, 0.3278, -1.0124, 0.5599,
+        ],
+        dev,
+    )?
+    .reshape((1, 4, 5))?;
+    let w = Tensor::new(
+        &[
+            -0.8404f32, -0.3490, 0.0130, 1.3123, 0.1763, -1.9249, 1.4270, 0.9421, 0.8670, -0.7181,
+            -1.1111, 0.8869, -1.2429, 1.8357, 1.6052, -1.3844, 0.3951, -1.2036, 0.6686, 1.6261,
+            -0.6451, -0.0840, -1.4247, 0.5512,
+        ],
+        dev,
+    )?
+    .reshape((2, 4, 3))?;
+    let res = t.conv1d(&w, 0, 1, 1, 1)?;
+    assert_eq!(res.dims(), [1, 2, 3]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [2.6357, -1.3336, 4.1393, -1.1784, 3.5675, 0.5069]
+    );
+    let res = t.conv1d(&w, /*padding*/ 1, 1, 1, 1)?;
+    assert_eq!(res.dims(), [1, 2, 5]);
+    // Same as pytorch default padding: use zeros.
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352]
+    );
+
+    let w = w.transpose(0, 1)?;
+    // The CPU kernels applied in the contiguous and non contiguous cases are different.
+    for w in [w.clone(), w.contiguous()?] {
+        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 1)?;
+        assert_eq!(res.dims(), [1, 2, 7]);
+        assert_eq!(
+            test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+            [
+                0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538,
+                4.7076, -5.9745, -0.8276, 1.621
+            ],
+        );
+        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 2)?;
+        assert_eq!(res.dims(), [1, 4, 7]);
+        assert_eq!(
+            test_utils::to_vec2_round(&res.squeeze(0)?, 4)?,
+            [
+                [-1.5596, -1.8099, 2.0407, 4.8764, -0.1743, -0.735, -0.7819],
+                [0.7816, 3.8152, -0.5926, 2.2515, -5.1844, -0.3157, 1.4721],
+                [1.6295, 0.52, 6.2611, 0.7109, 2.6315, -1.8793, 0.7113],
+                [1.0949, 1.0166, 1.7464, 2.4561, -0.79, -0.5119, 0.1488]
+            ]
+        );
+    }
+    Ok(())
+}
+
+fn conv1d_small(dev: &Device) -> Result<()> {
+    let t = Tensor::new(&[0.4056f32, -0.8689, -0.0773, -1.5630], dev)?.reshape((1, 1, 4))?;
+    let w = Tensor::new(&[1f32, 0., 0.], dev)?.reshape((1, 1, 3))?;
+    let res = t.conv1d(&w, 0, 1, 1, 1)?;
+    assert_eq!(res.dims(), [1, 1, 2]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [0.4056, -0.8689]
+    );
+    let res = t.conv1d(&w, /*padding*/ 1, 1, 1, 1)?;
+    assert_eq!(res.dims(), [1, 1, 4]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [0.0, 0.4056, -0.8689, -0.0773],
+    );
+    Ok(())
+}
+
+/* This test is based on the following script.
+import torch
+torch.manual_seed(4242)
+
+t = torch.randn((1, 4, 5, 5))
+w = torch.randn((2, 4, 3, 3))
+print(t.flatten())
+print(w.flatten())
+res = torch.nn.functional.conv2d(t, w)
+print(res.flatten())
+
+w_t = w.transpose(0, 1)
+res = torch.nn.functional.conv_transpose2d(t, w_t)
+print(res.shape)
+print(res)
+
+res = torch.nn.functional.conv2d(t, w, dilation=2)
+print(res.shape)
+print(res[0])
+
+res = torch.nn.functional.conv_transpose2d(t, w_t, dilation=2)
+print(res.shape)
+print(res)
+*/
+fn conv2d(dev: &Device) -> Result<()> {
+    let t = Tensor::new(
+        &[
+            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
+            1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843, 0.2395,
+            1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013, -0.6836,
+            0.2477, 1.3127, -0.2260, 0.2622, -1.2974, -0.8140, -0.8404, -0.3490, 0.0130, 1.3123,
+            1.7569, -0.3956, -1.8255, 0.1727, -0.3538, 2.6941, 1.0529, 0.4219, -0.2071, 1.1586,
+            0.4717, 0.3865, -0.5690, -0.5010, -0.1310, 0.7796, 0.6630, -0.2021, 2.6090, 0.2049,
+            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
+            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
+            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
+            -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
+        ],
+        dev,
+    )?;
+    let w = Tensor::new(
+        &[
+            -0.9325f32, 0.6451, -0.8537, 0.2378, 0.8764, -0.1832, 0.2987, -0.6488, -0.2273,
+            -2.4184, -0.1192, -0.4821, -0.5079, -0.5766, -2.4729, 1.6734, 0.4558, 0.2851, 1.1514,
+            -0.9013, 1.0662, -0.1817, -0.0259, 0.1709, 0.5367, 0.7513, 0.8086, -2.2586, -0.5027,
+            0.9141, -1.3086, -1.3343, -1.5669, -0.1657, 0.7958, 0.1432, 0.3896, -0.4501, 0.1667,
+            0.0714, -0.0952, 1.2970, -0.1674, -0.3178, 1.0677, 0.3060, 0.7080, 0.1914, 1.1679,
+            -0.3602, 1.9265, -1.8626, -0.5112, -0.0982, 0.2621, 0.6565, 0.5908, 1.0089, -0.1646,
+            1.8032, -0.6286, 0.2016, -0.3370, 1.2555, 0.8009, -0.6488, -0.4652, -1.5685, 1.5860,
+            0.5583, 0.4623, 0.6026,
+        ],
+        dev,
+    )?;
+    let t = t.reshape((1, 4, 5, 5))?;
+    let w = w.reshape((2, 4, 3, 3))?;
+    let res = t.conv2d(&w, 0, 1, 1, 1)?;
+    assert_eq!(res.dims(), [1, 2, 3, 3]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [
+            -4.2812, 2.0923, 5.2187, 7.5184, 0.752, -14.9426, 10.0087, 4.391, 0.2918, 1.6715,
+            10.389, 3.6023, -4.2808, 0.2672, 5.3646, -5.2023, -2.1955, -9.4075
+        ]
+    );
+
+    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
+
+    assert_eq!(res.dims(), [1, 2, 7, 7]);
+    assert_eq!(
+        test_utils::to_vec3_round(&res.i(0)?, 4)?,
+        [
+            [
+                [-1.9918, 2.6797, -0.4599, -1.6037, 1.4131, -2.4012, 2.9277],
+                [1.8016, -3.5361, 1.0757, 3.5395, -8.2168, -3.2023, 0.5375],
+                [0.8243, 1.8675, 7.8929, -4.0746, -6.4415, 5.1139, 1.6889],
+                [0.2722, 8.9679, 3.3477, 1.8514, -4.2896, -3.8228, -7.5632],
+                [-8.5412, -5.8142, -7.1587, -1.6095, 0.4651, 0.2748, -2.0985],
+                [2.0833, -0.6482, -12.1692, -4.1284, -2.9765, -0.0656, -4.5114],
+                [5.307, 2.6957, 2.3087, 1.0478, 0.7808, -1.1519, -0.9579]
+            ],
+            [
+                [1.089, 0.1872, -0.6408, -0.9897, 0.8503, 1.1019, -0.9211],
+                [-0.1741, -0.2915, 4.2472, 1.9417, 1.65, 0.6303, -4.7131],
+                [1.6555, 2.4026, -2.9293, 2.9953, 0.5328, 3.5873, -0.9621],
+                [-1.4289, -3.2787, 4.1747, -6.0341, -4.6341, -5.7945, 4.142],
+                [7.5973, 6.4431, 5.9872, 2.1639, -8.6566, 3.3143, -3.4059],
+                [-0.8775, -3.048, 11.6543, 0.6442, 2.3218, -0.4765, 1.1516],
+                [-5.5423, -2.5188, 1.0754, -0.0563, -2.9386, -1.1504, 1.0171]
+            ]
+        ]
+    );
+
+    // Dilations.
+    let res = t.conv2d(&w, 0, 1, 2, 1)?;
+    assert_eq!(res.dims(), [1, 2, 1, 1]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [2.45, -2.3504],
+    );
+
+    // Transpose and dilations.
+    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 2)?;
+    assert_eq!(res.dims(), [1, 2, 9, 9]);
+    assert_eq!(
+        test_utils::to_vec3_round(&res.i(0)?, 4)?,
+        [
+            [
+                [-1.9918, 3.1652, -0.6778, -4.3442, 4.4351, 0.6652, -3.0124, -0.6031, 2.9277],
+                [2.7036, -1.7156, -0.3969, 1.0516, 1.6381, -2.8886, -0.205, 2.4682, -1.0499],
+                [-0.9459, 3.1631, 3.707, -4.8369, -8.5166, -1.4496, -2.7559, -3.2698, 1.4376],
+                [-0.2157, 3.7786, -2.0252, -4.2633, 3.6731, -1.5142, 5.9391, -0.2622, -0.141],
+                [-6.8121, -3.1744, 1.5945, 3.0637, -9.6088, 1.4446, 2.9489, -3.0082, -7.3822],
+                [0.2371, 3.3303, 0.3861, 2.2646, -4.6784, 4.1235, -0.0109, 0.3176, -0.03],
+                [-2.5339, -2.9564, -3.4518, -4.4594, -9.1873, -1.9709, -0.4676, 0.51, -3.5024],
+                [4.007, 0.3067, -2.2954, 1.1105, -0.1992, 1.6372, -2.9268, 0.2807, -1.2787],
+                [5.307, 1.1317, 1.3518, 0.9049, 3.8116, -0.4075, -0.8874, -0.2241, -0.9579]
+            ],
+            [
+                [1.089, -0.6483, 0.0726, -0.4752, -1.3283, 1.7103, 1.0703, 0.1076, -0.9211],
+                [-0.8629, 0.1376, 0.3202, 2.0955, 0.9696, 2.8988, -1.0012, 1.5049, -0.1278],
+                [1.9286, -1.5255, -2.9563, 2.4589, 3.3611, -0.6951, 0.3525, -1.7724, -5.9861],
+                [1.1226, 2.1561, 3.6417, 4.7546, -0.692, 4.4126, -5.1902, 6.0805, 2.3185],
+                [1.0111, 0.3604, 0.6432, -3.6605, 7.9517, -9.2955, -5.2988, -3.7803, -2.0642],
+                [3.3172, -1.7967, -3.6576, -2.0942, 1.3158, 0.112, -1.7405, 2.9167, 0.7957],
+                [5.1001, 1.8995, -1.8639, 1.1262, 9.9629, 2.683, -3.6319, -1.1607, 0.5856],
+                [-4.8445, -0.5642, 4.2317, 0.0856, 1.2267, -0.5712, 1.736, 1.0997, 0.6908],
+                [-5.5423, -1.1831, -1.2176, 0.0843, 0.0446, -0.7545, -2.4798, -0.0827, 1.0171]
+            ]
+        ]
+    );
+
+    Ok(())
+}
+
+/* This test is based on the following script.
+import torch
+torch.manual_seed(4242)
+
+t = torch.randn((1, 2, 3, 3))
+w = torch.randn((1, 2, 1, 1))
+print(t.flatten())
+print(w.flatten())
+res = torch.nn.functional.conv2d(t, w)
+print(res.flatten())
+
+w_t = w.transpose(0, 1)
+res = torch.nn.functional.conv_transpose2d(t, w_t)
+print(res.shape)
+print(res.flatten())
+
+t_t = w.transpose(0, 1)
+res = torch.nn.functional.conv_transpose2d(t_t, w)
+print(res.shape)
+print(res.flatten())
+*/
+fn conv2d_small(dev: &Device) -> Result<()> {
+    let t = Tensor::new(
+        &[
+            0.4056f32, -0.8689, 0.6843, 0.2395, 1.2279, -0.9287, -1.7030, 0.1370, 0.1866, 0.4145,
+            -0.6266, 0.3529, 2.2013, -0.6836, 0.2477, 1.3127, -0.6957, 0.3278,
+        ],
+        dev,
+    )?;
+    let w = Tensor::new(&[-0.9259f32, 1.3017], dev)?;
+    let t = t.reshape((1, 2, 3, 3))?;
+    let w = w.reshape((1, 2, 1, 1))?;
+    let res = t.conv2d(&w, 0, 1, 1, 1)?;
+    assert_eq!(res.dims(), [1, 1, 3, 3]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [0.164, -0.0111, -0.1742, 2.6437, -2.0268, 1.1823, 3.2855, -1.0324, 0.2539]
+    );
+    let res = t.conv2d(&w, 2, 1, 1, 1)?;
+    assert_eq!(res.dims(), [1, 1, 7, 7]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1640,
+            -0.0111, -0.1742, 0.0, 0.0, 0.0, 0.0, 2.6437, -2.0268, 1.1823, 0.0, 0.0, 0.0, 0.0,
+            3.2855, -1.0324, 0.2539, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0
+        ]
+    );
+
+    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
+    assert_eq!(res.dims(), [1, 1, 3, 3]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [0.164, -0.0111, -0.1742, 2.6437, -2.0268, 1.1823, 3.2855, -1.0324, 0.2539],
+    );
+    let res = t.transpose(0, 1)?.conv_transpose2d(&w, 0, 0, 1, 1)?;
+    assert_eq!(res.dims(), [2, 2, 3, 3]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [
+            -0.3755, 0.8045, -0.6336, -0.2218, -1.1369, 0.8599, 1.5768, -0.1268, -0.1728, 0.528,
+            -1.131, 0.8908, 0.3118, 1.5984, -1.2089, -2.2168, 0.1783, 0.2429, -0.3838, 0.5802,
+            -0.3268, -2.0382, 0.6329, -0.2293, -1.2154, 0.6441, -0.3035, 0.5396, -0.8156, 0.4594,
+            2.8654, -0.8898, 0.3224, 1.7087, -0.9056, 0.4267
+        ]
+    );
+    Ok(())
+}
+
+fn conv2d_smaller(dev: &Device) -> Result<()> {
+    let t = Tensor::new(
+        &[
+            0.4056f32, -0.8689, 0.6843, 0.2395, 1.2279, -0.9287, -1.7030, 0.1370, 0.1866,
+        ],
+        dev,
+    )?;
+    let w = Tensor::new(&[1f32, 1., 1., 1., 1., 1., 1., 1., 1.], dev)?;
+    let t = t.reshape((1, 1, 3, 3))?;
+    let w = w.reshape((1, 1, 3, 3))?;
+    let res = t.conv2d(&w, 0, 1, 1, 1)?;
+    assert_eq!(res.dims(), [1, 1, 1, 1]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [-0.6197]
+    );
+    Ok(())
+}
+
+/* This test is based on the following script.
+import torch
+torch.manual_seed(4242)
+
+t = torch.randn((1, 2, 4, 2))
+w = torch.randn((1, 2, 1, 1))
+print(t.flatten())
+print(w.flatten())
+res = torch.nn.functional.conv2d(t, w)
+print(res.flatten())
+*/
+fn conv2d_non_square(dev: &Device) -> Result<()> {
+    let t = Tensor::new(
+        &[
+            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
+            1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699,
+        ],
+        dev,
+    )?;
+    let w = Tensor::new(&[-1.1351f32, 1.3841], dev)?;
+    let t = t.reshape((1, 2, 4, 2))?;
+    let w = w.reshape((1, 2, 1, 1))?;
+    let res = t.conv2d(&w, 0, 1, 1, 1)?;
+    assert_eq!(res.dims(), [1, 1, 4, 2]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [0.2312, 5.2238, 2.3772, 1.9076, 2.0256, -0.5776, -1.6028, -1.467]
+    );
+    Ok(())
+}
+
+/*
+import torch
+torch.manual_seed(4242)
+
+t = torch.randn((1, 4, 5, 5), requires_grad=True)
+w = torch.randn((2, 4, 3, 3), requires_grad=True)
+print(t.flatten())
+print(w.flatten())
+res = torch.nn.functional.conv2d(t, w)
+print(res.flatten())
+loss = (res ** 2).sum()
+print(loss)
+loss.backward()
+print(t.grad.shape)
+print(t.grad.flatten())
+print(w.grad.shape)
+print(w.grad.flatten())
+
+t.grad.zero_()
+w.grad.zero_()
+res = torch.nn.functional.conv2d(t, w, stride=2)
+print(res.flatten())
+loss = (res ** 2).sum()
+print(loss)
+loss.backward()
+print(t.grad.shape)
+print(t.grad[0])
+print(w.grad.shape)
+print(w.grad[0])
+*/
+fn conv2d_grad(dev: &Device) -> Result<()> {
+    // conv-transposes are not implemented for metal
+    use candle_core::Var;
+    let t = Var::from_slice(
+        &[
+            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
+            1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843, 0.2395,
+            1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013, -0.6836,
+            0.2477, 1.3127, -0.2260, 0.2622, -1.2974, -0.8140, -0.8404, -0.3490, 0.0130, 1.3123,
+            1.7569, -0.3956, -1.8255, 0.1727, -0.3538, 2.6941, 1.0529, 0.4219, -0.2071, 1.1586,
+            0.4717, 0.3865, -0.5690, -0.5010, -0.1310, 0.7796, 0.6630, -0.2021, 2.6090, 0.2049,
+            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
+            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
+            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
+            -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
+        ],
+        (1, 4, 5, 5),
+        dev,
+    )?;
+    let w = Var::from_slice(
+        &[
+            -0.9325f32, 0.6451, -0.8537, 0.2378, 0.8764, -0.1832, 0.2987, -0.6488, -0.2273,
+            -2.4184, -0.1192, -0.4821, -0.5079, -0.5766, -2.4729, 1.6734, 0.4558, 0.2851, 1.1514,
+            -0.9013, 1.0662, -0.1817, -0.0259, 0.1709, 0.5367, 0.7513, 0.8086, -2.2586, -0.5027,
+            0.9141, -1.3086, -1.3343, -1.5669, -0.1657, 0.7958, 0.1432, 0.3896, -0.4501, 0.1667,
+            0.0714, -0.0952, 1.2970, -0.1674, -0.3178, 1.0677, 0.3060, 0.7080, 0.1914, 1.1679,
+            -0.3602, 1.9265, -1.8626, -0.5112, -0.0982, 0.2621, 0.6565, 0.5908, 1.0089, -0.1646,
+            1.8032, -0.6286, 0.2016, -0.3370, 1.2555, 0.8009, -0.6488, -0.4652, -1.5685, 1.5860,
+            0.5583, 0.4623, 0.6026,
+        ],
+        (2, 4, 3, 3),
+        dev,
+    )?;
+    let res = t.conv2d(&w, 0, 1, 1, 1)?;
+    let loss = res.sqr()?.sum_all()?;
+    assert_eq!(test_utils::to_vec0_round(&loss, 2)?, 741.12f32);
+    let grads = loss.backward()?;
+    let grad_t = grads.get(&t).unwrap();
+    let grad_w = grads.get(&w).unwrap();
+    assert_eq!(grad_t.dims(), [1, 4, 5, 5]);
+    assert_eq!(grad_w.dims(), [2, 4, 3, 3]);
+    assert_eq!(
+        test_utils::to_vec1_round(&grad_t.flatten_all()?, 2)?,
+        [
+            9.29, -2.84, -5.71, 3.38, -7.71, -19.15, 7.02, 29.1, 9.34, 34.73, -22.87, 24.35,
+            -39.88, -14.01, 21.08, 9.94, 13.63, -34.68, 11.21, -6.26, 7.72, -6.32, -16.64, -1.08,
+            -20.22, 21.73, -0.37, -4.06, 5.82, -3.65, -30.73, 14.55, 87.7, 31.6, 4.53, -89.78,
+            -75.37, -57.43, -7.56, 92.96, 18.79, -4.63, -159.75, -42.47, -47.26, 52.88, 37.32,
+            49.0, 12.82, 2.01, -8.98, 20.18, 16.62, 12.06, 15.38, 20.0, 2.57, -15.22, 72.62,
+            -10.75, 2.25, -31.2, 3.75, -0.2, 9.76, -0.68, 5.21, -40.44, -22.59, -61.61, 17.28,
+            20.41, 37.55, 5.23, 6.81, 23.54, 23.62, -9.99, -9.13, 4.87, -35.06, -26.1, 63.48,
+            25.81, -39.21, -70.68, -46.96, 2.33, 41.81, 82.42, -28.63, -11.78, -35.33, -10.28,
+            -28.57, -9.13, 7.21, -9.05, -9.62, -11.25
+        ]
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(&grad_w.flatten_all()?, 2)?,
+        [
+            -28.92, -22.88, -141.23, 73.35, 61.07, 47.81, -20.0, -73.71, -41.82, -13.59, 21.5,
+            28.72, 28.57, -46.85, -90.19, 143.61, 16.68, 7.43, 18.88, -90.81, -20.29, 54.79, 82.63,
+            22.94, 77.81, -16.39, -13.2, 9.34, -40.39, -26.62, 5.33, -60.91, 9.09, -59.37, 7.08,
+            58.64, 5.55, 20.52, 2.5, -17.25, -6.8, 22.21, 30.15, -7.52, -37.46, 5.67, 22.58, 9.03,
+            47.05, 17.61, 37.31, -98.13, -14.61, -4.8, -6.36, 44.69, 23.34, 8.37, -13.52, 80.05,
+            -34.24, -16.36, -12.31, 1.92, -33.62, -14.1, -49.23, -7.39, 11.5, -9.98, 9.66, 29.6
+        ]
+    );
+
+    // Same as before but with stride.
+    let res = t.conv2d(&w, 0, 2, 1, 1)?;
+    let loss = res.sqr()?.sum_all()?;
+    assert_eq!(test_utils::to_vec0_round(&loss, 2)?, 277.16f32);
+    let grads = loss.backward()?;
+    let grad_t = grads.get(&t).unwrap();
+    let grad_w = grads.get(&w).unwrap();
+    assert_eq!(grad_t.dims(), [1, 4, 5, 5]);
+    assert_eq!(grad_w.dims(), [2, 4, 3, 3]);
+    assert_eq!(
+        test_utils::to_vec3_round(&grad_t.i(0)?, 2)?,
+        [
+            [
+                [9.29, -7.03, 0.94, 3.49, -7.71],
+                [-1.8, -7.82, 8.9, 8.46, 7.43],
+                [-25.84, 22.09, -19.27, -0.22, 1.69],
+                [4.02, 18.53, -18.37, 2.3, -24.51],
+                [7.72, -9.68, -12.34, 5.6, -20.22]
+            ],
+            [
+                [21.73, 3.39, -18.27, 3.86, -3.65],
+                [8.25, 3.73, 30.73, -8.61, -11.93],
+                [-72.15, -15.36, -17.53, -12.32, -1.61],
+                [-22.32, -7.79, -91.82, 6.44, -37.69],
+                [52.88, 14.44, 42.75, 9.88, 2.01]
+            ],
+            [
+                [-8.98, 9.91, 6.75, -4.68, 15.38],
+                [4.93, -0.33, 9.94, -1.46, 14.78],
+                [13.62, -30.63, 3.96, -3.58, -4.48],
+                [-14.13, 1.19, -34.43, 3.08, -33.83],
+                [17.28, 12.94, 31.83, -3.35, 6.81]
+            ],
+            [
+                [23.54, 6.98, -24.52, 0.52, 4.87],
+                [9.65, 6.18, 1.71, -25.23, -4.93],
+                [-54.99, -23.66, 3.19, -3.73, 18.58],
+                [-21.35, -10.39, -39.88, 28.73, -30.76],
+                [-9.13, 11.12, -14.0, -8.23, -11.25]
+            ]
+        ]
+    );
+    assert_eq!(
+        test_utils::to_vec3_round(&grad_w.i(0)?, 2)?,
+        [
+            [
+                [28.34, -7.91, -45.75],
+                [21.03, 3.86, 29.86],
+                [0.72, -36.58, -35.28]
+            ],
+            [
+                [-16.04, 11.53, -16.38],
+                [29.62, -16.32, -48.35],
+                [57.5, 28.29, 25.81]
+            ],
+            [
+                [2.93, -19.6, 1.57],
+                [27.15, 53.88, -24.64],
+                [12.74, -22.6, -26.2]
+            ],
+            [
+                [-0.18, -14.86, -6.82],
+                [-19.55, -2.72, 45.9],
+                [-2.54, 36.97, 27.11]
+            ]
+        ]
+    );
+
+    // Replicate the issue from https://github.com/huggingface/candle/issues/1212
+    let res = t.i((.., .., 0..4, 0..4))?.conv2d(&w, 0, 2, 1, 1)?;
+    let loss = res.sqr()?.sum_all()?;
+    assert_eq!(test_utils::to_vec0_round(&loss, 2)?, 21.12f32);
+    let grads = loss.backward()?;
+    let grad_t = grads.get(&t).unwrap();
+    let grad_w = grads.get(&w).unwrap();
+    assert_eq!(grad_t.dims(), [1, 4, 5, 5]);
+    assert_eq!(grad_w.dims(), [2, 4, 3, 3]);
+    assert_eq!(
+        test_utils::to_vec3_round(&grad_t.i(0)?, 2)?,
+        [
+            [
+                [9.29, -7.03, 7.87, 0.0, 0.0],
+                [-1.8, -7.82, 5.9, 0.0, 0.0],
+                [-3.12, 4.49, 5.52, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0]
+            ],
+            [
+                [21.73, 3.39, 4.77, 0.0, 0.0],
+                [8.25, 3.73, 27.61, 0.0, 0.0],
+                [-20.55, -5.61, -2.77, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0]
+            ],
+            [
+                [-8.98, 9.91, -7.15, 0.0, 0.0],
+                [4.93, -0.33, 4.56, 0.0, 0.0],
+                [-6.7, -5.76, -8.05, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0]
+            ],
+            [
+                [23.54, 6.98, -10.0, 0.0, 0.0],
+                [9.65, 6.18, 18.72, 0.0, 0.0],
+                [3.29, -5.27, 0.79, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0]
+            ]
+        ]
+    );
+    assert_eq!(
+        test_utils::to_vec3_round(&grad_w.i(0)?, 2)?,
+        [
+            [
+                [-3.47, 7.44, 0.66],
+                [12.89, -3.4, -9.29],
+                [-14.16, -0.83, 7.14]
+            ],
+            [
+                [-3.23, 5.37, -3.02],
+                [-2.12, -11.24, 1.94],
+                [6.97, 7.2, 2.99]
+            ],
+            [
+                [-4.04, -3.31, 4.87],
+                [-6.68, -5.68, 1.73],
+                [-5.54, 4.32, 0.52]
+            ],
+            [[-4.72, 1.5, 4.72], [3.79, 4.04, 6.76], [-4.6, 5.8, 6.93]]
+        ]
+    );
+
+    // Conv Transpose 2d Test
+    //tested against following python
+
+    // import torch
+    // torch.manual_seed(4242)
+    // padding = 4
+    // outpadding = 2
+    // dilation = 3
+    // stride = 3
+    // input = torch.randn((1, 4, 7, 5), requires_grad=True)
+    // kernel = torch.randn((4, 2, 3, 5), requires_grad=True)
+    // print("input", input.flatten())
+    // print("kernel", kernel.flatten())
+    // res = torch.nn.functional.conv_transpose2d(
+    //     input,
+    //     kernel,
+    //     stride=stride,
+    //     padding=padding,
+    //     dilation=dilation,
+    //     output_padding=outpadding,
+    // )
+    // res.retain_grad()
+    // print(res.shape)
+    // loss = (res**2).sum()
+    // print(loss)
+    // loss.backward()
+    // print(input.grad.shape)
+    // print("input grad", torch.round(input.grad, decimals=1))
+    // print(kernel.grad.shape)
+    // print("kernel grad", torch.round(kernel.grad.flatten(), decimals=1))
+
+    let padding = 4;
+    let outpadding = 2;
+    let dilation = 3;
+    let stride = 3;
+
+    let t = Var::from_slice(
+        &[
+            0.4056_f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997,
+            3.0616, 1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843,
+            0.2395, 1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013,
+            -0.6836, 0.2477, 1.3127, -0.2260, 0.2622, -1.2974, -0.8140, -0.8404, -0.3490, 0.0130,
+            1.3123, 1.7569, -0.3956, -1.8255, 0.1727, -0.3538, 2.6941, 1.0529, 0.4219, -0.2071,
+            1.1586, 0.4717, 0.3865, -0.5690, -0.5010, -0.1310, 0.7796, 0.6630, -0.2021, 2.6090,
+            0.2049, 0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323,
+            -1.3712, 0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742,
+            0.3790, -0.4431, -0.4720, -0.7890, 0.2620, 0.5411, -1.1715, -2.4997, 2.3249, -0.8912,
+            -0.4733, -0.5701, -2.8888, -1.4112, -0.5471, -0.9234, -1.1660, 0.4189, -0.7465,
+            -0.6473, 0.1402, 0.7875, 0.5377, -0.6779, -0.8088, -0.4864, -0.2312, 0.9279, 0.1264,
+            1.5480, 0.8265, -0.1025, 0.5138, -0.2512, 0.1576, 1.2705, 0.3641, -0.9325, 0.6451,
+            -0.8537, 0.2378, 0.1794, 0.2752, -0.3687, -1.1149, -0.1410, -0.5829, -0.0892, 1.4258,
+            -2.2789, 0.5270, 0.1825, 1.7007, -0.5263, -0.2954, 0.4440, 0.5537, 0.3492, 0.6186,
+            1.6475, 0.2219,
+        ],
+        (1, 4, 7, 5),
+        dev,
+    )?;
+
+    #[rustfmt::skip]
+    let w = Var::from_slice(
+        &[
+            -1.1744_f32, 0.3266, 2.5893, 1.0142, 0.1763, 0.7752, 0.6604, 0.2029, -0.2145, 0.7234,
+            -0.3441, -1.5400, -0.6333, 0.6613, 0.2083, 0.6230, -1.7002, 0.3393, 0.4049, 1.0762,
+            0.2723, 1.4181, 0.0029, -0.2122, 1.7668, 1.4168, 0.3320, -0.2719, 0.7932, -0.7204,
+            0.4447, 0.1211, 0.5908, 1.0089, -0.1646, 1.8033, -0.6286, 0.2016, -0.3370, 1.2555,
+            0.8009, -0.6488, -0.4652, -1.5685, 1.5860, 0.5583, 0.4623, 0.6026, 0.8828, 2.4990,
+            0.6811, -0.3369, 1.3320, 1.7669, -1.1067, 1.2958, -0.9415, -0.9655, -0.4462, 0.7181,
+            0.5181, -1.1658, -1.8467, -0.7763, 1.2769, 0.8651, 0.9890, 1.5092, 0.7207, -0.8481,
+            0.7417, 0.3375, -1.2685, 1.4572, 1.0915, 0.1093, -0.8550, -0.5831, -0.6309, -0.2509,
+            0.5220, -0.0914, 0.7900, 0.1096, 0.3258, 0.2723, -1.0942, -0.3393, -0.1653, 0.5732,
+            -0.8014, 1.8194, -1.9023, 0.2127, 1.8636, -0.8979, 0.1927, -0.2778, 0.3105, 0.0071,
+            -1.1823, 0.2476, -0.7178, -1.3821, 1.0769, -0.4376, -0.9967, -0.1227, 1.6197, -1.0604,
+            0.1372, 0.8141, -0.6163, 0.7304, -0.8285, 2.0636, -0.7176, 0.2495, -0.2581, -0.4478,
+        ],
+        (4, 2, 3, 5),
+        dev,
+    )?;
+    let res = t.conv_transpose2d(&w, padding, outpadding, stride, dilation)?;
+    let loss = res.sqr()?.sum_all()?;
+    assert_eq!(test_utils::to_vec0_round(&loss, 0)?, 2904.0);
+    let grads = loss.backward()?;
+
+    let grad_t = grads.get(&t).unwrap();
+    let grad_w = grads.get(&w).unwrap();
+    assert_eq!(grad_t.dims(), [1, 4, 7, 5]);
+    assert_eq!(grad_w.dims(), [4, 2, 3, 5]);
+
+    assert_eq!(
+        test_utils::to_vec1_round(&grad_w.flatten_all()?, 1)?,
+        [
+            // torch gets 89.1
+            -89.0, -135.3, 136.7, 102.0, -53.4, 117.9, 118.6, -43.9, -218.0, -58.5, -114.3, -150.0,
+            -15.6, 172.1, 66.3, -64.3, -27.9, -19.8, 31.7, 62.1, 5.5, 92.6, 28.2, -29.6, 55.9,
+            52.7, -72.7, -119.8, 53.8, -25.5, 128.8, 19.3, 68.0, 190.9, -64.1, -86.2, -111.2,
+            106.6, -67.7, 37.8, 115.9, 50.4, -77.7, -54.9, 22.3, -4.6, 89.8, 61.7, 122.4, 192.6,
+            -27.8, -104.6, 57.0, 166.4, 27.1, 6.1, 18.7, -93.2, 31.5, 168.2, -3.7, -99.5, -55.5,
+            -10.8, 17.5, 20.8, 16.9, 43.8, 42.0, -89.2, 18.8, -9.6, -84.1, 212.6, 19.7, -50.0,
+            -52.0, -40.0, -166.6, -73.2, -10.8, -73.3, 31.5, -23.4, -79.3, -27.0, -84.4, -42.9,
+            -20.3, 51.8, -16.7, 76.3, -120.5, -65.8, 96.5, -10.7, -45.9, -88.1, 65.4, -7.0, -1.5,
+            92.8, -25.1, -114.2, -5.8, -14.8, -51.2, -20.7, 54.2, -79.8, 47.7, -29.2, -8.8, 53.5,
+            -28.4, 85.0, -18.3, 107.0, 28.3, -71.8
+        ]
+    );
+
+    assert_eq!(
+        test_utils::to_vec3_round(&grad_t.i(0)?, 1)?,
+        [
+            [
+                [32.3, -41.6, -24.0, 14.1, 17.6],
+                [-11.8, 72.5, 87.6, 46.4, 61.5],
+                [115.0, 108.5, -48.6, -63.4, -50.0],
+                [51.3, 5.4, 31.3, 91.1, -30.9],
+                [52.7, 92.8, -68.0, -47.0, 83.0],
+                // pytorch gets -107.1
+                [-10.2, -107.0, -5.4, 213.1, -31.4],
+                [-2.4, 65.1, 9.2, -146.2, -24.2]
+            ],
+            [
+                [-72.6, -63.9, -61.9, 45.3, 33.0],
+                [79.3, -0.5, -26.2, 78.2, 42.7],
+                [90.9, 141.6, 40.1, -62.7, 37.0],
+                [32.8, 198.2, -0.8, -31.1, 27.3],
+                // torch gets 48.0
+                [34.5, 34.9, -47.9, 127.6, -12.3],
+                [-61.4, -3.2, -2.9, -10.9, -16.6],
+                [74.6, 60.1, -68.9, 34.5, -50.4]
+            ],
+            [
+                [37.5, -56.9, -43.6, -13.5, -9.9],
+                [40.0, 97.3, 28.6, 14.2, -30.1],
+                [-22.3, -126.3, -68.8, -8.2, 26.1],
+                [-32.9, 37.3, 108.5, -54.8, 29.6],
+                [34.9, -176.9, -125.0, -28.3, -13.9],
+                [-54.9, 142.6, 62.1, -80.4, -65.6],
+                [7.4, -91.1, -67.6, 35.0, 39.7]
+            ],
+            [
+                [-57.2, -40.9, -10.1, 32.6, 29.4],
+                [18.7, -18.0, 29.5, -1.2, 59.2],
+                [-14.0, -74.4, 19.8, -117.0, 58.2],
+                [-21.8, 163.5, -71.1, -99.0, 80.9],
+                [-58.9, -10.9, 93.8, -139.6, 98.0],
+                // torch gets 54.5
+                [-54.4, 135.3, 6.0, -79.1, 134.6],
+                [27.5, -76.0, 43.4, -2.8, -7.8]
+            ]
+        ]
+    );
+    Ok(())
+}
+
+test_device!(conv1d, conv1d_cpu, conv1d_gpu, conv1d_metal);
+test_device!(
+    conv1d_small,
+    conv1d_small_cpu,
+    conv1d_small_gpu,
+    conv1d_small_metal
+);
+test_device!(conv2d, conv2d_cpu, conv2d_gpu, conv2d_metal);
+test_device!(
+    conv2d_non_square,
+    conv2d_non_square_cpu,
+    conv2d_non_square_gpu,
+    conv2d_non_square_metal
+);
+test_device!(
+    conv2d_small,
+    conv2d_small_cpu,
+    conv2d_small_gpu,
+    conv2d_small_metal
+);
+test_device!(
+    conv2d_smaller,
+    conv2d_smaller_cpu,
+    conv2d_smaller_gpu,
+    conv2d_smaller_metal
+);
+test_device!(
+    conv2d_grad,
+    conv2d_grad_cpu,
+    conv2d_grad_gpu,
+    conv2_grad_metal
+);
--- a/candle-core/tests/custom_op_tests.rs
+++ b/candle-core/tests/custom_op_tests.rs
+use candle_core::backend::BackendStorage;
+use candle_core::cpu_backend;
+use candle_core::test_utils::to_vec1_round;
+use candle_core::{CpuStorage, CustomOp1, DType, Device, Error, Layout, Result, Shape, Tensor};
+
+fn fwd<T: num_traits::Float>(v: T, alpha: f64) -> T {
+    if v.is_sign_positive() {
+        v
+    } else {
+        let alpha = T::from(alpha).unwrap_or(T::nan());
+        (v.exp() - T::one()) * alpha
+    }
+}
+
+struct Elu {
+    alpha: f64,
+}
+
+impl CustomOp1 for Elu {
+    fn name(&self) -> &'static str {
+        "elu"
+    }
+
+    fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Shape)> {
+        let storage = candle_core::map_dtype!(
+            "elu",
+            s,
+            |s| cpu_backend::unary_map(s, l, |v| fwd(v, self.alpha)),
+            (BF16, F16, F32, F64)
+        );
+        Ok((storage, l.shape().clone()))
+    }
+}
+
+#[test]
+fn custom_op1_no_backward() -> Result<()> {
+    let cpu = &Device::Cpu;
+    let t = Tensor::arange(0u32, 12u32, cpu)?.to_dtype(DType::F32)?;
+    let t = (t - 5.)?;
+    let elu_t = t.apply_op1_no_bwd(&Elu { alpha: 1. })?;
+    assert_eq!(
+        to_vec1_round(&elu_t, 4)?,
+        &[-0.9933, -0.9817, -0.9502, -0.8647, -0.6321, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+    );
+    Ok(())
+}
+
+// Define a similar struct as Elu but with backward support.
+fn bwd<T: num_traits::Float>(v: T, alpha: f64) -> T {
+    if v.is_sign_positive() {
+        T::one()
+    } else {
+        let alpha = T::from(alpha).unwrap_or(T::nan());
+        v.exp() * alpha
+    }
+}
+
+struct EluBackward {
+    alpha: f64,
+}
+
+impl CustomOp1 for EluBackward {
+    fn name(&self) -> &'static str {
+        "elu-bwd"
+    }
+
+    fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Shape)> {
+        let storage = candle_core::map_dtype!(
+            "elu-bwd",
+            s,
+            |s| cpu_backend::unary_map(s, l, |v| bwd(v, self.alpha)),
+            (BF16, F16, F32, F64)
+        );
+        Ok((storage, l.shape().clone()))
+    }
+}
+
+struct EluWithBackward(Elu);
+
+impl EluWithBackward {
+    fn new(alpha: f64) -> Self {
+        Self(Elu { alpha })
+    }
+}
+
+impl CustomOp1 for EluWithBackward {
+    fn name(&self) -> &'static str {
+        "elu"
+    }
+
+    fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Shape)> {
+        self.0.cpu_fwd(s, l)
+    }
+
+    fn bwd(&self, arg: &Tensor, _res: &Tensor, grad_res: &Tensor) -> Result<Option<Tensor>> {
+        let alpha = self.0.alpha;
+        let bwd = arg.apply_op1(EluBackward { alpha })?;
+        Ok(Some(grad_res.mul(&bwd)?))
+    }
+}
+
+#[test]
+fn custom_op1_with_backward() -> Result<()> {
+    let cpu = &Device::Cpu;
+    let t = candle_core::Var::new(&[-2f32, 0f32, 2f32], cpu)?;
+    let elu_t = t.apply_op1(EluWithBackward::new(2.))?;
+    assert_eq!(to_vec1_round(&elu_t, 4)?, &[-1.7293, 0.0, 2.0]);
+
+    let grads = elu_t.backward()?;
+    let grad_x = grads.get(&t).unwrap();
+    assert_eq!(to_vec1_round(grad_x, 4)?, [0.2707, 1.0, 1.0]);
+
+    Ok(())
+}
+
+impl candle_core::InplaceOp1 for Elu {
+    fn name(&self) -> &'static str {
+        "elu"
+    }
+
+    fn cpu_fwd(&self, s: &mut CpuStorage, _l: &Layout) -> Result<()> {
+        let alpha = self.alpha;
+        match s {
+            CpuStorage::BF16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
+            CpuStorage::F16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
+            CpuStorage::F32(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
+            CpuStorage::F64(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
+            _ => candle_core::bail!("unsupported dtype for inplace elu"),
+        }
+        Ok(())
+    }
+}
+
+#[test]
+fn inplace_op1() -> Result<()> {
+    let cpu = &Device::Cpu;
+    let t = Tensor::arange(0u32, 12u32, cpu)?.to_dtype(DType::F32)?;
+    let t = (t - 5.)?;
+    t.inplace_op1(&Elu { alpha: 1. })?;
+    assert_eq!(
+        to_vec1_round(&t, 4)?,
+        &[-0.9933, -0.9817, -0.9502, -0.8647, -0.6321, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+    );
+    Ok(())
+}
--- a/candle-core/tests/display_tests.rs
+++ b/candle-core/tests/display_tests.rs
+use anyhow::Result;
+use candle_core::{DType, Device::Cpu, Tensor};
+
+#[test]
+fn display_scalar() -> Result<()> {
+    let t = Tensor::new(1234u32, &Cpu)?;
+    let s = format!("{t}");
+    assert_eq!(&s, "[1234]\nTensor[[], u32]");
+    let t = t.to_dtype(DType::F32)?.neg()?;
+    let s = format!("{}", (&t / 10.0)?);
+    assert_eq!(&s, "[-123.4000]\nTensor[[], f32]");
+    let s = format!("{}", (&t / 1e8)?);
+    assert_eq!(&s, "[-1.2340e-5]\nTensor[[], f32]");
+    let s = format!("{}", (&t * 1e8)?);
+    assert_eq!(&s, "[-1.2340e11]\nTensor[[], f32]");
+    let s = format!("{}", (&t * 0.)?);
+    assert_eq!(&s, "[0.]\nTensor[[], f32]");
+    Ok(())
+}
+
+#[test]
+fn display_vector() -> Result<()> {
+    let t = Tensor::new::<&[u32; 0]>(&[], &Cpu)?;
+    let s = format!("{t}");
+    assert_eq!(&s, "[]\nTensor[[0], u32]");
+    let t = Tensor::new(&[0.1234567, 1.0, -1.2, 4.1, f64::NAN], &Cpu)?;
+    let s = format!("{t}");
+    assert_eq!(
+        &s,
+        "[ 0.1235,  1.0000, -1.2000,  4.1000,     NaN]\nTensor[[5], f64]"
+    );
+    let t = (Tensor::ones(50, DType::F32, &Cpu)? * 42.)?;
+    let s = format!("\n{t}");
+    let expected = r#"
+[42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
+ 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
+ 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
+ 42., 42.]
+Tensor[[50], f32]"#;
+    assert_eq!(&s, expected);
+    let t = (Tensor::ones(11000, DType::F32, &Cpu)? * 42.)?;
+    let s = format!("{t}");
+    assert_eq!(
+        &s,
+        "[42., 42., 42., ..., 42., 42., 42.]\nTensor[[11000], f32]"
+    );
+    Ok(())
+}
+
+#[test]
+fn display_multi_dim() -> Result<()> {
+    let t = (Tensor::ones((200, 100), DType::F32, &Cpu)? * 42.)?;
+    let s = format!("\n{t}");
+    let expected = r#"
+[[42., 42., 42., ..., 42., 42., 42.],
+ [42., 42., 42., ..., 42., 42., 42.],
+ [42., 42., 42., ..., 42., 42., 42.],
+ ...
+ [42., 42., 42., ..., 42., 42., 42.],
+ [42., 42., 42., ..., 42., 42., 42.],
+ [42., 42., 42., ..., 42., 42., 42.]]
+Tensor[[200, 100], f32]"#;
+    assert_eq!(&s, expected);
+    let t = t.reshape(&[2, 1, 1, 100, 100])?;
+    let t = format!("\n{t}");
+    let expected = r#"
+[[[[[42., 42., 42., ..., 42., 42., 42.],
+    [42., 42., 42., ..., 42., 42., 42.],
+    [42., 42., 42., ..., 42., 42., 42.],
+    ...
+    [42., 42., 42., ..., 42., 42., 42.],
+    [42., 42., 42., ..., 42., 42., 42.],
+    [42., 42., 42., ..., 42., 42., 42.]]]],
+ [[[[42., 42., 42., ..., 42., 42., 42.],
+    [42., 42., 42., ..., 42., 42., 42.],
+    [42., 42., 42., ..., 42., 42., 42.],
+    ...
+    [42., 42., 42., ..., 42., 42., 42.],
+    [42., 42., 42., ..., 42., 42., 42.],
+    [42., 42., 42., ..., 42., 42., 42.]]]]]
+Tensor[[2, 1, 1, 100, 100], f32]"#;
+    assert_eq!(&t, expected);
+    Ok(())
+}