Initial commit

25d2752f · yongshk · 25d2752f · 25d2752f · 25d2752f · 25d2752f
Commit 25d2752f authored May 29, 2025 by yongshk
20 changed files
--- a/candle-core/tests/fortran_tensor_3d.pth
+++ b/candle-core/tests/fortran_tensor_3d.pth
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
+#![allow(clippy::approx_constant)]
+use anyhow::{Context, Result};
+use candle_core::{test_device, test_utils, Device, Shape, Tensor, Var};
+fn simple_grad(device: &Device) -> Result<()> {
+    let x = Var::new(&[3f32, 1., 4.], device)?;
+    let x = x.as_tensor();
+    let y = (((x * x)? + x * 5f64)? + 4f64)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(x.to_vec1::<f32>()?, [3., 1., 4.]);
+    // y = x^2 + 5.x + 4
+    assert_eq!(y.to_vec1::<f32>()?, [28., 10., 40.]);
+    // dy/dx = 2.x + 5
+    assert_eq!(grad_x.to_vec1::<f32>()?, [11., 7., 13.]);
+    Ok(())
+}
+fn sum_grad(device: &Device) -> Result<()> {
+    let x = Var::new(&[3f32, 1., 4.], device)?;
+    let x = x.as_tensor();
+    let y = (x.sqr()?.sum_keepdim(0)? * 2.)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(y.to_vec1::<f32>()?, [52.]);
+    // y = 2.x^2 so dy/dx = 4.x
+    assert_eq!(grad_x.to_vec1::<f32>()?, &[12., 4., 16.]);
+    // Same test as before but squeezing on the last dimension.
+    let y = (x.sqr()?.sum_keepdim(0)? * 2.)?.squeeze(0)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(y.to_scalar::<f32>()?, 52.);
+    // y = 2.x^2 so dy/dx = 4.x
+    assert_eq!(grad_x.to_vec1::<f32>()?, &[12., 4., 16.]);
+    Ok(())
+}
+fn matmul_grad(device: &Device) -> Result<()> {
+    let data: Vec<_> = (0..12).map(|i| i as f32).collect();
+    let x = Var::from_slice(&data, (2, 2, 3), device)?;
+    let data: Vec<_> = (0..12).map(|i| i as f32).collect();
+    let y = Var::from_slice(&data, (2, 3, 2), device)?;
+    let c = x.matmul(&y)?;
+    let grads = c.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    let grad_y = grads.get(&y).context("no grad for y")?;
+    assert_eq!(grad_x.shape(), &Shape::from((2, 2, 3)));
+    assert_eq!(grad_y.shape(), &Shape::from((2, 3, 2)));
+    assert_eq!(
+        &*grad_x.to_vec3::<f32>()?,
+        &[
+            [[1., 5., 9.], [1., 5., 9.]],
+            [[13., 17., 21.], [13., 17., 21.]]
+        ]
+    );
+    assert_eq!(
+        &*grad_y.to_vec3::<f32>()?,
+        &[
+            [[3., 3.], [5., 5.], [7., 7.]],
+            [[15., 15.], [17., 17.], [19., 19.]]
+        ]
+    );
+    Ok(())
+}
+// The simplest gradient descent, using scalar variable.
+fn grad_descent(device: &Device) -> Result<()> {
+    let x = Var::new(0f32, device)?;
+    let learning_rate = 0.1;
+    for _step in 0..100 {
+        let xt = x.as_tensor();
+        let c = ((xt - 4.2)? * (xt - 4.2)?)?;
+        let grads = c.backward()?;
+        let x_grad = grads.get(&x).context("no grad for x")?;
+        x.set(&(xt - x_grad * learning_rate)?)?
+    }
+    assert_eq!(x.to_scalar::<f32>()?, 4.199999);
+    Ok(())
+}
+fn unary_grad(device: &Device) -> Result<()> {
+    let x = Var::new(&[3f32, 1., 4., 0.15], device)?;
+    let x = x.as_tensor();
+    let y = (x.log()? + 1.)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 4)?,
+        [2.0986, 1.0, 2.3863, -0.8971]
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [0.3333, 1.0, 0.25, 6.6667]
+    );
+    let y = x.exp()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 4)?,
+        [20.0855, 2.7183, 54.5982, 1.1618]
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [20.0855, 2.7183, 54.5982, 1.1618]
+    );
+    let y = x.exp()?.sqr()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 3)?,
+        [403.429, 7.389, 2980.958, 1.35]
+    );
+    // exp(x)^2 = exp(2*x)
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 2)?,
+        [806.86, 14.78, 5961.92, 2.7]
+    );
+    let y = x.sin()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 4)?,
+        [0.1411, 0.8415, -0.7568, 0.1494],
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [-0.99, 0.5403, -0.6536, 0.9888],
+    );
+    let y = x.cos()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 4)?,
+        [-0.99, 0.5403, -0.6536, 0.9888],
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [-0.1411, -0.8415, 0.7568, -0.1494],
+    );
+    let y = x.sqr()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(y.to_vec1::<f32>()?, [9.0, 1.0, 16.0, 0.0225]);
+    assert_eq!(grad_x.to_vec1::<f32>()?, [6.0, 2.0, 8.0, 0.3]);
+    let y = x.sqr()?.sqrt()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(y.to_vec1::<f32>()?, [3.0, 1.0, 4.0, 0.15]);
+    assert_eq!(test_utils::to_vec1_round(grad_x, 4)?, [1.0, 1.0, 1.0, 1.0]);
+    let y = x.neg()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(y.to_vec1::<f32>()?, [-3.0, -1.0, -4.0, -0.15]);
+    assert_eq!(grad_x.to_vec1::<f32>()?, [-1.0, -1.0, -1.0, -1.0]);
+    let y = x.affine(0.2, 1.)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(y.to_vec1::<f32>()?, [1.6, 1.2, 1.8, 1.03]);
+    assert_eq!(grad_x.to_vec1::<f32>()?, [0.2, 0.2, 0.2, 0.2]);
+    let y = Tensor::new(1f32, device)?.broadcast_div(x)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 4)?,
+        [0.3333, 1.0, 0.25, 6.6667]
+    );
+    assert_eq!(
+        grad_x.to_vec1::<f32>()?,
+        [-0.11111111, -1.0, -0.0625, -44.444443],
+    );
+    let y = x.broadcast_div(&Tensor::new(0.5f32, device)?)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(y.to_vec1::<f32>()?, [6., 2., 8., 0.3]);
+    assert_eq!(grad_x.to_vec1::<f32>()?, [2., 2., 2., 2.]);
+    let x = Var::new(&[3f32, 1., 4., 0.15], device)?;
+    let y = x.powf(2.5)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(test_utils::to_vec1_round(&y, 2)?, [15.59, 1.0, 32.0, 0.01]);
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 2)?,
+        [12.99, 2.5, 20.0, 0.15]
+    );
+    let y = x.tanh()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(test_utils::to_vec1_round(&y, 2)?, [1.0, 0.76, 1.0, 0.15]);
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 2)?,
+        [0.01, 0.42, 0.0, 0.98],
+    );
+    // testing compared to pytorch nn.GELU(approximate = 'tanh')
+    let y = x.gelu()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 4)?,
+        [2.9964, 0.8412, 3.9999, 0.0839]
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [1.0116, 1.0830, 1.0003, 0.6188],
+    );
+    // Testing compared to pytorch torch.erf
+    //
+    // import torch
+    // x = torch.tensor([3.0, 1.0, 4.0, 0.15], requires_grad=True)
+    // y = x.erf()
+    // print(y)
+    // loss = y.sum()
+    // loss.backward()
+    // print(x.grad)
+    let y = x.erf()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(test_utils::to_vec1_round(&y, 4)?, [1.0, 0.8427, 1.0, 0.168]);
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [0.0001, 0.4151, 0.0, 1.1033],
+    );
+    // Testing compared to pytorch nn.GELU(approximate = 'none')
+    //
+    // import torch
+    // import torch.nn.functional as F
+    // x = torch.tensor([3.0, 1.0, 4.0, 0.15], requires_grad=True)
+    // y = F.gelu(x, approximate='none')
+    // print(y)
+    // loss = y.sum()
+    // loss.backward()
+    // print(x.grad)
+    let y = x.gelu_erf()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 4)?,
+        [2.9960, 0.8413, 3.9999, 0.0839]
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [1.0119, 1.0833, 1.0005, 0.6188],
+    );
+    // Testing compared to pytorch elu
+    //
+    // import torch
+    // import torch.nn.functional as F
+    // x = torch.tensor([-1.0, 0.0, -2.0, 3.0], requires_grad=True)
+    // y = F.elu(x, alpha=2.0)
+    // print(y)
+    // loss = y.min
+    // loss = y.sum()
+    // loss.backward()
+    // print(x.grad)
+    let elu_x = Var::new(&[-1.0f32, 0., -2., 3.], device)?;
+    let y = elu_x.elu(2.)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(&elu_x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 4)?,
+        [-1.2642, 0.0000, -1.7293, 3.0000]
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [0.7358, 2.0000, 0.2707, 1.0000]
+    );
+    // testing compared to pytorch nn.Silu()
+    let y = x.silu()?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec1_round(&y, 4)?,
+        [2.8577, 0.7311, 3.9281, 0.0806]
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [1.0881, 0.9277, 1.0527, 0.5747],
+    );
+    if device.is_cpu() {
+        let x = Var::new(&[[[1f32, 2., 3.], [4., 5., 6.], [7., 8., 9.]]], device)?;
+        let y = x.interpolate1d(12)?.reshape(36)?;
+        let z = Tensor::new(
+            &[
+                1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16.,
+                17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+                33., 34., 35., 36.,
+            ],
+            device,
+        )?;
+        let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
+        let grads = loss.backward()?;
+        let grad_x = grads.get(&x).context("no grad for x")?;
+        assert_eq!(
+            test_utils::to_vec3_round(grad_x, 4)?,
+            [[[10_f32, 26., 42.], [58., 74., 90.], [106., 122., 138.]]]
+        );
+    }
+    // manually checked: see comments
+    let x = Var::new(&[[[[1f32, 2., 3.], [4., 5., 6.], [7., 8., 9.]]]], device)?;
+    let y = x.interpolate2d(6, 6)?.reshape(36)?;
+    let z = Tensor::new(
+        &[
+            1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16., 17.,
+            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34.,
+            35., 36.,
+        ],
+        device,
+    )?;
+    // gradient should be
+    // row 1
+    // 1+2+7+8 = 18
+    // 3+4+9+10 = 26
+    // 5+6+11+12 = 34
+    // row 2
+    // 13+14+19+20 = 66
+    // 15+16+21+22 = 74
+    // 17+18+23+24 = 82
+    // row 3
+    // 25+26+31+32 = 114
+    // 27+28+33+34 = 122
+    // 29+30+35+36 = 130
+    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
+    let grads = loss.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec2_round(&grad_x.flatten(0, 2)?, 4)?,
+        [[18_f32, 26., 34.], [66., 74., 82.], [114., 122., 130.]]
+    );
+    // manually checked: see comments
+    let x = Var::new(&[[[[1f32, 2.], [4., 5.]]]], device)?;
+    let y = x.interpolate2d(6, 6)?.reshape(36)?;
+    let z = Tensor::new(
+        &[
+            1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16., 17.,
+            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34.,
+            35., 36.,
+        ],
+        device,
+    )?;
+    // gradient should be
+    // row 1
+    // 1+2+3+7+8+9+13+14+15 = 72
+    // 4+5+6+10+11+12+16+17+18 = 99
+    // row 2
+    // 19+20+21+25+26+27+31+32+33 = 234
+    // 22+23+24+28+29+30+34+35+36 = 243
+    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
+    let grads = loss.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec2_round(&grad_x.flatten(0, 2)?, 4)?,
+        [[72_f32, 99.], [234., 261.]]
+    );
+    // manually checked: see comments
+    let x = Var::new(&[[[[1f32, 2.], [4., 5.]], [[6f32, 7.], [8., 9.]]]], device)?;
+    let y = x.interpolate2d(4, 4)?.reshape(32)?;
+    #[rustfmt::skip]
+    let z = Tensor::new(
+        &[
+            1_f32, 02., 03., 04.,
+            05.,   06., 07., 08.,
+            09.,   10., 11., 12.,
+            13.,   14., 15., 16.,
+            17.,   18., 19., 20.,
+            21.,   22., 23., 24.,
+            25.,   26., 27., 28.,
+            29.,   30., 31., 32.
+        ],
+        device,
+    )?;
+    // gradient should be
+    // m1r1
+    // 1+2+5+6=14
+    // 3+4+7+8=22
+    // m1r2
+    // 9+10+13+14=46
+    // 11+12+15+16=54
+    // m2r1
+    // 17+18+21+22=78
+    // 19+20+23+24=86
+    // m2r2
+    // 25+26+29+30=110
+    // 27+28+31+32=118
+    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
+    let grads = loss.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec3_round(&grad_x.flatten(0, 1)?, 4)?,
+        [[[14_f32, 22.], [46., 54.]], [[78., 86.], [110., 118.]]]
+    );
+    // manually checked: see comments
+    let x = Var::new(
+        &[[[[1f32, 2.], [4., 5.]]], [[[6f32, 7.], [8., 9.]]]],
+        device,
+    )?;
+    let y = x.interpolate2d(4, 4)?.reshape(32)?;
+    #[rustfmt::skip]
+       let z = Tensor::new(
+           &[
+               1_f32, 02., 03., 04.,
+               05.,   06., 07., 08.,
+               09.,   10., 11., 12.,
+               13.,   14., 15., 16.,
+               17.,   18., 19., 20.,
+               21.,   22., 23., 24.,
+               25.,   26., 27., 28.,
+               29.,   30., 31., 32.
+           ],
+           device,
+       )?;
+    // gradient should be
+    // m1r1
+    // 1+2+5+6=14
+    // 3+4+7+8=22
+    // m1r2
+    // 9+10+13+14=46
+    // 11+12+15+16=54
+    // m2r1
+    // 17+18+21+22=78
+    // 19+20+23+24=86
+    // m2r2
+    // 25+26+29+30=110
+    // 27+28+31+32=118
+    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
+    let grads = loss.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(
+        test_utils::to_vec3_round(&grad_x.flatten(0, 1)?, 4)?,
+        [[[14_f32, 22.], [46., 54.]], [[78., 86.], [110., 118.]]]
+    );
+    Ok(())
+}
+fn binary_grad(device: &Device) -> Result<()> {
+    let x = Var::new(&[3f32, 1., -4., -1.], device)?;
+    let x = x.as_tensor();
+    // leaky relu
+    let y = x.maximum(&(x * 0.1)?)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(x.to_vec1::<f32>()?, [3., 1., -4., -1.]);
+    assert_eq!(y.to_vec1::<f32>()?, [3., 1., -0.4, -0.1]);
+    assert_eq!(grad_x.to_vec1::<f32>()?, [1., 1., 0.1, 0.1]);
+    let y = x.minimum(&(x * 0.1)?)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(y.to_vec1::<f32>()?, [0.3, 0.1, -4., -1.]);
+    assert_eq!(grad_x.to_vec1::<f32>()?, [0.1, 0.1, 1., 1.]);
+    // This one is easy to mess up, we want the gradient to be one as it is the identity function.
+    let y = x.minimum(x)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    assert_eq!(y.to_vec1::<f32>()?, [3., 1., -4., -1.]);
+    assert_eq!(grad_x.to_vec1::<f32>()?, [1., 1., 1., 1.]);
+    let x_var = Var::new(&[3f32, 1., -4., -1., 5., 9.], device)?;
+    let x = x_var.as_tensor();
+    let y_var = Var::new(&[2f32, 7., 1.], device)?;
+    let y = y_var.as_tensor();
+    let ss = x
+        .reshape((2, 3))?
+        .slice_scatter0(&y.reshape((1, 3))?, 1)?
+        .sqr()?;
+    let grads = ss.backward()?;
+    let grad_x = grads.get(x).context("no grad for x")?;
+    let grad_y = grads.get(y).context("no grad for y")?;
+    assert_eq!(ss.to_vec2::<f32>()?, [[9., 1., 16.], [4., 49., 1.]]);
+    assert_eq!(grad_x.to_vec1::<f32>()?, [6.0, 2.0, -8.0, 0.0, 0.0, 0.0]);
+    assert_eq!(grad_y.to_vec1::<f32>()?, [4.0, 14.0, 2.0]);
+    Ok(())
+}
+test_device!(
+    simple_grad,
+    simple_grad_cpu,
+    simple_grad_gpu,
+    simple_grad_metal
+);
+test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu, sum_grad_metal);
+test_device!(
+    matmul_grad,
+    matmul_grad_cpu,
+    matmul_grad_gpu,
+    matmul_grad_metal
+);
+test_device!(
+    grad_descent,
+    grad_descent_cpu,
+    grad_descent_gpu,
+    grad_descent_metal
+);
+test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu, unary_grad_metal);
+test_device!(
+    binary_grad,
+    binary_grad_cpu,
+    binary_grad_gpu,
+    binary_grad_metal
+);
--- a/candle-core/tests/indexing_tests.rs
+++ b/candle-core/tests/indexing_tests.rs
+use anyhow::Result;
+use candle_core::{Device, IndexOp, Tensor};
+#[test]
+fn integer_index() -> Result<()> {
+    let dev = Device::Cpu;
+    let tensor = Tensor::arange(0u32, 2 * 3, &dev)?.reshape((2, 3))?;
+    let result = tensor.i(1)?;
+    assert_eq!(result.dims(), &[3]);
+    assert_eq!(result.to_vec1::<u32>()?, &[3, 4, 5]);
+    let result = tensor.i((.., 2))?;
+    assert_eq!(result.dims(), &[2]);
+    assert_eq!(result.to_vec1::<u32>()?, &[2, 5]);
+    Ok(())
+}
+#[test]
+fn range_index() -> Result<()> {
+    let dev = Device::Cpu;
+    // RangeFull
+    let tensor = Tensor::arange(0u32, 2 * 3, &dev)?.reshape((2, 3))?;
+    let result = tensor.i(..)?;
+    assert_eq!(result.dims(), &[2, 3]);
+    assert_eq!(result.to_vec2::<u32>()?, &[[0, 1, 2], [3, 4, 5]]);
+    // Range
+    let tensor = Tensor::arange(0u32, 4 * 3, &dev)?.reshape((4, 3))?;
+    let result = tensor.i(1..3)?;
+    assert_eq!(result.dims(), &[2, 3]);
+    assert_eq!(result.to_vec2::<u32>()?, &[[3, 4, 5], [6, 7, 8]]);
+    // RangeFrom
+    let result = tensor.i(2..)?;
+    assert_eq!(result.dims(), &[2, 3]);
+    assert_eq!(result.to_vec2::<u32>()?, &[[6, 7, 8], [9, 10, 11]]);
+    // RangeTo
+    let result = tensor.i(..2)?;
+    assert_eq!(result.dims(), &[2, 3]);
+    assert_eq!(result.to_vec2::<u32>()?, &[[0, 1, 2], [3, 4, 5]]);
+    // RangeInclusive
+    let result = tensor.i(1..=2)?;
+    assert_eq!(result.dims(), &[2, 3]);
+    assert_eq!(result.to_vec2::<u32>()?, &[[3, 4, 5], [6, 7, 8]]);
+    // RangeTo
+    let result = tensor.i(..1)?;
+    assert_eq!(result.dims(), &[1, 3]);
+    assert_eq!(result.to_vec2::<u32>()?, &[[0, 1, 2]]);
+    // RangeToInclusive
+    let result = tensor.i(..=1)?;
+    assert_eq!(result.dims(), &[2, 3]);
+    assert_eq!(result.to_vec2::<u32>()?, &[[0, 1, 2], [3, 4, 5]]);
+    // Empty range
+    let result = tensor.i(1..1)?;
+    assert_eq!(result.dims(), &[0, 3]);
+    let empty: [[u32; 3]; 0] = [];
+    assert_eq!(result.to_vec2::<u32>()?, &empty);
+    // Similar to PyTorch, allow empty ranges when the computed length is negative.
+    #[allow(clippy::reversed_empty_ranges)]
+    let result = tensor.i(1..0)?;
+    assert_eq!(result.dims(), &[0, 3]);
+    let empty: [[u32; 3]; 0] = [];
+    assert_eq!(result.to_vec2::<u32>()?, &empty);
+    Ok(())
+}
+#[test]
+fn index_3d() -> Result<()> {
+    let tensor = Tensor::from_iter(0..24u32, &Device::Cpu)?.reshape((2, 3, 4))?;
+    assert_eq!(tensor.i((0, 0, 0))?.to_scalar::<u32>()?, 0);
+    assert_eq!(tensor.i((1, 0, 0))?.to_scalar::<u32>()?, 12);
+    assert_eq!(tensor.i((0, 1, 0))?.to_scalar::<u32>()?, 4);
+    assert_eq!(tensor.i((0, 1, 3))?.to_scalar::<u32>()?, 7);
+    assert_eq!(tensor.i((0..2, 0, 0))?.to_vec1::<u32>()?, &[0, 12]);
+    assert_eq!(
+        tensor.i((0..2, .., 0))?.to_vec2::<u32>()?,
+        &[[0, 4, 8], [12, 16, 20]]
+    );
+    assert_eq!(
+        tensor.i((..2, .., 3))?.to_vec2::<u32>()?,
+        &[[3, 7, 11], [15, 19, 23]]
+    );
+    assert_eq!(tensor.i((1, .., 3))?.to_vec1::<u32>()?, &[15, 19, 23]);
+    Ok(())
+}
+#[test]
+fn slice_assign() -> Result<()> {
+    let dev = Device::Cpu;
+    let tensor = Tensor::arange(0u32, 4 * 5, &dev)?.reshape((4, 5))?;
+    let src = Tensor::arange(0u32, 2 * 3, &dev)?.reshape((3, 2))?;
+    let out = tensor.slice_assign(&[1..4, 3..5], &src)?;
+    assert_eq!(
+        out.to_vec2::<u32>()?,
+        &[
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 0, 1],
+            [10, 11, 12, 2, 3],
+            [15, 16, 17, 4, 5]
+        ]
+    );
+    let out = tensor.slice_assign(&[0..3, 0..2], &src)?;
+    assert_eq!(
+        out.to_vec2::<u32>()?,
+        &[
+            [0, 1, 2, 3, 4],
+            [2, 3, 7, 8, 9],
+            [4, 5, 12, 13, 14],
+            [15, 16, 17, 18, 19]
+        ]
+    );
+    Ok(())
+}
--- a/candle-core/tests/layout_tests.rs
+++ b/candle-core/tests/layout_tests.rs
+use candle::{test_device, Device, IndexOp, Result, Tensor};
+use candle_core as candle;
+fn contiguous(device: &Device) -> Result<()> {
+    let tensor = Tensor::arange(0u32, 24u32, device)?.reshape((2, 3, 4))?;
+    assert_eq!(
+        tensor.to_vec3::<u32>()?,
+        &[
+            [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]],
+            [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
+        ]
+    );
+    assert_eq!(
+        tensor.t()?.contiguous()?.to_vec3::<u32>()?,
+        &[
+            [[0, 4, 8], [1, 5, 9], [2, 6, 10], [3, 7, 11]],
+            [[12, 16, 20], [13, 17, 21], [14, 18, 22], [15, 19, 23]]
+        ]
+    );
+    assert_eq!(
+        tensor.transpose(0, 1)?.contiguous()?.to_vec3::<u32>()?,
+        &[
+            [[0, 1, 2, 3], [12, 13, 14, 15]],
+            [[4, 5, 6, 7], [16, 17, 18, 19]],
+            [[8, 9, 10, 11], [20, 21, 22, 23]]
+        ]
+    );
+    assert_eq!(
+        tensor.transpose(0, 1)?.flatten_all()?.to_vec1::<u32>()?,
+        &[0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 16, 17, 18, 19, 8, 9, 10, 11, 20, 21, 22, 23]
+    );
+    assert_eq!(
+        tensor
+            .i(1..)?
+            .transpose(0, 1)?
+            .contiguous()?
+            .to_vec3::<u32>()?,
+        &[[[12, 13, 14, 15]], [[16, 17, 18, 19]], [[20, 21, 22, 23]]]
+    );
+    assert_eq!(
+        tensor.transpose(0, 2)?.contiguous()?.to_vec3::<u32>()?,
+        &[
+            [[0, 12], [4, 16], [8, 20]],
+            [[1, 13], [5, 17], [9, 21]],
+            [[2, 14], [6, 18], [10, 22]],
+            [[3, 15], [7, 19], [11, 23]]
+        ]
+    );
+    Ok(())
+}
+test_device!(contiguous, contiguous_cpu, contiguous_gpu, contiguous_metal);
+#[test]
+fn strided_blocks() -> Result<()> {
+    use candle::Device::Cpu;
+    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
+    match tensor.strided_blocks() {
+        candle::StridedBlocks::SingleBlock { start_offset, len } => {
+            assert_eq!(start_offset, 0);
+            assert_eq!(len, 24);
+        }
+        candle::StridedBlocks::MultipleBlocks { .. } => {
+            panic!("unexpected block structure")
+        }
+    };
+    let tensor = Tensor::arange(0u32, 26u32, &Cpu)?
+        .i(2..)?
+        .reshape((2, 3, 4))?;
+    match tensor.strided_blocks() {
+        candle::StridedBlocks::SingleBlock { start_offset, len } => {
+            assert_eq!(start_offset, 2);
+            assert_eq!(len, 24);
+        }
+        candle::StridedBlocks::MultipleBlocks { .. } => {
+            panic!("unexpected block structure")
+        }
+    };
+    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
+    let tensor = tensor.i(1)?;
+    match tensor.strided_blocks() {
+        candle::StridedBlocks::SingleBlock { start_offset, len } => {
+            assert_eq!(start_offset, 12);
+            assert_eq!(len, 12);
+        }
+        candle::StridedBlocks::MultipleBlocks { .. } => {
+            panic!("unexpected block structure")
+        }
+    };
+    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
+    let tensor = tensor.i((.., 1))?.contiguous()?;
+    match tensor.strided_blocks() {
+        candle::StridedBlocks::SingleBlock { start_offset, len } => {
+            assert_eq!(start_offset, 0);
+            assert_eq!(len, 8);
+            assert_eq!(tensor.to_vec2::<u32>()?, &[[4, 5, 6, 7], [16, 17, 18, 19]]);
+        }
+        candle::StridedBlocks::MultipleBlocks { .. } => {
+            panic!("unexpected block structure")
+        }
+    };
+    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
+    let tensor = tensor.i((.., 1))?;
+    match tensor.strided_blocks() {
+        candle::StridedBlocks::SingleBlock { .. } => {
+            panic!("unexpected block structure")
+        }
+        candle::StridedBlocks::MultipleBlocks {
+            block_len,
+            block_start_index,
+        } => {
+            assert_eq!(block_len, 4);
+            assert_eq!(block_start_index.collect::<Vec<_>>(), &[4, 16])
+        }
+    };
+    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
+    match tensor.t()?.strided_blocks() {
+        candle::StridedBlocks::SingleBlock { .. } => {
+            panic!("unexpected block structure")
+        }
+        candle::StridedBlocks::MultipleBlocks {
+            block_start_index,
+            block_len,
+        } => {
+            assert_eq!(block_len, 1);
+            assert_eq!(
+                block_start_index.collect::<Vec<_>>(),
+                &[
+                    0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 16, 20, 13, 17, 21, 14, 18, 22, 15,
+                    19, 23
+                ]
+            )
+        }
+    };
+    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
+    match tensor.transpose(0, 1)?.strided_blocks() {
+        candle::StridedBlocks::SingleBlock { .. } => {
+            panic!("unexpected block structure")
+        }
+        candle::StridedBlocks::MultipleBlocks {
+            block_start_index,
+            block_len,
+        } => {
+            assert_eq!(block_len, 4);
+            assert_eq!(
+                block_start_index.collect::<Vec<_>>(),
+                &[0, 12, 4, 16, 8, 20]
+            )
+        }
+    };
+    Ok(())
+}
--- a/candle-core/tests/matmul_tests.rs
+++ b/candle-core/tests/matmul_tests.rs
+use candle_core::{test_device, DType, Device, IndexOp, Result, Tensor};
+fn matmul(device: &Device) -> Result<()> {
+    let data = vec![1.0f32, 2.0, 3.0, 4.0];
+    let a = Tensor::from_slice(&data, (2, 2), device)?;
+    let data = vec![1.0f32, 2.0, 3.0, 4.0];
+    let b = Tensor::from_slice(&data, (2, 2), device)?;
+    let c = a.matmul(&b)?;
+    assert_eq!(c.to_vec2::<f32>()?, &[[7.0f32, 10.0], [15.0, 22.0]]);
+    let data = vec![1.0f32, 2.0];
+    let a = Tensor::from_slice(&data, (2, 1), device)?;
+    let data = vec![3.0f32, 4.0];
+    let b = Tensor::from_slice(&data, (1, 2), device)?;
+    let c = a.matmul(&b)?;
+    assert_eq!(c.to_vec2::<f32>()?, &[&[3.0, 4.0], &[6.0, 8.0]]);
+    let data: Vec<_> = (0..6).map(|i| i as f32).collect();
+    let a = Tensor::from_slice(&data, (2, 3), device)?;
+    let data: Vec<_> = (0..6).map(|i| (i + 2) as f32).collect();
+    let b = Tensor::from_slice(&data, (3, 2), device)?;
+    let c = a.matmul(&b)?;
+    assert_eq!(c.to_vec2::<f32>()?, &[&[16., 19.], &[52., 64.]]);
+    let data: Vec<_> = (0..12).map(|i| i as f32).collect();
+    let a = Tensor::from_slice(&data, (2, 2, 3), device)?;
+    let data: Vec<_> = (0..12).map(|i| (i + 2) as f32).collect();
+    let b = Tensor::from_slice(&data, (2, 3, 2), device)?;
+    let expected = [[[16., 19.], [52., 64.]], [[214., 235.], [304., 334.]]];
+    let c = a.matmul(&b)?;
+    assert_eq!(c.to_vec3::<f32>()?, &expected);
+    // Also perform the matmul on contiguous transposed versions.
+    let a_tt = a.t()?.contiguous()?.t()?;
+    assert!(!a_tt.is_contiguous());
+    assert_eq!(a.dims(), a_tt.dims());
+    assert_eq!(a_tt.stride(), &[6, 1, 2]);
+    let b_tt = b.t()?.contiguous()?.t()?;
+    assert!(!b_tt.is_contiguous());
+    assert_eq!(b.dims(), b_tt.dims());
+    assert_eq!(b_tt.stride(), &[6, 1, 3]);
+    assert_eq!(a_tt.matmul(&b)?.to_vec3::<f32>()?, &expected);
+    assert_eq!(a.matmul(&b_tt)?.to_vec3::<f32>()?, &expected);
+    assert_eq!(a_tt.matmul(&b_tt)?.to_vec3::<f32>()?, &expected);
+    Ok(())
+}
+fn broadcast_matmul(device: &Device) -> Result<()> {
+    let lhs = Tensor::randn(0f32, 1f32, (3, 1, 4, 5), device)?;
+    let rhs = Tensor::randn(0f32, 1f32, (6, 5, 2), device)?;
+    let out = lhs.broadcast_matmul(&rhs)?;
+    assert_eq!(out.dims(), &[3, 6, 4, 2]);
+    for idx1 in 0..3 {
+        for idx2 in 0..6 {
+            let out = out.i((idx1, idx2))?;
+            let lhs = lhs.i((idx1, 0))?;
+            let rhs = rhs.i(idx2)?;
+            let out2 = lhs.matmul(&rhs);
+            let sum_diff2 = (out - out2)?.sqr()?.sum_all()?;
+            // With cuda, we see errors of up to ~1e-12.
+            assert!(sum_diff2.to_vec0::<f32>()? < 1e-6)
+        }
+    }
+    Ok(())
+}
+// https://github.com/huggingface/candle/issues/1948
+fn squeeze_mm(device: &Device) -> Result<()> {
+    let seq_len = 8_usize;
+    let a = Tensor::zeros((1, seq_len, 16), DType::F32, device)?;
+    let x = a.i((.., seq_len - 1, ..))?;
+    let w = Tensor::zeros((32, 16), DType::F32, device)?.t()?;
+    let x = x.matmul(&w)?;
+    assert_eq!(x.dims(), &[1, 32]);
+    Ok(())
+}
+// https://github.com/huggingface/candle/issues/1992
+fn mm_layout(device: &Device) -> Result<()> {
+    let a = Tensor::arange(0f32, 16f32, device)?.reshape((1, 1, 4, 4))?;
+    let b = Tensor::arange(0f32, 8f32, device)?.reshape((1, 1, 4, 2))?;
+    let mm1 = a.matmul(&b)?;
+    // Forces the layout to be:
+    // shape: [1, 1, 4, 2], stride: [8, 2, 2, 1], start_offset: 0
+    // This is still a contiguous matrix but matmul checks are only the two last dimensions have
+    // non 1 sizes but matmul check may be reluctant to handle it.
+    let b = b.transpose(1, 2)?.force_contiguous()?.transpose(1, 2)?;
+    let mm2 = a.matmul(&b)?;
+    let diff = (mm1 - mm2)?.abs()?.sum_all()?.to_vec0::<f32>()?;
+    assert_eq!(diff, 0.);
+    Ok(())
+}
+test_device!(matmul, matmul_cpu, matmul_gpu, matmul_metal);
+test_device!(
+    broadcast_matmul,
+    broadcast_matmul_cpu,
+    broadcast_matmul_gpu,
+    broadcast_matmul_metal
+);
+test_device!(squeeze_mm, squeeze_mm_cpu, squeeze_mm_gpu, squeeze_mm_metal);
+test_device!(mm_layout, mm_layout_cpu, mm_layout_gpu, mm_layout_metal);
--- a/candle-core/tests/npy.py
+++ b/candle-core/tests/npy.py
+import numpy as np
+x = np.arange(10)
+# Write a npy file.
+np.save("test.npy", x)
+# Write multiple values to a npz file.
+values = { "x": x, "x_plus_one": x + 1 }
+np.savez("test.npz", **values)
--- a/candle-core/tests/pool_tests.rs
+++ b/candle-core/tests/pool_tests.rs
+use candle_core::{test_device, test_utils, Device, IndexOp, Result, Tensor};
+// https://github.com/huggingface/candle/issues/364
+fn avg_pool2d(dev: &Device) -> Result<()> {
+    let data: Vec<f32> = vec![
+        1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+    ];
+    let t = Tensor::from_vec(data, (1, 1, 4, 4), dev)?;
+    let pool = t.avg_pool2d(2)?.squeeze(0)?.squeeze(0)?;
+    assert_eq!(pool.to_vec2::<f32>()?, [[0.5f32, 1.], [1., 1.]]);
+    let data: Vec<f32> = vec![
+        1., 2., 1., 3., 0., 0., 1., 1., 1., 1., 1., 1., 5., 1., 1., 1.,
+    ];
+    let t = Tensor::from_vec(data, (1, 1, 2, 8), dev)?;
+    let pool = t.avg_pool2d(2)?.squeeze(0)?.squeeze(0)?;
+    assert_eq!(pool.to_vec2::<f32>()?, [[5. / 4., 6. / 4., 6. / 4., 1.]]);
+    Ok(())
+}
+fn max_pool2d(dev: &Device) -> Result<()> {
+    let data: Vec<f32> = vec![
+        1., 2., 1., 3., 0., 0., 1., 1., 1., 1., 1., 1., 5., 1., 1., 1.,
+    ];
+    let t = Tensor::from_vec(data, (1, 1, 4, 4), dev)?;
+    let pool = t.max_pool2d(2)?.squeeze(0)?.squeeze(0)?;
+    assert_eq!(pool.to_vec2::<f32>()?, [[2f32, 3.], [5., 1.]]);
+    let t = t.reshape((1, 1, 2, 8))?;
+    let pool = t.max_pool2d(2)?.squeeze(0)?.squeeze(0)?;
+    assert_eq!(pool.to_vec2::<f32>()?, [[2.0, 3.0, 5.0, 1.0]]);
+    Ok(())
+}
+/* This test corresponds to the following PyTorch script.
+import torch
+torch.manual_seed(4242)
+t = torch.randn((1, 2, 4, 4))
+print(t.flatten())
+res = torch.nn.functional.avg_pool2d(t, 2)
+print(res)
+*/
+fn avg_pool2d_pytorch(dev: &Device) -> Result<()> {
+    if dev.is_metal() {
+        return Ok(());
+    }
+    let t = Tensor::new(
+        &[
+            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
+            1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843, 0.2395,
+            1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013, -0.6836,
+            0.2477, 1.3127,
+        ],
+        dev,
+    )?
+    .reshape((1, 2, 4, 4))?;
+    let pool = t.avg_pool2d(2)?.squeeze(0)?;
+    assert_eq!(
+        test_utils::to_vec3_round(&pool, 4)?,
+        [
+            [[-1.1926, -0.0395], [0.2688, 0.1871]],
+            [[0.1835, -0.1606], [0.6249, 0.3217]]
+        ]
+    );
+    let pool = t.avg_pool2d(3)?.squeeze(0)?;
+    assert_eq!(
+        test_utils::to_vec3_round(&pool, 4)?,
+        [[[0.085]], [[0.0078]]]
+    );
+    let t = t.reshape((1, 1, 4, 8))?;
+    let pool = t.avg_pool2d(2)?.squeeze(0)?.squeeze(0)?;
+    assert_eq!(
+        test_utils::to_vec2_round(&pool, 4)?,
+        [
+            [0.7745, 0.0276, -1.6983, 0.12],
+            [0.3542, 0.1625, 0.4542, -0.0014]
+        ]
+    );
+    Ok(())
+}
+fn upsample_nearest2d(dev: &Device) -> Result<()> {
+    let t = Tensor::arange(0f32, 6f32, dev)?.reshape((1, 1, 2, 3))?;
+    let upsampled = t.upsample_nearest2d(4, 6)?.i(0)?.i(0)?;
+    assert_eq!(
+        t.i(0)?.i(0)?.to_vec2::<f32>()?,
+        [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]
+    );
+    assert_eq!(
+        upsampled.to_vec2::<f32>()?,
+        [
+            [0.0, 0.0, 1.0, 1.0, 2.0, 2.0],
+            [0.0, 0.0, 1.0, 1.0, 2.0, 2.0],
+            [3.0, 3.0, 4.0, 4.0, 5.0, 5.0],
+            [3.0, 3.0, 4.0, 4.0, 5.0, 5.0]
+        ]
+    );
+    Ok(())
+}
+test_device!(avg_pool2d, avg_pool2d_cpu, avg_pool2d_gpu, avg_pool2d_metal);
+test_device!(
+    avg_pool2d_pytorch,
+    avg_pool2d_pytorch_cpu,
+    avg_pool2d_pytorch_gpu,
+    avg_pool2d_pytorch_metal
+);
+test_device!(max_pool2d, max_pool2d_cpu, max_pool2d_gpu, max_pool2d_metal);
+test_device!(
+    upsample_nearest2d,
+    upsample_nearest2d_cpu,
+    upsample_nearest2d_gpu,
+    upsample_nearest2d_metal
+);
--- a/candle-core/tests/pth.py
+++ b/candle-core/tests/pth.py
+import torch
+from collections import OrderedDict
+# Write a trivial tensor to a pt file
+a= torch.tensor([[1,2,3,4], [5,6,7,8]])
+o = OrderedDict()
+o["test"] = a
+# Write a trivial tensor to a pt file
+torch.save(o, "test.pt")
+############################################################################################################
+# Write a trivial tensor to a pt file with a key
+torch.save({"model_state_dict": o}, "test_with_key.pt")
+############################################################################################################
+# Create a tensor with fortran contiguous memory layout
+import numpy as np
+# Step 1: Create a 3D NumPy array with Fortran order using a range of numbers
+# For example, creating a 2x3x4 array
+array_fortran = np.asfortranarray(np.arange(1, 2*3*4 + 1).reshape(2, 3, 4))
+# Verify the memory order
+print("Is Fortran contiguous (F order):", array_fortran.flags['F_CONTIGUOUS'])  # Should be True
+print("Is C contiguous (C order):", array_fortran.flags['C_CONTIGUOUS'])  # Should be False
+# Step 2: Convert the NumPy array to a PyTorch tensor
+tensor_fortran = torch.from_numpy(array_fortran)
+# Verify the tensor layout
+print("Tensor stride:", tensor_fortran.stride())  # Stride will reflect the Fortran memory layout
+# Step 3: Save the PyTorch tensor to a .pth file
+torch.save({"tensor_fortran": tensor_fortran}, 'fortran_tensor_3d.pth')
+print("3D Tensor saved with Fortran layout.")
--- a/candle-core/tests/pth_tests.rs
+++ b/candle-core/tests/pth_tests.rs
+/// Regression test for pth files not loading on Windows.
+#[test]
+fn test_pth() {
+    let tensors = candle_core::pickle::PthTensors::new("tests/test.pt", None).unwrap();
+    tensors.get("test").unwrap().unwrap();
+}
+#[test]
+fn test_pth_with_key() {
+    let tensors =
+        candle_core::pickle::PthTensors::new("tests/test_with_key.pt", Some("model_state_dict"))
+            .unwrap();
+    tensors.get("test").unwrap().unwrap();
+}
+#[test]
+fn test_pth_fortran_congiguous() {
+    let tensors =
+        candle_core::pickle::PthTensors::new("tests/fortran_tensor_3d.pth", None).unwrap();
+    let tensor = tensors.get("tensor_fortran").unwrap().unwrap();
+    assert_eq!(tensor.dims3().unwrap(), (2, 3, 4));
+    assert_eq!(
+        tensor.to_vec3::<i64>().unwrap(),
+        [
+            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
+            [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]
+        ]
+    );
+}
--- a/candle-core/tests/quantized_tests.rs
+++ b/candle-core/tests/quantized_tests.rs
+use candle_core::{
+    bail,
+    quantized::{self, GgmlDType},
+    test_device,
+    test_utils::to_vec2_round,
+    Device, Module, Result, Tensor,
+};
+use quantized::{k_quants, GgmlType};
+use rand::prelude::*;
+const GGML_TEST_SIZE: usize = 32 * 128;
+const GGML_MAX_QUANTIZATION_TOTAL_ERROR: f32 = 0.002;
+const GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS: f32 = 0.0075;
+const GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS: f32 = 0.0040;
+const GGML_MAX_DOT_PRODUCT_ERROR: f32 = 0.02;
+fn test_matmul(
+    device: &Device,
+    (b, m, n, k): (usize, usize, usize, usize),
+    dtype: GgmlDType,
+) -> Result<()> {
+    let lhs = (0..(m * k))
+        .map(|v| v as f32 / (m * k) as f32)
+        .collect::<Vec<_>>();
+    let rhs = (0..(k * n))
+        .map(|v| v as f32 / (n * k) as f32)
+        .collect::<Vec<_>>();
+    let lhs = Tensor::from_slice(&lhs, (m, k), device)?;
+    let rhs = Tensor::from_slice(&rhs, (k, n), device)?;
+    let mm = lhs.matmul(&rhs)?;
+    let qtensor = quantized::QTensor::quantize(&rhs.t()?, dtype)?;
+    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
+    let res = matmul.forward(&lhs)?;
+    let error: f32 = ((&mm - &res)?.abs()? / &mm.abs()?)?
+        .sum_all()?
+        .to_scalar()?;
+    let error = error / (b * m * n) as f32;
+    assert!(
+        error <= 0.02,
+        "Error {error} is too big. \nExpected:\n {mm} \nFound:\n {res}\n for {dtype:?}"
+    );
+    Ok(())
+}
+fn quantized_matmul(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
+    let (m, k, n) = (3, 64, 4);
+    let lhs = (0..(m * k)).map(|v| v as f32).collect::<Vec<_>>();
+    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), device)?;
+    let mut dst = vec![42.; 3 * 4];
+    let mut rhs_t = vec![k_quants::BlockQ4_0::zeros(); 8];
+    let rhs = (0..(k * n)).map(|v| v as f32).collect::<Vec<_>>();
+    k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
+    k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
+    assert_eq!(
+        dst.iter().map(|x| x.round()).collect::<Vec<_>>(),
+        &[
+            85120.0, 214562.0, 345455.0, 474748.0, 213475.0, 604465.0, 1000686.0, 1388317.0,
+            341876.0, 994283.0, 1655709.0, 2301518.0
+        ]
+    );
+    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), device)?.t()?;
+    let mm = tensor_lhs.matmul(&tensor_rhs)?;
+    assert_eq!(
+        mm.to_vec2::<f32>()?,
+        &[
+            [85344.0, 214368.0, 343392.0, 472416.0],
+            [214368.0, 605536.0, 996704.0, 1387872.0],
+            [343392.0, 996704.0, 1650016.0, 2303328.0]
+        ]
+    );
+    let qtensor = quantized::QTensor::quantize(&tensor_rhs.t()?, GgmlDType::Q4_0)?;
+    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
+    let res = matmul.forward(&tensor_lhs)?;
+    match device {
+        Device::Metal(_) => assert_eq!(
+            to_vec2_round(&res, 0)?,
+            &[
+                [84946.0, 214126.0, 344757.0, 473798.0],
+                [213458.0, 604350.0, 1000469.0, 1387990.0],
+                [341970.0, 994574.0, 1656181.0, 2302182.0]
+            ]
+        ),
+        _ => assert_eq!(
+            to_vec2_round(&res, 0)?,
+            &[
+                [85120.0, 214562.0, 345455.0, 474748.0],
+                [213475.0, 604465.0, 1000686.0, 1388317.0],
+                [341876.0, 994283.0, 1655709.0, 2301518.0]
+            ]
+        ),
+    }
+    test_matmul(device, (1, 3, 4, 256), GgmlDType::Q4_0)?;
+    Ok(())
+}
+fn quantized_matmul_neg(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
+    let (m, k, n) = (3, 64, 4);
+    let lhs = (0..(m * k))
+        .map(|v| v as f32 - (m * k) as f32 / 2.0)
+        .collect::<Vec<_>>();
+    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), device)?;
+    let mut dst = vec![42.; 3 * 4];
+    let mut rhs_t = vec![k_quants::BlockQ4_0::zeros(); 8];
+    let rhs = (0..k * n)
+        .map(|v| v as f32 - (k * n) as f32 / 3.0)
+        .collect::<Vec<_>>();
+    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), device)?.t()?;
+    k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
+    k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
+    assert_eq!(
+        dst.iter().map(|x| x.round()).collect::<Vec<_>>(),
+        &[
+            243524.0, -19596.0, -285051.0, -549815.0, 23777.0, 21651.0, 19398.0, 18367.0,
+            -196472.0, 63012.0, 324585.0, 587902.0
+        ]
+    );
+    let mm = tensor_lhs.matmul(&tensor_rhs)?;
+    assert_eq!(
+        to_vec2_round(&mm, 0)?,
+        &[
+            [244064.0, -20128.0, -284320.0, -548512.0],
+            [23563.0, 21515.0, 19467.0, 17419.0],
+            [-196939.0, 63157.0, 323253.0, 583349.0]
+        ]
+    );
+    let qtensor = quantized::QTensor::quantize(&tensor_rhs.t()?, GgmlDType::Q4_0)?;
+    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
+    let res = matmul.forward(&tensor_lhs)?;
+    match device {
+        Device::Metal(_) => assert_eq!(
+            to_vec2_round(&res, 0)?,
+            &[
+                [243666.0, -19714.0, -285433.0, -550453.0],
+                [23782.0, 21654.0, 19400.0, 18369.0],
+                [-196102.0, 63022.0, 324233.0, 587191.0]
+            ]
+        ),
+        _ => assert_eq!(
+            to_vec2_round(&res, 0)?,
+            &[
+                [243524.0, -19596.0, -285051.0, -549815.0],
+                [23777.0, 21651.0, 19398.0, 18367.0],
+                [-196472.0, 63012.0, 324585.0, 587902.0]
+            ]
+        ),
+    }
+    Ok(())
+}
+test_device!(
+    quantized_matmul,
+    quantized_matmul_cpu,
+    quantized_matmul_cuda,
+    quantized_matmul_metal
+);
+test_device!(
+    quantized_matmul_neg,
+    quantized_matmul_neg_cpu,
+    quantized_matmul_neg_cuda,
+    quantized_matmul_neg_metal
+);
+fn quantize_q4_0(device: &Device) -> Result<()> {
+    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
+    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
+    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q4_0)?;
+    let dst = quant.dequantize(device)?;
+    assert_eq!(
+        dst.to_vec1::<f32>()?,
+        &[
+            -0.0, -0.0, 3.875, 3.875, 3.875, 3.875, 7.75, 7.75, 7.75, 7.75, 11.625, 11.625, 11.625,
+            11.625, 15.5, 15.5, 15.5, 15.5, 19.375, 19.375, 19.375, 19.375, 23.25, 23.25, 23.25,
+            23.25, 27.125, 27.125, 27.125, 27.125, 31.0, 31.0, 31.5, 31.5, 31.5, 31.5, 39.375,
+            39.375, 39.375, 39.375, 39.375, 39.375, 39.375, 39.375, 47.25, 47.25, 47.25, 47.25,
+            47.25, 47.25, 47.25, 47.25, 55.125, 55.125, 55.125, 55.125, 55.125, 55.125, 55.125,
+            55.125, 63.0, 63.0, 63.0, 63.0, 59.375, 59.375, 71.25, 71.25, 71.25, 71.25, 71.25,
+            71.25, 71.25, 71.25, 71.25, 71.25, 71.25, 71.25, 83.125, 83.125, 83.125, 83.125,
+            83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 95.0, 95.0, 95.0, 95.0,
+            95.0, 95.0, 95.25, 95.25, 95.25, 95.25, 95.25, 95.25, 95.25, 95.25, 111.125, 111.125,
+            111.125, 111.125, 111.125, 111.125, 111.125, 111.125, 111.125, 111.125, 111.125,
+            111.125, 111.125, 111.125, 111.125, 111.125, 127.0, 127.0, 127.0, 127.0, 127.0, 127.0,
+            127.0, 127.0
+        ]
+    );
+    ggml_quantization_error_test(GgmlDType::Q4_0, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    Ok(())
+}
+fn quantize_q4_1(device: &Device) -> Result<()> {
+    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
+    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
+    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q4_1)?;
+    let dst = quant.dequantize(device)?;
+    assert_eq!(
+        round_vector(&dst.to_vec1::<f32>()?),
+        &[
+            0.0, 0.0, 2.066, 2.066, 4.133, 4.133, 6.199, 6.199, 8.266, 8.266, 10.332, 10.332,
+            12.398, 12.398, 14.465, 14.465, 16.531, 16.531, 18.598, 18.598, 20.664, 20.664, 22.73,
+            22.73, 24.797, 24.797, 26.863, 26.863, 28.93, 28.93, 30.996, 30.996, 32.0, 32.0,
+            34.066, 34.066, 36.133, 36.133, 38.199, 38.199, 40.266, 40.266, 42.332, 42.332, 44.398,
+            44.398, 46.465, 46.465, 48.531, 48.531, 50.598, 50.598, 52.664, 52.664, 54.73, 54.73,
+            56.797, 56.797, 58.863, 58.863, 60.93, 60.93, 62.996, 62.996, 64.0, 64.0, 66.066,
+            66.066, 68.133, 68.133, 70.199, 70.199, 72.266, 72.266, 74.332, 74.332, 76.398, 76.398,
+            78.465, 78.465, 80.531, 80.531, 82.598, 82.598, 84.664, 84.664, 86.73, 86.73, 88.797,
+            88.797, 90.863, 90.863, 92.93, 92.93, 94.996, 94.996, 96.0, 96.0, 98.066, 98.066,
+            100.133, 100.133, 102.199, 102.199, 104.266, 104.266, 106.332, 106.332, 108.398,
+            108.398, 110.465, 110.465, 112.531, 112.531, 114.598, 114.598, 116.664, 116.664,
+            118.73, 118.73, 120.797, 120.797, 122.863, 122.863, 124.93, 124.93, 126.996, 126.996
+        ]
+    );
+    ggml_quantization_error_test(GgmlDType::Q4_1, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    Ok(())
+}
+fn quantize_q5_0(device: &Device) -> Result<()> {
+    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
+    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
+    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q5_0)?;
+    let dst = quant.dequantize(device)?;
+    assert_eq!(
+        round_vector(&dst.to_vec1::<f32>()?),
+        &[
+            -0.0, 1.938, 1.938, 3.875, 3.875, 5.813, 5.813, 7.75, 7.75, 9.688, 9.688, 11.625,
+            11.625, 13.563, 13.563, 15.5, 15.5, 17.438, 17.438, 19.375, 19.375, 21.313, 21.313,
+            23.25, 23.25, 25.188, 25.188, 27.125, 27.125, 29.063, 29.063, 31.0, 31.5, 31.5, 35.438,
+            35.438, 35.438, 35.438, 39.375, 39.375, 39.375, 39.375, 43.313, 43.313, 43.313, 43.313,
+            47.25, 47.25, 47.25, 47.25, 51.188, 51.188, 51.188, 51.188, 55.125, 55.125, 55.125,
+            55.125, 59.063, 59.063, 59.063, 59.063, 63.0, 63.0, 65.313, 65.313, 65.313, 65.313,
+            65.313, 71.25, 71.25, 71.25, 71.25, 71.25, 71.25, 77.188, 77.188, 77.188, 77.188,
+            77.188, 77.188, 83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 89.063, 89.063, 89.063,
+            89.063, 89.063, 89.063, 95.0, 95.0, 95.0, 95.25, 95.25, 95.25, 95.25, 103.188, 103.188,
+            103.188, 103.188, 103.188, 103.188, 103.188, 103.188, 111.125, 111.125, 111.125,
+            111.125, 111.125, 111.125, 111.125, 111.125, 119.063, 119.063, 119.063, 119.063,
+            119.063, 119.063, 119.063, 119.063, 127.0, 127.0, 127.0, 127.0
+        ]
+    );
+    ggml_quantization_error_test(GgmlDType::Q5_0, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    Ok(())
+}
+fn quantize_q5_1(device: &Device) -> Result<()> {
+    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
+    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
+    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q5_1)?;
+    let dst = quant.dequantize(device)?;
+    assert_eq!(
+        round_vector(&dst.to_vec1::<f32>()?),
+        &[
+            0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+            16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
+            30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0,
+            44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0,
+            58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0,
+            72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0,
+            86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0,
+            100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0,
+            112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0,
+            124.0, 125.0, 126.0, 127.0
+        ]
+    );
+    ggml_quantization_error_test(GgmlDType::Q5_1, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    Ok(())
+}
+fn get_test_vector2(bound: f32, size: usize, device: &Device) -> Result<Tensor> {
+    assert!(
+        size % crate::quantized::k_quants::QK_K == 0,
+        "size must be a multiple of {}",
+        crate::quantized::k_quants::QK_K
+    );
+    let src = (0..size)
+        .map(|v| (v as f32 - size as f32 / 2.) * bound / (size as f32 / 2.))
+        .collect::<Vec<_>>();
+    assert_eq!([src[0], src[size / 2]], [-bound, 0.0]);
+    Tensor::from_vec(src, (size,), device)
+}
+/// Round a vector
+fn round_vector(values: &[f32]) -> Vec<f32> {
+    values
+        .iter()
+        .map(|x| (1000. * x).round() / 1000.)
+        .collect::<Vec<_>>()
+}
+fn compare_with_error(values: &[f32], expected: &[f32], tolerance: f32) {
+    for (i, (value, expected_value)) in values.iter().zip(expected.iter()).enumerate() {
+        let difference = (value - expected_value).abs();
+        assert!(
+            difference < tolerance,
+            "Error at index {}: value = {}, expected = {}. Difference = {} exceeds tolerance = {}.",
+            i,
+            value,
+            expected_value,
+            difference,
+            tolerance
+        );
+    }
+}
+/// Creates a vector similar to the ones used in GGML unit tests:
+/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30
+fn create_ggml_like_vector(offset: f32) -> Vec<f32> {
+    (0..GGML_TEST_SIZE)
+        .map(|i| 0.1 + 2.0 * (i as f32 + offset).cos())
+        .collect()
+}
+/// Calculates the root mean square error between two vectors
+fn calculate_rmse(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    let sum = a
+        .iter()
+        .zip(b)
+        .map(|(a, b)| (a - b).powi(2))
+        .sum::<f32>()
+        .sqrt();
+    sum / a.len() as f32
+}
+/// Similar to the GGML quantization unit test:
+/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50
+fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_error: f32) -> Result<()> {
+    let src = create_ggml_like_vector(0.0);
+    let src = Tensor::from_slice(&src, (GGML_TEST_SIZE,), device)?;
+    let quant = quantized::QTensor::quantize(&src, dtype)?;
+    let dst = quant.dequantize(device)?;
+    let error = calculate_rmse(&src.to_vec1::<f32>()?, &dst.to_vec1::<f32>()?);
+    if error > max_error {
+        bail!(
+            "Quantization error {} exceeds max error {}",
+            error,
+            max_error
+        );
+    }
+    Ok(())
+}
+fn quantize_q2k(device: &Device) -> Result<()> {
+    let dtype = GgmlDType::Q2K;
+    let src = get_test_vector2(0.5, 1024, device)?;
+    let quant = quantized::QTensor::quantize(&src, dtype)?;
+    let dst = quant.dequantize(device)?;
+    let src = src.to_vec1::<f32>()?;
+    let dst = dst.to_vec1::<f32>()?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.1);
+    // Test some specific values
+    assert_eq!(
+        [src[0], src[128], src[256], src[512], src[800], src[1023]],
+        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
+    );
+    let dst = round_vector(&dst);
+    assert_eq!(
+        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
+        [-0.499, -0.366, -0.249, 0.0, 0.295, 0.492]
+    );
+    let src_big = get_test_vector2(128.0, 1024, device)?;
+    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
+    let dst_big = quant_big.dequantize(device)?;
+    let src_big = src_big.to_vec1::<f32>()?;
+    let dst_big = dst_big.to_vec1::<f32>()?;
+    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 6.0);
+    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS)?;
+    Ok(())
+}
+fn quantize_q3k(device: &Device) -> Result<()> {
+    let dtype = GgmlDType::Q3K;
+    let src = get_test_vector2(0.5, 1024, device)?;
+    let quant = quantized::QTensor::quantize(&src, dtype)?;
+    let dst = quant.dequantize(device)?;
+    let src = src.to_vec1::<f32>()?;
+    let dst = dst.to_vec1::<f32>()?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.03);
+    // Test some specific values
+    assert_eq!(
+        [src[0], src[128], src[256], src[512], src[800], src[1023]],
+        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
+    );
+    let dst = round_vector(&dst);
+    assert_eq!(
+        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
+        [-0.493, -0.37, -0.243, -0.0, 0.292, 0.492]
+    );
+    let src_big = get_test_vector2(128.0, 1024, device)?;
+    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
+    let dst_big = quant_big.dequantize(device)?;
+    let src_big = src_big.to_vec1::<f32>()?;
+    let dst_big = dst_big.to_vec1::<f32>()?;
+    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 3.5);
+    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS)?;
+    Ok(())
+}
+fn quantize_q4k(device: &Device) -> Result<()> {
+    let dtype = GgmlDType::Q4K;
+    let src = get_test_vector2(0.5, 1024, device)?;
+    let quant = quantized::QTensor::quantize(&src, dtype)?;
+    let dst = quant.dequantize(device)?;
+    let src = src.to_vec1::<f32>()?;
+    let dst = dst.to_vec1::<f32>()?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.017);
+    // Test some specific values
+    assert_eq!(
+        [src[0], src[128], src[256], src[512], src[800], src[1023]],
+        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
+    );
+    let dst = round_vector(&dst);
+    assert_eq!(
+        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
+        [-0.5, -0.373, -0.25, 0.0, 0.288, 0.498]
+    );
+    let src_big = get_test_vector2(128.0, 1024, device)?;
+    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
+    let dst_big = quant_big.dequantize(device)?;
+    let src_big = src_big.to_vec1::<f32>()?;
+    let dst_big = dst_big.to_vec1::<f32>()?;
+    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 4.5);
+    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    Ok(())
+}
+fn quantize_q5k(device: &Device) -> Result<()> {
+    let dtype = GgmlDType::Q5K;
+    let src = get_test_vector2(0.5, 1024, device)?;
+    let quant = quantized::QTensor::quantize(&src, dtype)?;
+    let dst = quant.dequantize(device)?;
+    let src = src.to_vec1::<f32>()?;
+    let dst = dst.to_vec1::<f32>()?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.009);
+    // Test some specific values
+    assert_eq!(
+        [src[0], src[128], src[256], src[512], src[800], src[1023]],
+        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
+    );
+    let dst = round_vector(&dst);
+    assert_eq!(
+        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
+        [-0.5, -0.373, -0.25, 0.0, 0.279, 0.499]
+    );
+    let src_big = get_test_vector2(128.0, 1024, device)?;
+    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
+    let dst_big = quant_big.dequantize(device)?;
+    let src_big = src_big.to_vec1::<f32>()?;
+    let dst_big = dst_big.to_vec1::<f32>()?;
+    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 2.5);
+    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    Ok(())
+}
+fn quantize_q6k(device: &Device) -> Result<()> {
+    let dtype = GgmlDType::Q6K;
+    let src = get_test_vector2(0.5, 1024, device)?;
+    let quant = quantized::QTensor::quantize(&src, dtype)?;
+    let dst = quant.dequantize(device)?;
+    let src = src.to_vec1::<f32>()?;
+    let dst = dst.to_vec1::<f32>()?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);
+    // Test some specific values
+    assert_eq!(
+        [src[0], src[128], src[256], src[512], src[800], src[1023]],
+        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
+    );
+    let dst = round_vector(&dst);
+    assert_eq!(
+        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
+        [-0.497, -0.372, -0.25, -0.0, 0.284, 0.5]
+    );
+    let src_big = get_test_vector2(128.0, 1024, device)?;
+    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
+    let dst_big = quant_big.dequantize(device)?;
+    let src_big = src_big.to_vec1::<f32>()?;
+    let dst_big = dst_big.to_vec1::<f32>()?;
+    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 2.0);
+    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    Ok(())
+}
+fn quantize_q8k(device: &Device) -> Result<()> {
+    let dtype = GgmlDType::Q8K;
+    let src = get_test_vector2(0.5, 1024, device)?;
+    let quant = quantized::QTensor::quantize(&src, dtype)?;
+    let dst = quant.dequantize(device)?;
+    let src = src.to_vec1::<f32>()?;
+    let dst = dst.to_vec1::<f32>()?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);
+    // Test some specific values
+    assert_eq!(
+        [src[0], src[128], src[256], src[512], src[800], src[1023]],
+        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
+    );
+    let dst = round_vector(&dst);
+    assert_eq!(
+        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
+        [-0.5, -0.375, -0.25, -0.0, 0.281, 0.499]
+    );
+    let src_big = get_test_vector2(128.0, 1024, device)?;
+    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
+    let dst_big = quant_big.dequantize(device)?;
+    let src_big = src_big.to_vec1::<f32>()?;
+    let dst_big = dst_big.to_vec1::<f32>()?;
+    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 0.6);
+    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    Ok(())
+}
+test_device!(
+    quantize_q4_0,
+    quantize_q4_0_cpu,
+    quantize_q4_0_cuda,
+    quantize_q4_0_metal
+);
+test_device!(
+    quantize_q4_1,
+    quantize_q4_1_cpu,
+    quantize_q4_1_cuda,
+    quantize_q4_1_metal
+);
+test_device!(
+    quantize_q5_0,
+    quantize_q5_0_cpu,
+    quantize_q5_0_cuda,
+    quantize_q5_0_metal
+);
+test_device!(
+    quantize_q5_1,
+    quantize_q5_1_cpu,
+    quantize_q5_1_cuda,
+    quantize_q5_1_metal
+);
+test_device!(
+    quantize_q2k,
+    quantize_q2k_cpu,
+    quantize_q2k_cuda,
+    quantize_q2k_metal
+);
+test_device!(
+    quantize_q3k,
+    quantize_q3k_cpu,
+    quantize_q3k_cuda,
+    quantize_q3k_metal
+);
+test_device!(
+    quantize_q4k,
+    quantize_q4k_cpu,
+    quantize_q4k_cuda,
+    quantize_q4k_metal
+);
+test_device!(
+    quantize_q5k,
+    quantize_q5k_cpu,
+    quantize_q5k_cuda,
+    quantize_q5k_metal
+);
+test_device!(
+    quantize_q6k,
+    quantize_q6k_cpu,
+    quantize_q6k_cuda,
+    quantize_q6k_metal
+);
+test_device!(
+    quantize_q8k,
+    quantize_q8k_cpu,
+    quantize_q8k_cuda,
+    quantize_q8k_metal
+);
+/// Very simple dot product implementation
+fn vec_dot_reference(a: &[f32], b: &[f32]) -> f32 {
+    a.iter().zip(b).map(|(a, b)| a * b).sum()
+}
+/// Returns the error achieved by the GGML matmul unit test.
+fn ggml_reference_matmul_error(dtype: GgmlDType) -> Result<f32> {
+    let err = match dtype {
+        GgmlDType::F16 => 0.000010,
+        GgmlDType::Q2K => 0.004086,
+        GgmlDType::Q3K => 0.016148,
+        GgmlDType::Q4K => 0.002425,
+        GgmlDType::Q5K => 0.000740,
+        GgmlDType::Q6K => 0.000952,
+        GgmlDType::Q4_0 => 0.001143,
+        GgmlDType::Q4_1 => 0.008,
+        GgmlDType::Q5_0 => 0.001353,
+        GgmlDType::Q5_1 => 0.00149,
+        GgmlDType::Q8_0 => 0.000092,
+        // Not from the ggml repo.
+        GgmlDType::Q8K => 0.00065,
+        _ => bail!("No GGML results for quantization type {dtype:?}",),
+    };
+    Ok(err)
+}
+/// Similar to the GGML matmul unit test:
+/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91
+fn ggml_matmul_error_test<T: GgmlType>() -> Result<()> {
+    let a = create_ggml_like_vector(0.0);
+    let b = create_ggml_like_vector(1.0);
+    ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 1.0)?;
+    // Another example that is more likely to trigger the overflow reported in #1526
+    let a = (0..GGML_TEST_SIZE)
+        .map(|i| i as f32 / GGML_TEST_SIZE as f32)
+        .collect::<Vec<_>>();
+    let b = (0..GGML_TEST_SIZE)
+        .map(|i| i as f32 / GGML_TEST_SIZE as f32)
+        .collect::<Vec<_>>();
+    ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 2.0)?;
+    Ok(())
+}
+fn ggml_matmul_error_test_<T: GgmlType>(a: &[f32], b: &[f32], err_m: f32) -> Result<()> {
+    let length = a.len();
+    let mut a_quant = vec![T::zeros(); length / T::BLCK_SIZE];
+    let mut b_quant = vec![T::VecDotType::zeros(); length / T::VecDotType::BLCK_SIZE];
+    T::from_float(a, &mut a_quant)?;
+    T::VecDotType::from_float(b, &mut b_quant)?;
+    let result = T::vec_dot(length, &a_quant, &b_quant)?;
+    let result_unopt = T::vec_dot_unopt(length, &a_quant, &b_quant)?;
+    let reference_result = vec_dot_reference(a, b);
+    if (result - result_unopt).abs() / length as f32 > 1e-6 {
+        bail!(
+            "the opt and unopt vec-dot returned different values, opt {result}, unopt {result_unopt}"
+        )
+    }
+    let error = (result - reference_result).abs() / length as f32;
+    let ggml_error = ggml_reference_matmul_error(T::DTYPE)? * err_m;
+    if !error.is_finite() || error > GGML_MAX_DOT_PRODUCT_ERROR {
+        bail!("Dot product error {error} exceeds max error {GGML_MAX_DOT_PRODUCT_ERROR}",);
+    }
+    // We diverge slightly due to different rounding behavior / f16 to f32 conversions in GGML
+    // => we use a slightly higher error threshold
+    const ERROR_LENIENCY: f32 = 0.00001;
+    if error - ERROR_LENIENCY > ggml_error {
+        bail!(
+            "Dot product error {} exceeds ggml reference error {}",
+            error,
+            ggml_error
+        );
+    }
+    Ok(())
+}
+#[test]
+fn quantized_mm() -> Result<()> {
+    ggml_matmul_error_test::<k_quants::BlockQ4_0>()?;
+    ggml_matmul_error_test::<k_quants::BlockQ4_1>()?;
+    ggml_matmul_error_test::<k_quants::BlockQ5_0>()?;
+    ggml_matmul_error_test::<k_quants::BlockQ5_1>()?;
+    ggml_matmul_error_test::<k_quants::BlockQ8_0>()?;
+    Ok(())
+}
+/// generates random tensors of size `m x k` and `n x k` and calculates their expected matrix multiplication result.
+fn get_random_tensors(
+    m: usize,
+    k: usize,
+    n: usize,
+    device: &Device,
+) -> Result<(Tensor, Tensor, Tensor)> {
+    let mut rng = StdRng::seed_from_u64(314159265358979);
+    let lhs = (0..m * k)
+        .map(|_| rng.gen::<f32>() - 0.5)
+        .collect::<Vec<_>>();
+    let rhs = (0..n * k)
+        .map(|_| rng.gen::<f32>() - 0.5)
+        .collect::<Vec<_>>();
+    let lhs = Tensor::from_vec(lhs, (m, k), device)?;
+    let rhs = Tensor::from_vec(rhs, (n, k), device)?;
+    let mm = lhs.matmul(&rhs.t()?)?;
+    Ok((lhs, rhs, mm))
+}
+#[macro_export]
+macro_rules! quantized_matmul {
+    // TODO: Switch to generating the two last arguments automatically once concat_idents is
+    // stable. https://github.com/rust-lang/rust/issues/29599
+    ($fn_name: ident, $fn_name_cpu: ident, $fn_name_cuda: ident, $fn_name_metal: ident, $dtype: expr) => {
+        fn $fn_name(device: &Device) -> Result<()> {
+            test_matmul(device, (1, 3, 4, 256), $dtype)?;
+            Ok(())
+        }
+        test_device!($fn_name, $fn_name_cpu, $fn_name_cuda, $fn_name_metal);
+    };
+}
+quantized_matmul!(
+    quantized_matmul_q4_0_bis,
+    quantized_matmul_q4_0_cpu,
+    quantized_matmul_q4_0_cuda,
+    quantized_matmul_q4_0_metal,
+    GgmlDType::Q4_0
+);
+quantized_matmul!(
+    quantized_matmul_q4_1_bis,
+    quantized_matmul_q4_1_cpu,
+    quantized_matmul_q4_1_cuda,
+    quantized_matmul_q4_1_metal,
+    GgmlDType::Q4_1
+);
+quantized_matmul!(
+    quantized_matmul_q5_0_bis,
+    quantized_matmul_q5_0_cpu,
+    quantized_matmul_q5_0_cuda,
+    quantized_matmul_q5_0_metal,
+    GgmlDType::Q5_0
+);
+quantized_matmul!(
+    quantized_matmul_q5_1_bis,
+    quantized_matmul_q5_1_cpu,
+    quantized_matmul_q5_1_cuda,
+    quantized_matmul_q5_1_metal,
+    GgmlDType::Q5_1
+);
+quantized_matmul!(
+    quantized_matmul_q8_0_bis,
+    quantized_matmul_q8_0_cpu,
+    quantized_matmul_q8_0_cuda,
+    quantized_matmul_q8_0_metal,
+    GgmlDType::Q8_0
+);
+// Not implemented in Ggml
+// quantized_matmul!(
+//     quantized_matmul_q8_1_bis,
+//     quantized_matmul_q8_1_cpu,
+//     quantized_matmul_q8_1_cuda,
+//     quantized_matmul_q8_1_metal,
+//     GgmlDType::Q8_1
+// );
+// TODO This is bugged (also bugged in GGML
+quantized_matmul!(
+    quantized_matmul_q2k_bis,
+    quantized_matmul_q2k_cpu,
+    quantized_matmul_q2k_cuda,
+    quantized_matmul_q2k_metal,
+    GgmlDType::Q2K
+);
+quantized_matmul!(
+    quantized_matmul_q3k_bis,
+    quantized_matmul_q3k_cpu,
+    quantized_matmul_q3k_cuda,
+    quantized_matmul_q3k_metal,
+    GgmlDType::Q3K
+);
+quantized_matmul!(
+    quantized_matmul_q4k_bis,
+    quantized_matmul_q4k_cpu,
+    quantized_matmul_q4k_cuda,
+    quantized_matmul_q4k_metal,
+    GgmlDType::Q4K
+);
+quantized_matmul!(
+    quantized_matmul_q5k_bis,
+    quantized_matmul_q5k_cpu,
+    quantized_matmul_q5k_cuda,
+    quantized_matmul_q5k_metal,
+    GgmlDType::Q5K
+);
+quantized_matmul!(
+    quantized_matmul_q6k_bis,
+    quantized_matmul_q6k_cpu,
+    quantized_matmul_q6k_cuda,
+    quantized_matmul_q6k_metal,
+    GgmlDType::Q6K
+);
+// Not implemented on metal
+// quantized_matmul!(
+//     quantized_matmul_q8k_bis,
+//     quantized_matmul_q8k_cpu,
+//     quantized_matmul_q8k_cuda,
+//     quantized_matmul_q8k_metal,
+//     GgmlDType::Q8K
+// );
+#[test]
+fn quantized_matmul_q2k() -> Result<()> {
+    use k_quants::BlockQ2K;
+    let cpu = &Device::Cpu;
+    let (m, k, n) = (11, 512, 21);
+    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
+    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q2K)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let mm = rhs.forward(&lhs)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [0.916, 0.422, 0.215, 1.668]);
+    ggml_matmul_error_test::<BlockQ2K>()?;
+    Ok(())
+}
+#[test]
+fn quantized_matmul_q3k() -> Result<()> {
+    use k_quants::BlockQ3K;
+    let cpu = &Device::Cpu;
+    let (m, k, n) = (11, 512, 21);
+    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
+    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q3K)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let mm = rhs.forward(&lhs)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.029, 1.418, -0.314, 1.495]);
+    ggml_matmul_error_test::<BlockQ3K>()?;
+    Ok(())
+}
+#[test]
+fn quantized_matmul_q4k() -> Result<()> {
+    use k_quants::BlockQ4K;
+    let cpu = &Device::Cpu;
+    let (m, k, n) = (11, 512, 21);
+    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
+    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q4K)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let mm = rhs.forward(&lhs)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.125, 1.435, -0.201, 1.589]);
+    ggml_matmul_error_test::<BlockQ4K>()?;
+    Ok(())
+}
+#[test]
+fn quantized_matmul_q5k() -> Result<()> {
+    use k_quants::BlockQ5K;
+    let cpu = &Device::Cpu;
+    let (m, k, n) = (11, 512, 21);
+    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
+    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q5K)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let mm = rhs.forward(&lhs)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.192, 1.491, -0.18, 1.743]);
+    //Expected: 0.000740408897
+    ggml_matmul_error_test::<BlockQ5K>()?;
+    Ok(())
+}
+#[test]
+fn quantized_matmul_q6k() -> Result<()> {
+    use k_quants::BlockQ6K;
+    let cpu = &Device::Cpu;
+    let (m, k, n) = (11, 512, 21);
+    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
+    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q6K)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let mm = rhs.forward(&lhs)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.324, 1.49, -0.164, 1.741]);
+    ggml_matmul_error_test::<BlockQ6K>()?;
+    Ok(())
+}
+#[test]
+fn quantized_matmul_q8k() -> Result<()> {
+    use k_quants::BlockQ8K;
+    let cpu = &Device::Cpu;
+    let (m, k, n) = (11, 512, 21);
+    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
+    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q8K)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let mm = rhs.forward(&lhs)?;
+    assert_eq!(mm.dims(), [m, n]);
+    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
+    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
+    assert_eq!(dst, [1.266, 1.504, -0.204, 1.7]);
+    ggml_matmul_error_test::<BlockQ8K>()?;
+    Ok(())
+}
--- a/candle-core/tests/serialization_tests.rs
+++ b/candle-core/tests/serialization_tests.rs
+use candle_core::{DType, Result, Tensor};
+#[test]
+fn npy() -> Result<()> {
+    let npy = Tensor::read_npy("tests/test.npy")?;
+    assert_eq!(
+        npy.to_dtype(DType::U8)?.to_vec1::<u8>()?,
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    );
+    Ok(())
+}
+#[test]
+fn npz() -> Result<()> {
+    let npz = Tensor::read_npz("tests/test.npz")?;
+    assert_eq!(npz.len(), 2);
+    assert_eq!(npz[0].0, "x");
+    assert_eq!(npz[1].0, "x_plus_one");
+    assert_eq!(
+        npz[1].1.to_dtype(DType::U8)?.to_vec1::<u8>()?,
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    );
+    Ok(())
+}
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
+use candle_core::{test_device, test_utils, DType, Device, IndexOp, Result, Tensor, D};
+fn zeros(device: &Device) -> Result<()> {
+    let tensor = Tensor::zeros((5, 2), DType::F32, device)?;
+    let (dim1, dim2) = tensor.dims2()?;
+    assert_eq!(dim1, 5);
+    assert_eq!(dim2, 2);
+    Ok(())
+}
+fn ones(device: &Device) -> Result<()> {
+    assert_eq!(
+        Tensor::ones((2, 3), DType::U8, device)?.to_vec2::<u8>()?,
+        [[1, 1, 1], [1, 1, 1]],
+    );
+    assert_eq!(
+        Tensor::ones((2, 3), DType::U32, device)?.to_vec2::<u32>()?,
+        [[1, 1, 1], [1, 1, 1]],
+    );
+    assert_eq!(
+        Tensor::ones((2, 3), DType::I64, device)?.to_vec2::<i64>()?,
+        [[1, 1, 1], [1, 1, 1]],
+    );
+    assert_eq!(
+        Tensor::ones((2, 3), DType::F32, device)?.to_vec2::<f32>()?,
+        [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
+    );
+    assert_eq!(
+        Tensor::ones((2, 3), DType::F64, device)?.to_vec2::<f64>()?,
+        [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
+    );
+    Ok(())
+}
+fn full(device: &Device) -> Result<()> {
+    assert_eq!(
+        Tensor::full(42u32, (2, 3), device)?.to_vec2::<u32>()?,
+        [[42, 42, 42], [42, 42, 42]],
+    );
+    Ok(())
+}
+fn arange(device: &Device) -> Result<()> {
+    assert_eq!(
+        Tensor::arange(0u8, 5u8, device)?.to_vec1::<u8>()?,
+        [0, 1, 2, 3, 4],
+    );
+    assert_eq!(
+        Tensor::arange_step(0u8, 5u8, 2, device)?.to_vec1::<u8>()?,
+        [0, 2, 4],
+    );
+    assert_eq!(
+        Tensor::arange_step(0u8, 5u8, 3, device)?.to_vec1::<u8>()?,
+        [0, 3],
+    );
+    assert_eq!(
+        Tensor::arange_step(5i64, 0i64, -1, device)?.to_vec1::<i64>()?,
+        [5, 4, 3, 2, 1],
+    );
+    Ok(())
+}
+fn add_mul(device: &Device) -> Result<()> {
+    let tensor = Tensor::new(&[3f32, 1., 4.], device)?;
+    let dim1 = tensor.dims1()?;
+    assert_eq!(dim1, 3);
+    let content: Vec<f32> = tensor.to_vec1()?;
+    assert_eq!(content, [3., 1., 4.]);
+    let tensor = Tensor::add(&tensor, &tensor)?;
+    let content: Vec<f32> = tensor.to_vec1()?;
+    assert_eq!(content, [6., 2., 8.]);
+    let tensor = Tensor::mul(&tensor, &tensor)?;
+    let content: Vec<f32> = tensor.to_vec1()?;
+    assert_eq!(content, [36., 4., 64.]);
+    Ok(())
+}
+fn tensor_2d(device: &Device) -> Result<()> {
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
+    let tensor = Tensor::new(data, device)?;
+    let dims = tensor.dims2()?;
+    assert_eq!(dims, (2, 5));
+    let content: Vec<Vec<f32>> = tensor.to_vec2()?;
+    assert_eq!(content, data);
+    Ok(())
+}
+fn clamp(device: &Device) -> Result<()> {
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
+    let tensor = Tensor::new(data, device)?;
+    let tensor = tensor.clamp(1.5, 6.2)?;
+    assert_eq!(
+        tensor.to_vec2::<f32>()?,
+        [[3.0, 1.5, 4.0, 1.5, 5.0], [2.0, 1.5, 6.2, 6.2, 2.0]],
+    );
+    Ok(())
+}
+fn unary_op(device: &Device) -> Result<()> {
+    let data = &[[-3f32, 1., 4., -0.1, 0.5], [2.7, -1.8, -0.28, 1.8, 2.8]];
+    let tensor = Tensor::new(data, device)?;
+    assert_eq!(
+        test_utils::to_vec2_round(&tensor.gelu()?, 4)?,
+        [
+            [-0.0036, 0.8412, 3.9999, -0.046, 0.3457],
+            [2.6911, -0.0647, -0.1091, 1.7353, 2.7933]
+        ]
+    );
+    let t_f16 = tensor.to_dtype(DType::F16)?.gelu()?.to_dtype(DType::F32)?;
+    let max_diff = (tensor.gelu()? - t_f16)?.flatten_all()?.max(0)?;
+    assert!(max_diff.to_vec0::<f32>()? < 5e-3);
+    assert_eq!(
+        test_utils::to_vec2_round(&tensor.gelu_erf()?, 4)?,
+        [
+            [-0.004, 0.8413, 3.9999, -0.046, 0.3457],
+            [2.6906, -0.0647, -0.1091, 1.7353, 2.7928]
+        ]
+    );
+    assert_eq!(
+        test_utils::to_vec2_round(&tensor.erf()?, 4)?,
+        [
+            [-1.0, 0.8427, 1.0, -0.1125, 0.5205],
+            [0.9999, -0.9891, -0.3079, 0.9891, 0.9999]
+        ]
+    );
+    assert_eq!(
+        test_utils::to_vec2_round(&tensor.silu()?, 4)?,
+        [
+            [-0.1423, 0.7311, 3.9281, -0.0475, 0.3112],
+            [2.53, -0.2553, -0.1205, 1.5447, 2.6395]
+        ]
+    );
+    assert_eq!(
+        test_utils::to_vec2_round(&tensor.ceil()?, 4)?,
+        [[-3.0, 1.0, 4.0, -0.0, 1.0], [3.0, -1.0, -0.0, 2.0, 3.0]]
+    );
+    assert_eq!(
+        test_utils::to_vec2_round(&tensor.floor()?, 4)?,
+        [[-3.0, 1.0, 4.0, -1.0, 0.0], [2.0, -2.0, -1.0, 1.0, 2.0]]
+    );
+    assert_eq!(
+        test_utils::to_vec2_round(&tensor.round()?, 4)?,
+        [[-3.0, 1.0, 4.0, -0.0, 1.0], [3.0, -2.0, -0.0, 2.0, 3.0]]
+    );
+    let tensor = Tensor::new(&[2997.9246, 314.15926f32], device)?;
+    assert_eq!(
+        test_utils::to_vec1_round(&tensor.round_to(2)?, 4)?,
+        [2997.92, 314.16]
+    );
+    assert_eq!(
+        test_utils::to_vec1_round(&tensor.round_to(-2)?, 4)?,
+        [3000.0, 300.]
+    );
+    let tensor = Tensor::new(
+        &[-1.01f32, -0.9, -0.1, 0.0, -0.0, 0.1, 0.9, 1.0, 1.1],
+        device,
+    )?;
+    assert_eq!(
+        tensor.sign()?.to_vec1::<f32>()?,
+        [-1., -1., -1., 0., 0., 1., 1., 1., 1.]
+    );
+    Ok(())
+}
+fn binary_op(device: &Device) -> Result<()> {
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
+    let tensor1 = Tensor::new(data, device)?;
+    let data2 = &[[5f32, 5., 5., 5., 5.], [2., 1., 7., 8., 2.]];
+    let tensor2 = Tensor::new(data2, device)?;
+    let tensor = (&tensor1 + (&tensor1 * &tensor1)? / (&tensor1 + &tensor2))?;
+    let dims = tensor.dims2()?;
+    assert_eq!(dims, (2, 5));
+    let content: Vec<Vec<f32>> = tensor.to_vec2()?;
+    assert_eq!(content[0], [4.125, 1.1666666, 5.7777777, 1.1666666, 7.5]);
+    assert_eq!(content[1], [3.0, 1.5, 10.5, 12.0, 3.0]);
+    #[allow(clippy::eq_op)]
+    let tensor = (&tensor - &tensor)?;
+    let content: Vec<Vec<f32>> = tensor.to_vec2()?;
+    assert_eq!(content[0], [0., 0., 0., 0., 0.]);
+    let min = tensor1.minimum(&(&tensor2 * 0.5)?)?;
+    let max = tensor1.maximum(&(&tensor2 * 0.5)?)?;
+    assert_eq!(
+        min.to_vec2::<f32>()?,
+        [[2.5, 1.0, 2.5, 1.0, 2.5], [1.0, 0.5, 3.5, 4.0, 1.0]],
+    );
+    assert_eq!(
+        max.to_vec2::<f32>()?,
+        [[3.0, 2.5, 4.0, 2.5, 5.0], [2.0, 1.0, 7.0, 8.0, 2.0]]
+    );
+    Ok(())
+}
+fn transpose(device: &Device) -> Result<()> {
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
+    let tensor = Tensor::new(data, device)?.t()?;
+    let dims = tensor.dims2()?;
+    assert_eq!(dims, (5, 2));
+    assert_eq!(
+        tensor.to_vec2::<f32>()?,
+        &[[3f32, 2.], [1., 1.], [4., 7.], [1., 8.], [5., 2.]]
+    );
+    assert_eq!(tensor.t()?.to_vec2::<f32>()?, data);
+    assert_eq!(tensor.contiguous()?.t()?.to_vec2::<f32>()?, data);
+    assert_eq!(((tensor + 1.)?.t()? - 1.)?.to_vec2::<f32>()?, data);
+    Ok(())
+}
+fn var(device: &Device) -> Result<()> {
+    // Values taken from https://pytorch.org/docs/stable/generated/torch.var.html
+    let data = &[
+        [0.2035f32, 1.2959, 1.8101, -0.4644],
+        [1.5027, -0.3270, 0.5905, 0.6538],
+        [-1.5745, 1.3330, -0.5596, -0.6548],
+        [0.1264, -0.5080, 1.6420, 0.1992],
+    ];
+    let tensor = Tensor::new(data, device)?;
+    assert_eq!(
+        test_utils::to_vec2_round(&tensor.var_keepdim(1)?, 4)?,
+        &[[1.0631], [0.559], [1.4893], [0.8258]]
+    );
+    Ok(())
+}
+fn sum(device: &Device) -> Result<()> {
+    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
+    let tensor = Tensor::new(data, device)?;
+    assert_eq!(
+        tensor.sum_keepdim(2)?.to_vec3::<u32>()?,
+        &[[[8], [15]], [[10], [18]]]
+    );
+    assert_eq!(
+        tensor.sum_keepdim(0)?.to_vec3::<u32>()?,
+        &[[[5, 2, 11], [9, 7, 17]]],
+    );
+    assert_eq!(tensor.sum_keepdim((0, 2, 1))?.to_vec3::<u32>()?, &[[[51]]],);
+    assert_eq!(
+        tensor.t()?.sum_keepdim(1)?.t()?.to_vec3::<u32>()?,
+        &[[[8], [15]], [[10], [18]]]
+    );
+    assert_eq!(
+        tensor.sum_keepdim((2, 1))?.to_vec3::<u32>()?,
+        &[[[8 + 15]], [[10 + 18]]]
+    );
+    let data: Vec<u32> = (0..4000u32).collect();
+    let tensor = Tensor::new(data.as_slice(), device)?;
+    assert_eq!(tensor.sum_keepdim(0)?.to_vec1::<u32>()?, &[7998000]);
+    let tensor = tensor.reshape((2000, 2))?;
+    assert_eq!(tensor.sum_keepdim((0, 1))?.to_vec2::<u32>()?, &[[7998000]]);
+    assert_eq!(
+        tensor.sum_keepdim(0)?.sum_keepdim(1)?.to_vec2::<u32>()?,
+        &[[7998000]]
+    );
+    assert_eq!(
+        tensor.sum_keepdim(1)?.sum_keepdim(0)?.to_vec2::<u32>()?,
+        &[[7998000]]
+    );
+    assert_eq!(
+        tensor.sum_keepdim(0)?.to_vec2::<u32>()?,
+        &[[3998000, 4000000]]
+    );
+    // Make the tensor non contiguous.
+    let tensor = tensor.t()?.contiguous()?.t()?;
+    assert_eq!(tensor.sum_keepdim((0, 1))?.to_vec2::<u32>()?, &[[7998000]]);
+    assert_eq!(
+        tensor.sum_keepdim(0)?.sum_keepdim(1)?.to_vec2::<u32>()?,
+        &[[7998000]]
+    );
+    assert_eq!(
+        tensor.sum_keepdim(1)?.sum_keepdim(0)?.to_vec2::<u32>()?,
+        &[[7998000]]
+    );
+    assert_eq!(
+        tensor.sum_keepdim(0)?.to_vec2::<u32>()?,
+        &[[3998000, 4000000]]
+    );
+    let t1 = tensor.reshape((200, 5, 4))?;
+    let t2 = t1.transpose(0, 2)?.contiguous()?.transpose(0, 2)?;
+    for tensor in [t1, t2] {
+        assert_eq!(
+            tensor.sum_keepdim((0, 1, 2))?.to_vec3::<u32>()?,
+            &[[[7998000]]]
+        );
+        assert_eq!(
+            tensor
+                .sum_keepdim(0)?
+                .sum_keepdim(2)?
+                .sum_keepdim(1)?
+                .to_vec3::<u32>()?,
+            &[[[7998000]]]
+        );
+        assert_eq!(
+            tensor
+                .sum_keepdim(0)?
+                .sum_keepdim((1, 2))?
+                .to_vec3::<u32>()?,
+            &[[[7998000]]]
+        );
+        assert_eq!(
+            tensor
+                .sum_keepdim(1)?
+                .sum_keepdim((0, 2))?
+                .to_vec3::<u32>()?,
+            &[[[7998000]]]
+        );
+        assert_eq!(
+            tensor.sum_keepdim(0)?.to_vec3::<u32>()?,
+            &[[
+                [398000, 398200, 398400, 398600],
+                [398800, 399000, 399200, 399400],
+                [399600, 399800, 400000, 400200],
+                [400400, 400600, 400800, 401000],
+                [401200, 401400, 401600, 401800]
+            ]]
+        );
+    }
+    Ok(())
+}
+fn min(device: &Device) -> Result<()> {
+    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
+    let tensor = Tensor::new(data, device)?;
+    assert_eq!(
+        tensor.min_keepdim(2)?.to_vec3::<u32>()?,
+        &[[[1], [1]], [[1], [2]]]
+    );
+    assert_eq!(
+        tensor.min_keepdim(0)?.to_vec3::<u32>()?,
+        &[[[2, 1, 4], [1, 2, 8]]],
+    );
+    let data: Vec<u32> = (200..4000u32).collect();
+    let tensor = Tensor::new(data.as_slice(), device)?;
+    assert_eq!(tensor.min_keepdim(0)?.to_vec1::<u32>()?, &[200]);
+    let tensor = tensor.reshape((1900, 2))?;
+    assert_eq!(
+        tensor.min_keepdim(0)?.min_keepdim(1)?.to_vec2::<u32>()?,
+        &[[200]]
+    );
+    assert_eq!(
+        tensor.min_keepdim(1)?.min_keepdim(0)?.to_vec2::<u32>()?,
+        &[[200]]
+    );
+    assert_eq!(tensor.min_keepdim(0)?.to_vec2::<u32>()?, &[[200, 201]]);
+    // Make the tensor non contiguous.
+    let tensor = tensor.t()?.contiguous()?.t()?;
+    assert_eq!(
+        tensor.min_keepdim(0)?.min_keepdim(1)?.to_vec2::<u32>()?,
+        &[[200]]
+    );
+    assert_eq!(
+        tensor.min_keepdim(1)?.min_keepdim(0)?.to_vec2::<u32>()?,
+        &[[200]]
+    );
+    assert_eq!(tensor.min_keepdim(0)?.to_vec2::<u32>()?, &[[200, 201]]);
+    let t1 = tensor.reshape((190, 5, 4))?;
+    let t2 = t1.transpose(0, 2)?.contiguous()?.transpose(0, 2)?;
+    for tensor in [t1, t2] {
+        assert_eq!(
+            tensor
+                .min_keepdim(0)?
+                .min_keepdim(2)?
+                .min_keepdim(1)?
+                .to_vec3::<u32>()?,
+            &[[[200]]]
+        );
+        assert_eq!(
+            tensor.min_keepdim(0)?.to_vec3::<u32>()?,
+            &[[
+                [200, 201, 202, 203],
+                [204, 205, 206, 207],
+                [208, 209, 210, 211],
+                [212, 213, 214, 215],
+                [216, 217, 218, 219]
+            ]]
+        );
+    }
+    Ok(())
+}
+fn max(device: &Device) -> Result<()> {
+    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
+    let tensor = Tensor::new(data, device)?;
+    assert_eq!(
+        tensor.max_keepdim(2)?.to_vec3::<u32>()?,
+        &[[[4], [9]], [[7], [8]]]
+    );
+    assert_eq!(
+        tensor.max_keepdim(0)?.to_vec3::<u32>()?,
+        &[[[3, 1, 7], [8, 5, 9]]],
+    );
+    let data: Vec<u32> = (200..4000u32).collect();
+    let tensor = Tensor::new(data.as_slice(), device)?;
+    assert_eq!(tensor.max_keepdim(0)?.to_vec1::<u32>()?, &[3999]);
+    let tensor = tensor.reshape((1900, 2))?;
+    assert_eq!(
+        tensor.max_keepdim(0)?.max_keepdim(1)?.to_vec2::<u32>()?,
+        &[[3999]]
+    );
+    assert_eq!(
+        tensor.max_keepdim(1)?.max_keepdim(0)?.to_vec2::<u32>()?,
+        &[[3999]]
+    );
+    assert_eq!(tensor.max_keepdim(0)?.to_vec2::<u32>()?, &[[3998, 3999]]);
+    // Make the tensor non contiguous.
+    let tensor = tensor.t()?.contiguous()?.t()?;
+    assert_eq!(
+        tensor.max_keepdim(0)?.max_keepdim(1)?.to_vec2::<u32>()?,
+        &[[3999]]
+    );
+    assert_eq!(
+        tensor.max_keepdim(1)?.max_keepdim(0)?.to_vec2::<u32>()?,
+        &[[3999]]
+    );
+    assert_eq!(tensor.max_keepdim(0)?.to_vec2::<u32>()?, &[[3998, 3999]]);
+    let t1 = tensor.reshape((190, 5, 4))?;
+    let t2 = t1.transpose(0, 2)?.contiguous()?.transpose(0, 2)?;
+    for tensor in [t1, t2] {
+        assert_eq!(
+            tensor
+                .max_keepdim(0)?
+                .max_keepdim(2)?
+                .max_keepdim(1)?
+                .to_vec3::<u32>()?,
+            &[[[3999]]]
+        );
+        assert_eq!(
+            tensor.max_keepdim(0)?.to_vec3::<u32>()?,
+            &[[
+                [3980, 3981, 3982, 3983],
+                [3984, 3985, 3986, 3987],
+                [3988, 3989, 3990, 3991],
+                [3992, 3993, 3994, 3995],
+                [3996, 3997, 3998, 3999]
+            ]]
+        );
+    }
+    Ok(())
+}
+fn argmin(device: &Device) -> Result<()> {
+    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
+    let tensor = Tensor::new(data, device)?;
+    assert_eq!(
+        tensor.argmin_keepdim(2)?.to_vec3::<u32>()?,
+        &[[[1], [0]], [[1], [1]]]
+    );
+    assert_eq!(
+        tensor.argmin_keepdim(0)?.to_vec3::<u32>()?,
+        &[[[1, 0, 0], [0, 1, 1]]],
+    );
+    let data: Vec<u32> = (200..4000u32).collect();
+    let tensor = Tensor::new(data.as_slice(), device)?;
+    assert_eq!(tensor.argmin_keepdim(0)?.to_vec1::<u32>()?, &[0]);
+    let tensor = tensor.reshape((1900, 2))?;
+    assert_eq!(
+        tensor
+            .argmin_keepdim(0)?
+            .argmin_keepdim(1)?
+            .to_vec2::<u32>()?,
+        &[[0]]
+    );
+    assert_eq!(
+        tensor
+            .argmin_keepdim(1)?
+            .argmin_keepdim(0)?
+            .to_vec2::<u32>()?,
+        &[[0]]
+    );
+    assert_eq!(tensor.argmin_keepdim(0)?.to_vec2::<u32>()?, &[[0, 0]]);
+    // Make the tensor non contiguous.
+    let tensor = tensor.t()?.contiguous()?.t()?;
+    assert_eq!(
+        tensor
+            .argmin_keepdim(0)?
+            .argmin_keepdim(1)?
+            .to_vec2::<u32>()?,
+        &[[0]]
+    );
+    assert_eq!(
+        tensor
+            .argmin_keepdim(1)?
+            .argmin_keepdim(0)?
+            .to_vec2::<u32>()?,
+        &[[0]]
+    );
+    assert_eq!(tensor.argmin_keepdim(0)?.to_vec2::<u32>()?, &[[0, 0]]);
+    let t1 = tensor.reshape((190, 5, 4))?;
+    let t2 = t1.transpose(0, 2)?.contiguous()?.transpose(0, 2)?;
+    for tensor in [t1, t2] {
+        assert_eq!(
+            tensor
+                .argmin_keepdim(0)?
+                .argmin_keepdim(2)?
+                .argmin_keepdim(1)?
+                .to_vec3::<u32>()?,
+            &[[[0]]]
+        );
+        assert_eq!(
+            tensor.argmin_keepdim(0)?.to_vec3::<u32>()?,
+            &[[
+                [0, 0, 0, 0],
+                [0, 0, 0, 0],
+                [0, 0, 0, 0],
+                [0, 0, 0, 0],
+                [0, 0, 0, 0],
+            ]]
+        );
+    }
+    Ok(())
+}
+fn argmax(device: &Device) -> Result<()> {
+    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
+    let tensor = Tensor::new(data, device)?;
+    assert_eq!(
+        tensor.argmax_keepdim(2)?.to_vec3::<u32>()?,
+        &[[[2], [2]], [[2], [0]]]
+    );
+    assert_eq!(
+        tensor.argmax_keepdim(0)?.to_vec3::<u32>()?,
+        &[[[0, 0, 1], [1, 0, 0]]],
+    );
+    let data: Vec<u32> = (200..4000u32).collect();
+    let tensor = Tensor::new(data.as_slice(), device)?;
+    assert_eq!(tensor.argmax_keepdim(0)?.to_vec1::<u32>()?, &[3799]);
+    let tensor = tensor.reshape((1900, 2))?;
+    assert_eq!(
+        tensor
+            .argmax_keepdim(0)?
+            .argmax_keepdim(1)?
+            .to_vec2::<u32>()?,
+        &[[0]]
+    );
+    assert_eq!(
+        tensor
+            .argmax_keepdim(1)?
+            .argmax_keepdim(0)?
+            .to_vec2::<u32>()?,
+        &[[0]]
+    );
+    assert_eq!(tensor.argmax_keepdim(0)?.to_vec2::<u32>()?, &[[1899, 1899]]);
+    // Make the tensor non contiguous.
+    let tensor = tensor.t()?.contiguous()?.t()?;
+    assert_eq!(
+        tensor
+            .argmax_keepdim(0)?
+            .argmax_keepdim(1)?
+            .to_vec2::<u32>()?,
+        &[[0]]
+    );
+    assert_eq!(
+        tensor
+            .argmax_keepdim(1)?
+            .argmax_keepdim(0)?
+            .to_vec2::<u32>()?,
+        &[[0]]
+    );
+    assert_eq!(tensor.argmax_keepdim(0)?.to_vec2::<u32>()?, &[[1899, 1899]]);
+    let t1 = tensor.reshape((190, 5, 4))?;
+    let t2 = t1.transpose(0, 2)?.contiguous()?.transpose(0, 2)?;
+    for tensor in [t1, t2] {
+        assert_eq!(
+            tensor
+                .argmax_keepdim(0)?
+                .argmax_keepdim(2)?
+                .argmax_keepdim(1)?
+                .to_vec3::<u32>()?,
+            &[[[0]]]
+        );
+        assert_eq!(
+            tensor.argmax_keepdim(0)?.to_vec3::<u32>()?,
+            &[[
+                [189, 189, 189, 189],
+                [189, 189, 189, 189],
+                [189, 189, 189, 189],
+                [189, 189, 189, 189],
+                [189, 189, 189, 189],
+            ]]
+        );
+    }
+    Ok(())
+}
+fn narrow(device: &Device) -> Result<()> {
+    let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
+    let tensor = Tensor::new(data, device)?;
+    assert_eq!(
+        tensor.narrow(2, 1, 2)?.to_vec3::<f32>()?,
+        &[[[1.0, 4.0], [5.0, 9.0]], [[1.0, 7.0], [2.0, 8.0]]],
+    );
+    assert_eq!(
+        tensor.narrow(1, 1, 1)?.to_vec3::<f32>()?,
+        &[[[1.0, 5.0, 9.0]], [[8.0, 2.0, 8.0]]],
+    );
+    assert_eq!(
+        tensor.narrow(0, 0, 1)?.to_vec3::<f32>()?,
+        &[[[3.0, 1.0, 4.0], [1.0, 5.0, 9.0]]],
+    );
+    assert_eq!(
+        tensor.narrow(0, 1, 1)?.to_vec3::<f32>()?,
+        &[[[2.0, 1.0, 7.0], [8.0, 2.0, 8.0]]],
+    );
+    // The following has been checked against PyTorch via:
+    //   import torch
+    //   t = torch.tensor([[[3., 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]])
+    //   t.transpose(-1, -2).narrow(1, 1, 2)
+    assert_eq!(
+        tensor.t()?.narrow(1, 1, 2)?.to_vec3::<f32>()?,
+        &[[[1.0, 5.0], [4.0, 9.0]], [[1.0, 2.0], [7.0, 8.0]]],
+    );
+    Ok(())
+}
+fn broadcast(device: &Device) -> Result<()> {
+    let data = &[3f32, 1., 4.];
+    let tensor = Tensor::new(data, device)?;
+    assert_eq!(
+        tensor.broadcast_left((3, 1))?.to_vec3::<f32>()?,
+        &[[[3.0, 1.0, 4.0]], [[3.0, 1.0, 4.0]], [[3.0, 1.0, 4.0]]]
+    );
+    Ok(())
+}
+fn cat(device: &Device) -> Result<()> {
+    // 1D
+    let t1 = Tensor::new(&[3f32, 1., 4.], device)?;
+    let t2 = Tensor::new(&[1f32, 5., 9., 2.], device)?;
+    let t3 = Tensor::new(&[6f32, 5., 3., 5., 8., 9.], device)?;
+    assert_eq!(Tensor::cat(&[&t1], 0)?.to_vec1::<f32>()?, [3f32, 1., 4.],);
+    assert_eq!(
+        Tensor::cat(&[&t1, &t2], 0)?.to_vec1::<f32>()?,
+        [3f32, 1., 4., 1., 5., 9., 2.],
+    );
+    assert_eq!(
+        Tensor::cat(&[&t1, &t2, &t3], 0)?.to_vec1::<f32>()?,
+        [3f32, 1., 4., 1., 5., 9., 2., 6., 5., 3., 5., 8., 9.],
+    );
+    // 2D
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 7., 1., 8., 2.]];
+    let t1 = Tensor::new(data, device)?;
+    let data2 = &[[5f32, 5., 5., 5., 5.], [2., 7., 1., 8., 2.]];
+    let t2 = Tensor::new(data2, device)?;
+    assert_eq!(
+        Tensor::cat(&[&t1, &t2], 0)?.to_vec2::<f32>()?,
+        [
+            [3.0, 1.0, 4.0, 1.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0],
+            [5.0, 5.0, 5.0, 5.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0]
+        ]
+    );
+    // PyTorch equivalent:
+    //     import torch
+    //     t1 = torch.tensor([[3, 1, 4, 1, 5], [2, 7, 1, 8, 2]])
+    //     t2 = torch.tensor([[5]*5, [2, 7, 1, 8, 2]])
+    //     torch.cat([t1.t(), t2.t()], dim=1).t()
+    assert_eq!(
+        Tensor::cat(&[&t1.t()?, &t2.t()?], 1)?
+            .t()?
+            .to_vec2::<f32>()?,
+        [
+            [3.0, 1.0, 4.0, 1.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0],
+            [5.0, 5.0, 5.0, 5.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0]
+        ]
+    );
+    assert_eq!(
+        Tensor::cat(&[&t1, &t2], 1)?.to_vec2::<f32>()?,
+        [
+            [3.0, 1.0, 4.0, 1.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0, 2.0, 7.0, 1.0, 8.0, 2.0]
+        ]
+    );
+    // 3D
+    let t1 = Tensor::arange(0, 48i64, device)?.reshape((2, 6, 4))?;
+    let t2 = Tensor::arange(100, 124i64, device)?.reshape((2, 3, 4))?;
+    let t3 = Tensor::arange(10000, 10032i64, device)?.reshape((2, 4, 4))?;
+    let t_cat = Tensor::cat(&[&t1, &t2, &t3], 1)?;
+    let t1 = t1.t()?.contiguous()?.t()?;
+    let t2 = t2.t()?.contiguous()?.t()?;
+    let t3 = t3.t()?.contiguous()?.t()?;
+    let t_cat2 = Tensor::cat(&[&t1, &t2, &t3], 1)?;
+    let diff = t_cat.eq(&t_cat2)?.to_dtype(DType::F32)?.sum_all()?;
+    assert_eq!(diff.to_vec0::<f32>()?, 104.0);
+    assert_eq!(t_cat.i((0, 0, 0))?.to_vec0::<i64>()?, 0);
+    assert_eq!(t_cat.i((0, 4, 0))?.to_vec0::<i64>()?, 16);
+    assert_eq!(t_cat.i((0, 5, 0))?.to_vec0::<i64>()?, 20);
+    assert_eq!(t_cat.i((1, 5, 0))?.to_vec0::<i64>()?, 44);
+    assert_eq!(t_cat.i((0, 6, 0))?.to_vec0::<i64>()?, 100);
+    assert_eq!(t_cat.i((1, 6, 0))?.to_vec0::<i64>()?, 112);
+    assert_eq!(t_cat.i((0, 6, 1))?.to_vec0::<i64>()?, 101);
+    assert_eq!(t_cat.i((0, 7, 1))?.to_vec0::<i64>()?, 105);
+    assert_eq!(t_cat.i((0, 12, 1))?.to_vec0::<i64>()?, 10013);
+    assert_eq!(t_cat.i((1, 12, 3))?.to_vec0::<i64>()?, 10031);
+    Ok(())
+}
+fn embeddings(device: &Device) -> Result<()> {
+    let ids = Tensor::new(&[0u32, 2u32, 1u32], device)?;
+    let t = Tensor::new(&[[0f32, 1f32], [2f32, 3f32], [4f32, 5f32]], device)?;
+    let hs = t.embedding(&ids)?;
+    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]);
+    let hs = t.index_select(&ids, 0)?;
+    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]);
+    let hs = t.index_select(&ids.to_dtype(DType::I64)?, 0)?;
+    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]);
+    Ok(())
+}
+fn cmp(device: &Device) -> Result<()> {
+    let t1 = Tensor::new(&[[0f32, 1f32], [2f32, 3f32], [4f32, 5f32]], device)?;
+    let t2 = Tensor::new(&[[1f32, 0f32], [3f32, 3f32], [4f32, 7f32]], device)?;
+    assert_eq!(t1.eq(&t2)?.to_vec2::<u8>()?, &[[0, 0], [0, 1], [1, 0]]);
+    assert_eq!(t1.ne(&t2)?.to_vec2::<u8>()?, &[[1, 1], [1, 0], [0, 1]]);
+    assert_eq!(t1.le(&t2)?.to_vec2::<u8>()?, &[[1, 0], [1, 1], [1, 1]]);
+    assert_eq!(t1.lt(&t2)?.to_vec2::<u8>()?, &[[1, 0], [1, 0], [0, 1]]);
+    assert_eq!(t1.gt(&t2)?.to_vec2::<u8>()?, &[[0, 1], [0, 0], [0, 0]]);
+    assert_eq!(t1.ge(&t2)?.to_vec2::<u8>()?, &[[0, 1], [0, 1], [1, 0]]);
+    Ok(())
+}
+fn index_select(device: &Device) -> Result<()> {
+    let ids = Tensor::new(&[0u32, 2u32, 1u32], device)?;
+    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
+    assert_eq!(
+        t.to_vec2::<f32>()?,
+        &[
+            [0.0, 1.0, 2.0],
+            [3.0, 4.0, 5.0],
+            [6.0, 7.0, 8.0],
+            [9.0, 10.0, 11.0]
+        ]
+    );
+    for dtype in [DType::U8, DType::U32, DType::I64] {
+        let ids = ids.to_dtype(dtype)?;
+        let hs = t.index_select(&ids, 1)?;
+        assert_eq!(
+            hs.to_vec2::<f32>()?,
+            &[
+                [0.0, 2.0, 1.0],
+                [3.0, 5.0, 4.0],
+                [6.0, 8.0, 7.0],
+                [9.0, 11.0, 10.0]
+            ]
+        );
+        let hs = t.index_select(&ids, 0)?;
+        assert_eq!(
+            hs.to_vec2::<f32>()?,
+            &[[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]]
+        );
+        // Prior to https://github.com/huggingface/candle/pull/1022
+        // There would be a bug where the last values in the result tensor would be set to 0.
+        let ids = Tensor::new(&[0u32, 2u32, 1u32, 0u32, 2u32, 1u32], device)?;
+        let hs = t.index_select(&ids, 0)?;
+        assert_eq!(
+            hs.to_vec2::<f32>()?,
+            &[
+                [0.0, 1.0, 2.0],
+                [6.0, 7.0, 8.0],
+                [3.0, 4.0, 5.0],
+                [0.0, 1.0, 2.0],
+                [6.0, 7.0, 8.0],
+                [3.0, 4.0, 5.0],
+            ]
+        );
+        // Test when selecting dim > 0 with ids size different from elem count of
+        // target dim in source/input.
+        let ids = Tensor::new(&[1u32, 0u32, 1u32], device)?;
+        let t = Tensor::arange(1f32, 5f32, device)?.reshape((2, 2))?;
+        assert_eq!(t.to_vec2::<f32>()?, &[[1.0, 2.0], [3.0, 4.0]]);
+        let hs = t.index_select(&ids, 1)?;
+        assert_eq!(hs.to_vec2::<f32>()?, &[[2.0, 1.0, 2.0], [4.0, 3.0, 4.0]]);
+    }
+    Ok(())
+}
+fn index_add(device: &Device) -> Result<()> {
+    let ids = Tensor::new(&[0u32, 1u32, 1u32], device)?;
+    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
+    assert_eq!(
+        t.to_vec2::<f32>()?,
+        &[
+            [0.0, 1.0, 2.0],
+            [3.0, 4.0, 5.0],
+            [6.0, 7.0, 8.0],
+            [9.0, 10.0, 11.0]
+        ]
+    );
+    let init = Tensor::ones((4, 2), DType::F32, device)?;
+    let hs = init.index_add(&ids, &t, 1)?;
+    assert_eq!(
+        hs.to_vec2::<f32>()?,
+        &[[1.0, 4.0], [4.0, 10.0], [7.0, 16.0], [10.0, 22.0]],
+    );
+    let init = Tensor::zeros((4, 2), DType::F32, device)?;
+    let ids = Tensor::new(&[1u32, 0u32, 0u32], device)?;
+    let hs = init.index_add(&ids, &t, 1)?;
+    assert_eq!(
+        hs.to_vec2::<f32>()?,
+        &[[3.0, 0.0], [9.0, 3.0], [15.0, 6.0], [21.0, 9.0]],
+    );
+    let init = Tensor::zeros((6, 3), DType::F32, device)?;
+    let ids = Tensor::new(&[5u32, 0u32, 1u32, 0u32], device)?;
+    let hs = init.index_add(&ids, &t, 0)?;
+    assert_eq!(
+        hs.to_vec2::<f32>()?,
+        &[
+            [12.0, 14.0, 16.0],
+            [6.0, 7.0, 8.0],
+            [0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0],
+            [0.0, 1.0, 2.0]
+        ]
+    );
+    Ok(())
+}
+fn slice_scatter(device: &Device) -> Result<()> {
+    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
+    assert_eq!(
+        t.to_vec2::<f32>()?,
+        &[
+            [0.0, 1.0, 2.0],
+            [3.0, 4.0, 5.0],
+            [6.0, 7.0, 8.0],
+            [9.0, 10.0, 11.0]
+        ]
+    );
+    let src = Tensor::arange(100f32, 106f32, device)?.reshape((2, 3))?;
+    assert_eq!(
+        t.slice_scatter0(&src, 0)?.to_vec2::<f32>()?,
+        &[
+            [100.0, 101.0, 102.0],
+            [103.0, 104.0, 105.0],
+            [6.0, 7.0, 8.0],
+            [9.0, 10.0, 11.0]
+        ]
+    );
+    assert_eq!(
+        t.slice_scatter0(&src, 1)?.to_vec2::<f32>()?,
+        &[
+            [0.0, 1.0, 2.0],
+            [100.0, 101.0, 102.0],
+            [103.0, 104.0, 105.0],
+            [9.0, 10.0, 11.0]
+        ]
+    );
+    assert_eq!(
+        t.slice_scatter0(&src, 2)?.to_vec2::<f32>()?,
+        &[
+            [0.0, 1.0, 2.0],
+            [3.0, 4.0, 5.0],
+            [100.0, 101.0, 102.0],
+            [103.0, 104.0, 105.0],
+        ]
+    );
+    Ok(())
+}
+fn scatter_add(device: &Device) -> Result<()> {
+    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
+    assert_eq!(
+        t.to_vec2::<f32>()?,
+        &[
+            [0.0, 1.0, 2.0],
+            [3.0, 4.0, 5.0],
+            [6.0, 7.0, 8.0],
+            [9.0, 10.0, 11.0]
+        ]
+    );
+    let ids = Tensor::new(&[[0u32, 1, 2], [3, 4, 0], [3, 3, 1], [2, 0, 4]], device)?;
+    let init = Tensor::ones((4, 5), DType::F32, device)?;
+    let hs = init.scatter_add(&ids, &t, 1)?;
+    assert_eq!(
+        hs.to_vec2::<f32>()?,
+        &[
+            [1.0, 2.0, 3.0, 1.0, 1.0],
+            [6.0, 1.0, 1.0, 4.0, 5.0],
+            [1.0, 9.0, 1.0, 14.0, 1.0],
+            [11.0, 1.0, 10.0, 1.0, 12.0]
+        ]
+    );
+    let init = Tensor::ones((6, 3), DType::F32, device)?;
+    let hs = init.scatter_add(&ids, &t, 0)?;
+    assert_eq!(
+        hs.to_vec2::<f32>()?,
+        &[
+            [1.0, 11.0, 6.0],
+            [1.0, 2.0, 9.0],
+            [10.0, 1.0, 3.0],
+            [10.0, 8.0, 1.0],
+            [1.0, 5.0, 12.0],
+            [1.0, 1.0, 1.0]
+        ]
+    );
+    Ok(())
+}
+fn gather(device: &Device) -> Result<()> {
+    let ids = Tensor::new(&[[0u32], [2u32], [1u32], [0u32]], device)?;
+    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
+    assert_eq!(
+        t.to_vec2::<f32>()?,
+        &[
+            [0.0, 1.0, 2.0],
+            [3.0, 4.0, 5.0],
+            [6.0, 7.0, 8.0],
+            [9.0, 10.0, 11.0]
+        ]
+    );
+    let hs = t.gather(&ids, 1)?;
+    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0], [5.0], [7.0], [9.0]]);
+    let ids = Tensor::new(
+        &[[0u32, 0u32], [2u32, 0u32], [1u32, 1u32], [0u32, 2u32]],
+        device,
+    )?;
+    let hs = t.gather(&ids, 1)?;
+    assert_eq!(
+        hs.to_vec2::<f32>()?,
+        &[[0.0, 0.0], [5.0, 3.0], [7.0, 7.0], [9.0, 11.0]]
+    );
+    let ids = Tensor::new(&[[0u32, 2u32, 0u32]], device)?;
+    let hs = t.gather(&ids, 0)?;
+    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 7.0, 2.0]]);
+    let ids = Tensor::new(&[[0u32, 2u32, 0u32], [0u32, 1u32, 1u32]], device)?;
+    let hs = t.gather(&ids, 0)?;
+    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 7.0, 2.0], [0.0, 4.0, 5.0]]);
+    Ok(())
+}
+fn broadcasting(device: &Device) -> Result<()> {
+    let t1 = Tensor::arange(0f32, 24f32, device)?.reshape((4, 2, 3))?;
+    let t2 = Tensor::new(&[100f32, 200f32], device)?;
+    let s = t1.broadcast_add(&t2.reshape((2, 1))?)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[100.0, 101.0, 102.0], [203.0, 204.0, 205.0]],
+            [[106.0, 107.0, 108.0], [209.0, 210.0, 211.0]],
+            [[112.0, 113.0, 114.0], [215.0, 216.0, 217.0]],
+            [[118.0, 119.0, 120.0], [221.0, 222.0, 223.0]]
+        ]
+    );
+    let s = t1.t()?.broadcast_add(&t2)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[100.0, 203.0], [101.0, 204.0], [102.0, 205.0]],
+            [[106.0, 209.0], [107.0, 210.0], [108.0, 211.0]],
+            [[112.0, 215.0], [113.0, 216.0], [114.0, 217.0]],
+            [[118.0, 221.0], [119.0, 222.0], [120.0, 223.0]]
+        ]
+    );
+    let s = t1.broadcast_sub(&t2.reshape((2, 1))?)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[-100.0, -99.0, -98.0], [-197.0, -196.0, -195.0]],
+            [[-94.0, -93.0, -92.0], [-191.0, -190.0, -189.0]],
+            [[-88.0, -87.0, -86.0], [-185.0, -184.0, -183.0]],
+            [[-82.0, -81.0, -80.0], [-179.0, -178.0, -177.0]]
+        ]
+    );
+    let s = t1.t()?.broadcast_sub(&t2)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[-100.0, -197.0], [-99.0, -196.0], [-98.0, -195.0]],
+            [[-94.0, -191.0], [-93.0, -190.0], [-92.0, -189.0]],
+            [[-88.0, -185.0], [-87.0, -184.0], [-86.0, -183.0]],
+            [[-82.0, -179.0], [-81.0, -178.0], [-80.0, -177.0]]
+        ]
+    );
+    // Test a narrowed version as this uses a layout start_offset.
+    let t1 = t1.i(2..)?;
+    let s = t1.broadcast_add(&t2.reshape((2, 1))?)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[112.0, 113.0, 114.0], [215.0, 216.0, 217.0]],
+            [[118.0, 119.0, 120.0], [221.0, 222.0, 223.0]]
+        ]
+    );
+    let s = t1.t()?.broadcast_add(&t2)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[112.0, 215.0], [113.0, 216.0], [114.0, 217.0]],
+            [[118.0, 221.0], [119.0, 222.0], [120.0, 223.0]]
+        ]
+    );
+    let s = t1.broadcast_sub(&t2.reshape((2, 1))?)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[-88.0, -87.0, -86.0], [-185.0, -184.0, -183.0]],
+            [[-82.0, -81.0, -80.0], [-179.0, -178.0, -177.0]]
+        ]
+    );
+    let s = t1.t()?.broadcast_sub(&t2)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[-88.0, -185.0], [-87.0, -184.0], [-86.0, -183.0]],
+            [[-82.0, -179.0], [-81.0, -178.0], [-80.0, -177.0]]
+        ]
+    );
+    let t3 = Tensor::new(1f32, device)?.broadcast_div(&t2)?;
+    let s = t1.broadcast_mul(&t2.reshape((2, 1))?)?;
+    let s_div = t1.broadcast_div(&t3.reshape((2, 1))?)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[1200.0, 1300.0, 1400.0], [3000.0, 3200.0, 3400.0]],
+            [[1800.0, 1900.0, 2000.0], [4200.0, 4400.0, 4600.0]]
+        ]
+    );
+    assert_eq!(s.to_vec3::<f32>()?, s_div.to_vec3::<f32>()?,);
+    let s = t1.t()?.broadcast_mul(&t2)?;
+    let s_div = t1.t()?.broadcast_div(&t3)?;
+    assert_eq!(
+        s.to_vec3::<f32>()?,
+        &[
+            [[1200.0, 3000.0], [1300.0, 3200.0], [1400.0, 3400.0]],
+            [[1800.0, 4200.0], [1900.0, 4400.0], [2000.0, 4600.0]]
+        ]
+    );
+    assert_eq!(s.to_vec3::<f32>()?, s_div.to_vec3::<f32>()?,);
+    Ok(())
+}
+fn randn(device: &Device) -> Result<()> {
+    let tensor = Tensor::randn(0f32, 1f32, (5, 3), device)?;
+    assert_eq!(tensor.dims(), [5, 3]);
+    // Check that the seed gets updated by checking that
+    // a new series of numbers is generated each time
+    let tensor2 = Tensor::randn(0f32, 1f32, (5, 3), device)?;
+    assert_ne!(tensor.to_vec2::<f32>()?, tensor2.to_vec2::<f32>()?);
+    let tensor = Tensor::rand(0f32, 1f32, (5, 3), device)?;
+    assert_eq!(tensor.dims(), [5, 3]);
+    // Check that the seed gets updated by checking that
+    // a new series of numbers is generated each time
+    let tensor2 = Tensor::rand(0f32, 1f32, (5, 3), device)?;
+    assert_ne!(tensor.to_vec2::<f32>()?, tensor2.to_vec2::<f32>()?);
+    // We do not expect deterministic elements at any index.
+    // There once was a bug that had a deterministic zero element in evenly sized tensors.
+    const N: usize = 2;
+    let v = (0..100)
+        .map(|_| Tensor::randn(0f32, 1f32, N, device).and_then(|t| t.to_vec1::<f32>()))
+        .collect::<Result<Vec<_>>>()?;
+    assert!(
+        (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])),
+        "There are deterministic values in the randn tensors"
+    );
+    let v = (0..100)
+        .map(|_| Tensor::rand(0f32, 1f32, N, device).and_then(|t| t.to_vec1::<f32>()))
+        .collect::<Result<Vec<_>>>()?;
+    assert!(
+        (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])),
+        "There are deterministic values in the rand tensors"
+    );
+    Ok(())
+}
+test_device!(zeros, zeros_cpu, zeros_gpu, zeros_metal);
+test_device!(ones, ones_cpu, ones_gpu, ones_metal);
+test_device!(full, full_cpu, full_gpu, full_metal);
+test_device!(arange, arange_cpu, arange_gpu, arange_metal);
+test_device!(add_mul, add_mul_cpu, add_mul_gpu, add_mul_metal);
+test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu, tensor_2d_metal);
+test_device!(narrow, narrow_cpu, narrow_gpu, narrow_metal);
+test_device!(broadcast, broadcast_cpu, broadcast_gpu, broadcast_metal);
+test_device!(cat, cat_cpu, cat_gpu, cat_metal);
+test_device!(sum, sum_cpu, sum_gpu, sum_metal);
+test_device!(min, min_cpu, min_gpu, min_metal);
+test_device!(max, max_cpu, max_gpu, max_metal);
+test_device!(argmax, argmax_cpu, argmax_gpu, argmax_metal);
+test_device!(argmin, argmin_cpu, argmin_gpu, argmin_metal);
+test_device!(transpose, transpose_cpu, transpose_gpu, transpose_metal);
+test_device!(unary_op, unary_op_cpu, unary_op_gpu, unary_op_metal);
+test_device!(binary_op, binary_op_cpu, binary_op_gpu, binary_op_metal);
+test_device!(embeddings, embeddings_cpu, embeddings_gpu, embeddings_metal);
+test_device!(cmp, cmp_cpu, cmp_gpu, cmp_metal);
+test_device!(
+    broadcasting,
+    broadcasting_cpu,
+    broadcasting_gpu,
+    broadcasting_metal
+);
+test_device!(
+    index_select,
+    index_select_cpu,
+    index_select_gpu,
+    index_select_metal
+);
+test_device!(index_add, index_add_cpu, index_add_gpu, index_add_metal);
+test_device!(gather, gather_cpu, gather_gpu, gather_metal);
+test_device!(
+    scatter_add,
+    scatter_add_cpu,
+    scatter_add_gpu,
+    scatter_add_metal
+);
+test_device!(
+    slice_scatter,
+    slice_scatter_cpu,
+    slice_scatter_gpu,
+    slice_scatter_metal
+);
+test_device!(randn, randn_cpu, randn_gpu, randn_metal);
+test_device!(clamp, clamp_cpu, clamp_gpu, clamp_metal);
+test_device!(var, var_cpu, var_gpu, var_metal);
+// There was originally a bug on the CPU implementation for randn
+// https://github.com/huggingface/candle/issues/381
+#[test]
+fn randn_hasneg() -> Result<()> {
+    let t = Tensor::randn(0f32, 1f32, 200, &Device::Cpu)?.to_vec1::<f32>()?;
+    if t.iter().all(|&v| v >= 0.) {
+        candle_core::bail!("all values in tensors are non-negative")
+    }
+    Ok(())
+}
+#[test]
+fn pad_with_same() -> Result<()> {
+    let t = Tensor::arange(1f32, 5f32, &Device::Cpu)?.reshape((2, 2))?;
+    let t0 = t.pad_with_same(0, 1, 2)?;
+    assert_eq!(
+        t0.to_vec2::<f32>()?,
+        [[1.0, 2.0], [1.0, 2.0], [3.0, 4.0], [3.0, 4.0], [3.0, 4.0]]
+    );
+    let t1 = t.pad_with_same(1, 1, 2)?;
+    assert_eq!(
+        t1.to_vec2::<f32>()?,
+        [[1.0, 1.0, 2.0, 2.0, 2.0], [3.0, 3.0, 4.0, 4.0, 4.0]]
+    );
+    Ok(())
+}
+#[test]
+fn i64_abs() -> Result<()> {
+    let t = Tensor::new(&[-42i64, 1337], &Device::Cpu)?;
+    let t = t.abs()?;
+    assert_eq!(t.to_vec1::<i64>()?, [42, 1337]);
+    Ok(())
+}
+#[test]
+fn tril_triu_eye() -> Result<()> {
+    let t = Tensor::tril2(4, DType::F32, &Device::Cpu)?;
+    assert_eq!(
+        t.to_vec2::<f32>()?,
+        [
+            [1.0, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 0.0, 0.0],
+            [1.0, 1.0, 1.0, 0.0],
+            [1.0, 1.0, 1.0, 1.0]
+        ],
+    );
+    let t = Tensor::triu2(4, DType::F32, &Device::Cpu)?;
+    assert_eq!(
+        t.to_vec2::<f32>()?,
+        [
+            [1.0, 1.0, 1.0, 1.0],
+            [0.0, 1.0, 1.0, 1.0],
+            [0.0, 0.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0]
+        ]
+    );
+    let t = Tensor::eye(4, DType::F32, &Device::Cpu)?;
+    assert_eq!(
+        t.to_vec2::<f32>()?,
+        [
+            [1.0, 0.0, 0.0, 0.0],
+            [0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0]
+        ]
+    );
+    Ok(())
+}
+#[test]
+fn cumsum() -> Result<()> {
+    let t = &[3f32, 1., 4., 1., 5.];
+    let t = Tensor::new(t, &Device::Cpu)?;
+    assert_eq!(t.cumsum(0)?.to_vec1::<f32>()?, [3., 4., 8., 9., 14.]);
+    let t = t.unsqueeze(1)?;
+    assert_eq!(
+        t.cumsum(0)?.to_vec2::<f32>()?,
+        [[3.0], [4.0], [8.0], [9.0], [14.0]]
+    );
+    assert_eq!(
+        t.cumsum(1)?.to_vec2::<f32>()?,
+        [[3.0], [1.0], [4.0], [1.0], [5.0]]
+    );
+    let t = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
+    let t = Tensor::new(t, &Device::Cpu)?;
+    assert_eq!(
+        t.cumsum(1)?.to_vec2::<f32>()?,
+        [[3.0, 4.0, 8.0, 9.0, 14.0], [2.0, 3.0, 10.0, 18.0, 20.0]],
+    );
+    assert_eq!(
+        t.cumsum(0)?.to_vec2::<f32>()?,
+        [[3.0, 1.0, 4.0, 1.0, 5.0], [5.0, 2.0, 11.0, 9.0, 7.0]]
+    );
+    Ok(())
+}
+/// A helper function for floating point comparison. Both a and b must be 1D Tensor and contains the same amount of data.
+/// Assertion passes if the difference of all pairs of a and b is smaller than epsilon.
+fn assert_close(a: &Tensor, b: &Tensor, epsilon: f64) -> Result<()> {
+    let a_vec: Vec<f64> = a.to_vec1()?;
+    let b_vec: Vec<f64> = b.to_vec1()?;
+    assert_eq!(a_vec.len(), b_vec.len());
+    for (a, b) in a_vec.iter().zip(b_vec.iter()) {
+        assert!((a - b).abs() < epsilon);
+    }
+    Ok(())
+}
+#[test]
+fn log_sum_exp() -> Result<()> {
+    let input = Tensor::new(&[[1f64, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
+    let output = input.log_sum_exp(D::Minus1)?;
+    // The expectations obtained from pytorch.
+    let expected = Tensor::new(&[3.4076, 6.4076], &Device::Cpu)?;
+    assert_close(&output, &expected, 0.00001)?;
+    Ok(())
+}
+#[test]
+fn pow() -> Result<()> {
+    let lhs = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
+    let rhs = (&lhs - 2.)?;
+    let res = lhs.pow(&rhs)?;
+    assert_eq!(
+        test_utils::to_vec2_round(&res, 3)?,
+        [[1.0, 1.0, 3.0], [16.0, 125.0, 1296.0]]
+    );
+    Ok(())
+}
--- a/candle-core/tests/test.npy
+++ b/candle-core/tests/test.npy
--- a/candle-core/tests/test.pt
+++ b/candle-core/tests/test.pt
--- a/candle-core/tests/test_with_key.pt
+++ b/candle-core/tests/test_with_key.pt
--- a/candle-datasets/Cargo.toml
+++ b/candle-datasets/Cargo.toml
+[package]
+name = "candle-datasets"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+repository.workspace = true
+keywords.workspace = true
+categories.workspace = true
+license.workspace = true
+readme = "README.md"
+[dependencies]
+byteorder = { workspace = true }
+candle = { workspace = true }
+candle-nn = { workspace = true }
+hf-hub = { workspace = true}
+intel-mkl-src = { workspace = true, optional = true }
+memmap2 = { workspace = true }
+tokenizers = { workspace = true, features = ["onig"] }
+rand = { workspace = true }
+thiserror = { workspace = true }
+parquet = { workspace = true}
+image = { workspace = true }
--- a/candle-datasets/README.md
+++ b/candle-datasets/README.md
+# candle-datasets
--- a/candle-datasets/src/batcher.rs
+++ b/candle-datasets/src/batcher.rs
+use candle::{Result, Tensor};
+pub struct Batcher<I> {
+    inner: I,
+    batch_size: usize,
+    return_last_incomplete_batch: bool,
+}
+impl<I> Batcher<I> {
+    fn new(inner: I) -> Self {
+        Self {
+            inner,
+            batch_size: 16,
+            return_last_incomplete_batch: false,
+        }
+    }
+    pub fn batch_size(mut self, batch_size: usize) -> Self {
+        self.batch_size = batch_size;
+        self
+    }
+    pub fn return_last_incomplete_batch(mut self, r: bool) -> Self {
+        self.return_last_incomplete_batch = r;
+        self
+    }
+}
+pub struct Iter1<I: Iterator<Item = Tensor>> {
+    inner: I,
+}
+pub struct Iter2<I: Iterator<Item = (Tensor, Tensor)>> {
+    inner: I,
+}
+impl<I: Iterator<Item = Tensor>> Batcher<Iter1<I>> {
+    pub fn new1(inner: I) -> Self {
+        Self::new(Iter1 { inner })
+    }
+}
+impl<I: Iterator<Item = (Tensor, Tensor)>> Batcher<Iter2<I>> {
+    pub fn new2(inner: I) -> Self {
+        Self::new(Iter2 { inner })
+    }
+}
+pub struct IterResult1<I: Iterator<Item = Result<Tensor>>> {
+    inner: I,
+}
+pub struct IterResult2<I: Iterator<Item = Result<(Tensor, Tensor)>>> {
+    inner: I,
+}
+impl<I: Iterator<Item = Result<Tensor>>> Batcher<IterResult1<I>> {
+    pub fn new_r1(inner: I) -> Self {
+        Self::new(IterResult1 { inner })
+    }
+}
+impl<I: Iterator<Item = Result<(Tensor, Tensor)>>> Batcher<IterResult2<I>> {
+    pub fn new_r2(inner: I) -> Self {
+        Self::new(IterResult2 { inner })
+    }
+}
+impl<I: Iterator<Item = Tensor>> Iterator for Batcher<Iter1<I>> {
+    type Item = Result<Tensor>;
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut items = Vec::with_capacity(self.batch_size);
+        for _i in 0..self.batch_size {
+            // We have two levels of inner here so that we can have two implementations of the
+            // Iterator trait that are different for Iter1 and Iter2. If rust gets better
+            // specialization at some point we can get rid of this.
+            match self.inner.inner.next() {
+                Some(item) => items.push(item),
+                None => {
+                    if self.return_last_incomplete_batch {
+                        break;
+                    }
+                    return None;
+                }
+            }
+        }
+        Some(Tensor::stack(&items, 0))
+    }
+}
+impl<I: Iterator<Item = (Tensor, Tensor)>> Iterator for Batcher<Iter2<I>> {
+    type Item = Result<(Tensor, Tensor)>;
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut xs = Vec::with_capacity(self.batch_size);
+        let mut ys = Vec::with_capacity(self.batch_size);
+        for _i in 0..self.batch_size {
+            match self.inner.inner.next() {
+                Some((x, y)) => {
+                    xs.push(x);
+                    ys.push(y)
+                }
+                None => {
+                    if self.return_last_incomplete_batch {
+                        break;
+                    }
+                    return None;
+                }
+            }
+        }
+        let xs = Tensor::stack(&xs, 0);
+        let ys = Tensor::stack(&ys, 0);
+        Some(xs.and_then(|xs| ys.map(|ys| (xs, ys))))
+    }
+}
+impl<I: Iterator<Item = Result<Tensor>>> Iterator for Batcher<IterResult1<I>> {
+    type Item = Result<Tensor>;
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut items = Vec::with_capacity(self.batch_size);
+        for _i in 0..self.batch_size {
+            // We have two levels of inner here so that we can have two implementations of the
+            // Iterator trait that are different for Iter1 and Iter2. If rust gets better
+            // specialization at some point we can get rid of this.
+            match self.inner.inner.next() {
+                Some(item) => items.push(item),
+                None => {
+                    if self.return_last_incomplete_batch {
+                        break;
+                    }
+                    return None;
+                }
+            }
+        }
+        let items = items.into_iter().collect::<Result<Vec<Tensor>>>();
+        Some(items.and_then(|items| Tensor::stack(&items, 0)))
+    }
+}
+impl<I: Iterator<Item = Result<(Tensor, Tensor)>>> Iterator for Batcher<IterResult2<I>> {
+    type Item = Result<(Tensor, Tensor)>;
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut xs = Vec::with_capacity(self.batch_size);
+        let mut ys = Vec::with_capacity(self.batch_size);
+        let mut errs = vec![];
+        for _i in 0..self.batch_size {
+            match self.inner.inner.next() {
+                Some(Ok((x, y))) => {
+                    xs.push(x);
+                    ys.push(y)
+                }
+                Some(Err(err)) => errs.push(err),
+                None => {
+                    if self.return_last_incomplete_batch {
+                        break;
+                    }
+                    return None;
+                }
+            }
+        }
+        if !errs.is_empty() {
+            return Some(Err(errs.swap_remove(0)));
+        }
+        let xs = Tensor::stack(&xs, 0);
+        let ys = Tensor::stack(&ys, 0);
+        Some(xs.and_then(|xs| ys.map(|ys| (xs, ys))))
+    }
+}
--- a/candle-datasets/src/hub.rs
+++ b/candle-datasets/src/hub.rs
+use hf_hub::{
+    api::sync::{Api, ApiRepo},
+    Repo, RepoType,
+};
+use parquet::file::reader::SerializedFileReader;
+use std::fs::File;
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("ApiError : {0}")]
+    ApiError(#[from] hf_hub::api::sync::ApiError),
+    #[error("IoError : {0}")]
+    IoError(#[from] std::io::Error),
+    #[error("ParquetError : {0}")]
+    ParquetError(#[from] parquet::errors::ParquetError),
+}
+fn sibling_to_parquet(
+    rfilename: &str,
+    repo: &ApiRepo,
+) -> Result<SerializedFileReader<File>, Error> {
+    let local = repo.get(rfilename)?;
+    let file = File::open(local)?;
+    let reader = SerializedFileReader::new(file)?;
+    Ok(reader)
+}
+pub fn from_hub(api: &Api, dataset_id: String) -> Result<Vec<SerializedFileReader<File>>, Error> {
+    let repo = Repo::with_revision(
+        dataset_id,
+        RepoType::Dataset,
+        "refs/convert/parquet".to_string(),
+    );
+    let repo = api.repo(repo);
+    let info = repo.info()?;
+    let files: Result<Vec<_>, _> = info
+        .siblings
+        .into_iter()
+        .filter_map(|s| -> Option<Result<_, _>> {
+            let filename = s.rfilename;
+            if filename.ends_with(".parquet") {
+                let reader_result = sibling_to_parquet(&filename, &repo);
+                Some(reader_result)
+            } else {
+                None
+            }
+        })
+        .collect();
+    let files = files?;
+    Ok(files)
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use parquet::file::reader::FileReader;
+    #[test]
+    fn test_dataset() {
+        let api = Api::new().unwrap();
+        let files = from_hub(
+            &api,
+            "hf-internal-testing/dummy_image_text_data".to_string(),
+        )
+        .unwrap();
+        assert_eq!(files.len(), 1);
+        assert_eq!(files[0].metadata().file_metadata().num_rows(), 20);
+    }
+}
--- a/candle-datasets/src/lib.rs
+++ b/candle-datasets/src/lib.rs
+//! Datasets & Dataloaders for Candle
+pub mod batcher;
+pub mod hub;
+pub mod nlp;
+pub mod vision;
+pub use batcher::Batcher;