Initial commit

25d2752f · yongshk · 25d2752f · 25d2752f · 25d2752f · 25d2752f
Commit 25d2752f authored May 29, 2025 by yongshk
20 changed files
--- a/candle-core/examples/cuda_sum_benchmark.rs
+++ b/candle-core/examples/cuda_sum_benchmark.rs
+#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+extern crate intel_mkl_src;
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+use std::str::FromStr;
+use anyhow::Result;
+use candle_core::{Device, Tensor};
+fn cos_sin(n: usize, device: &Device) -> Result<Tensor> {
+    let thetas: Vec<_> = (0..n).map(|i| (i as f32 / n as f32)).collect();
+    let xs: Vec<_> = thetas.iter().map(|t| t.cos().abs()).collect();
+    let ys: Vec<_> = thetas.iter().map(|t| t.sin().abs()).collect();
+    let xs = Tensor::from_vec(xs, (n, 1), device)?;
+    let ys = Tensor::from_vec(ys, (1, n), device)?;
+    let ys = Tensor::cat(&[&ys, &ys, &ys, &ys, &ys, &ys], 1)?;
+    Ok(xs.matmul(&ys)?)
+}
+fn main() -> Result<()> {
+    let device = Device::new_cuda(0)?;
+    let args = std::env::args().collect::<Vec<String>>();
+    let n = if args.len() < 2 {
+        2000usize
+    } else {
+        usize::from_str(&args[1])?
+    };
+    let xys_cpu = cos_sin(n, &Device::Cpu)?;
+    let xys = cos_sin(n, &device)?;
+    println!("{xys_cpu:?} {xys:?}");
+    let sum_keepdim_cpu = xys_cpu.sum_keepdim(1)?;
+    println!("{sum_keepdim_cpu}");
+    let sum_keepdim = xys.sum_keepdim(1)?;
+    println!("{sum_keepdim}");
+    let start = std::time::Instant::now();
+    let n_iters = 100;
+    let mut v = 0f32;
+    for _i in 0..n_iters {
+        let sum_keepdim = xys.sum_keepdim(1)?;
+        let sum_keepdim = sum_keepdim.sum_keepdim(0)?;
+        let sum_keepdim: f32 = sum_keepdim.reshape(&[])?.to_scalar()?;
+        v += sum_keepdim;
+    }
+    let elapsed = start.elapsed();
+    if v > 0. {
+        println!(
+            "ran {n_iters} iterations, time per iter: {:?} ({v})",
+            elapsed.div_f64(n_iters as f64)
+        );
+    }
+    Ok(())
+}
--- a/candle-core/src/accelerate.rs
+++ b/candle-core/src/accelerate.rs
+#![allow(dead_code)]
+use libc::{c_char, c_double, c_float, c_int, c_long, c_ulong};
+mod ffi {
+    use super::*;
+    extern "C" {
+        // It would be nice to be able to switch to the NEWLAPACK version of the function but this
+        // seems to trigger some link error. Available function names can be seen here:
+        // /Library/Developer/CommandLineTools/SDKs/MacOSX13.3.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Accelerate.tbd
+        #[link_name = "sgemm_"]
+        pub fn sgemm_ffi(
+            transa: *const c_char,
+            transb: *const c_char,
+            m: *const c_int,
+            n: *const c_int,
+            k: *const c_int,
+            alpha: *const c_float,
+            a: *const c_float,
+            lda: *const c_int,
+            b: *const c_float,
+            ldb: *const c_int,
+            beta: *const c_float,
+            c: *mut c_float,
+            ldc: *const c_int,
+        );
+        #[link_name = "dgemm_"]
+        pub fn dgemm_ffi(
+            transa: *const c_char,
+            transb: *const c_char,
+            m: *const c_int,
+            n: *const c_int,
+            k: *const c_int,
+            alpha: *const c_double,
+            a: *const c_double,
+            lda: *const c_int,
+            b: *const c_double,
+            ldb: *const c_int,
+            beta: *const c_double,
+            c: *mut c_double,
+            ldc: *const c_int,
+        );
+        pub fn vvexpf(dst: *mut c_float, src: *const c_float, len: *const c_int);
+        pub fn vvexp(dst: *mut c_double, src: *const c_double, len: *const c_int);
+        pub fn vvsqrtf(dst: *mut c_float, src: *const c_float, len: *const c_int);
+        pub fn vvsqrt(dst: *mut c_double, src: *const c_double, len: *const c_int);
+        pub fn vvsinf(dst: *mut c_float, src: *const c_float, len: *const c_int);
+        pub fn vvsin(dst: *mut c_double, src: *const c_double, len: *const c_int);
+        pub fn vvcosf(dst: *mut c_float, src: *const c_float, len: *const c_int);
+        pub fn vvcos(dst: *mut c_double, src: *const c_double, len: *const c_int);
+        pub fn vvlogf(dst: *mut c_float, src: *const c_float, len: *const c_int);
+        pub fn vvlog(dst: *mut c_double, src: *const c_double, len: *const c_int);
+        pub fn vvtanhf(dst: *mut c_float, src: *const c_float, len: *const c_int);
+        pub fn vvtanh(dst: *mut c_double, src: *const c_double, len: *const c_int);
+        pub fn vDSP_vaddD(
+            _: *const c_double,
+            _: c_long,
+            _: *const c_double,
+            _: c_long,
+            _: *mut c_double,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vadd(
+            _: *const c_float,
+            _: c_long,
+            _: *const c_float,
+            _: c_long,
+            _: *mut c_float,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vsubD(
+            _: *const c_double,
+            _: c_long,
+            _: *const c_double,
+            _: c_long,
+            _: *mut c_double,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vsub(
+            _: *const c_float,
+            _: c_long,
+            _: *const c_float,
+            _: c_long,
+            _: *mut c_float,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vmulD(
+            _: *const c_double,
+            _: c_long,
+            _: *const c_double,
+            _: c_long,
+            _: *mut c_double,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vmul(
+            _: *const c_float,
+            _: c_long,
+            _: *const c_float,
+            _: c_long,
+            _: *mut c_float,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vdivD(
+            _: *const c_double,
+            _: c_long,
+            _: *const c_double,
+            _: c_long,
+            _: *mut c_double,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vdiv(
+            _: *const c_float,
+            _: c_long,
+            _: *const c_float,
+            _: c_long,
+            _: *mut c_float,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vminD(
+            _: *const c_double,
+            _: c_long,
+            _: *const c_double,
+            _: c_long,
+            _: *mut c_double,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vmin(
+            _: *const c_float,
+            _: c_long,
+            _: *const c_float,
+            _: c_long,
+            _: *mut c_float,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vmaxD(
+            _: *const c_double,
+            _: c_long,
+            _: *const c_double,
+            _: c_long,
+            _: *mut c_double,
+            _: c_long,
+            _: c_ulong,
+        );
+        pub fn vDSP_vmax(
+            _: *const c_float,
+            _: c_long,
+            _: *const c_float,
+            _: c_long,
+            _: *mut c_float,
+            _: c_long,
+            _: c_ulong,
+        );
+    }
+}
+#[allow(clippy::too_many_arguments)]
+#[inline]
+pub unsafe fn sgemm(
+    transa: u8,
+    transb: u8,
+    m: i32,
+    n: i32,
+    k: i32,
+    alpha: f32,
+    a: &[f32],
+    lda: i32,
+    b: &[f32],
+    ldb: i32,
+    beta: f32,
+    c: &mut [f32],
+    ldc: i32,
+) {
+    ffi::sgemm_ffi(
+        &(transa as c_char),
+        &(transb as c_char),
+        &m,
+        &n,
+        &k,
+        &alpha,
+        a.as_ptr(),
+        &lda,
+        b.as_ptr(),
+        &ldb,
+        &beta,
+        c.as_mut_ptr(),
+        &ldc,
+    )
+}
+#[allow(clippy::too_many_arguments)]
+#[inline]
+pub unsafe fn dgemm(
+    transa: u8,
+    transb: u8,
+    m: i32,
+    n: i32,
+    k: i32,
+    alpha: f64,
+    a: &[f64],
+    lda: i32,
+    b: &[f64],
+    ldb: i32,
+    beta: f64,
+    c: &mut [f64],
+    ldc: i32,
+) {
+    ffi::dgemm_ffi(
+        &(transa as c_char),
+        &(transb as c_char),
+        &m,
+        &n,
+        &k,
+        &alpha,
+        a.as_ptr(),
+        &lda,
+        b.as_ptr(),
+        &ldb,
+        &beta,
+        c.as_mut_ptr(),
+        &ldc,
+    )
+}
+#[inline]
+pub fn vs_exp(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvexpf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vd_exp(a: &[f64], y: &mut [f64]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvexp(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vs_sqrt(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvsqrtf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vd_sqrt(a: &[f64], y: &mut [f64]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvsqrt(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vs_sin(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvsinf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vd_sin(a: &[f64], y: &mut [f64]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvsin(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vs_cos(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvcosf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vd_cos(a: &[f64], y: &mut [f64]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvcos(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vs_tanh(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvtanhf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vd_tanh(a: &[f64], y: &mut [f64]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvtanh(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vs_ln(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvlogf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vd_ln(a: &[f64], y: &mut [f64]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { ffi::vvlog(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
+}
+#[inline]
+pub fn vs_sqr(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a)
+}
+#[inline]
+pub fn vd_sqr(a: &[f64], y: &mut [f64]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a)
+}
+#[inline]
+pub fn vs_tanh_inplace(y: &mut [f32]) {
+    unsafe { ffi::vvtanhf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
+}
+#[inline]
+pub fn vd_tanh_inplace(y: &mut [f64]) {
+    unsafe { ffi::vvtanh(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
+}
+#[inline]
+pub fn vs_exp_inplace(y: &mut [f32]) {
+    unsafe { ffi::vvexpf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
+}
+#[inline]
+pub fn vd_exp_inplace(y: &mut [f64]) {
+    unsafe { ffi::vvexp(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
+}
+#[inline]
+pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = (2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
+    }
+    vs_tanh_inplace(ys);
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = 0.5 * v * (1.0 + *y)
+    }
+}
+#[inline]
+pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = (2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
+    }
+    vd_tanh_inplace(ys);
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = 0.5 * v * (1.0 + *y)
+    }
+}
+#[inline]
+pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = -v
+    }
+    vs_exp_inplace(ys);
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = v / (1.0 + *y)
+    }
+}
+#[inline]
+pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = -v
+    }
+    vd_exp_inplace(ys);
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = v / (1.0 + *y)
+    }
+}
+macro_rules! binary_op {
+    ($fn_name:ident, $ty:ty, $accelerate_name:ident) => {
+        #[inline]
+        pub fn $fn_name(a: &[$ty], b: &[$ty], y: &mut [$ty]) {
+            let a_len = a.len();
+            let b_len = b.len();
+            let y_len = y.len();
+            if a_len != y_len || b_len != y_len {
+                panic!(
+                    "{} a,b,y len mismatch {a_len} {b_len} {y_len}",
+                    stringify!($fn_name)
+                );
+            }
+            unsafe {
+                // Weird quirk of accelerate, the rhs comes before the lhs.
+                ffi::$accelerate_name(
+                    b.as_ptr(),
+                    1,
+                    a.as_ptr(),
+                    1,
+                    y.as_mut_ptr(),
+                    1,
+                    a_len as u64,
+                )
+            }
+        }
+    };
+}
+binary_op!(vs_add, f32, vDSP_vadd);
+binary_op!(vd_add, f64, vDSP_vaddD);
+binary_op!(vs_sub, f32, vDSP_vsub);
+binary_op!(vd_sub, f64, vDSP_vsubD);
+binary_op!(vs_mul, f32, vDSP_vmul);
+binary_op!(vd_mul, f64, vDSP_vmulD);
+binary_op!(vs_div, f32, vDSP_vdiv);
+binary_op!(vd_div, f64, vDSP_vdivD);
+binary_op!(vs_max, f32, vDSP_vmax);
+binary_op!(vd_max, f64, vDSP_vmaxD);
+binary_op!(vs_min, f32, vDSP_vmin);
+binary_op!(vd_min, f64, vDSP_vminD);
--- a/candle-core/src/backend.rs
+++ b/candle-core/src/backend.rs
+use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
+use crate::{CpuStorage, DType, Layout, Result, Shape};
+pub trait BackendStorage: Sized {
+    type Device: BackendDevice;
+    fn try_clone(&self, _: &Layout) -> Result<Self>;
+    fn dtype(&self) -> DType;
+    fn device(&self) -> &Self::Device;
+    // Maybe this should return a Cow instead so that no copy is done on the cpu case.
+    fn to_cpu_storage(&self) -> Result<CpuStorage>;
+    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self>;
+    fn powf(&self, _: &Layout, _: f64) -> Result<Self>;
+    fn elu(&self, _: &Layout, _: f64) -> Result<Self>;
+    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self>;
+    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self>;
+    fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self>;
+    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self>;
+    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self>;
+    fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self>;
+    fn conv1d(
+        &self,
+        _l: &Layout,
+        _kernel: &Self,
+        _kernel_l: &Layout,
+        _params: &crate::conv::ParamsConv1D,
+    ) -> Result<Self>;
+    fn conv_transpose1d(
+        &self,
+        _l: &Layout,
+        _kernel: &Self,
+        _kernel_l: &Layout,
+        _params: &crate::conv::ParamsConvTranspose1D,
+    ) -> Result<Self>;
+    fn conv2d(
+        &self,
+        _l: &Layout,
+        _kernel: &Self,
+        _kernel_l: &Layout,
+        _params: &crate::conv::ParamsConv2D,
+    ) -> Result<Self>;
+    fn conv_transpose2d(
+        &self,
+        _l: &Layout,
+        _kernel: &Self,
+        _kernel_l: &Layout,
+        _params: &crate::conv::ParamsConvTranspose2D,
+    ) -> Result<Self>;
+    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
+    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
+    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self>;
+    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self>;
+    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self>;
+    fn scatter_add(
+        &self,
+        _: &Layout,
+        _: &Self,
+        _: &Layout,
+        _: &Self,
+        _: &Layout,
+        _: usize,
+    ) -> Result<Self>;
+    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self>;
+    fn index_add(
+        &self,
+        _: &Layout,
+        _: &Self,
+        _: &Layout,
+        _: &Self,
+        _: &Layout,
+        _: usize,
+    ) -> Result<Self>;
+    fn matmul(
+        &self,
+        _: &Self,
+        _: (usize, usize, usize, usize),
+        _: &Layout,
+        _: &Layout,
+    ) -> Result<Self>;
+    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()>;
+    #[allow(clippy::too_many_arguments)]
+    // Similar to cudaMemcpy2D, though values are in elements and not in bytes.
+    fn copy2d(
+        &self,
+        _: &mut Self,
+        _d1: usize,
+        _d2: usize,
+        _src_stride1: usize,
+        _dst_stride1: usize,
+        _src_offset: usize,
+        _dst_offset: usize,
+    ) -> Result<()>;
+}
+pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
+    type Storage: BackendStorage;
+    // TODO: Make the usize generic and part of a generic DeviceLocation.
+    fn new(_: usize) -> Result<Self>;
+    fn location(&self) -> crate::DeviceLocation;
+    fn same_device(&self, _: &Self) -> bool;
+    fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;
+    fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;
+    /// # Safety
+    /// This function is unsafe as it doesn't initialize the underlying data store.
+    /// The caller should ensure that the data is properly initialized as early as possible
+    /// after this call.
+    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;
+    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage>;
+    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage>;
+    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;
+    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;
+    fn set_seed(&self, _: u64) -> Result<()>;
+}
--- a/candle-core/src/backprop.rs
+++ b/candle-core/src/backprop.rs
--- a/candle-core/src/conv.rs
+++ b/candle-core/src/conv.rs
+use crate::{op::BackpropOp, op::Op, Error, Result, Tensor};
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ParamsConv1D {
+    pub(crate) b_size: usize,
+    // Maybe we should have a version without l_in as this bit depends on the input and not only on
+    // the weights.
+    pub(crate) l_in: usize,
+    pub(crate) c_out: usize,
+    pub(crate) c_in: usize,
+    pub(crate) k_size: usize,
+    pub(crate) padding: usize,
+    pub(crate) stride: usize,
+    pub(crate) dilation: usize,
+}
+impl ParamsConv1D {
+    pub(crate) fn l_out(&self) -> usize {
+        (self.l_in + 2 * self.padding - self.dilation * (self.k_size - 1) - 1) / self.stride + 1
+    }
+    pub(crate) fn out_dims(&self) -> Vec<usize> {
+        let l_out = self.l_out();
+        vec![self.b_size, self.c_out, l_out]
+    }
+}
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ParamsConvTranspose1D {
+    pub(crate) b_size: usize,
+    pub(crate) l_in: usize,
+    pub(crate) c_out: usize,
+    pub(crate) c_in: usize,
+    pub(crate) k_size: usize,
+    pub(crate) padding: usize,
+    pub(crate) output_padding: usize,
+    pub(crate) stride: usize,
+    pub(crate) dilation: usize,
+}
+impl ParamsConvTranspose1D {
+    pub(crate) fn l_out(&self) -> usize {
+        (self.l_in - 1) * self.stride - 2 * self.padding
+            + self.dilation * (self.k_size - 1)
+            + self.output_padding
+            + 1
+    }
+    pub(crate) fn out_dims(&self) -> Vec<usize> {
+        let l_out = self.l_out();
+        vec![self.b_size, self.c_out, l_out]
+    }
+}
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum CudnnFwdAlgo {
+    ImplicitGemm,
+    ImplicitPrecompGemm,
+    Gemm,
+    Direct,
+    Fft,
+    FftTiling,
+    Winograd,
+    WinogradNonFused,
+    Count,
+}
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ParamsConv2D {
+    pub(crate) b_size: usize,
+    pub(crate) i_h: usize,
+    pub(crate) i_w: usize,
+    pub(crate) k_h: usize,
+    pub(crate) k_w: usize,
+    pub(crate) c_out: usize,
+    pub(crate) c_in: usize,
+    pub(crate) padding: usize,
+    pub(crate) stride: usize,
+    pub(crate) dilation: usize,
+    pub cudnn_fwd_algo: Option<CudnnFwdAlgo>,
+}
+impl ParamsConv2D {
+    pub(crate) fn out_h(&self) -> usize {
+        (self.i_h + 2 * self.padding - self.dilation * (self.k_h - 1) - 1) / self.stride + 1
+    }
+    pub(crate) fn out_w(&self) -> usize {
+        (self.i_w + 2 * self.padding - self.dilation * (self.k_w - 1) - 1) / self.stride + 1
+    }
+    pub(crate) fn out_dims(&self) -> Vec<usize> {
+        vec![self.b_size, self.c_out, self.out_h(), self.out_w()]
+    }
+}
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ParamsConvTranspose2D {
+    pub(crate) b_size: usize,
+    pub(crate) i_h: usize,
+    pub(crate) i_w: usize,
+    pub(crate) k_h: usize,
+    pub(crate) k_w: usize,
+    pub(crate) c_out: usize,
+    pub(crate) c_in: usize,
+    pub(crate) padding: usize,
+    pub(crate) output_padding: usize,
+    pub(crate) stride: usize,
+    pub(crate) dilation: usize,
+}
+impl ParamsConvTranspose2D {
+    pub(crate) fn out_h(&self) -> usize {
+        (self.i_h - 1) * self.stride + self.dilation * (self.k_h - 1) + self.output_padding + 1
+            - 2 * self.padding
+    }
+    pub(crate) fn out_w(&self) -> usize {
+        (self.i_w - 1) * self.stride + self.dilation * (self.k_w - 1) + self.output_padding + 1
+            - 2 * self.padding
+    }
+    pub(crate) fn out_dims(&self) -> Vec<usize> {
+        vec![self.b_size, self.c_out, self.out_h(), self.out_w()]
+    }
+}
+impl Tensor {
+    fn conv1d_single_group(&self, kernel: &Self, params: &ParamsConv1D) -> Result<Self> {
+        let storage =
+            self.storage()
+                .conv1d(self.layout(), &kernel.storage(), kernel.layout(), params)?;
+        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::Conv1D {
+            arg,
+            kernel,
+            padding: params.padding,
+            stride: params.stride,
+            dilation: params.dilation,
+        });
+        let out_dims = params.out_dims();
+        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
+    }
+    /// Applies a 1D convolution over the input tensor.
+    pub fn conv1d(
+        &self,
+        kernel: &Self,
+        padding: usize,
+        stride: usize,
+        dilation: usize,
+        groups: usize,
+    ) -> Result<Self> {
+        let (c_out, c_in_k, k_size) = kernel.dims3()?;
+        let (b_size, c_in, l_in) = self.dims3()?;
+        if c_in != c_in_k * groups {
+            Err(Error::Conv1dInvalidArgs {
+                inp_shape: self.shape().clone(),
+                k_shape: kernel.shape().clone(),
+                padding,
+                stride,
+                msg: "the number of in-channels on the input doesn't match the kernel size",
+            }
+            .bt())?
+        }
+        let params = ParamsConv1D {
+            b_size,
+            l_in,
+            c_out: c_out / groups,
+            c_in: c_in / groups,
+            k_size,
+            padding,
+            stride,
+            dilation,
+        };
+        if groups == 1 {
+            self.conv1d_single_group(kernel, &params)
+        } else {
+            let blocks = self.chunk(groups, 1)?;
+            let kernel = kernel.chunk(groups, 0)?;
+            let blocks = blocks
+                .iter()
+                .zip(&kernel)
+                .map(|(block, kernel)| block.conv1d_single_group(kernel, &params))
+                .collect::<Result<Vec<_>>>()?;
+            Tensor::cat(&blocks, 1)
+        }
+    }
+    fn conv_transpose1d_single_group(
+        &self,
+        kernel: &Self,
+        params: &ParamsConvTranspose1D,
+    ) -> Result<Self> {
+        let storage = self.storage().conv_transpose1d(
+            self.layout(),
+            &kernel.storage(),
+            kernel.layout(),
+            params,
+        )?;
+        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::ConvTranspose1D {
+            arg,
+            kernel,
+            padding: params.padding,
+            output_padding: params.output_padding,
+            stride: params.stride,
+            dilation: params.dilation,
+        });
+        let out_dims = params.out_dims();
+        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
+    }
+    /// Applies a 1D transposed convolution over the input tensor.
+    pub fn conv_transpose1d(
+        &self,
+        kernel: &Self,
+        padding: usize,
+        output_padding: usize,
+        stride: usize,
+        dilation: usize,
+        groups: usize,
+    ) -> Result<Self> {
+        let (c_in_k, c_out, k_size) = kernel.dims3()?;
+        let (b_size, c_in, l_in) = self.dims3()?;
+        if c_in != c_in_k {
+            crate::bail!("in_channel mismatch between input ({c_in}) and kernel ({c_in_k})")
+        }
+        if c_in % groups != 0 {
+            crate::bail!("in_channel {c_in} is not divisible by the number of groups")
+        }
+        let params = ParamsConvTranspose1D {
+            b_size,
+            l_in,
+            k_size,
+            c_out,
+            c_in: c_in / groups,
+            padding,
+            output_padding,
+            stride,
+            dilation,
+        };
+        if groups == 1 {
+            self.conv_transpose1d_single_group(kernel, &params)
+        } else {
+            let blocks = self.chunk(groups, 1)?;
+            let kernel = kernel.chunk(groups, 0)?;
+            let blocks = blocks
+                .iter()
+                .zip(&kernel)
+                .map(|(block, kernel)| block.conv_transpose1d_single_group(kernel, &params))
+                .collect::<Result<Vec<_>>>()?;
+            Tensor::cat(&blocks, 1)
+        }
+    }
+    fn conv2d_single_group(&self, kernel: &Self, params: &ParamsConv2D) -> Result<Self> {
+        let storage =
+            self.storage()
+                .conv2d(self.layout(), &kernel.storage(), kernel.layout(), params)?;
+        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::Conv2D {
+            arg,
+            kernel,
+            padding: params.padding,
+            stride: params.stride,
+            dilation: params.dilation,
+        });
+        let out_dims = params.out_dims();
+        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
+    }
+    /// Applies a 2D convolution over the input tensor.
+    pub fn conv2d(
+        &self,
+        kernel: &Self,
+        padding: usize,
+        stride: usize,
+        dilation: usize,
+        groups: usize,
+    ) -> Result<Self> {
+        let (b_size, c_in, i_h, i_w) = self.dims4()?;
+        let (c_out, c_in_k, k_h, k_w) = kernel.dims4()?;
+        if c_in != c_in_k * groups {
+            crate::bail!(
+                "in_channel mismatch between input ({c_in}, groups {groups}) and kernel ({c_in_k})"
+            )
+        }
+        let params = ParamsConv2D {
+            b_size,
+            i_h,
+            i_w,
+            k_h,
+            k_w,
+            c_out: c_out / groups,
+            c_in: c_in / groups,
+            padding,
+            stride,
+            dilation,
+            cudnn_fwd_algo: None,
+        };
+        if groups == 1 {
+            self.conv2d_single_group(kernel, &params)
+        } else {
+            let blocks = self.chunk(groups, 1)?;
+            let kernel = kernel.chunk(groups, 0)?;
+            let blocks = blocks
+                .iter()
+                .zip(&kernel)
+                .map(|(block, kernel)| block.conv2d_single_group(kernel, &params))
+                .collect::<Result<Vec<_>>>()?;
+            Tensor::cat(&blocks, 1)
+        }
+    }
+    /// Applies a 2D transposed convolution over the input tensor.
+    pub fn conv_transpose2d(
+        &self,
+        kernel: &Self,
+        padding: usize,
+        output_padding: usize,
+        stride: usize,
+        dilation: usize,
+    ) -> Result<Self> {
+        let (b_size, c_in, i_h, i_w) = self.dims4()?;
+        let (c_in_k, c_out, k_h, k_w) = kernel.dims4()?;
+        if c_in != c_in_k {
+            crate::bail!("in_channel mismatch between input ({c_in}) and kernel ({c_in_k})")
+        }
+        let params = ParamsConvTranspose2D {
+            b_size,
+            i_h,
+            i_w,
+            k_h,
+            k_w,
+            c_out,
+            c_in,
+            padding,
+            output_padding,
+            stride,
+            dilation,
+        };
+        let storage = self.storage().conv_transpose2d(
+            self.layout(),
+            &kernel.storage(),
+            kernel.layout(),
+            &params,
+        )?;
+        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::ConvTranspose2D {
+            arg,
+            kernel,
+            padding: params.padding,
+            output_padding: params.output_padding,
+            stride: params.stride,
+            dilation: params.dilation,
+        });
+        let out_dims = params.out_dims();
+        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
+    }
+}
--- a/candle-core/src/convert.rs
+++ b/candle-core/src/convert.rs
+//! Implement conversion traits for tensors
+use crate::{DType, Device, Error, Tensor, WithDType};
+use half::{bf16, f16, slice::HalfFloatSliceExt};
+use std::convert::TryFrom;
+impl<T: WithDType> TryFrom<&Tensor> for Vec<T> {
+    type Error = Error;
+    fn try_from(tensor: &Tensor) -> Result<Self, Self::Error> {
+        tensor.to_vec1::<T>()
+    }
+}
+impl<T: WithDType> TryFrom<&Tensor> for Vec<Vec<T>> {
+    type Error = Error;
+    fn try_from(tensor: &Tensor) -> Result<Self, Self::Error> {
+        tensor.to_vec2::<T>()
+    }
+}
+impl<T: WithDType> TryFrom<&Tensor> for Vec<Vec<Vec<T>>> {
+    type Error = Error;
+    fn try_from(tensor: &Tensor) -> Result<Self, Self::Error> {
+        tensor.to_vec3::<T>()
+    }
+}
+impl<T: WithDType> TryFrom<Tensor> for Vec<T> {
+    type Error = Error;
+    fn try_from(tensor: Tensor) -> Result<Self, Self::Error> {
+        Vec::<T>::try_from(&tensor)
+    }
+}
+impl<T: WithDType> TryFrom<Tensor> for Vec<Vec<T>> {
+    type Error = Error;
+    fn try_from(tensor: Tensor) -> Result<Self, Self::Error> {
+        Vec::<Vec<T>>::try_from(&tensor)
+    }
+}
+impl<T: WithDType> TryFrom<Tensor> for Vec<Vec<Vec<T>>> {
+    type Error = Error;
+    fn try_from(tensor: Tensor) -> Result<Self, Self::Error> {
+        Vec::<Vec<Vec<T>>>::try_from(&tensor)
+    }
+}
+impl<T: WithDType> TryFrom<&[T]> for Tensor {
+    type Error = Error;
+    fn try_from(v: &[T]) -> Result<Self, Self::Error> {
+        Tensor::from_slice(v, v.len(), &Device::Cpu)
+    }
+}
+impl<T: WithDType> TryFrom<Vec<T>> for Tensor {
+    type Error = Error;
+    fn try_from(v: Vec<T>) -> Result<Self, Self::Error> {
+        let len = v.len();
+        Tensor::from_vec(v, len, &Device::Cpu)
+    }
+}
+macro_rules! from_tensor {
+    ($typ:ident) => {
+        impl TryFrom<&Tensor> for $typ {
+            type Error = Error;
+            fn try_from(tensor: &Tensor) -> Result<Self, Self::Error> {
+                tensor.to_scalar::<$typ>()
+            }
+        }
+        impl TryFrom<Tensor> for $typ {
+            type Error = Error;
+            fn try_from(tensor: Tensor) -> Result<Self, Self::Error> {
+                $typ::try_from(&tensor)
+            }
+        }
+        impl TryFrom<$typ> for Tensor {
+            type Error = Error;
+            fn try_from(v: $typ) -> Result<Self, Self::Error> {
+                Tensor::new(v, &Device::Cpu)
+            }
+        }
+    };
+}
+from_tensor!(f64);
+from_tensor!(f32);
+from_tensor!(f16);
+from_tensor!(bf16);
+from_tensor!(i64);
+from_tensor!(u32);
+from_tensor!(u8);
+impl Tensor {
+    pub fn write_bytes<W: std::io::Write>(&self, f: &mut W) -> crate::Result<()> {
+        use byteorder::{LittleEndian, WriteBytesExt};
+        let vs = self.flatten_all()?;
+        match self.dtype() {
+            DType::BF16 => {
+                let vs = vs.to_vec1::<bf16>()?;
+                for &v in vs.reinterpret_cast() {
+                    f.write_u16::<LittleEndian>(v)?
+                }
+            }
+            DType::F16 => {
+                let vs = vs.to_vec1::<f16>()?;
+                for &v in vs.reinterpret_cast() {
+                    f.write_u16::<LittleEndian>(v)?
+                }
+            }
+            DType::F32 => {
+                // TODO: Avoid using a buffer when data is already on the CPU.
+                for v in vs.to_vec1::<f32>()? {
+                    f.write_f32::<LittleEndian>(v)?
+                }
+            }
+            DType::F64 => {
+                for v in vs.to_vec1::<f64>()? {
+                    f.write_f64::<LittleEndian>(v)?
+                }
+            }
+            DType::U32 => {
+                for v in vs.to_vec1::<u32>()? {
+                    f.write_u32::<LittleEndian>(v)?
+                }
+            }
+            DType::I64 => {
+                for v in vs.to_vec1::<i64>()? {
+                    f.write_i64::<LittleEndian>(v)?
+                }
+            }
+            DType::U8 => {
+                let vs = vs.to_vec1::<u8>()?;
+                f.write_all(&vs)?;
+            }
+        }
+        Ok(())
+    }
+}
--- a/candle-core/src/cpu/avx.rs
+++ b/candle-core/src/cpu/avx.rs
+use super::{Cpu, CpuF16};
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+use half::f16;
+pub struct CurrentCpu {}
+const STEP: usize = 32;
+const EPR: usize = 8;
+const ARR: usize = STEP / EPR;
+impl Cpu<ARR> for CurrentCpu {
+    type Unit = __m256;
+    type Array = [__m256; ARR];
+    const STEP: usize = STEP;
+    const EPR: usize = EPR;
+    fn n() -> usize {
+        ARR
+    }
+    unsafe fn zero() -> Self::Unit {
+        _mm256_setzero_ps()
+    }
+    unsafe fn zero_array() -> Self::Array {
+        [Self::zero(); ARR]
+    }
+    unsafe fn from_f32(v: f32) -> Self::Unit {
+        _mm256_set1_ps(v)
+    }
+    unsafe fn load(mem_addr: *const f32) -> Self::Unit {
+        _mm256_loadu_ps(mem_addr)
+    }
+    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
+        _mm256_add_ps(a, b)
+    }
+    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit {
+        _mm256_add_ps(_mm256_mul_ps(b, c), a)
+    }
+    unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit) {
+        _mm256_storeu_ps(mem_addr, a);
+    }
+    unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
+        for i in 0..ARR / 2 {
+            x[2 * i] = _mm256_add_ps(x[2 * i], x[2 * i + 1]);
+        }
+        for i in 0..ARR / 4 {
+            x[4 * i] = _mm256_add_ps(x[4 * i], x[4 * i + 2]);
+        }
+        #[allow(clippy::reversed_empty_ranges)]
+        for i in 0..ARR / 8 {
+            x[8 * i] = _mm256_add_ps(x[8 * i], x[8 * i + 4]);
+        }
+        let t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), _mm256_extractf128_ps(x[0], 1));
+        let t1 = _mm_hadd_ps(t0, t0);
+        *y = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
+    }
+}
+pub struct CurrentCpuF16 {}
+impl CpuF16<ARR> for CurrentCpuF16 {
+    type Unit = __m256;
+    type Array = [__m256; ARR];
+    const STEP: usize = STEP;
+    const EPR: usize = EPR;
+    fn n() -> usize {
+        ARR
+    }
+    unsafe fn zero() -> Self::Unit {
+        _mm256_setzero_ps()
+    }
+    unsafe fn zero_array() -> Self::Array {
+        [Self::zero(); ARR]
+    }
+    unsafe fn from_f32(v: f32) -> Self::Unit {
+        _mm256_set1_ps(v)
+    }
+    #[cfg(target_feature = "f16c")]
+    unsafe fn load(mem_addr: *const f16) -> Self::Unit {
+        _mm256_cvtph_ps(_mm_loadu_si128(mem_addr as *const __m128i))
+    }
+    #[cfg(not(target_feature = "f16c"))]
+    unsafe fn load(mem_addr: *const f16) -> Self::Unit {
+        let mut tmp = [0.0f32; 8];
+        for i in 0..8 {
+            tmp[i] = (*mem_addr.add(i)).to_f32();
+        }
+        _mm256_loadu_ps(tmp.as_ptr())
+    }
+    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
+        _mm256_add_ps(a, b)
+    }
+    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit {
+        _mm256_add_ps(_mm256_mul_ps(b, c), a)
+    }
+    #[cfg(target_feature = "f16c")]
+    unsafe fn vec_store(mem_addr: *mut f16, a: Self::Unit) {
+        _mm_storeu_si128(mem_addr as *mut __m128i, _mm256_cvtps_ph(a, 0))
+    }
+    #[cfg(not(target_feature = "f16c"))]
+    unsafe fn vec_store(mem_addr: *mut f16, a: Self::Unit) {
+        let mut tmp = [0.0f32; 8];
+        _mm256_storeu_ps(tmp.as_mut_ptr(), a);
+        for i in 0..8 {
+            *mem_addr.add(i) = f16::from_f32(tmp[i]);
+        }
+    }
+    unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
+        let mut offset = ARR >> 1;
+        for i in 0..offset {
+            x[i] = _mm256_add_ps(x[i], x[offset + i]);
+        }
+        offset >>= 1;
+        for i in 0..offset {
+            x[i] = _mm256_add_ps(x[i], x[offset + i]);
+        }
+        offset >>= 1;
+        for i in 0..offset {
+            x[i] = _mm256_add_ps(x[i], x[offset + i]);
+        }
+        let t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), _mm256_extractf128_ps(x[0], 1));
+        let t1 = _mm_hadd_ps(t0, t0);
+        *y = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
+    }
+}
--- a/candle-core/src/cpu/erf.rs
+++ b/candle-core/src/cpu/erf.rs
--- a/candle-core/src/cpu/kernels.rs
+++ b/candle-core/src/cpu/kernels.rs
--- a/candle-core/src/cpu/mod.rs
+++ b/candle-core/src/cpu/mod.rs
--- a/candle-core/src/cpu/neon.rs
+++ b/candle-core/src/cpu/neon.rs
--- a/candle-core/src/cpu/simd128.rs
+++ b/candle-core/src/cpu/simd128.rs
+use super::Cpu;
+use core::arch::wasm32::*;
+pub struct CurrentCpu {}
+const STEP: usize = 16;
+const EPR: usize = 4;
+const ARR: usize = STEP / EPR;
+impl Cpu<ARR> for CurrentCpu {
+    type Unit = v128;
+    type Array = [v128; ARR];
+    const STEP: usize = STEP;
+    const EPR: usize = EPR;
+    fn n() -> usize {
+        ARR
+    }
+    unsafe fn zero() -> Self::Unit {
+        f32x4_splat(0.0)
+    }
+    unsafe fn zero_array() -> Self::Array {
+        [Self::zero(); ARR]
+    }
+    unsafe fn from_f32(v: f32) -> Self::Unit {
+        f32x4_splat(v)
+    }
+    unsafe fn load(mem_addr: *const f32) -> Self::Unit {
+        v128_load(mem_addr as *mut v128)
+    }
+    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
+        f32x4_add(a, b)
+    }
+    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit {
+        f32x4_add(f32x4_mul(b, c), a)
+    }
+    unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit) {
+        v128_store(mem_addr as *mut v128, a);
+    }
+    unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
+        for i in 0..ARR / 2 {
+            x[2 * i] = f32x4_add(x[2 * i], x[2 * i + 1]);
+        }
+        for i in 0..ARR / 4 {
+            x[4 * i] = f32x4_add(x[4 * i], x[4 * i + 2]);
+        }
+        for i in 0..ARR / 8 {
+            x[8 * i] = f32x4_add(x[8 * i], x[8 * i + 4]);
+        }
+        *y = f32x4_extract_lane::<0>(x[0])
+            + f32x4_extract_lane::<1>(x[0])
+            + f32x4_extract_lane::<2>(x[0])
+            + f32x4_extract_lane::<3>(x[0]);
+    }
+}
--- a/candle-core/src/cpu_backend/mod.rs
+++ b/candle-core/src/cpu_backend/mod.rs
--- a/candle-core/src/cpu_backend/utils.rs
+++ b/candle-core/src/cpu_backend/utils.rs
--- a/candle-core/src/cuda_backend/cudnn.rs
+++ b/candle-core/src/cuda_backend/cudnn.rs
--- a/candle-core/src/cuda_backend/device.rs
+++ b/candle-core/src/cuda_backend/device.rs
--- a/candle-core/src/cuda_backend/error.rs
+++ b/candle-core/src/cuda_backend/error.rs
--- a/candle-core/src/cuda_backend/mod.rs
+++ b/candle-core/src/cuda_backend/mod.rs
--- a/candle-core/src/cuda_backend/utils.rs
+++ b/candle-core/src/cuda_backend/utils.rs
--- a/candle-core/src/custom_op.rs
+++ b/candle-core/src/custom_op.rs