update readme

3cbd6a82 · zhanggzh · 4a9906cb · 3cbd6a82 · 3cbd6a82 · 3cbd6a82
Commit 3cbd6a82 authored Jun 05, 2025 by zhanggzh
Showing with 899 additions and 27 deletions

README.md README.md +73 -27

sparseconvnet/SCN/sparseconvnet_cuda_hip.cpp sparseconvnet/SCN/sparseconvnet_cuda_hip.cpp +825 -0

sparseconvnet/__init__.py sparseconvnet/__init__.py +1 -0

No files found.
--- a/README.md
+++ b/README.md
-# <div align="center"><strong>FastMoe</strong></div>
+# <div align="center"><strong>SparseConvNet</strong></div>
 ## 简介
-这是用于训练 Submanifold 稀疏卷积网络的 PyTorch 库。
+SparseConvNet 这是用于训练子流形稀疏卷积网络的 PyTorch 库
+
 ## 安装
-  
-  源码编译安装，该方式需要安装torch及fastpt工具包；注意使用fastpt包进行源码编译安装时，要匹配fastpt、torch、dtk之间的版本号，例如基于dtk2504编译，则fastpt、torch都必须是dtk2504的包，其中fastpt与torch对应的版本号关系为
-|   | fastpt版本 | torch版本    | DTK版本 | 
-| - | -------- | ------- | ------------ | 
-| 1 | 2.0.1+das.dtk2504   | v2.4.1 |  dtk2504| 
-| 1 | 2.1.0+das.dtk2504   | v2.5.1 |  dtk2504| 
-| 1 | 2.0.1+das.dtk25041   | v2.4.1 |  dtk25041| 
-| 1 | 2.1.0+das.dtk25041   | v2.5.1 |  dtk25041| 
-## 编译流程
-  ```
-  pip3 install wheel
-  pip3 install fastpt-2.0.1+das.dtk2504-py3-none-any.whl # 以torch2.4.1，dtk2504为例
-  https://developer.sourcefind.cn/codes/OpenDAS/sparseconvnet.git
-  git checkout v0.2-fastpt #切换到相应分支
-  cd fastmoe
-  source  /usr/local/bin/fastpt -c
-  python3 setup.py bdist_wheel 
-  ```
-## 验证安装
+组件支持组合
+
+   | PyTorch版本 | fastpt版本  |sparseconvnet版本      | DTK版本                  | Python版本       | 推荐编译方式 |
+   | ----------- | ----------- | ----------- | ------------------------ | -----------------| ------------ |
+   | 2.5.1       | 2.1.0       |0.2        | >= 25.04                 | 3.8、3.10、3.11  | fastpt不转码 |
+   | 2.4.1       | 2.0.1       |0.2        | >= 25.04                 | 3.8、3.10、3.11  | fastpt不转码 |
+   
+
+ pytorch版本大于2.4.1 && dtk版本大于25.04 推荐使用fastpt不转码编译。
+
+### 1、使用pip方式安装
+sparseconvnet whl包下载目录：[光和开发者社区](https://download.sourcefind.cn:65024/4/main)，选择对应的pytorch版本和python版本下载对应sparseconvnet的whl包
+```shell
+pip install torch* (下载torch的whl包)
+pip install fastpt* --no-deps (下载fastpt的whl包)
+source  /usr/local/bin/fastpt -E
+pip install sparseconvnet* (下载的sparseconvnet的whl包)
 ```
-source  /usr/local/bin/fastpt -e
-pip3 list | grep sparseconvnet
+### 2、使用源码编译方式安装
+
+#### 编译环境准备
+提供基于fastpt不转码编译：
+
+1. 基于光源pytorch基础镜像环境：镜像下载地址：[光合开发者社区](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch、python、dtk及系统下载对应的镜像版本。
+
+2. 基于现有python环境：安装pytorch，fastpt whl包下载目录：[光合开发者社区](https://sourcefind.cn/#/image/dcu/pytorch)，根据python、dtk版本,下载对应pytorch的whl包。安装命令如下：
+```shell
+pip install torch* (下载torch的whl包)
+pip install fastpt* --no-deps (下载fastpt的whl包, 安装顺序，先安装torch，后安装fastpt)
+pip install pytest
+pip install wheel
 ```
-## 测试
+
+#### 源码编译安装
+- 代码下载
+```shell
+git clone http://developer.sourcefind.cn/codes/OpenDAS/sparseconvnet.git # 根据编译需要切换分支
 ```
-source  /usr/local/bin/fastpt -e
-cd examples
-python3 hello-world.py 
+- 提供2种源码编译方式（进入sparseconvnet目录）：
 ```
+1. 设置不转码编译环境变量
+source /usr/local/bin/fastpt -C
+
+2. 编译whl包并安装
+python3 setup.py -v bdist_wheel
+pip install dist/sparseconvnet*
+
+3. 源码编译安装
+sh develop.sh
+```
+#### 注意事项
+ 若使用pip install下载安装过慢，可添加pypi清华源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
+ ROCM_PATH为dtk的路径，默认为/opt/dtk
+ 在pytorch2.5.1环境下编译需要支持c++17语法，打开setup.py文件，把文件中的 -std=c++14 修改为 -std=c++17
+
+## 验证
+```shell
+python3
+Python 3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0] on linux
+Type "help", "copyright", "credits" or "license" for more information.
+>>> import sparseconvnetd
+>>> sparseconvnet.__version__
+'0.2'
+>>>
+```
+版本号与官方版本同步，查询该软件的版本号，例如0.2.
+
+## Known Issue
+- 无
+
+## 参考资料
+- [README_ORIGIN](README_ORIGIN.md)
+- [README_zh-CN](README_zh-CN.md)
+- [https://github.com/facebookresearch/SparseConvNet](https://github.com/facebookresearch/SparseConvNet)
--- a/sparseconvnet/SCN/sparseconvnet_cuda_hip.cpp
+++ b/sparseconvnet/SCN/sparseconvnet_cuda_hip.cpp
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#define ENABLE_OPENMP YES
+#if defined(ENABLE_OPENMP)
+#include <omp.h>
+#endif
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <torch/extension.h>
+
+#include "Metadata/Metadata.cpp"
+template class Metadata<1>;
+template class Metadata<2>;
+template class Metadata<3>;
+template class Metadata<4>;
+template class Metadata<5>;
+template class Metadata<6>;
+
+#include "CPU/ActivePooling.cpp"
+#include "CPU/AffineReluTrivialConvolution.cpp"
+#include "CPU/AveragePooling.cpp"
+#include "CPU/BatchNormalization.cpp"
+#include "CPU/BatchwiseMultiplicativeDropout.cpp"
+#include "CPU/Convolution.cpp"
+#include "CPU/Deconvolution.cpp"
+#include "CPU/IOLayers.cpp"
+#include "CPU/LeakyReLU.cpp"
+#include "CPU/MaxPooling.cpp"
+#include "CPU/NetworkInNetwork.cpp"
+#include "CPU/SparseToDense.cpp"
+#include "CPU/UnPooling.cpp"
+#include "HIP/ActivePooling.cpp"
+#include "HIP/AffineReluTrivialConvolution.cpp"
+#include "HIP/AveragePooling.cpp"
+#include "HIP/BatchNormalization.cpp"
+#include "HIP/BatchwiseMultiplicativeDropout.cpp"
+#include "HIP/Convolution.cpp"
+#include "HIP/Deconvolution.cpp"
+#include "HIP/IOLayers.cpp"
+#include "HIP/LeakyReLU.cpp"
+#include "HIP/MaxPooling.cpp"
+#include "HIP/NetworkInNetwork.cpp"
+#include "HIP/SparseToDense.cpp"
+#include "HIP/UnPooling.cpp"
+
+double AffineReluTrivialConvolution_updateOutput(at::Tensor &input_features,
+                                                 at::Tensor &output_features,
+                                                 at::Tensor &affineWeight,
+                                                 at::Tensor &affineBias,
+                                                 at::Tensor &convWeight) {
+  if (input_features.device().type() == torch::kCUDA)
+    return cuda_AffineReluTrivialConvolution_updateOutput<float>(
+        input_features, output_features, affineWeight, affineBias, convWeight);
+  else
+    return cpu_AffineReluTrivialConvolution_updateOutput<float>(
+        input_features, output_features, affineWeight, affineBias, convWeight);
+}
+
+void AffineReluTrivialConvolution_backward(
+    at::Tensor &input_features, at::Tensor &d_input_features,
+    at::Tensor &d_output_features, at::Tensor &affineWeight,
+    at::Tensor &d_affineWeight, at::Tensor &affineBias,
+    at::Tensor &d_affineBias, at::Tensor &convWeight, at::Tensor &d_convWeight,
+    bool additiveGrad) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_AffineReluTrivialConvolution_backward<float>(
+        input_features, d_input_features, d_output_features, affineWeight,
+        d_affineWeight, affineBias, d_affineBias, convWeight, d_convWeight,
+        additiveGrad);
+  else
+    cpu_AffineReluTrivialConvolution_backward<float>(
+        input_features, d_input_features, d_output_features, affineWeight,
+        d_affineWeight, affineBias, d_affineBias, convWeight, d_convWeight,
+        additiveGrad);
+}
+
+void BatchNormalization_updateOutput(
+    at::Tensor &input_features, at::Tensor &output_features,
+    at::Tensor &saveMean, at::Tensor &saveInvStd, at::Tensor &runningMean,
+    at::Tensor &runningVar, at::Tensor &weight, at::Tensor &bias, double eps,
+    double momentum, bool train, double leakiness) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_BatchNormalization_updateOutput<float>(
+        input_features, output_features, saveMean, saveInvStd, runningMean,
+        runningVar, weight, bias, eps, momentum, train, leakiness);
+  else
+    cpu_BatchNormalization_updateOutput<float>(
+        input_features, output_features, saveMean, saveInvStd, runningMean,
+        runningVar, weight, bias, eps, momentum, train, leakiness);
+}
+
+void BatchNormalization_backward(
+    at::Tensor &input_features, at::Tensor &d_input_features,
+    at::Tensor &output_features, at::Tensor &d_output_features,
+    at::Tensor &saveMean, at::Tensor &saveInvStd, at::Tensor &runningMean,
+    at::Tensor &runningVar, at::Tensor &weight, at::Tensor &bias,
+    at::Tensor &d_weight, at::Tensor &d_bias, double leakiness) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_BatchNormalization_backward<float>(
+        input_features, d_input_features, output_features, d_output_features,
+        saveMean, saveInvStd, runningMean, runningVar, weight, bias, d_weight,
+        d_bias, leakiness);
+  else
+    cpu_BatchNormalization_backward<float>(
+        input_features, d_input_features, output_features, d_output_features,
+        saveMean, saveInvStd, runningMean, runningVar, weight, bias, d_weight,
+        d_bias, leakiness);
+}
+
+void BatchwiseMultiplicativeDropout_updateOutput(at::Tensor &input_features,
+                                                 at::Tensor &output_features,
+                                                 at::Tensor &noise,
+                                                 double alpha) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_BatchwiseMultiplicativeDropout_updateOutput<float>(
+        input_features, output_features, noise, alpha);
+  else
+    cpu_BatchwiseMultiplicativeDropout_updateOutput<float>(
+        input_features, output_features, noise, alpha);
+}
+
+void BatchwiseMultiplicativeDropout_updateGradInput(
+    at::Tensor &input_features, at::Tensor &d_input_features,
+    at::Tensor &d_output_features, at::Tensor &noise, double alpha) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_BatchwiseMultiplicativeDropout_updateGradInput<float>(
+        input_features, d_input_features, d_output_features, noise, alpha);
+  else
+    cpu_BatchwiseMultiplicativeDropout_updateGradInput<float>(
+        input_features, d_input_features, d_output_features, noise, alpha);
+}
+
+void LeakyReLU_updateOutput(at::Tensor &input_features,
+                            at::Tensor &output_features, double alpha) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_LeakyReLU_updateOutput<float>(input_features, output_features, alpha);
+  else
+    cpu_LeakyReLU_updateOutput<float>(input_features, output_features, alpha);
+}
+
+void LeakyReLU_updateGradInput(at::Tensor &input_features,
+                               at::Tensor &d_input_features,
+                               at::Tensor &d_output_features, double alpha) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_LeakyReLU_updateGradInput<float>(input_features, d_input_features,
+                                          d_output_features, alpha);
+  else
+    cpu_LeakyReLU_updateGradInput<float>(input_features, d_input_features,
+                                         d_output_features, alpha);
+}
+
+double NetworkInNetwork_updateOutput(at::Tensor &input_features,
+                                     at::Tensor &output_features,
+                                     at::Tensor &weight, at::Tensor &bias) {
+  if (input_features.device().type() == torch::kCUDA)
+    return cuda_NetworkInNetwork_updateOutput<float>(
+        input_features, output_features, weight, bias);
+  else
+    return cpu_NetworkInNetwork_updateOutput<float>(
+        input_features, output_features, weight, bias);
+}
+
+void NetworkInNetwork_updateGradInput(at::Tensor &d_input_features,
+                                      at::Tensor &d_output_features,
+                                      at::Tensor &weight) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_NetworkInNetwork_updateGradInput<float>(d_input_features,
+                                                 d_output_features, weight);
+  else
+    cpu_NetworkInNetwork_updateGradInput<float>(d_input_features,
+                                                d_output_features, weight);
+}
+
+void NetworkInNetwork_accGradParameters(at::Tensor &input_features,
+                                        at::Tensor &d_output_features,
+                                        at::Tensor &d_weight,
+                                        at::Tensor &d_bias) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_NetworkInNetwork_accGradParameters<float>(
+        input_features, d_output_features, d_weight, d_bias);
+  else
+    cpu_NetworkInNetwork_accGradParameters<float>(
+        input_features, d_output_features, d_weight, d_bias);
+}
+template <Int Dimension>
+void ActivePooling_updateOutput(at::Tensor &inputSize, Metadata<Dimension> &m,
+                                at::Tensor &input_features,
+                                at::Tensor &output_features, bool average) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_ActivePooling_updateOutput<float, Dimension>(
+        inputSize, m, input_features, output_features, average);
+  else
+    cpu_ActivePooling_updateOutput<float, Dimension>(
+        inputSize, m, input_features, output_features, average);
+}
+
+template <Int Dimension>
+void ActivePooling_updateGradInput(
+    at::Tensor &inputSize, Metadata<Dimension> &m, at::Tensor &input_features,
+    at::Tensor &d_input_features, at::Tensor &d_output_features, bool average) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    return cuda_ActivePooling_updateGradInput<float, Dimension>(
+        inputSize, m, input_features, d_input_features, d_output_features,
+        average);
+  else
+    return cpu_ActivePooling_updateGradInput<float, Dimension>(
+        inputSize, m, input_features, d_input_features, d_output_features,
+        average);
+}
+template <Int Dimension>
+void AveragePooling_updateOutput(at::Tensor &inputSize, at::Tensor &outputSize,
+                                 at::Tensor &poolSize, at::Tensor &poolStride,
+                                 Metadata<Dimension> &m,
+                                 at::Tensor &input_features,
+                                 at::Tensor &output_features,
+                                 long nFeaturesToDrop) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_AveragePooling_updateOutput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        output_features, nFeaturesToDrop);
+  else
+    cpu_AveragePooling_updateOutput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        output_features, nFeaturesToDrop);
+}
+template <Int Dimension>
+void AveragePooling_updateGradInput(
+    at::Tensor &inputSize, at::Tensor &outputSize, at::Tensor &poolSize,
+    at::Tensor &poolStride, Metadata<Dimension> &m, at::Tensor &input_features,
+    at::Tensor &d_input_features, at::Tensor &d_output_features,
+    long nFeaturesToDrop) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_AveragePooling_updateGradInput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        d_input_features, d_output_features, nFeaturesToDrop);
+  else
+    cpu_AveragePooling_updateGradInput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        d_input_features, d_output_features, nFeaturesToDrop);
+}
+template <Int Dimension>
+double
+Convolution_updateOutput(at::Tensor &inputSize, at::Tensor &outputSize,
+                         at::Tensor &filterSize, at::Tensor &filterStride,
+                         Metadata<Dimension> &m, at::Tensor &input_features,
+                         at::Tensor &output_features, at::Tensor &weight,
+                         at::Tensor &bias) {
+  if (input_features.device().type() == torch::kCUDA)
+    return cuda_Convolution_updateOutput<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        output_features, weight, bias);
+  else
+    return cpu_Convolution_updateOutput<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        output_features, weight, bias);
+}
+template <Int Dimension>
+void Convolution_backward(at::Tensor &inputSize, at::Tensor &outputSize,
+                          at::Tensor &filterSize, at::Tensor &filterStride,
+                          Metadata<Dimension> &m, at::Tensor &input_features,
+                          at::Tensor &d_input_features,
+                          at::Tensor &d_output_features, at::Tensor &weight,
+                          at::Tensor &d_weight, at::Tensor &d_bias) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_Convolution_backward<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        d_input_features, d_output_features, weight, d_weight, d_bias);
+  else
+    cpu_Convolution_backward<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        d_input_features, d_output_features, weight, d_weight, d_bias);
+}
+template <Int Dimension>
+double SubmanifoldConvolution_updateOutput(
+    at::Tensor &inputSize, at::Tensor &filterSize, Metadata<Dimension> &m,
+    at::Tensor &input_features, at::Tensor &output_features, at::Tensor &weight,
+    at::Tensor &bias) {
+  if (input_features.device().type() == torch::kCUDA)
+    return cuda_SubmanifoldConvolution_updateOutput<float, Dimension>(
+        inputSize, filterSize, m, input_features, output_features, weight,
+        bias);
+  else
+    return cpu_SubmanifoldConvolution_updateOutput<float, Dimension>(
+        inputSize, filterSize, m, input_features, output_features, weight,
+        bias);
+}
+template <Int Dimension>
+void SubmanifoldConvolution_backward(
+    at::Tensor &inputSize, at::Tensor &filterSize, Metadata<Dimension> &m,
+    at::Tensor &input_features, at::Tensor &d_input_features,
+    at::Tensor &d_output_features, at::Tensor &weight, at::Tensor &d_weight,
+    at::Tensor &d_bias) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_SubmanifoldConvolution_backward<float, Dimension>(
+        inputSize, filterSize, m, input_features, d_input_features,
+        d_output_features, weight, d_weight, d_bias);
+  else
+    cpu_SubmanifoldConvolution_backward<float, Dimension>(
+        inputSize, filterSize, m, input_features, d_input_features,
+        d_output_features, weight, d_weight, d_bias);
+}
+template <Int Dimension>
+double PermutohedralSubmanifoldConvolution_updateOutput(
+    at::Tensor &inputSize, Metadata<Dimension> &m, at::Tensor &input_features,
+    at::Tensor &output_features, at::Tensor &weight, at::Tensor &bias) {
+  if (input_features.device().type() == torch::kCUDA)
+    return cuda_PermutohedralSubmanifoldConvolution_updateOutput<float,
+                                                                 Dimension>(
+        inputSize, m, input_features, output_features, weight, bias);
+  else
+    return cpu_PermutohedralSubmanifoldConvolution_updateOutput<float,
+                                                                Dimension>(
+        inputSize, m, input_features, output_features, weight, bias);
+}
+template <Int Dimension>
+void PermutohedralSubmanifoldConvolution_backward(
+    at::Tensor &inputSize, Metadata<Dimension> &m, at::Tensor &input_features,
+    at::Tensor &d_input_features, at::Tensor &d_output_features,
+    at::Tensor &weight, at::Tensor &d_weight, at::Tensor &d_bias) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_PermutohedralSubmanifoldConvolution_backward<float, Dimension>(
+        inputSize, m, input_features, d_input_features, d_output_features,
+        weight, d_weight, d_bias);
+  else
+    cpu_PermutohedralSubmanifoldConvolution_backward<float, Dimension>(
+        inputSize, m, input_features, d_input_features, d_output_features,
+        weight, d_weight, d_bias);
+}
+template <Int Dimension>
+double FullConvolution_updateOutput(
+    at::Tensor &inputSize, at::Tensor &outputSize, at::Tensor &filterSize,
+    at::Tensor &filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut, at::Tensor &input_features,
+    at::Tensor &output_features, at::Tensor &weight, at::Tensor &bias) {
+  if (input_features.device().type() == torch::kCUDA)
+    return cuda_FullConvolution_updateOutput<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, mIn, mOut,
+        input_features, output_features, weight, bias);
+  else
+    return cpu_FullConvolution_updateOutput<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, mIn, mOut,
+        input_features, output_features, weight, bias);
+}
+template <Int Dimension>
+void FullConvolution_backward(at::Tensor &inputSize, at::Tensor &outputSize,
+                              at::Tensor &filterSize, at::Tensor &filterStride,
+                              Metadata<Dimension> &mIn,
+                              Metadata<Dimension> &mOut,
+                              at::Tensor &input_features,
+                              at::Tensor &d_input_features,
+                              at::Tensor &d_output_features, at::Tensor &weight,
+                              at::Tensor &d_weight, at::Tensor &d_bias) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_FullConvolution_backward<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, mIn, mOut,
+        input_features, d_input_features, d_output_features, weight, d_weight,
+        d_bias);
+  else
+    cpu_FullConvolution_backward<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, mIn, mOut,
+        input_features, d_input_features, d_output_features, weight, d_weight,
+        d_bias);
+}
+template <Int Dimension>
+double RandomizedStrideConvolution_updateOutput(
+    at::Tensor &inputSize, at::Tensor &outputSize, at::Tensor &filterSize,
+    at::Tensor &filterStride, Metadata<Dimension> &m,
+    at::Tensor &input_features, at::Tensor &output_features, at::Tensor &weight,
+    at::Tensor &bias) {
+  if (input_features.device().type() == torch::kCUDA)
+    return cuda_RandomizedStrideConvolution_updateOutput<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        output_features, weight, bias);
+  else
+    return cpu_RandomizedStrideConvolution_updateOutput<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        output_features, weight, bias);
+}
+template <Int Dimension>
+void RandomizedStrideConvolution_backward(
+    at::Tensor &inputSize, at::Tensor &outputSize, at::Tensor &filterSize,
+    at::Tensor &filterStride, Metadata<Dimension> &m,
+    at::Tensor &input_features, at::Tensor &d_input_features,
+    at::Tensor &d_output_features, at::Tensor &weight, at::Tensor &d_weight,
+    at::Tensor &d_bias) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_RandomizedStrideConvolution_backward<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        d_input_features, d_output_features, weight, d_weight, d_bias);
+  else
+    cpu_RandomizedStrideConvolution_backward<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        d_input_features, d_output_features, weight, d_weight, d_bias);
+}
+template <Int Dimension>
+double
+Deconvolution_updateOutput(at::Tensor &inputSize, at::Tensor &outputSize,
+                           at::Tensor &filterSize, at::Tensor &filterStride,
+                           Metadata<Dimension> &m, at::Tensor &input_features,
+                           at::Tensor &output_features, at::Tensor &weight,
+                           at::Tensor &bias) {
+  if (input_features.device().type() == torch::kCUDA)
+    return cuda_Deconvolution_updateOutput<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        output_features, weight, bias);
+  else
+    return cpu_Deconvolution_updateOutput<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        output_features, weight, bias);
+}
+template <Int Dimension>
+void Deconvolution_backward(at::Tensor &inputSize, at::Tensor &outputSize,
+                            at::Tensor &filterSize, at::Tensor &filterStride,
+                            Metadata<Dimension> &m, at::Tensor &input_features,
+                            at::Tensor &d_input_features,
+                            at::Tensor &d_output_features, at::Tensor &weight,
+                            at::Tensor &d_weight, at::Tensor &d_bias) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_Deconvolution_backward<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        d_input_features, d_output_features, weight, d_weight, d_bias);
+  else
+    cpu_Deconvolution_backward<float, Dimension>(
+        inputSize, outputSize, filterSize, filterStride, m, input_features,
+        d_input_features, d_output_features, weight, d_weight, d_bias);
+}
+template <Int Dimension>
+void InputLayer_updateOutput(Metadata<Dimension> &m, at::Tensor &spatialSize,
+                             at::Tensor &input_coords,
+                             at::Tensor &input_features,
+                             at::Tensor &output_features, long batchSize,
+                             long mode) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_InputLayer_updateOutput<float, Dimension>(
+        m, spatialSize, input_coords, input_features, output_features,
+        batchSize, mode);
+  else
+    cpu_InputLayer_updateOutput<float, Dimension>(
+        m, spatialSize, input_coords, input_features, output_features,
+        batchSize, mode);
+}
+template <Int Dimension>
+void InputLayer_updateGradInput(Metadata<Dimension> &m,
+                                at::Tensor &d_input_features,
+                                at::Tensor &d_output_features) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_InputLayer_updateGradInput<float, Dimension>(m, d_input_features,
+                                                      d_output_features);
+  else
+    cpu_InputLayer_updateGradInput<float, Dimension>(m, d_input_features,
+                                                     d_output_features);
+}
+template <Int Dimension>
+void OutputLayer_updateOutput(Metadata<Dimension> &m,
+                              at::Tensor &input_features,
+                              at::Tensor &output_features) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_OutputLayer_updateOutput<float, Dimension>(m, input_features,
+                                                    output_features);
+  else
+    cpu_OutputLayer_updateOutput<float, Dimension>(m, input_features,
+                                                   output_features);
+}
+template <Int Dimension>
+void OutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                 at::Tensor &d_input_features,
+                                 at::Tensor &d_output_features) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_OutputLayer_updateGradInput<float, Dimension>(m, d_input_features,
+                                                       d_output_features);
+  else
+    cpu_OutputLayer_updateGradInput<float, Dimension>(m, d_input_features,
+                                                      d_output_features);
+}
+template <Int Dimension>
+void BLInputLayer_updateOutput(Metadata<Dimension> &m, at::Tensor &spatialSize,
+                               at::Tensor &input_coords,
+                               at::Tensor &input_features,
+                               at::Tensor &output_features, long mode) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_BLInputLayer_updateOutput<float, Dimension>(
+        m, spatialSize, input_coords, input_features, output_features, mode);
+  else
+    cpu_BLInputLayer_updateOutput<float, Dimension>(
+        m, spatialSize, input_coords, input_features, output_features, mode);
+}
+template <Int Dimension>
+void BLInputLayer_updateGradInput(Metadata<Dimension> &m,
+                                  at::Tensor &d_input_features,
+                                  at::Tensor &d_output_features) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_BLInputLayer_updateGradInput<float, Dimension>(m, d_input_features,
+                                                        d_output_features);
+  else
+    cpu_BLInputLayer_updateGradInput<float, Dimension>(m, d_input_features,
+                                                       d_output_features);
+}
+template <Int Dimension>
+void BLOutputLayer_updateOutput(Metadata<Dimension> &m,
+                                at::Tensor &input_features,
+                                at::Tensor &output_features) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_BLOutputLayer_updateOutput<float, Dimension>(m, input_features,
+                                                      output_features);
+  else
+    cpu_BLOutputLayer_updateOutput<float, Dimension>(m, input_features,
+                                                     output_features);
+}
+template <Int Dimension>
+void BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                   at::Tensor &d_input_features,
+                                   at::Tensor &d_output_features) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_BLOutputLayer_updateGradInput<float, Dimension>(m, d_input_features,
+                                                         d_output_features);
+  else
+    cpu_BLOutputLayer_updateGradInput<float, Dimension>(m, d_input_features,
+                                                        d_output_features);
+}
+template <Int Dimension>
+void MaxPooling_updateOutput(at::Tensor &inputSize, at::Tensor &outputSize,
+                             at::Tensor &poolSize, at::Tensor &poolStride,
+                             Metadata<Dimension> &m, at::Tensor &input_features,
+                             at::Tensor &output_features,
+                             long nFeaturesToDrop) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_MaxPooling_updateOutput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        output_features, nFeaturesToDrop);
+  else
+    cpu_MaxPooling_updateOutput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        output_features, nFeaturesToDrop);
+}
+template <Int Dimension>
+void MaxPooling_updateGradInput(
+    at::Tensor &inputSize, at::Tensor &outputSize, at::Tensor &poolSize,
+    at::Tensor &poolStride, Metadata<Dimension> &m, at::Tensor &input_features,
+    at::Tensor &d_input_features, at::Tensor &output_features,
+    at::Tensor &d_output_features, long nFeaturesToDrop) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_MaxPooling_updateGradInput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        d_input_features, output_features, d_output_features, nFeaturesToDrop);
+  else
+    cpu_MaxPooling_updateGradInput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        d_input_features, output_features, d_output_features, nFeaturesToDrop);
+}
+template <Int Dimension>
+void RandomizedStrideMaxPooling_updateOutput(
+    at::Tensor &inputSize, at::Tensor &outputSize, at::Tensor &poolSize,
+    at::Tensor &poolStride, Metadata<Dimension> &m, at::Tensor &input_features,
+    at::Tensor &output_features, long nFeaturesToDrop) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_RandomizedStrideMaxPooling_updateOutput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        output_features, nFeaturesToDrop);
+  else
+    cpu_RandomizedStrideMaxPooling_updateOutput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        output_features, nFeaturesToDrop);
+}
+template <Int Dimension>
+void RandomizedStrideMaxPooling_updateGradInput(
+    at::Tensor &inputSize, at::Tensor &outputSize, at::Tensor &poolSize,
+    at::Tensor &poolStride, Metadata<Dimension> &m, at::Tensor &input_features,
+    at::Tensor &d_input_features, at::Tensor &output_features,
+    at::Tensor &d_output_features, long nFeaturesToDrop) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_RandomizedStrideMaxPooling_updateGradInput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        d_input_features, output_features, d_output_features, nFeaturesToDrop);
+  else
+    cpu_RandomizedStrideMaxPooling_updateGradInput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        d_input_features, output_features, d_output_features, nFeaturesToDrop);
+}
+template <Int Dimension>
+void SparseToDense_updateOutput(at::Tensor &inputSize, Metadata<Dimension> &m,
+                                at::Tensor &input_features,
+                                at::Tensor &output_features, long nPlanes) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_SparseToDense_updateOutput<float, Dimension>(
+        inputSize, m, input_features, output_features, nPlanes);
+  else
+    cpu_SparseToDense_updateOutput<float, Dimension>(
+        inputSize, m, input_features, output_features, nPlanes);
+}
+template <Int Dimension>
+void SparseToDense_updateGradInput(at::Tensor &inputSize,
+                                   Metadata<Dimension> &m,
+                                   at::Tensor &input_features,
+                                   at::Tensor &d_input_features,
+                                   at::Tensor &d_output_features) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_SparseToDense_updateGradInput<float, Dimension>(
+        inputSize, m, input_features, d_input_features, d_output_features);
+  else
+    cpu_SparseToDense_updateGradInput<float, Dimension>(
+        inputSize, m, input_features, d_input_features, d_output_features);
+}
+template <Int Dimension>
+void UnPooling_updateOutput(at::Tensor &inputSize, at::Tensor &outputSize,
+                            at::Tensor &poolSize, at::Tensor &poolStride,
+                            Metadata<Dimension> &m, at::Tensor &input_features,
+                            at::Tensor &output_features, long nFeaturesToDrop) {
+  if (input_features.device().type() == torch::kCUDA)
+    cuda_UnPooling_updateOutput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        output_features, nFeaturesToDrop);
+  else
+    cpu_UnPooling_updateOutput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, input_features,
+        output_features, nFeaturesToDrop);
+}
+template <Int Dimension>
+void UnPooling_updateGradInput(at::Tensor &inputSize, at::Tensor &outputSize,
+                               at::Tensor &poolSize, at::Tensor &poolStride,
+                               Metadata<Dimension> &m,
+                               at::Tensor &d_input_features,
+                               at::Tensor &d_output_features,
+                               long nFeaturesToDrop) {
+  if (d_output_features.device().type() == torch::kCUDA)
+    cuda_UnPooling_updateGradInput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, d_input_features,
+        d_output_features, nFeaturesToDrop);
+  else
+    cpu_UnPooling_updateGradInput<float, Dimension>(
+        inputSize, outputSize, poolSize, poolStride, m, d_input_features,
+        d_output_features, nFeaturesToDrop);
+}
+
+#define FOO                                                                    \
+  template void ActivePooling_updateOutput<DIMENSION>(                         \
+      at::Tensor & inputSize, Metadata<DIMENSION> & m,                         \
+      at::Tensor & input_features, at::Tensor & output_features,               \
+      bool average);                                                           \
+  template void ActivePooling_updateGradInput<DIMENSION>(                      \
+      at::Tensor & inputSize, Metadata<DIMENSION> & m,                         \
+      at::Tensor & input_features, at::Tensor & d_input_features,              \
+      at::Tensor & d_output_features, bool average);                           \
+  template void AveragePooling_updateOutput<DIMENSION>(                        \
+      at::Tensor & inputSize, at::Tensor & outputSize, at::Tensor & poolSize,  \
+      at::Tensor & poolStride, Metadata<DIMENSION> & m,                        \
+      at::Tensor & input_features, at::Tensor & output_features,               \
+      long nFeaturesToDrop);                                                   \
+  template void AveragePooling_updateGradInput<DIMENSION>(                     \
+      at::Tensor & inputSize, at::Tensor & outputSize, at::Tensor & poolSize,  \
+      at::Tensor & poolStride, Metadata<DIMENSION> & m,                        \
+      at::Tensor & input_features, at::Tensor & d_input_features,              \
+      at::Tensor & d_output_features, long nFeaturesToDrop);                   \
+  template double Convolution_updateOutput<DIMENSION>(                         \
+      at::Tensor & inputSize, at::Tensor & outputSize,                         \
+      at::Tensor & filterSize, at::Tensor & filterStride,                      \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & output_features, at::Tensor & weight, at::Tensor & bias);   \
+  template void Convolution_backward<DIMENSION>(                               \
+      at::Tensor & inputSize, at::Tensor & outputSize,                         \
+      at::Tensor & filterSize, at::Tensor & filterStride,                      \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & d_input_features, at::Tensor & d_output_features,           \
+      at::Tensor & weight, at::Tensor & d_weight, at::Tensor & d_bias);        \
+  template double SubmanifoldConvolution_updateOutput<DIMENSION>(              \
+      at::Tensor & inputSize, at::Tensor & filterSize,                         \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & output_features, at::Tensor & weight, at::Tensor & bias);   \
+  template void SubmanifoldConvolution_backward<DIMENSION>(                    \
+      at::Tensor & inputSize, at::Tensor & filterSize,                         \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & d_input_features, at::Tensor & d_output_features,           \
+      at::Tensor & weight, at::Tensor & d_weight, at::Tensor & d_bias);        \
+  template double PermutohedralSubmanifoldConvolution_updateOutput<DIMENSION>( \
+      at::Tensor & inputSize, Metadata<DIMENSION> & m,                         \
+      at::Tensor & input_features, at::Tensor & output_features,               \
+      at::Tensor & weight, at::Tensor & bias);                                 \
+  template void PermutohedralSubmanifoldConvolution_backward<DIMENSION>(       \
+      at::Tensor & inputSize, Metadata<DIMENSION> & m,                         \
+      at::Tensor & input_features, at::Tensor & d_input_features,              \
+      at::Tensor & d_output_features, at::Tensor & weight,                     \
+      at::Tensor & d_weight, at::Tensor & d_bias);                             \
+  template double FullConvolution_updateOutput<DIMENSION>(                     \
+      at::Tensor & inputSize, at::Tensor & outputSize,                         \
+      at::Tensor & filterSize, at::Tensor & filterStride,                      \
+      Metadata<DIMENSION> & mIn, Metadata<DIMENSION> & mOut,                   \
+      at::Tensor & input_features, at::Tensor & output_features,               \
+      at::Tensor & weight, at::Tensor & bias);                                 \
+  template void FullConvolution_backward<DIMENSION>(                           \
+      at::Tensor & inputSize, at::Tensor & outputSize,                         \
+      at::Tensor & filterSize, at::Tensor & filterStride,                      \
+      Metadata<DIMENSION> & mIn, Metadata<DIMENSION> & mOut,                   \
+      at::Tensor & input_features, at::Tensor & d_input_features,              \
+      at::Tensor & d_output_features, at::Tensor & weight,                     \
+      at::Tensor & d_weight, at::Tensor & d_bias);                             \
+  template double RandomizedStrideConvolution_updateOutput<DIMENSION>(         \
+      at::Tensor & inputSize, at::Tensor & outputSize,                         \
+      at::Tensor & filterSize, at::Tensor & filterStride,                      \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & output_features, at::Tensor & weight, at::Tensor & bias);   \
+  template void RandomizedStrideConvolution_backward<DIMENSION>(               \
+      at::Tensor & inputSize, at::Tensor & outputSize,                         \
+      at::Tensor & filterSize, at::Tensor & filterStride,                      \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & d_input_features, at::Tensor & d_output_features,           \
+      at::Tensor & weight, at::Tensor & d_weight, at::Tensor & d_bias);        \
+  template double Deconvolution_updateOutput<DIMENSION>(                       \
+      at::Tensor & inputSize, at::Tensor & outputSize,                         \
+      at::Tensor & filterSize, at::Tensor & filterStride,                      \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & output_features, at::Tensor & weight, at::Tensor & bias);   \
+  template void Deconvolution_backward<DIMENSION>(                             \
+      at::Tensor & inputSize, at::Tensor & outputSize,                         \
+      at::Tensor & filterSize, at::Tensor & filterStride,                      \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & d_input_features, at::Tensor & d_output_features,           \
+      at::Tensor & weight, at::Tensor & d_weight, at::Tensor & d_bias);        \
+  template void InputLayer_updateOutput<DIMENSION>(                            \
+      Metadata<DIMENSION> & m, at::Tensor & spatialSize,                       \
+      at::Tensor & input_coords, at::Tensor & input_features,                  \
+      at::Tensor & output_features, long batchSize, long mode);                \
+  template void InputLayer_updateGradInput<DIMENSION>(                         \
+      Metadata<DIMENSION> & m, at::Tensor & d_input_features,                  \
+      at::Tensor & d_output_features);                                         \
+  template void OutputLayer_updateOutput<DIMENSION>(                           \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & output_features);                                           \
+  template void OutputLayer_updateGradInput<DIMENSION>(                        \
+      Metadata<DIMENSION> & m, at::Tensor & d_input_features,                  \
+      at::Tensor & d_output_features);                                         \
+  template void BLInputLayer_updateOutput<DIMENSION>(                          \
+      Metadata<DIMENSION> & m, at::Tensor & spatialSize,                       \
+      at::Tensor & input_coords, at::Tensor & input_features,                  \
+      at::Tensor & output_features, long mode);                                \
+  template void BLInputLayer_updateGradInput<DIMENSION>(                       \
+      Metadata<DIMENSION> & m, at::Tensor & d_input_features,                  \
+      at::Tensor & d_output_features);                                         \
+  template void BLOutputLayer_updateOutput<DIMENSION>(                         \
+      Metadata<DIMENSION> & m, at::Tensor & input_features,                    \
+      at::Tensor & output_features);                                           \
+  template void BLOutputLayer_updateGradInput<DIMENSION>(                      \
+      Metadata<DIMENSION> & m, at::Tensor & d_input_features,                  \
+      at::Tensor & d_output_features);                                         \
+  template void MaxPooling_updateOutput<DIMENSION>(                            \
+      at::Tensor & inputSize, at::Tensor & outputSize, at::Tensor & poolSize,  \
+      at::Tensor & poolStride, Metadata<DIMENSION> & m,                        \
+      at::Tensor & input_features, at::Tensor & output_features,               \
+      long nFeaturesToDrop);                                                   \
+  template void MaxPooling_updateGradInput<DIMENSION>(                         \
+      at::Tensor & inputSize, at::Tensor & outputSize, at::Tensor & poolSize,  \
+      at::Tensor & poolStride, Metadata<DIMENSION> & m,                        \
+      at::Tensor & input_features, at::Tensor & d_input_features,              \
+      at::Tensor & output_features, at::Tensor & d_output_features,            \
+      long nFeaturesToDrop);                                                   \
+  template void RandomizedStrideMaxPooling_updateOutput<DIMENSION>(            \
+      at::Tensor & inputSize, at::Tensor & outputSize, at::Tensor & poolSize,  \
+      at::Tensor & poolStride, Metadata<DIMENSION> & m,                        \
+      at::Tensor & input_features, at::Tensor & output_features,               \
+      long nFeaturesToDrop);                                                   \
+  template void RandomizedStrideMaxPooling_updateGradInput<DIMENSION>(         \
+      at::Tensor & inputSize, at::Tensor & outputSize, at::Tensor & poolSize,  \
+      at::Tensor & poolStride, Metadata<DIMENSION> & m,                        \
+      at::Tensor & input_features, at::Tensor & d_input_features,              \
+      at::Tensor & output_features, at::Tensor & d_output_features,            \
+      long nFeaturesToDrop);                                                   \
+  template void SparseToDense_updateOutput<DIMENSION>(                         \
+      at::Tensor & inputSize, Metadata<DIMENSION> & m,                         \
+      at::Tensor & input_features, at::Tensor & output_features,               \
+      long nPlanes);                                                           \
+  template void SparseToDense_updateGradInput<DIMENSION>(                      \
+      at::Tensor & inputSize, Metadata<DIMENSION> & m,                         \
+      at::Tensor & input_features, at::Tensor & d_input_features,              \
+      at::Tensor & d_output_features);                                         \
+  template void UnPooling_updateOutput<DIMENSION>(                             \
+      at::Tensor & inputSize, at::Tensor & outputSize, at::Tensor & poolSize,  \
+      at::Tensor & poolStride, Metadata<DIMENSION> & m,                        \
+      at::Tensor & input_features, at::Tensor & output_features,               \
+      long nFeaturesToDrop);                                                   \
+  template void UnPooling_updateGradInput<DIMENSION>(                          \
+      at::Tensor & inputSize, at::Tensor & outputSize, at::Tensor & poolSize,  \
+      at::Tensor & poolStride, Metadata<DIMENSION> & m,                        \
+      at::Tensor & d_input_features, at::Tensor & d_output_features,           \
+      long nFeaturesToDrop);
+
+#define DIMENSION 1
+FOO;
+#undef DIMENSION
+#define DIMENSION 2
+FOO;
+#undef DIMENSION
+#define DIMENSION 3
+FOO;
+#undef DIMENSION
+#define DIMENSION 4
+FOO;
+#undef DIMENSION
+#define DIMENSION 5
+FOO;
+#undef DIMENSION
+#define DIMENSION 6
+FOO;
+#undef DIMENSION
+
+void CopyFeaturesHelper_updateOutput(at::Tensor &rules, at::Tensor &context,
+                                     at::Tensor &Context) {
+  if (context.is_cuda())
+    cuda_CopyFeaturesHelper_updateOutput<float>(rules, context, Context);
+  else
+    cpu_CopyFeaturesHelper_updateOutput<float>(rules, context, Context);
+}
+void CopyFeaturesHelper_updateGradInput(at::Tensor &rules, at::Tensor &dcontext,
+                                        at::Tensor &dContext) {
+  if (dContext.is_cuda())
+    cuda_CopyFeaturesHelper_updateGradInput<float>(rules, dcontext, dContext);
+  else
+    cpu_CopyFeaturesHelper_updateGradInput<float>(rules, dcontext, dContext);
+}
+
+bool is_cuda_build() {return true;}
--- a/sparseconvnet/__init__.py
+++ b/sparseconvnet/__init__.py
@@ -35,3 +35,4 @@ from .tables import *
 from .unPooling import UnPooling
 from .utils import append_tensors, AddCoords, add_feature_planes, concatenate_feature_planes, compare_sparse
 from .shapeContext import ShapeContext, MultiscaleShapeContext
+__version__ = '0.2'