init

76eeba5d · Hang Zhang · 76eeba5d · 76eeba5d · 76eeba5d · 76eeba5d
Commit 76eeba5d authored May 12, 2017 by Hang Zhang
20 changed files
--- a/.editorconfig
+++ b/.editorconfig
+root = true
+
+[*]
+indent_style = tab
+indent_size = 2
--- a/.gitignore
+++ b/.gitignore
+*.DS_Store
+*.swp
+*.pyc
+build/
+encoding/build/
--- a/LICENSE.md
+++ b/LICENSE.md
+MIT License
+
+Copyright (c) 2017 Hang Zhang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+1. The above copyright notice and this permission notice shall be included in 
+all copies or substantial portions of the Software. 
+2. Original authors' names are not deleted.
+3. The authors' names are not used to endorse or promote products derived 
+from this software 
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
+# PyTorch-Encoding-Layer
+
+:x: **In progress** This repo is a PyTorch implementation of Encoding Layer as described in the paper:
+
+**Deep TEN: Texture Encoding Network** [[arXiv]](https://arxiv.org/pdf/1612.02844.pdf)  
+  [Hang Zhang](http://hangzh.com/), [Jia Xue](http://jiaxueweb.com/), [Kristin Dana](http://eceweb1.rutgers.edu/vision/dana.html)
+```
+@article{zhang2016deep,
+  title={Deep TEN: Texture Encoding Network},
+  author={Zhang, Hang and Xue, Jia and Dana, Kristin},
+  journal={arXiv preprint arXiv:1612.02844},
+  year={2016}
+}
+```
+
+## Installation
+- Dependencies
+	* Install PyTorch from source
+- Installing package
+```bash
+bash make.sh
+```
--- a/build.py
+++ b/build.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import os
+import torch
+from torch.utils.ffi import create_extension
+
+package_base = os.path.dirname(torch.__file__)
+this_file = os.path.dirname(os.path.realpath(__file__))
+
+include_path = [os.path.join(os.environ['HOME'],'pytorch/torch/lib/THC'), 
+								os.path.join(this_file,'encoding/src/'),
+								os.path.join(this_file,'encoding/kernel/')]
+
+sources = ['encoding/src/encoding_lib.cpp']
+headers = ['encoding/src/encoding_lib.h']
+defines = [('WITH_CUDA', None)]
+with_cuda = True 
+
+extra_objects = ['lib/libENCODING.dylib']
+extra_objects = [os.path.join(package_base, fname) for fname in extra_objects]
+
+print(extra_objects)
+
+ffi = create_extension(
+    'encoding._ext.encoding_lib',
+    package=True,
+    headers=headers,
+    sources=sources,
+    define_macros=defines,
+    relative_to=__file__,
+    with_cuda=with_cuda,
+		include_dirs = include_path,
+		extra_objects=extra_objects,
+)
+
+if __name__ == '__main__':
+    ffi.build()
--- a/clean.sh
+++ b/clean.sh
+#!/usr/bin/env bash
+
+rm -rf build/ dist/ encoding.egg-info/ encoding/build/ encoding/_ext/
--- a/encoding/CMakeLists.txt
+++ b/encoding/CMakeLists.txt
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.8)
+
+OPTION(NDEBUG "disable asserts (WARNING: this may result in silent UB e.g. with out-of-bound indices)")
+IF(NOT NDEBUG)
+  MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS "" ${CMAKE_C_FLAGS})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_DEBUG "" ${CMAKE_C_FLAGS_DEBUG})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "" ${CMAKE_C_FLAGS_RELEASE})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS "" ${CMAKE_CXX_FLAGS})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
+ENDIF()
+
+INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindTorch.cmake)
+
+#IF(NOT Torch_FOUND)
+#  FIND_PACKAGE(Torch REQUIRED)
+#ENDIF()
+
+IF(NOT CUDA_FOUND)
+  FIND_PACKAGE(CUDA 6.5 REQUIRED)
+ENDIF()
+
+# Detect CUDA architecture and get best NVCC flags
+IF(NOT COMMAND CUDA_SELECT_NVCC_ARCH_FLAGS OR MSVC)
+  INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/select_compute_arch.cmake)
+ENDIF()
+LIST(APPEND CUDA_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
+CUDA_SELECT_NVCC_ARCH_FLAGS(NVCC_FLAGS_EXTRA $ENV{TORCH_CUDA_ARCH_LIST})
+LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
+    if(CUDA_VERSION VERSION_LESS "8.0")
+      MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__")
+    endif(CUDA_VERSION VERSION_LESS "8.0")
+  endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
+endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+
+IF(MSVC)
+  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819")
+  ADD_DEFINITIONS(-DTH_EXPORTS)
+ENDIF()
+
+IF(NOT ENCODING_INSTALL_LIB_SUBDIR)
+  SET(ENCODING_INSTALL_LIB_SUBDIR "${TORCH_BUILD_DIR}" CACHE PATH "ENCODING install library directory")
+  SET(ENCODING_INSTALL_INCLUDE_SUBDIR "${TORCH_BUILD_DIR}/include" CACHE PATH "ENCODING install include subdirectory")
+ENDIF()
+
+SET(CMAKE_MACOSX_RPATH 1)
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+FILE(GLOB src-cuda kernel/*.cu)
+
+CUDA_INCLUDE_DIRECTORIES(
+	${CMAKE_CURRENT_SOURCE_DIR}
+	${Torch_INSTALL_INCLUDE} 
+)
+CUDA_ADD_LIBRARY(ENCODING SHARED ${src-cuda})
+
+IF(MSVC)
+  SET_TARGET_PROPERTIES(ENCODING PROPERTIES PREFIX "lib" IMPORT_PREFIX "lib")
+ENDIF()
+
+INCLUDE_DIRECTORIES(
+	./include
+	${CMAKE_CURRENT_SOURCE_DIR} 
+	${Torch_INSTALL_INCLUDE} 
+)
+TARGET_LINK_LIBRARIES(ENCODING 
+	${THC_LIBRARIES} 
+	${TH_LIBRARIES} 
+	${CUDA_cusparse_LIBRARY}
+)
+
+# Luarocks bug pre-14.04 prevents us from setting it for Lua-Torch
+IF(ENCODING_SO_VERSION)
+  MESSAGE(STATUS "ENCODING_SO_VERSION: ${ENCODING_SO_VERSION}")
+  SET_TARGET_PROPERTIES(ENCODING PROPERTIES
+    VERSION   ${ENCODING_SO_VERSION}
+    SOVERSION ${ENCODING_SO_VERSION})
+ENDIF(ENCODING_SO_VERSION)
+
+INSTALL(TARGETS ENCODING LIBRARY DESTINATION ${ENCODING_INSTALL_LIB_SUBDIR})
+INSTALL(FILES kernel/thc_encoding.h DESTINATION "${ENCODING_INSTALL_INCLUDE_SUBDIR}/ENCODING")
+INSTALL(FILES kernel/generic/encoding_kernel.h DESTINATION "${ENCODING_INSTALL_INCLUDE_SUBDIR}/ENCODING/generic")
--- a/encoding/__init__.py
+++ b/encoding/__init__.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import torch
+from torch.nn.modules.module import Module
+from ._ext import encoding_lib
+
+class aggregate(Function):
+	def forward(self, A, R):
+		# A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
+		B, N, K, D = R.size()
+		E = A.new(B,K,D)
+		# TODO support cpu backend
+		print(encoding_lib)
+		encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
+		return E
+
+	def backward(self, E):
+		# TODO FIXME this is test only
+		return E
+
+
+class Aggregate(Module):
+	def forward(self, A, R):
+		return aggregate()(A, R)
--- a/encoding/cmake/FindTorch.cmake
+++ b/encoding/cmake/FindTorch.cmake
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+# Custom CMake rules for PyTorch (a hacky way)
+FILE(GLOB TORCH_LIB_HINTS 
+	"/anaconda/lib/python3.6/site-packages/torch/lib" 
+	"/anaconda2/lib/python3.6/site-packages/torch/lib" 
+	"${HOME}/anaconda/lib/python2.7/site-packages/torch/lib"
+	"${HOME}/anaconda2/lib/python2.7/site-packages/torch/lib"
+)
+FIND_PATH(TORCH_BUILD_DIR
+	NAMES "THNN.h"
+	PATHS "${TORCH_LIB_HINTS}"
+)
+
+MESSAGE(STATUS "TORCH_BUILD_DIR: " ${TORCH_BUILD_DIR})
+
+# Find the include files
+SET(TORCH_TH_INCLUDE_DIR "${TORCH_BUILD_DIR}/include/TH")
+SET(TORCH_THC_INCLUDE_DIR "${TORCH_BUILD_DIR}/include/THC")
+SET(TORCH_THC_UTILS_INCLUDE_DIR "$ENV{HOME}/pytorch/torch/lib/THC")
+
+SET(Torch_INSTALL_INCLUDE "${TORCH_BUILD_DIR}/include" ${TORCH_TH_INCLUDE_DIR} ${TORCH_THC_INCLUDE_DIR} ${TORCH_THC_UTILS_INCLUDE_DIR})
+
+# Find the libs. We need to find libraries one by one.
+FIND_LIBRARY(THC_LIBRARIES NAMES THC THC.1 PATHS ${TORCH_BUILD_DIR} PATH_SUFFIXES lib)
+FIND_LIBRARY(TH_LIBRARIES NAMES TH TH.1 PATHS ${TORCH_BUILD_DIR} PATH_SUFFIXES lib)
+
--- a/encoding/cmake/select_compute_arch.cmake
+++ b/encoding/cmake/select_compute_arch.cmake
+# Synopsis:
+#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
+#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
+#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
+#       - "Auto" detects local machine GPU compute arch at runtime.
+#       - "Common" and "All" cover common and entire subsets of architectures
+#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
+#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
+#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
+#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
+#      Additionally, sets ${out_variable}_readable to the resulting numeric list
+#      Example:
+#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
+#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+#
+#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
+#
+
+# This list will be used for CUDA_ARCH_NAME = All option
+set(CUDA_KNOWN_GPU_ARCHITECTURES  "Fermi" "Kepler" "Maxwell")
+
+# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
+set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
+
+if (CUDA_VERSION VERSION_GREATER "6.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
+endif ()
+
+if (CUDA_VERSION VERSION_GREATER "7.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1" "6.1+PTX")
+else()
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
+endif ()
+
+
+
+################################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   CUDA_DETECT_INSTALLED_GPUS(OUT_VARIABLE)
+#
+function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
+  if(NOT CUDA_GPU_DETECT_OUTPUT)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main()\n"
+      "{\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device)\n"
+      "  {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}"
+                    "-ccbin" ${CMAKE_CXX_COMPILER}
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      string(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      string(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_GPU_DETECT_OUTPUT)
+    message(STATUS "Automatic GPU detection failed. Building for common architectures.")
+    set(${OUT_VARIABLE} ${CUDA_COMMON_GPU_ARCHITECTURES} PARENT_SCOPE)
+  else()
+    set(${OUT_VARIABLE} ${CUDA_GPU_DETECT_OUTPUT} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+################################################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA architectures from parameter list
+# Usage:
+#   SELECT_NVCC_ARCH_FLAGS(out_variable [list of CUDA compute archs])
+function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
+  set(CUDA_ARCH_LIST "${ARGN}")
+
+  if("X${CUDA_ARCH_LIST}" STREQUAL "X" )
+    set(CUDA_ARCH_LIST "Auto")
+  endif()
+
+  set(cuda_arch_bin)
+  set(cuda_arch_ptx)
+
+  if("${CUDA_ARCH_LIST}" STREQUAL "All")
+    set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Common")
+    set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto")
+    CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST)
+    message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
+  endif()
+
+  # Now process the list and look for names
+  string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
+  list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
+  foreach(arch_name ${CUDA_ARCH_LIST})
+    set(arch_bin)
+    set(add_ptx FALSE)
+    # Check to see if we are compiling PTX
+    if(arch_name MATCHES "(.*)\\+PTX$")
+      set(add_ptx TRUE)
+      set(arch_name ${CMAKE_MATCH_1})
+    endif()
+    if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
+      set(arch_bin ${CMAKE_MATCH_1})
+      set(arch_ptx ${arch_bin})
+    else()
+      # Look for it in our list of known architectures
+      if(${arch_name} STREQUAL "Fermi")
+        set(arch_bin "2.0 2.1(2.0)")
+      elseif(${arch_name} STREQUAL "Kepler+Tegra")
+        set(arch_bin 3.2)
+      elseif(${arch_name} STREQUAL "Kepler+Tesla")
+        set(arch_bin 3.7)
+      elseif(${arch_name} STREQUAL "Kepler")
+        set(arch_bin 3.0 3.5)
+        set(arch_ptx 3.5)
+      elseif(${arch_name} STREQUAL "Maxwell+Tegra")
+        set(arch_bin 5.3)
+      elseif(${arch_name} STREQUAL "Maxwell")
+        set(arch_bin 5.0 5.2)
+        set(arch_ptx 5.2)
+      elseif(${arch_name} STREQUAL "Pascal")
+        set(arch_bin 6.0 6.1)
+        set(arch_ptx 6.1)
+      else()
+        message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
+      endif()
+    endif()
+    if(NOT arch_bin)
+      message(SEND_ERROR "arch_bin wasn't set for some reason")
+    endif()
+    list(APPEND cuda_arch_bin ${arch_bin})
+    if(add_ptx)
+      if (NOT arch_ptx)
+        set(arch_ptx ${arch_bin})
+      endif()
+      list(APPEND cuda_arch_ptx ${arch_ptx})
+    endif()
+  endforeach()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+
+  if(cuda_arch_bin)
+    list(REMOVE_DUPLICATES cuda_arch_bin)
+  endif()
+  if(cuda_arch_ptx)
+    list(REMOVE_DUPLICATES cuda_arch_ptx)
+  endif()
+
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified ARCH for the concrete CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
--- a/encoding/kernel/generic/encoding_kernel.c
+++ b/encoding/kernel/generic/encoding_kernel.c
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "kernel/generic/encoding_kernel.c"
+#else
+template <int Dim>
+THCDeviceTensor<float, Dim> devicetensor(THCState *state, THCTensor *t) {
+  if (!t) {
+    return THCDeviceTensor<float, Dim>();
+  }
+
+  int inDim = THCTensor_(nDimension)(state, t);
+  if (inDim == Dim) {
+    return toDeviceTensor<float, Dim>(state, t);
+  }
+
+  // View in which the last dimensions are collapsed or expanded as needed
+  THAssert(THCTensor_(isContiguous)(state, t));
+  int size[Dim];
+  for (int i = 0; i < Dim || i < inDim; ++i) {
+    if (i < Dim && i < inDim) {
+      size[i] = t->size[i];
+    } else if (i < Dim) {
+      size[i] = 1;
+    } else {
+      size[Dim - 1] *= t->size[i];
+    }
+  }
+  return THCDeviceTensor<float, Dim>(THCTensor_(data)(state, t), size);
+}
+
+__global__ void Encoding_(Aggregate_Forward_kernel) (
+	THCDeviceTensor<real, 3> E,
+	THCDeviceTensor<real, 3> A,
+	THCDeviceTensor<real, 4> R)
+{
+  /* declarations of the variables */
+  int b, k, d, i, N;
+	real sum;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  d = blockIdx.x * blockDim.x + threadIdx.x;
+  k = blockIdx.y * blockDim.y + threadIdx.y;
+	N = A.getSize(1);
+	/* boundary check for output */
+	sum = 0;
+	if (d >= E.getSize(2) || k >= E.getSize(1))	return;
+	/* main operation */
+	for(i=0; i<N; i++) {
+		sum += A[b][i][k].ldg() * R[b][i][k][d].ldg();
+	}
+	E[b][k][d] = sum;
+}
+
+void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, THCTensor *A_,
+							THCTensor *R_)
+/*
+ * aggregating the residuals with assignment weights
+ */
+{
+	/* Check the GPU index */
+	THCTensor_(checkGPU)(state, 3, E_, A_, R_);
+	if (THCTensor_(nDimension)(state, E_) != 3 ||
+			THCTensor_(nDimension)(state, A_) != 3 ||
+			THCTensor_(nDimension)(state, R_) != 4)
+		perror("Encoding: incorrect input dims. \n");
+	/* Device tensors */
+	THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
+	THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
+	THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
+	/* kernel function */
+	cudaStream_t stream = THCState_getCurrentStream(state);
+	dim3 threads(16, 16);
+	dim3 blocks(E.getSize(2)/16+1, E.getSize(1)/16+1, 
+							E.getSize(0));
+	Encoding_(Aggregate_Forward_kernel)<<<blocks, threads, 0, stream>>>(E, A, R);
+	THCudaCheck(cudaGetLastError());
+}
+
+#endif
--- a/encoding/kernel/generic/encoding_kernel.h
+++ b/encoding/kernel/generic/encoding_kernel.h
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/encoding_kernel.h"
+#else
+
+void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, THCTensor *A_,
+							THCTensor *R_);
+#endif
--- a/encoding/kernel/thc_encoding.cu
+++ b/encoding/kernel/thc_encoding.cu
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#include <THC/THC.h>
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+
+// this symbol will be resolved automatically from PyTorch libs
+extern THCState *state;
+
+//#define torch_(NAME)     TH_CONCAT_3(torch_, Real, NAME)
+//#define torch_Tensor     TH_CONCAT_STRING_3(torch., Real, Tensor)
+
+#define Encoding_(NAME) TH_CONCAT_4(Encoding_, Real, _, NAME)
+#define THCTensor        TH_CONCAT_3(TH,CReal,Tensor)
+#define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME)
+
+#include "generic/encoding_kernel.c"
+#include "THC/THCGenerateFloatType.h"
--- a/encoding/kernel/thc_encoding.h
+++ b/encoding/kernel/thc_encoding.h
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#include <THC/THC.h>
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+
+// this symbol will be resolved automatically from PyTorch libs
+extern THCState *state;
+
+//#define torch_(NAME)     TH_CONCAT_3(torch_, Real, NAME)
+//#define torch_Tensor     TH_CONCAT_STRING_3(torch., Real, Tensor)
+
+#define Encoding_(NAME) TH_CONCAT_4(Encoding_, Real, _, NAME)
+#define THCTensor        TH_CONCAT_3(TH,CReal,Tensor)
+#define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME)
+
+#include "generic/encoding_kernel.h"
+#include "THC/THCGenerateFloatType.h"
--- a/encoding/src/encoding_lib.cpp
+++ b/encoding/src/encoding_lib.cpp
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#include <THC/THC.h>
+#include "thc_encoding.h"
+
+extern THCState *state;
+
+#include "generic/encoding_generic.c"
+#include "THC/THCGenerateFloatType.h"
--- a/encoding/src/encoding_lib.h
+++ b/encoding/src/encoding_lib.h
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+
+//#include <THC/THC.h>
+
+/*
+#define Encoding_(NAME) TH_CONCAT_4(Encoding_, Real, _, NAME)
+#define THCTensor        TH_CONCAT_3(TH,CReal,Tensor)
+#define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME)
+
+#include "generic/encoding_generic.h"
+#include "THC/THCGenerateFloatType.h"
+*/
+
+int Encoding_Float_aggregate_forward(THCudaTensor *E, THCudaTensor *A,
+			THCudaTensor *R);
--- a/encoding/src/generic/encoding_generic.c
+++ b/encoding/src/generic/encoding_generic.c
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/encoding_generic.c"
+#else
+
+int Encoding_Float_aggregate_forward(THCudaTensor *E, THCudaTensor *A,
+			THCudaTensor *R)
+/*
+ * Aggregate operation
+ */
+{
+	if (THCTensor_(nDimension)(state, E) != 3 ||
+			THCTensor_(nDimension)(state, A) != 3 ||
+			THCTensor_(nDimension)(state, R) != 4)
+		perror("Encoding: incorrect input dims. \n");
+
+	Encoding_(Aggregate_Forward)(state, E, A, R);
+	/* C function return number of the outputs */
+	return 0;
+}
+
+#endif
--- a/encoding/src/generic/encoding_generic.h
+++ b/encoding/src/generic/encoding_generic.h
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/encoding_generic.h"
+#else
+
+int Encoding_(aggregate_forward)(THCudaTensor *E, THCudaTensor *A,
+			THCudaTensor *R);
+#endif
--- a/make.sh
+++ b/make.sh
+#!/usr/bin/env bash
+
+cd encoding/
+mkdir -p build && cd build
+
+cmake ..
+make install
+
+cd ..
+python setup.py install
--- a/setup.py
+++ b/setup.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import os
+import sys
+
+from setuptools import setup, find_packages
+
+import build
+
+this_file = os.path.dirname(__file__)
+
+setup(
+    name="encoding",
+    version="0.0.1",
+    description="PyTorch Encoding Layer",
+    url="https://github.com/zhanghang1989/PyTorch-Encoding-Layer",
+    author="Hang Zhang",
+    author_email="zhang.hang@rutgers.edu",
+    # Require cffi.
+    install_requires=["cffi>=1.0.0"],
+    setup_requires=["cffi>=1.0.0"],
+    # Exclude the build files.
+    packages=find_packages(exclude=["build"]),
+    # Package where to put the extensions. Has to be a prefix of build.py.
+    ext_package="",
+    # Extensions to compile.
+    cffi_modules=[
+        os.path.join(this_file, "build.py:ffi")
+    ],
+)