init

55ada7af · dugupeiwen · 55ada7af · 55ada7af · 55ada7af · 55ada7af
Commit 55ada7af authored Mar 17, 2024 by dugupeiwen
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+#
+# Copyright (c) 2016 , Continuum Analytics, Inc.
+# All rights reserved.
+#
+
+# Require cmake 2.8.12+
+cmake_minimum_required (VERSION 2.8.12)
+
+# Refuse to build on anything but linux
+if(APPLE OR WIN32)
+    message(FATAL_ERROR "rocmlite can only be built on linux.")
+endif(APPLE OR WIN32)
+
+option(CMAKE_CONDA_ROOT "CMAKE_CONDA_ROOT" "")
+set(CONDA_ROOT ${CMAKE_CONDA_ROOT})
+message(STATUS "CONDA_ROOT = ${CONDA_ROOT}")
+
+option(CMAKE_BITCODE_ROOT "CMAKE_BITCODE_ROOT" "")
+set(BITCODE_ROOT ${CMAKE_BITCODE_ROOT})
+message(STATUS "BITCODE_ROOT = ${BITCODE_ROOT}")
+
+
+# project name
+project (rocmlite)
+
+
+# The version number
+set (librocmlite_VERSION_MAJOR 0)
+set (librocmlite_VERSION_MINOR 1)
+set (librocmlite_VERSION ${librocmlite_VERSION_MAJOR}.${librocmlite_VERSION_MINOR})
+set (librocmlite_SOVERSION 0.1.0) # the .soversion of the shared lib.
+
+enable_language(CXX)
+
+
+# CMAKE 3.1.0+ has CXX_STANDARD etc, not present by default on older linux
+# just check the flag is supported (most likely gcc).
+include(CheckCXXCompilerFlag)
+CHECK_CXX_COMPILER_FLAG(-std=c++11 CXX_HAS_CXX11)
+if(CXX_HAS_CXX11)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fno-rtti -Wall -std=c++11")
+else()
+    message(FATAL_ERROR "Compiler must support C++11")
+endif()
+
+find_package(LLVM REQUIRED CONFIG)
+
+if(LLVM_PACKAGE_VERSION VERSION_LESS 6.0)
+    message(FATAL_ERROR "llvm version must be 6.0+")
+endif()
+
+message(STATUS "Found LLVM version: ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake found in: ${LLVM_DIR}")
+
+include_directories(${LLVM_INCLUDE_DIRS})
+add_definitions(${LLVM_DEFINITIONS})
+
+# conda root
+link_directories( "${CONDA_ROOT}/lib" )
+
+# turn on testing
+enable_testing()
+
+# rocmlite code
+add_subdirectory(rocmlite)
+
+message(
+"
+------------------------------------
+|           Build Summary          |
+------------------------------------
+Building...........: ${CMAKE_PROJECT_NAME} version ${librocmlite_VERSION}
+LLVM version.......: ${LLVM_PACKAGE_VERSION}
+LLVM location......: ${LLVM_DIR}
+C++ Compiler.......: ${CMAKE_CXX_COMPILER}
+C++ Flags..........: ${CMAKE_CXX_FLAGS}
+")
--- a/LICENSE
+++ b/LICENSE
+BSD 2-Clause License
+
+Copyright (c) 2018, Numba
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
+# roctools
+
+This repository acts as a collection point for the resources needed to produce 
+a conda package containing all the components necessary to use Numba with AMD 
+GCN discrete GPUs. Included are:
+
+ * Source code and build tooling for a library, `librocmlite`, which performs 
+the same function for AMD GPUs as 
+[`llvmlite`](https://github.com/numba/llvmlite) does for CPUs. It essentially 
+acts as shim between Python and LLVM. For convenience, `librocmlite` is 
+statically linked against releases from AMDs [LLVM 
+fork](https://github.com/RadeonOpenCompute/llvm), therefore there is no LLVM 
+dependency.
+
+ * A conda recipe (`llvmdev_amdgcn`) for building the aforementioned fork of 
+LLVM to bootstrap the `roctools` package.
+ * A conda recipe (`roctools`) that:
+
+    * Builds and tests `librocmlite`
+    * Extracts necessary math (and other) library bitcodes from AMDs `rpm` 
+based releases.
+    * Extracts necessary binaries from a build of AMDs LLVM fork (as a conda 
+package).
+
+    It is this package upon which Numba depends.
+
+------------------------
+
+## Conda build instructions
+
+1. Build the AMD LLVM fork package (this will take a while):
+
+    ```
+    $ conda build conda-recipes/llvmdev_amdgcn
+    ```
+
+    Upon successful completion a package called `llvmdev_amdgcn-{version}` will 
+be produced. This package is needed to bootstrap the build of `librocmlite` and 
+also to provide some binary tools used in the AMD GCN tool chain.
+
+2. Build the roctools package:
+
+    ```
+    $ conda build conda-recipes/roctools
+    ```
+
+    Upon successful completion a package called `roctools-{version}` will 
+be produced. This package is self contained and holds all the necessary 
+components for using AMD GCN GPUs.
+
+------------------------
+## License
+See [LICENSE](https://github.com/numba/roctools/blob/master/LICENSE).
--- a/conda-recipes/llvmdev_amdgcn/0001-Transforms-Add-missing-header-for-InstructionCombini.patch
+++ b/conda-recipes/llvmdev_amdgcn/0001-Transforms-Add-missing-header-for-InstructionCombini.patch
+From 7c9054610e354340f9474dcd13a927f929912d1d Mon Sep 17 00:00:00 2001
+From: Eugene Zelenko <eugene.zelenko@gmail.com>
+Date: Tue, 6 Mar 2018 23:06:13 +0000
+Subject: [PATCH] [Transforms] Add missing header for InstructionCombining.cpp,
+ in order to export LLVMInitializeInstCombine as extern "C". Fixes PR35947.
+
+Patch by Brenton Bostick.
+
+Differential revision: https://reviews.llvm.org/D44140
+
+
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@326843 91177308-0d34-0410-b5e6-96231b3b80d8
+---
+ lib/Transforms/InstCombine/InstructionCombining.cpp | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
+index a3b2fe9..7ec7343 100644
+--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
+@@ -34,6 +34,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "InstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+ #include "llvm/ADT/APInt.h"
+ #include "llvm/ADT/ArrayRef.h"
+ #include "llvm/ADT/DenseMap.h"
+-- 
+1.8.3.1
+
--- a/conda-recipes/llvmdev_amdgcn/bld.bat
+++ b/conda-recipes/llvmdev_amdgcn/bld.bat
+mkdir build
+cd build
+
+set BUILD_CONFIG=Release
+
+REM Configure step
+if "%ARCH%"=="32" (
+    set CMAKE_GENERATOR=Visual Studio 14 2015
+) else (
+    set CMAKE_GENERATOR=Visual Studio 14 2015 Win64
+)
+set CMAKE_GENERATOR_TOOLSET=v140_xp
+
+REM Reduce build times and package size by removing unused stuff
+set CMAKE_CUSTOM=-DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_INCLUDE_TESTS=OFF ^
+    -DLLVM_INCLUDE_UTILS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF ^
+    -DLLVM_ENABLE_ASSERTIONS=ON
+
+cmake -G "%CMAKE_GENERATOR%" -T "%CMAKE_GENERATOR_TOOLSET%" ^
+    -DCMAKE_BUILD_TYPE="%BUILD_CONFIG%" -DCMAKE_PREFIX_PATH=%LIBRARY_PREFIX% ^
+    -DCMAKE_INSTALL_PREFIX:PATH=%LIBRARY_PREFIX% %CMAKE_CUSTOM% %SRC_DIR%
+if errorlevel 1 exit 1
+
+REM Build step
+cmake --build . --config "%BUILD_CONFIG%"
+if errorlevel 1 exit 1
+
+REM Install step
+cmake --build . --config "%BUILD_CONFIG%" --target install
+if errorlevel 1 exit 1
--- a/conda-recipes/llvmdev_amdgcn/build.sh
+++ b/conda-recipes/llvmdev_amdgcn/build.sh
+#!/bin/bash
+
+# based on https://github.com/AnacondaRecipes/llvmdev-feedstock/blob/master/recipe/build.sh
+
+set -x
+
+# This is the clang compiler prefix
+DARWIN_TARGET=x86_64-apple-darwin13.4.0
+
+
+declare -a _cmake_config
+_cmake_config+=(-DCMAKE_INSTALL_PREFIX:PATH=${PREFIX})
+_cmake_config+=(-DCMAKE_BUILD_TYPE:STRING=Release)
+# The bootstrap clang I use was built with a static libLLVMObject.a and I trying to get the same here
+# _cmake_config+=(-DBUILD_SHARED_LIBS:BOOL=ON)
+_cmake_config+=(-DLLVM_ENABLE_ASSERTIONS:BOOL=ON)
+_cmake_config+=(-DLINK_POLLY_INTO_TOOLS:BOOL=ON)
+# Don't really require libxml2. Turn it off explicitly to avoid accidentally linking to system libs
+_cmake_config+=(-DLLVM_ENABLE_LIBXML2:BOOL=OFF)
+# Urgh, llvm *really* wants to link to ncurses / terminfo and we *really* do not want it to.
+_cmake_config+=(-DHAVE_TERMINFO_CURSES=OFF)
+# Sometimes these are reported as unused. Whatever.
+_cmake_config+=(-DHAVE_TERMINFO_NCURSES=OFF)
+_cmake_config+=(-DHAVE_TERMINFO_NCURSESW=OFF)
+_cmake_config+=(-DHAVE_TERMINFO_TERMINFO=OFF)
+_cmake_config+=(-DHAVE_TERMINFO_TINFO=OFF)
+_cmake_config+=(-DHAVE_TERMIOS_H=OFF)
+_cmake_config+=(-DCLANG_ENABLE_LIBXML=OFF)
+_cmake_config+=(-DLIBOMP_INSTALL_ALIASES=OFF)
+_cmake_config+=(-DLLVM_ENABLE_RTTI=OFF)
+_cmake_config+=(-DLLVM_TARGETS_TO_BUILD="AMDGPU;X86")
+# TODO :: It would be nice if we had a cross-ecosystem 'BUILD_TIME_LIMITED' env var we could use to
+#         disable these unnecessary but useful things.
+if [[ ${CONDA_FORGE} == yes ]]; then
+  _cmake_config+=(-DLLVM_INCLUDE_TESTS=OFF)
+  _cmake_config+=(-DLLVM_INCLUDE_UTILS=OFF)
+  _cmake_config+=(-DLLVM_INCLUDE_DOCS=OFF)
+  _cmake_config+=(-DLLVM_INCLUDE_EXAMPLES=OFF)
+fi
+# Only valid when using the Ninja Generator AFAICT
+# _cmake_config+=(-DLLVM_PARALLEL_LINK_JOBS:STRING=1)
+# What about cross-compiling targetting Darwin here? Are any of these needed?
+if [[ $(uname) == Darwin ]]; then
+  _cmake_config+=(-DCMAKE_OSX_SYSROOT=${SYSROOT_DIR})
+  _cmake_config+=(-DDARWIN_macosx_CACHED_SYSROOT=${SYSROOT_DIR})
+  _cmake_config+=(-DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET})
+  _cmake_config+=(-DCMAKE_LIBTOOL=$(which ${DARWIN_TARGET}-libtool))
+  _cmake_config+=(-DLD64_EXECUTABLE=$(which ${DARWIN_TARGET}-ld))
+  _cmake_config+=(-DCMAKE_INSTALL_NAME_TOOL=$(which ${DARWIN_TARGET}-install_name_tool))
+  # Once we are using our libc++ (not until llvm_build_final), it will be single-arch only and not setting
+  # this causes link failures building the santizers since they respect DARWIN_osx_ARCHS. We may as well
+  # save some compilation time by setting this for all of our llvm builds.
+  _cmake_config+=(-DDARWIN_osx_ARCHS=x86_64)
+#elif [[ $(uname) == Linux ]]; then
+#  _cmake_config+=(-DLLVM_BINUTILS_INCDIR=${PREFIX}/lib/gcc/${cpu_arch}-${vendor}-linux-gnu/${compiler_ver}/plugin/include)
+fi
+
+# For when the going gets tough:
+# _cmake_config+=(-Wdev)
+# _cmake_config+=(--debug-output)
+# _cmake_config+=(--trace-expand)
+# CPU_COUNT=1
+
+mkdir build
+cd build
+
+cmake -G'Unix Makefiles'     \
+      "${_cmake_config[@]}"  \
+      ..
+
+make -j${CPU_COUNT} VERBOSE=1
+make install
--- a/conda-recipes/llvmdev_amdgcn/cfg_test.ll
+++ b/conda-recipes/llvmdev_amdgcn/cfg_test.ll
+; ModuleID = 'foo'
+source_filename = "<string>"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin17.4.0"
+
+@.const.foo = internal constant [4 x i8] c"foo\00"
+@".const.Fatal error: missing _dynfunc.Closure" = internal constant [38 x i8] c"Fatal error: missing _dynfunc.Closure\00"
+@PyExc_RuntimeError = external global i8
+@".const.missing Environment" = internal constant [20 x i8] c"missing Environment\00"
+
+; Function Attrs: norecurse nounwind
+declare i32 @"_ZN8__main__7foo$241Ex"(i64* noalias nocapture %retptr, { i8*, i32 }** noalias nocapture readnone %excinfo, i8* noalias nocapture readnone %env, i64 %arg.x) local_unnamed_addr #0
+
+
+define i8* @"testme"(i8* %py_closure, i8* %py_args, i8* nocapture readnone %py_kws) local_unnamed_addr {
+entry:
+  %.5 = alloca i8*, align 8
+  %.6 = call i32 (i8*, i8*, i64, i64, ...) @PyArg_UnpackTuple(i8* %py_args, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.const.foo, i64 0, i64 0), i64 1, i64 1, i8** nonnull %.5)
+  %.7 = icmp eq i32 %.6, 0
+  br i1 %.7, label %entry_if, label %entry_endif, !prof !0
+
+entry_if:                                         ; preds = %entry.endif.1.1.endif, %entry
+  ret i8* null
+
+entry_endif:                                      ; preds = %entry
+  %.11 = icmp eq i8* %py_closure, null
+  ret i8* null
+
+}
+
+declare i32 @PyArg_UnpackTuple(i8*, i8*, i64, i64, ...) local_unnamed_addr
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) local_unnamed_addr #1
+
+declare void @PyErr_SetString(i8*, i8*) local_unnamed_addr
+
+declare i8* @PyNumber_Long(i8*) local_unnamed_addr
+
+declare i64 @PyLong_AsLongLong(i8*) local_unnamed_addr
+
+declare void @Py_DecRef(i8*) local_unnamed_addr
+
+declare i8* @PyErr_Occurred() local_unnamed_addr
+
+declare i8* @PyLong_FromLongLong(i64) local_unnamed_addr
+
+; Function Attrs: nounwind
+declare void @llvm.stackprotector(i8*, i8**) #1
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind }
+
+!0 = !{!"branch_weights", i32 1, i32 9}
+!1 = !{!"branch_weights", i32 9, i32 1}
+
--- a/conda-recipes/llvmdev_amdgcn/meta.yaml
+++ b/conda-recipes/llvmdev_amdgcn/meta.yaml
+{% set shortversion = "roc-1.8" %}
+{% set version = "roc-1.8.x" %}
+{% set conda_version = "roc_1.8.x" %}
+{% set build_number = "0" %}
+
+package:
+  name: llvmdev_amdgcn
+  version: {{ conda_version }}
+
+source:
+  - git_url: https://github.com/RadeonOpenCompute/llvm.git
+    git_tag: {{ version }}
+    patches:
+        # undefined behavior bug due to Twine usage
+        - twine_cfg_undefined_behavior.patch
+  - git_url: https://github.com/RadeonOpenCompute/lld.git
+    git_tag: {{ version }}
+    folder: tools/lld
+
+build:
+  number: {{ build_number }}
+  script_env:
+    - PY_VCRUNTIME_REDIST
+  ignore_run_exports:
+    # Is static-linked
+    - xar
+
+requirements:
+  build:
+    # We cannot do this on macOS or windows
+    # OSX already has llvm so has to be handled
+    # at build.sh time
+    # Windows needs to build using vs2015_runtime
+    # irrespective of python version
+    - {{ compiler('c') }} # [unix]
+    - {{ compiler('cxx') }} # [unix]
+    - cmake
+    # Needed to unpack the source tarball
+    - m2w64-xz  # [py27 and win]
+    # ninja not currently used, bld.bat needs an update
+    - ninja  # [win]
+    # Needed to build LLVM
+    - python
+    # need vs2015_runtime to build, do not want it at run time
+    # as extensions for py27 need vs2008
+    - vs2015_runtime # [win]
+    - make # [unix]
+  host:
+    # needed for llc at runtime
+    - zlib # [not win]
+    - xar # [osx]
+
+test:
+  requires:
+    - python
+  files:
+    - cfg_test.ll
+    - test_cfg_dot.py
+  commands:
+    - $PREFIX/bin/llvm-config --libs                         # [not win]
+    - $PREFIX/bin/llc -version                               # [not win]
+
+    - if not exist %LIBRARY_INC%\\llvm\\Pass.h exit 1        # [win]
+    - if not exist %LIBRARY_LIB%\\LLVMSupport.lib exit 1     # [win]
+
+    - test -f $PREFIX/include/llvm/Pass.h                    # [unix]
+    - test -f $PREFIX/lib/libLLVMSupport.a                   # [unix]
+
+    - test -f $PREFIX/lib/libLLVMCore.a                      # [not win]
+    # Test for ../twine_cfg_undefined_behavior.patch
+    - $PREFIX/bin/opt -dot-cfg cfg_test.ll                   # [not win]
+    - python test_cfg_dot.py                                 # [not win]
+
+about:
+  home: http://llvm.org/
+  dev_url: https://github.com/llvm-mirror/llvm
+  license: NCSA
+  license_file: LICENSE.TXT
+  summary: Development headers and libraries for LLVM
--- a/conda-recipes/llvmdev_amdgcn/test_cfg_dot.py
+++ b/conda-recipes/llvmdev_amdgcn/test_cfg_dot.py
+with open("cfg.testme.dot") as fin:
+    got = fin.read()
+
+assert '[label="W:1"]' in got
+assert '[label="W:9"]' in got
--- a/conda-recipes/llvmdev_amdgcn/twine_cfg_undefined_behavior.patch
+++ b/conda-recipes/llvmdev_amdgcn/twine_cfg_undefined_behavior.patch
+From b42222e01abc1a799c4e421fa26d72d49afe4b99 Mon Sep 17 00:00:00 2001
+From: Siu Kwan Lam <michael.lam.sk@gmail.com>
+Date: Fri, 23 Mar 2018 11:46:45 -0500
+Subject: [PATCH] Patch to fix undefined behavior in cfgprinter
+
+---
+ include/llvm/Analysis/CFGPrinter.h | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
+index 5786769..a4b642b 100644
+--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
+@@ -172,8 +172,7 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
+ 
+     // Prepend a 'W' to indicate that this is a weight rather than the actual
+     // profile count (due to scaling).
+-    Twine Attrs = "label=\"W:" + Twine(Weight->getZExtValue()) + "\"";
+-    return Attrs.str();
+    return ("label=\"W:" + Twine(Weight->getZExtValue()) + "\"").str();
+   }
+ };
+ } // End llvm namespace
+-- 
+2.10.1
+
--- a/conda-recipes/roctools/build.sh
+++ b/conda-recipes/roctools/build.sh
+#!/bin/bash
+
+set -x
+
+###############################################################################
+# Extract bitcodes from ROCM rpm source
+###############################################################################
+
+RPM_PATH=`readlink -f opencl_tmp/*.rpm`
+ROCM_PATH="opt/rocm/opencl/lib/x86_64/bitcode"
+
+declare -a bitcodes=(                       \
+"opencl.amdgcn.bc"                          \
+"ocml.amdgcn.bc"                            \
+"ockl.amdgcn.bc"                            \
+"oclc_correctly_rounded_sqrt_off.amdgcn.bc" \
+"oclc_daz_opt_off.amdgcn.bc"                \
+"oclc_finite_only_off.amdgcn.bc"            \
+"oclc_isa_version_803.amdgcn.bc"            \
+"oclc_unsafe_math_off.amdgcn.bc"            \
+"irif.amdgcn.bc"                            \
+)
+
+for bitcode in "${bitcodes[@]}"; do
+    bsdtar -x -f "$RPM_PATH" --strip-components 6 "$ROCM_PATH/$bitcode"
+done
+
+# move the bitcode to the pkg dir
+RESOURCE_PATH="$PREFIX/share/rocmtools"
+mv bitcode $RESOURCE_PATH
+
+###############################################################################
+# Now do C++ library build
+###############################################################################
+CMAKE_BUILD_DIR="cmake_build" # this needs to match meta.yaml test::source_files
+mkdir ${CMAKE_BUILD_DIR}
+pushd ${CMAKE_BUILD_DIR}
+
+printenv
+
+# Force CMake to look in the conda env "CMAKE_CONDA_ROOT" `/lib` etc 
+# for libraries via `-L`
+cmake .. -DCMAKE_BUILD_TYPE=RELEASE \
+         -DCMAKE_CONDA_ROOT:PATH="$BUILD_PREFIX" \
+         -DCMAKE_BITCODE_ROOT:PATH="$RESOURCE_PATH"
+
+# build
+make VERBOSE=1
+
+# move DSO to lib
+cp "rocmlite/librocmlite.so" "$PREFIX/lib"
+
+# test now, splitting this out to work at test time is hard to do
+# the test_XXX binaries are dynamically linked to librocmlite but no rpath
+# fix is made unless the binaries are also shipped (undesirable).
+ctest -V
+
+popd
+
+###############################################################################
+# Copy llvmdev binary tools to /bin
+# NOTE: should these names start to cause collision issues with llvm installs
+# they can be prefixed e.g. amd_opt. However `ld.lld` will need to have a 
+# `-flavour gnu` permanently supplied so it knows that it is emulating the GNU
+# linker variant.
+###############################################################################
+declare -a tools=( \
+"opt"              \
+"llc"              \
+"llvm-link"        \
+"ld.lld"           \
+)
+
+for tool in "${tools[@]}"; do
+    cp "$BUILD_PREFIX/bin/$tool" "$PREFIX/bin/$tool"
+done
--- a/conda-recipes/roctools/meta.yaml
+++ b/conda-recipes/roctools/meta.yaml
+{% set opencl_devel_ver="1.2.0-2018053132" %}
+{% set opencl_devel_sha256="95f429a25d7e6081fe1c75bd05feb5e515408b82a1631f09d96abb4232e1af68" %}
+
+package:
+    name: roctools
+    version:  {{ environ.get('GIT_DESCRIBE_TAG', '') }}
+
+source:
+  - path: ../..
+  - fn: rocm-opencl-devel-{{ opencl_devel_ver }}.x86_64.rpm
+    url: http://repo.radeon.com/rocm/yum/rpm/rocm-opencl-devel-{{ opencl_devel_ver }}.x86_64.rpm
+    folder: opencl_tmp
+    sha256: {{ opencl_devel_sha256 }}
+
+build:
+    number: {{ environ.get('GIT_DESCRIBE_NUMBER', 0) }}
+
+requirements:
+  build:
+    - {{ compiler('c') }} # [unix]
+    - {{ compiler('cxx') }} # [unix]
+    - cmake>=2.8
+    - xz
+    - zlib
+    - bzip2
+    - libarchive
+    - llvmdev_amdgcn
+    # gtest is used in the test binaries
+    - gtest
+
+  host:
+    - zlib # for llvm binary tooling that is copied in
+
+test:
+  commands:
+    # The librocmlite.so DSO is tested at compile time
+
+    # Check llvm binaries actually run
+    - opt --help | grep amdgpu
+    - llc --help | grep amdgpu
+    - llvm-link --help
+    - ld.lld --help
+
+about:
+    home: https://github.com/numba/roctools
+    license: BSD
+    summary: A shared library that wraps LLVM for code
+             generation for devices with Radeon Open Compute support.
+    license_file: LICENSE
--- a/include/rocmlite.hh
+++ b/include/rocmlite.hh
+/**
+ * Copyright (c) 2016 , Continuum Analytics, Inc.
+ * All rights reserved.
+ */
+
+#ifndef _ROC_HH
+#define _ROC_HH
+
+#include "llvm/IR/Module.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace std;
+
+namespace librocmlite
+{
+
+    class ModuleRef
+    {
+        public:
+            ModuleRef(llvm::Module * module);
+            operator bool () const;
+            llvm::Module * getModule();
+            void destroy();
+            std::string to_string();
+            static ModuleRef* parseAssembly(const char* Asm);
+            static ModuleRef* parseBitcode(const char *Bitcode, size_t Len);
+        private:
+            llvm::Module * M;
+    };
+
+    // Initializes the llvm libary tooling.
+    void Initialize();
+
+    // Finalizes the llvm library tooling.
+    void Finalize();
+
+    // Optimize a module in place
+    void Optimize(llvm::Module * M, int OptLevel, int SizeLevel, int Verify, const char * Cpu);
+
+    // Compile a module
+    int CompileModule(std::unique_ptr<llvm::Module> mod, llvm::raw_string_ostream &os, bool emitBRIG,
+                      int OptLevel);
+
+}
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+using namespace librocmlite;
+
+// ROC_ C/CFFI entry points
+
+void ROC_Initialize();
+
+void ROC_Finalize();
+
+char* ROC_CreateString(const char *str);
+
+void ROC_DisposeString(char *str);
+
+// rename this to ParseIR2Module ?
+ModuleRef* ROC_ParseModule(const char *Asm);
+
+ModuleRef* ROC_ParseBitcode(const char *Asm, size_t Len);
+
+void ROC_ModulePrint(ModuleRef *M, char **output);
+
+void ROC_ModuleDestroy(ModuleRef *M);
+
+int ROC_ModuleOptimize(ModuleRef *M, int OptLevel, int SizeLevel, int Verify, const char * Cpu);
+
+int ROC_ModuleLinkIn(ModuleRef * Dst, ModuleRef * Src);
+
+int ROC_ModuleEmitHSAIL(ModuleRef *M, int OptLevel, const char * Cpu, char **output);
+
+size_t ROC_ModuleEmitBRIG(ModuleRef *M, int OptLevel, const char * Cpu, char **output);
+
+void ROC_SetCommandLineOption(int argc, const char * const * argv);
+
+
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif //ifdef _ROC_HH
--- a/rocmlite/CMakeLists.txt
+++ b/rocmlite/CMakeLists.txt
+#
+# Copyright (c) 2016 , Continuum Analytics, Inc.
+# All rights reserved.
+#
+
+set(LIBROCMLITE_SOURCES rocmlite.cpp)
+
+add_library(
+            rocmlite
+            SHARED
+            ${LIBROCMLITE_SOURCES}
+            )
+
+# llvm components needed (will get mapped to libs for linking)
+# See `llvm-config --components` for a list of available components.
+llvm_map_components_to_libnames(
+llvm_libs # this name is magic, it is the variable in which the
+          # component linker info is stored.
+#all
+amdgpuasmparser
+amdgpuasmprinter
+amdgpucodegen
+amdgpudesc
+amdgpudisassembler
+amdgpuinfo
+amdgpuutils
+coroutines
+objcarcopts
+native
+core
+)
+
+
+#Link against LLVM libraries
+target_link_libraries(rocmlite ${llvm_libs})
+
+# include include/
+target_include_directories(rocmlite PUBLIC ${CMAKE_SOURCE_DIR}/include)
+
+
+# set library properties
+set_target_properties(rocmlite PROPERTIES
+                     VERSION   ${librocmlite_VERSION}
+                     SOVERSION ${librocmlite_SOVERSION})
+
+
+# Add in test dir
+add_subdirectory(test)
+
--- a/rocmlite/rocmlite.cpp
+++ b/rocmlite/rocmlite.cpp
+#include "rocmlite.hh"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/CodeGen/CommandFlags.def"
+#include "llvm/CodeGen/LinkAllAsmWriterComponents.h"
+#include "llvm/CodeGen/LinkAllCodegenComponents.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/IR/AutoUpgrade.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LegacyPassNameParser.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/LinkAllIR.h"
+#include "llvm/LinkAllPasses.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/PluginLoader.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SystemUtils.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Coroutines.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include <iostream>
+
+namespace librocmlite
+{
+
+
+    static llvm::LLVMContext *TheContext = nullptr;
+
+    bool DisableInline = false;
+    bool UnitAtATime = true;
+    bool DisableLoopVectorization = false;
+    bool DisableSLPVectorization = false;
+    bool StripDebug = false;
+    bool DisableOptimizations = false;
+    bool DisableSimplifyLibCalls = false;
+
+    static const std::string MArch = "amdgcn"; // AMD Graphics Core Next
+
+    // ModuleRef impl
+    ModuleRef::ModuleRef(Module * module) : M(module) { };
+
+    ModuleRef::operator bool () const
+    {
+        return M != nullptr;
+    }
+
+    Module * ModuleRef::getModule()
+    {
+        return M;
+    }
+
+    void ModuleRef::destroy()
+    {
+        delete M;
+        M = nullptr;
+    }
+
+    std::string ModuleRef::to_string()
+    {
+        std::string buf;
+        raw_string_ostream os(buf);
+        M->print(os, nullptr);
+        os.flush();
+        return buf;
+    }
+
+    ModuleRef * ModuleRef::parseAssembly(const char* Asm)
+    {
+        SMDiagnostic SM;
+        Module* M = parseAssemblyString(Asm, SM, *TheContext).release();
+        if (!M) return nullptr;
+        return new ModuleRef(M);
+    }
+
+    ModuleRef * ModuleRef::parseBitcode(const char *Bitcode, size_t Len)
+    {
+        auto buf = MemoryBuffer::getMemBuffer(StringRef(Bitcode, Len),
+                                              "", false);
+
+        MemoryBufferRef mbref = buf->getMemBufferRef();
+        auto ModuleOr = parseBitcodeFile(mbref, *TheContext);
+
+        // Error handling inspired by
+        // https://github.com/llvm-mirror/llvm/blob/release_60/lib/Bitcode/Reader/BitReader.cpp#L79
+        if (Error err = ModuleOr.takeError())
+        {
+            std::string msg;
+            handleAllErrors(std::move(err), [&](const ErrorInfoBase &eib)
+            {
+                msg = eib.message();
+            });
+            puts(strdup(msg.c_str()));
+            return nullptr;
+        }
+
+        std::unique_ptr<Module> mod (std::move(ModuleOr.get()));
+
+        if(!mod->isMaterialized())
+        {
+            mod->materializeAll();
+        }
+
+        ModuleRef * mref = new ModuleRef(mod.release());
+        return mref;
+    }
+
+    CodeGenOpt::Level GetCodeGenOptLevel(int OptLevel)
+    {
+        switch (OptLevel)
+        {
+            case 1:
+                return CodeGenOpt::Less;
+            case 2:
+                return CodeGenOpt::Default;
+            case 3:
+                return CodeGenOpt::Aggressive;
+            default:
+                return CodeGenOpt::None;
+        }
+    }
+
+    // The following function combines initialisation code from opt and llc
+    // tools as found in the llvm source tree, here:
+    // https://github.com/llvm-mirror/llvm/blob/master/tools/opt/opt.cpp
+    // and here:
+    // https://github.com/llvm-mirror/llvm/blob/master/tools/llc/llc.cpp
+    void Initialize()
+    {
+        using namespace llvm;
+
+        if ( TheContext != nullptr )
+        {
+            // Already initialized
+            return;
+        }
+
+        sys::PrintStackTraceOnErrorSignal("librocmlite");
+        EnablePrettyStackTrace();
+
+        // Enable debug stream buffering.
+        EnableDebugBuffering = true;
+
+        // this has thread safety issues, there's no global context anymore
+        // each thread really ought to have its own.
+        LLVMContext * Context = new LLVMContext();
+        TheContext = Context;
+
+        // Initialize targets
+
+        // FROM OPT and LLC
+        InitializeAllTargets();
+        InitializeAllTargetMCs();
+        InitializeAllAsmPrinters();
+        InitializeAllAsmParsers();
+
+        // Initialize passes
+        // FROM OPT
+        PassRegistry &Registry = *PassRegistry::getPassRegistry();
+        initializeCore(Registry);
+        initializeCoroutines(Registry);
+        initializeScalarOpts(Registry);
+        initializeObjCARCOpts(Registry);
+        initializeVectorization(Registry);
+        initializeIPO(Registry);
+        initializeAnalysis(Registry);
+        initializeTransformUtils(Registry);
+        initializeInstCombine(Registry);
+        initializeInstrumentation(Registry);
+        initializeTarget(Registry);
+        // For codegen passes, only passes that do IR to IR transformation are
+        // supported.
+        initializeExpandMemCmpPassPass(Registry);
+        initializeScalarizeMaskedMemIntrinPass(Registry);
+        initializeCodeGenPreparePass(Registry);
+        initializeAtomicExpandPass(Registry);
+        initializeRewriteSymbolsLegacyPassPass(Registry);
+        initializeWinEHPreparePass(Registry);
+        initializeDwarfEHPreparePass(Registry);
+        initializeSafeStackLegacyPassPass(Registry);
+        initializeSjLjEHPreparePass(Registry);
+        initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
+        initializeGlobalMergePass(Registry);
+        initializeIndirectBrExpandPassPass(Registry);
+        initializeInterleavedAccessPass(Registry);
+        initializeEntryExitInstrumenterPass(Registry);
+        initializePostInlineEntryExitInstrumenterPass(Registry);
+        initializeUnreachableBlockElimLegacyPassPass(Registry);
+        initializeExpandReductionsPass(Registry);
+        initializeWriteBitcodePassPass(Registry);
+#ifdef LINK_POLLY_INTO_TOOLS
+        polly::initializePollyPasses(Registry);
+#endif
+
+        // FROM LLC
+        initializeCodeGen(Registry);
+        initializeLoopStrengthReducePass(Registry);
+        initializeLowerIntrinsicsPass(Registry);
+        initializeConstantHoistingLegacyPassPass(Registry);
+        // Initialize debugging passes.
+        initializeScavengerTestPass(Registry);
+
+    }
+
+    void Finalize()
+    {
+        using namespace llvm;
+        // finalizer is called when the library is potentially unloaded
+        // the context can be deleted.
+        if (TheContext)
+        {
+            delete TheContext;
+            TheContext = nullptr;
+            llvm_shutdown();
+        }
+    }
+
+
+    // The following section is adapted from opt.cpp from the LLVM source tree.
+    // Original code is here:
+    // https://github.com/llvm-mirror/llvm/blob/master/tools/opt/opt.cpp
+
+    // --- Start OPT section ---
+
+    static inline void addPass(legacy::PassManagerBase &PM, Pass *P)
+    {
+        // Add the pass to the pass manager...
+        PM.add(P);
+
+        // If we are verifying all of the intermediate steps, add the verifier...
+        PM.add(createVerifierPass());
+    }
+
+    /// This routine adds optimization passes based on selected optimization level,
+    /// OptLevel.
+    static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
+                                      legacy::FunctionPassManager &FPM,
+                                      TargetMachine *TM,
+                                      unsigned OptLevel, unsigned SizeLevel)
+    {
+        FPM.add(createVerifierPass()); // Verify that input is correct
+
+        PassManagerBuilder Builder;
+        Builder.OptLevel = OptLevel;
+        Builder.SizeLevel = SizeLevel;
+
+        if (DisableInline)
+        {
+            // No inlining pass
+        }
+        else if (OptLevel > 1)
+        {
+            Builder.Inliner = createFunctionInliningPass(OptLevel, SizeLevel, false);
+        }
+        else
+        {
+            Builder.Inliner = createAlwaysInlinerLegacyPass();
+        }
+        Builder.DisableUnitAtATime = !UnitAtATime;
+        Builder.DisableUnrollLoops = OptLevel == 0;
+
+        // This is final, unless there is a #pragma vectorize enable
+        if (DisableLoopVectorization)
+            Builder.LoopVectorize = false;
+        // If option wasn't forced via cmd line (-vectorize-loops, -loop-vectorize)
+        else if (!Builder.LoopVectorize)
+            Builder.LoopVectorize = OptLevel > 1 && SizeLevel < 2;
+
+        // When #pragma vectorize is on for SLP, do the same as above
+        Builder.SLPVectorize =
+            DisableSLPVectorization ? false : OptLevel > 1 && SizeLevel < 2;
+
+        if (TM)
+            TM->adjustPassManager(Builder);
+
+        Builder.populateFunctionPassManager(FPM);
+        Builder.populateModulePassManager(MPM);
+
+    }
+
+    static void AddStandardLinkPasses(legacy::PassManagerBase &PM)
+    {
+        PassManagerBuilder Builder;
+        Builder.VerifyInput = true;
+        if (DisableOptimizations)
+            Builder.OptLevel = 0;
+
+        if (!DisableInline)
+            Builder.Inliner = createFunctionInliningPass();
+        Builder.populateLTOPassManager(PM);
+    }
+
+    // Returns the TargetMachine instance or zero if no triple is provided.
+    static TargetMachine* GetTargetMachine(Triple TheTriple, StringRef CPUStr,
+                                           StringRef FeaturesStr,
+                                           const TargetOptions &Options,
+                                           int OptLevel
+                                          )
+    {
+        std::string Error;
+        const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
+                                  Error);
+        // Some modules don't specify a triple, and this is okay.
+        if (!TheTarget)
+        {
+            return nullptr;
+        }
+
+        return TheTarget->createTargetMachine(TheTriple.getTriple(),
+                                              CPUStr, FeaturesStr, Options,
+                                              getRelocModel(),  getCodeModel(),
+                                              GetCodeGenOptLevel(OptLevel));
+    }
+
+    void Optimize(llvm::Module * M, int OptLevel, int SizeLevel, int Verify,
+            const char * Cpu)
+    {
+
+        bool OptLevelO1 = false;
+        bool OptLevelO2 = false;
+        bool OptLevelO3 = false;
+        bool StandardLinkOpts = false;
+        switch(OptLevel)
+        {
+            case 0:
+                break;
+            case 1:
+                OptLevelO1 = true;
+                break;
+            case 2:
+                OptLevelO2 = true;
+                break;
+            case 3:
+                OptLevelO3 = true;
+                break;
+        }
+
+        if(OptLevel > 0)
+        {
+            StandardLinkOpts = true;
+        }
+
+        // Strip debug info before running the verifier.
+        if (StripDebug)
+            StripDebugInfo(*M);
+
+        // Immediately run the verifier to catch any problems before starting up the
+        // pass pipelines.  Otherwise we can crash on broken code during
+        // doInitialization().
+        if(verifyModule(*M, &errs()))
+        {
+            errs() << "error: input module is broken!\n";
+            exit(1);
+        }
+
+
+        M->setTargetTriple(Triple::normalize("amdgcn--amdhsa"));
+
+
+        Triple ModuleTriple(M->getTargetTriple());
+        std::string CPUStr(Cpu);
+        std::string FeaturesStr="";
+        TargetMachine *Machine = nullptr;
+        TargetOptions Options;
+
+        if (ModuleTriple.getArch())
+        {
+            Machine = GetTargetMachine(ModuleTriple, CPUStr, FeaturesStr, Options, OptLevel);
+        }
+
+        std::unique_ptr<TargetMachine> TM(Machine);
+
+        // Override function attributes based on CPUStr, FeaturesStr, and command line
+        // flags.
+        setFunctionAttributes(CPUStr, FeaturesStr, *M);
+
+        // Create a PassManager to hold and optimize the collection of passes we are
+        // about to build.
+        legacy::PassManager Passes;
+
+        // Add an appropriate TargetLibraryInfo pass for the module's triple.
+        TargetLibraryInfoImpl TLII(ModuleTriple);
+
+        // switch off libcall simplication, transforming loops to
+        // system calls is not supported
+        TLII.disableAllFunctions();
+        Passes.add(new TargetLibraryInfoWrapperPass(TLII));
+
+        // Add an appropriate DataLayout instance for this module.
+        const DataLayout &DL = M->getDataLayout();
+        if (DL.isDefault())
+        {
+            M->setDataLayout("");
+        }
+
+        // Add internal analysis passes from the target machine.
+        Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
+                   : TargetIRAnalysis()));
+
+        std::unique_ptr<legacy::FunctionPassManager> FPasses;
+        if (OptLevelO1 || OptLevelO2 || OptLevelO3)
+        {
+            FPasses.reset(new legacy::FunctionPassManager(M));
+            FPasses->add(createTargetTransformInfoWrapperPass(
+                             TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis()));
+        }
+
+        if (StandardLinkOpts)
+            AddStandardLinkPasses(Passes);
+
+        // Apply optimisation passes
+        if (OptLevelO1)
+            AddOptimizationPasses(Passes, *FPasses, TM.get(), 1, 0);
+
+        if (OptLevelO2)
+            AddOptimizationPasses(Passes, *FPasses, TM.get(), 2, 0);
+
+        if (OptLevelO3)
+            AddOptimizationPasses(Passes, *FPasses, TM.get(), 3, 0);
+
+        if (FPasses)
+        {
+            FPasses->doInitialization();
+            for (Function &F : *M)
+                FPasses->run(F);
+            FPasses->doFinalization();
+        }
+
+        // Check that the module is well formed on completion of optimization
+        Passes.add(createVerifierPass());
+
+        // Now that we have all of the passes ready, run them.
+        Passes.run(*M);
+    }
+
+
+    // --- END OPT section ---
+
+
+
+    // The following section is adapted from llc.cpp from the LLVM source tree.
+    // Original code is here:
+    // https://github.com/llvm-mirror/llvm/blob/master/tools/llc/llc.cpp
+
+    // --- START LLC section ---
+
+    int CompileModule(std::unique_ptr<Module> mod, raw_string_ostream &os, bool emitBRIG,
+                      int OptLevel, const char * Cpu)
+    {
+        // Load the module to be compiled...
+        SMDiagnostic Err;
+
+        Triple TheTriple;// = Triple(mod->getTargetTriple());
+
+        TheTriple = Triple(Triple::normalize("amdgcn--amdhsa"));
+
+        // Get the target specific parser.
+        std::string Error;
+        const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
+                                  Error);
+
+        if (!TheTarget)
+        {
+            errs() << Error;
+            return 0;
+        }
+
+        // Package up features to be passed to target/subtarget
+        std::string CPUStr(Cpu);
+        std::string FeaturesStr = "+promote-alloca,+fp64-denormals,+flat-for-global,";
+
+        CodeGenOpt::Level OLvl = CodeGenOpt::Default;
+        switch (OptLevel)
+        {
+            case 0:
+                OLvl = CodeGenOpt::None;
+                break;
+            case 1:
+                OLvl = CodeGenOpt::Less;
+                break;
+            case 2:
+                OLvl = CodeGenOpt::Default;
+                break;
+            case 3:
+                OLvl = CodeGenOpt::Aggressive;
+                break;
+        }
+
+        TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
+        Options.MCOptions.AsmVerbose = true;
+
+        std::unique_ptr<TargetMachine> Target(
+            TheTarget->createTargetMachine(TheTriple.getTriple(), CPUStr, FeaturesStr,
+                                           Options, getRelocModel(), getCodeModel(),
+                                           OLvl));
+
+
+        assert(Target && "Could not allocate target machine!");
+        assert(mod && "Should have exited if we didn't have a module!");
+
+        if (FloatABIForCalls != FloatABI::Default)
+            Options.FloatABIType = FloatABIForCalls;
+
+        // Build up all of the passes that we want to do to the module.
+        legacy::PassManager PM;
+
+        // Add an appropriate TargetLibraryInfo pass for the module's triple.
+        TargetLibraryInfoImpl TLII(TheTriple);
+
+        // The -disable-simplify-libcalls flag actually disables all builtin optzns.
+        // TLII.disableAllFunctions();
+
+        PM.add(new TargetLibraryInfoWrapperPass(TLII));
+
+        // Add the target data from the target machine, if it exists, or the module./
+        mod->setDataLayout(Target->createDataLayout());
+
+        setFunctionAttributes(CPUStr, FeaturesStr, *mod);
+
+        auto FileType = (emitBRIG
+                         ? TargetMachine::CGFT_ObjectFile
+                         : TargetMachine::CGFT_AssemblyFile);
+
+        {
+            // new scope
+            buffer_ostream BOS(os);
+
+            // Ask the target to add backend passes as necessary.
+            bool Verify = true;
+            if (Target->addPassesToEmitFile(PM, BOS, FileType, Verify))
+            {
+                errs() << "target does not support generation of this"
+                       << " file type!\n";
+                return 1;
+            }
+            PM.run(*mod);
+        }
+        return 0;
+    }
+
+    // --- END LLC section ---
+
+} // end librocmlite namespace
+
+extern "C" {
+
+    using namespace librocmlite;
+
+    typedef struct OpaqueModule* llvm_module_ptr;
+
+    void ROC_Initialize()
+    {
+        Initialize();
+    }
+
+    void ROC_Finalize()
+    {
+        Finalize();
+    }
+
+
+    char* ROC_CreateString(const char *str)
+    {
+        return strdup(str);
+    }
+
+    void ROC_DisposeString(char *str)
+    {
+        free(str);
+    }
+
+    ModuleRef* ROC_ParseModule(const char *Asm)
+    {
+        return ModuleRef::parseAssembly(Asm);
+    }
+
+    ModuleRef* ROC_ParseBitcode(const char *Asm, size_t Len)
+    {
+        ModuleRef * mref = ModuleRef::parseBitcode(Asm, Len);
+        return mref;
+    }
+
+    void ROC_ModulePrint(ModuleRef *M, char **output)
+    {
+        *output = ROC_CreateString(M->to_string().c_str());
+    }
+
+    void ROC_ModuleDestroy(ModuleRef *M)
+    {
+        M->destroy();
+        delete M;
+    }
+
+    int ROC_ModuleOptimize(ModuleRef *M, int OptLevel, int SizeLevel,
+            int Verify, const char * Cpu)
+    {
+        if (OptLevel < 0 || OptLevel > 3) return 0;
+        if (SizeLevel < 0 || SizeLevel > 2) return 0;
+        Module * mref = M->getModule();
+        Optimize(mref, OptLevel, SizeLevel, Verify, Cpu);
+        return 1;
+    }
+
+
+    int ROC_ModuleLinkIn(ModuleRef * Dst, ModuleRef * Src)
+    {
+        const Module * ref = Src->getModule();
+        std::unique_ptr<Module> sM = llvm::CloneModule (*ref);
+
+        if(llvm::verifyModule(*Dst->getModule(), nullptr))
+        {
+            return 0;
+        }
+        if(llvm::verifyModule(*Src->getModule(), nullptr))
+        {
+            return 0;
+        }
+        int status = llvm::Linker::linkModules(*Dst->getModule(), std::move(sM), 0);
+        return !status;
+    }
+
+
+    int ROC_ModuleEmitHSAIL(ModuleRef *M, int OptLevel, const char * Cpu,
+            char **output)
+    {
+        const Module * ref = M->getModule();
+        std::unique_ptr<Module> sM = llvm::CloneModule (*ref);
+
+        if (OptLevel < 0 || OptLevel > 3) return 0;
+        // Compile
+        std::string buf;
+        raw_string_ostream os(buf);
+        int status = CompileModule(std::move(sM), os, false, OptLevel, Cpu);
+        if(status) return 0;
+        // Write output
+        os.flush();
+        *output = ROC_CreateString(buf.c_str());
+        return 1;
+    }
+
+    size_t ROC_ModuleEmitBRIG(ModuleRef *M, int OptLevel, const char * Cpu,
+            char **output)
+    {
+        const Module * ref = M->getModule();
+        std::unique_ptr<Module> sM = llvm::CloneModule (*ref);
+
+        if (OptLevel < 0 || OptLevel > 3) return 0;
+        // Compile
+        std::string buf;
+        raw_string_ostream os(buf);
+        int status  = CompileModule(std::move(sM), os, true, OptLevel, Cpu);
+        if(status) return 0;
+        // Write output
+        os.flush();
+        *output = (char*)malloc(buf.size());
+        memcpy(*output, buf.data(), buf.size());
+        return buf.size();
+    }
+
+    void ROC_SetCommandLineOption(int argc, const char * const * argv)
+    {
+        llvm::cl::ParseCommandLineOptions(argc, argv, "Does things");
+    }
+
+} // end extern "C"
--- a/rocmlite/test/CMakeLists.txt
+++ b/rocmlite/test/CMakeLists.txt
+#
+# Copyright (c) 2016 , Continuum Analytics, Inc.
+# All rights reserved.
+#
+
+
+find_program(VALGRIND valgrind)
+if(NOT VALGRIND)
+    message("Did not find valgrind.")
+else()
+    message("Found valgrind. ${VALGRIND}")
+endif()
+
+# Add a gtest with and the same running with valgrind (if found)
+macro(add_gtest TESTNAME)
+    add_executable(${TESTNAME} ${TESTNAME}.cpp)
+    # set link dir
+    link_directories(${librocmlite_BINARY_DIR})
+    # include include/
+    target_include_directories(${TESTNAME} PUBLIC ${CMAKE_SOURCE_DIR}/include ${BIN_INCLUDE_DIR})
+
+    # set linkage
+    target_link_libraries(${TESTNAME}
+                         rocmlite
+                         gtest
+                         gtest_main
+                         pthread
+                         )
+
+    get_target_property(TEST_LOCATION ${TESTNAME} LOCATION)
+    add_test(${TESTNAME} ${TEST_LOCATION})
+    if(VALGRIND)
+        add_test(${TESTNAME}_with_valgrind
+            valgrind --error-exitcode=1 --leak-check=full --show-leak-kinds=all
+            --suppressions=test_suppressions.supp
+            ${TEST_LOCATION})
+    endif()
+endmacro()
+
+set(TESTS
+    test_rocmlite
+    test_rocmlite_functions
+    )
+
+foreach(TEST ${TESTS})
+    add_gtest(${TEST})
+endforeach()
+
+add_custom_target(COPY_IN_COMPILATION_RESOURCES ALL
+                 COMMAND cmake -E copy_directory
+                 ${BITCODE_ROOT}
+                 ${CMAKE_BINARY_DIR}/rocmlite/test/
+                 DEPENDS ${gtest})
+
+add_custom_target(COPY_IN_TEST_RESOURCES ALL
+                 COMMAND cmake -E copy_directory
+                 ${CMAKE_SOURCE_DIR}/rocmlite/test/resources
+                 ${CMAKE_BINARY_DIR}/rocmlite/test/
+                 DEPENDS ${gtest})
+
--- a/rocmlite/test/resources/demo_ir.ll
+++ b/rocmlite/test/resources/demo_ir.ll
+; ModuleID = 'copy_kernel_1d'
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n32"
+target triple = "amdgcn--amdhsa"
+
+define internal spir_func i32 @hsapy_devfn__5F__5F_main_5F__5F__2E_copy_5F_kernel_5F_1d_24_1_2E_array_28_float32_2C__20_1d_2C__20_C_29__2E_array_28_float32_2C__20_1d_2C__20_C_29_(i8** %.ret, i8* %arg.out.0, i8* %arg.out.1, i64 %arg.out.2, i64 %arg.out.3, float addrspace(4)* %arg.out.4, i64 %arg.out.5.0, i64 %arg.out.6.0, i8* %arg.inp.0, i8* %arg.inp.1, i64 %arg.inp.2, i64 %arg.inp.3, float addrspace(4)* %arg.inp.4, i64 %arg.inp.5.0, i64 %arg.inp.6.0) {
+entry:
+  %inserted.meminfo = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } undef, i8* %arg.out.0, 0
+  %inserted.parent = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.meminfo, i8* %arg.out.1, 1
+  %inserted.nitems = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.parent, i64 %arg.out.2, 2
+  %inserted.itemsize = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.nitems, i64 %arg.out.3, 3
+  %inserted.data = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.itemsize, float addrspace(4)* %arg.out.4, 4
+  %.17 = insertvalue [1 x i64] undef, i64 %arg.out.5.0, 0
+  %inserted.shape = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.data, [1 x i64] %.17, 5
+  %.18 = insertvalue [1 x i64] undef, i64 %arg.out.6.0, 0
+  %inserted.strides = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.shape, [1 x i64] %.18, 6
+  %inserted.meminfo.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } undef, i8* %arg.inp.0, 0
+  %inserted.parent.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.meminfo.1, i8* %arg.inp.1, 1
+  %inserted.nitems.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.parent.1, i64 %arg.inp.2, 2
+  %inserted.itemsize.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.nitems.1, i64 %arg.inp.3, 3
+  %inserted.data.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.itemsize.1, float addrspace(4)* %arg.inp.4, 4
+  %.19 = insertvalue [1 x i64] undef, i64 %arg.inp.5.0, 0
+  %inserted.shape.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.data.1, [1 x i64] %.19, 5
+  %.20 = insertvalue [1 x i64] undef, i64 %arg.inp.6.0, 0
+  %inserted.strides.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.shape.1, [1 x i64] %.20, 6
+  %out = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
+  %inp = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
+  %"$0.1" = alloca i8*
+  store i8* null, i8** %"$0.1"
+  %"$0.2" = alloca i8*
+  store i8* null, i8** %"$0.2"
+  %"$const0.3" = alloca i64
+  store i64 0, i64* %"$const0.3"
+  %"$0.4" = alloca i64
+  store i64 0, i64* %"$0.4"
+  %i = alloca i64
+  store i64 0, i64* %i
+  %.56 = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.56
+  %"$0.7" = alloca i64
+  store i64 0, i64* %"$0.7"
+  %"$0.8" = alloca i1
+  store i1 false, i1* %"$0.8"
+  %.78 = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78
+  %"$30.3" = alloca float
+  store float 0.000000e+00, float* %"$30.3"
+  %.114 = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114
+  %"$const47.1" = alloca i8*
+  store i8* null, i8** %"$const47.1"
+  %"$47.2" = alloca i8*
+  store i8* null, i8** %"$47.2"
+  br label %B0
+
+B0:                                               ; preds = %entry
+  %.22 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
+  %.25 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
+  %.28 = load i8*, i8** %"$0.1"
+  store i8* null, i8** %"$0.1"
+  %.30 = load i8*, i8** %"$0.1"
+  %.32 = load i8*, i8** %"$0.2"
+  store i8* null, i8** %"$0.2"
+  %.34 = load i8*, i8** %"$0.1"
+  store i8* null, i8** %"$0.1"
+  %.37 = load i64, i64* %"$const0.3"
+  store i64 0, i64* %"$const0.3"
+  %.39 = load i64, i64* %"$const0.3"
+  %.40 = trunc i64 %.39 to i32
+  %.41 = call spir_func i64 @_Z13get_global_idj(i32 %.40)
+  %.43 = load i64, i64* %"$0.4"
+  store i64 %.41, i64* %"$0.4"
+  %.45 = load i64, i64* %"$const0.3"
+  store i64 0, i64* %"$const0.3"
+  %.47 = load i8*, i8** %"$0.2"
+  store i8* null, i8** %"$0.2"
+  %.49 = load i64, i64* %"$0.4"
+  %.51 = load i64, i64* %i
+  store i64 %.49, i64* %i
+  %.53 = load i64, i64* %"$0.4"
+  store i64 0, i64* %"$0.4"
+  %.55 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %.55, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.56
+  %.59 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.56, i32 0, i32 2
+  %.60 = load i64, i64* %.59
+  %.62 = load i64, i64* %"$0.7"
+  store i64 %.60, i64* %"$0.7"
+  %.64 = load i64, i64* %i
+  %.65 = load i64, i64* %"$0.7"
+  %.66 = icmp slt i64 %.64, %.65
+  %.68 = load i1, i1* %"$0.8"
+  store i1 %.66, i1* %"$0.8"
+  %.70 = load i64, i64* %"$0.7"
+  store i64 0, i64* %"$0.7"
+  %.72 = load i1, i1* %"$0.8"
+  br i1 %.72, label %B30, label %B47
+
+B30:                                              ; preds = %B0
+  %.74 = load i1, i1* %"$0.8"
+  store i1 false, i1* %"$0.8"
+  %.76 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
+  %.77 = load i64, i64* %i
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %.76, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78
+  %.81 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 5
+  %.82 = getelementptr inbounds [1 x i64], [1 x i64]* %.81, i32 0, i32 0
+  %.83 = load i64, i64* %.82, !range !7
+  %.84 = insertvalue [1 x i64] undef, i64 %.83, 0
+  %.85 = extractvalue [1 x i64] %.84, 0
+  %.86 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 6
+  %.87 = load [1 x i64], [1 x i64]* %.86
+  %.88 = extractvalue [1 x i64] %.87, 0
+  %.89 = icmp slt i64 %.77, 0
+  %.90 = add i64 %.77, %.85
+  %.91 = select i1 %.89, i64 %.90, i64 %.77
+  %.92 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 5
+  %.93 = getelementptr inbounds [1 x i64], [1 x i64]* %.92, i32 0, i32 0
+  %.94 = load i64, i64* %.93, !range !7
+  %.95 = insertvalue [1 x i64] undef, i64 %.94, 0
+  %.96 = extractvalue [1 x i64] %.95, 0
+  %.97 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 6
+  %.98 = load [1 x i64], [1 x i64]* %.97
+  %.99 = extractvalue [1 x i64] %.98, 0
+  %.100 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 4
+  %.101 = load float addrspace(4)*, float addrspace(4)** %.100
+  %.102 = mul i64 %.91, 1
+  %.103 = add i64 0, %.102
+  %.104 = getelementptr float, float addrspace(4)* %.101, i64 %.103
+  %.105 = load float, float addrspace(4)* %.104
+  %.107 = load float, float* %"$30.3"
+  store float %.105, float* %"$30.3"
+  %.109 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
+  %.111 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
+  %.112 = load float, float* %"$30.3"
+  %.113 = load i64, i64* %i
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %.111, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114
+  %.117 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 5
+  %.118 = getelementptr inbounds [1 x i64], [1 x i64]* %.117, i32 0, i32 0
+  %.119 = load i64, i64* %.118, !range !7
+  %.120 = insertvalue [1 x i64] undef, i64 %.119, 0
+  %.121 = extractvalue [1 x i64] %.120, 0
+  %.122 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 6
+  %.123 = load [1 x i64], [1 x i64]* %.122
+  %.124 = extractvalue [1 x i64] %.123, 0
+  %.125 = icmp slt i64 %.113, 0
+  %.126 = add i64 %.113, %.121
+  %.127 = select i1 %.125, i64 %.126, i64 %.113
+  %.128 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 5
+  %.129 = getelementptr inbounds [1 x i64], [1 x i64]* %.128, i32 0, i32 0
+  %.130 = load i64, i64* %.129, !range !7
+  %.131 = insertvalue [1 x i64] undef, i64 %.130, 0
+  %.132 = extractvalue [1 x i64] %.131, 0
+  %.133 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 6
+  %.134 = load [1 x i64], [1 x i64]* %.133
+  %.135 = extractvalue [1 x i64] %.134, 0
+  %.136 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 4
+  %.137 = load float addrspace(4)*, float addrspace(4)** %.136
+  %.138 = mul i64 %.127, 1
+  %.139 = add i64 0, %.138
+  %.140 = getelementptr float, float addrspace(4)* %.137, i64 %.139
+  store float %.112, float addrspace(4)* %.140
+  %.142 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
+  %.144 = load i64, i64* %i
+  store i64 0, i64* %i
+  %.146 = load float, float* %"$30.3"
+  store float 0.000000e+00, float* %"$30.3"
+  br label %B47
+
+B47:                                              ; preds = %B30, %B0
+  %.149 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
+  %.151 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
+  store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
+  %.153 = load i64, i64* %i
+  store i64 0, i64* %i
+  %.155 = load i1, i1* %"$0.8"
+  store i1 false, i1* %"$0.8"
+  %.158 = load i8*, i8** %"$const47.1"
+  store i8* null, i8** %"$const47.1"
+  %.160 = load i8*, i8** %"$const47.1"
+  %.162 = load i8*, i8** %"$47.2"
+  store i8* %.160, i8** %"$47.2"
+  %.164 = load i8*, i8** %"$const47.1"
+  store i8* null, i8** %"$const47.1"
+  %.166 = load i8*, i8** %"$47.2"
+  store i8* %.166, i8** %.ret
+  ret i32 0
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @hsaPy_hsapy_devfn__5F__5F_main_5F__5F__2E_copy_5F_kernel_5F_1d_24_1_2E_array_28_float32_2C__20_1d_2C__20_C_29__2E_array_28_float32_2C__20_1d_2C__20_C_29_(i8 addrspace(1)* %.1, i8 addrspace(1)* %.2, i64 %.3, i64 %.4, float addrspace(1)* %.5, i64 %.6, i64 %.7, i8 addrspace(1)* %.8, i8 addrspace(1)* %.9, i64 %.10, i64 %.11, float addrspace(1)* %.12, i64 %.13, i64 %.14) {
+.16:
+  %.17 = addrspacecast i8 addrspace(1)* %.1 to i8*
+  %.18 = addrspacecast i8 addrspace(1)* %.2 to i8*
+  %.19 = addrspacecast float addrspace(1)* %.5 to float addrspace(4)*
+  %.20 = addrspacecast i8 addrspace(1)* %.8 to i8*
+  %.21 = addrspacecast i8 addrspace(1)* %.9 to i8*
+  %.22 = addrspacecast float addrspace(1)* %.12 to float addrspace(4)*
+  %inserted.meminfo = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } undef, i8* %.17, 0
+  %inserted.parent = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.meminfo, i8* %.18, 1
+  %inserted.nitems = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.parent, i64 %.3, 2
+  %inserted.itemsize = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.nitems, i64 %.4, 3
+  %inserted.data = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.itemsize, float addrspace(4)* %.19, 4
+  %.23 = insertvalue [1 x i64] undef, i64 %.6, 0
+  %inserted.shape = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.data, [1 x i64] %.23, 5
+  %.24 = insertvalue [1 x i64] undef, i64 %.7, 0
+  %inserted.strides = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.shape, [1 x i64] %.24, 6
+  %inserted.meminfo.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } undef, i8* %.20, 0
+  %inserted.parent.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.meminfo.1, i8* %.21, 1
+  %inserted.nitems.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.parent.1, i64 %.10, 2
+  %inserted.itemsize.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.nitems.1, i64 %.11, 3
+  %inserted.data.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.itemsize.1, float addrspace(4)* %.22, 4
+  %.25 = insertvalue [1 x i64] undef, i64 %.13, 0
+  %inserted.shape.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.data.1, [1 x i64] %.25, 5
+  %.26 = insertvalue [1 x i64] undef, i64 %.14, 0
+  %inserted.strides.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.shape.1, [1 x i64] %.26, 6
+  %.27 = alloca i8*
+  store i8* null, i8** %.27
+  %extracted.meminfo = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 0
+  %extracted.parent = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 1
+  %extracted.nitems = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 2
+  %extracted.itemsize = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 3
+  %extracted.data = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 4
+  %extracted.shape = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 5
+  %.29 = extractvalue [1 x i64] %extracted.shape, 0
+  %extracted.strides = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 6
+  %.30 = extractvalue [1 x i64] %extracted.strides, 0
+  %extracted.meminfo.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 0
+  %extracted.parent.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 1
+  %extracted.nitems.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 2
+  %extracted.itemsize.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 3
+  %extracted.data.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 4
+  %extracted.shape.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 5
+  %.31 = extractvalue [1 x i64] %extracted.shape.1, 0
+  %extracted.strides.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 6
+  %.32 = extractvalue [1 x i64] %extracted.strides.1, 0
+  %.33 = call spir_func i32 @hsapy_devfn__5F__5F_main_5F__5F__2E_copy_5F_kernel_5F_1d_24_1_2E_array_28_float32_2C__20_1d_2C__20_C_29__2E_array_28_float32_2C__20_1d_2C__20_C_29_(i8** %.27, i8* %extracted.meminfo, i8* %extracted.parent, i64 %extracted.nitems, i64 %extracted.itemsize, float addrspace(4)* %extracted.data, i64 %.29, i64 %.30, i8* %extracted.meminfo.1, i8* %extracted.parent.1, i64 %extracted.nitems.1, i64 %extracted.itemsize.1, float addrspace(4)* %extracted.data.1, i64 %.31, i64 %.32)
+  %.34 = icmp eq i32 %.33, 0
+  %.35 = icmp eq i32 %.33, -2
+  %.36 = or i1 %.34, %.35
+  %.37 = xor i1 %.36, true
+  %.38 = icmp eq i32 %.33, -1
+  %.39 = icmp eq i32 %.33, -3
+  %.40 = icmp sge i32 %.33, 1
+  %.41 = load i8*, i8** %.27
+  ret void
+}
+
+!opencl.kernels = !{!0}
+!opencl.ocl.version = !{!6}
+!opencl.spir.version = !{!6}
+
+!0 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i64, i64, float addrspace(1)*, i64, i64, i8 addrspace(1)*, i8 addrspace(1)*, i64, i64, float addrspace(1)*, i64, i64)* @hsaPy_hsapy_devfn__5F__5F_main_5F__5F__2E_copy_5F_kernel_5F_1d_24_1_2E_array_28_float32_2C__20_1d_2C__20_C_29__2E_array_28_float32_2C__20_1d_2C__20_C_29_, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"i8 addrspace(1)* ", !"i8 addrspace(1)* ", !"i64", !"i64", !"float addrspace(1)* ", !"i64", !"i64", !"i8 addrspace(1)* ", !"i8 addrspace(1)* ", !"i64", !"i64", !"float addrspace(1)* ", !"i64", !"i64"}
+!4 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !"", !"", !"", !"", !"", !"", !"", !"", !""}
+!5 = !{!"kernel_arg_base_type", !"i8 addrspace(1)* ", !"i8 addrspace(1)* ", !"i64", !"i64", !"float addrspace(1)* ", !"i64", !"i64", !"i8 addrspace(1)* ", !"i8 addrspace(1)* ", !"i64", !"i64", !"float addrspace(1)* ", !"i64", !"i64"}
+!6 = !{i32 2, i32 0}
+!7 = !{i64 0, i64 9223372036854775807}
--- a/rocmlite/test/resources/test_suppressions.supp
+++ b/rocmlite/test/resources/test_suppressions.supp
+{
+   <Suppress_leak_in_stack_trace_handler>
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:_ZL17CreateSigAltStackv
+   fun:_ZL16RegisterHandlersv
+   fun:_ZN4llvm3sys16AddSignalHandlerEPFvPvES1_
+   fun:_ZN4llvm3sys28PrintStackTraceOnErrorSignalEb
+}
+
+{
+   <Suppress_leak_in_stack_trace_handler_with_debug_info>
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:CreateSigAltStack
+   fun:_ZL16RegisterHandlersv
+   fun:_ZN4llvm3sys16AddSignalHandlerEPFvPvES1_
+}
+
+{
+   <Suppress_leak_in_stack_trace_in_docker>
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:_ZL16RegisterHandlersv
+}
+
--- a/rocmlite/test/test_rocmlite.cpp
+++ b/rocmlite/test/test_rocmlite.cpp
+#include "rocmlite.hh"
+#include "gtest/gtest.h"
+
+#define TEST_BASE C_Linkage
+
+// Check initializer and finalizer work as expected.
+// (essentially a leak check)
+TEST(TEST_BASE, Initialization)
+{
+    for (int i = 0 ; i < 4; i++)
+    {
+        ROC_Initialize();
+        ROC_Finalize();
+    }
+}
+
--- a/rocmlite/test/test_rocmlite_functions.cpp
+++ b/rocmlite/test/test_rocmlite_functions.cpp
+#include "rocmlite.hh"
+#include "gtest/gtest.h"
+
+#include <string>
+#include <fstream>
+#include <streambuf>
+#include <exception>
+#include <regex>
+
+#define TEST_BASE C_Linkage_Functions
+
+using namespace std;
+
+// Helper functions
+std::string read_ir_from_file(const char * filename)
+{
+
+    std::stringstream buf;
+    std::ifstream f(filename, std::ios::out);
+    if(f.is_open())
+    {
+        buf << f.rdbuf();
+        f.close();
+    }
+    else
+    {
+        throw std::runtime_error("Could not open file.");
+    }
+
+
+    return buf.str();
+}
+
+std::string read_bc_from_file(const char * filename)
+{
+
+    std::stringstream buf;
+    std::ifstream f(filename, std::ios::out | std::ios::binary);
+    if(f.is_open())
+    {
+        buf << f.rdbuf();
+        f.close();
+    }
+    else
+    {
+        throw std::runtime_error("Could not open file.");
+    }
+
+    return buf.str();
+}
+
+// the names of the bitcode files that need linking in
+std::vector<std::string> _bitcodes = {       \
+                                             "opencl.amdgcn.bc",                          \
+                                             "ocml.amdgcn.bc",                            \
+                                             "ockl.amdgcn.bc",                            \
+                                             "oclc_correctly_rounded_sqrt_off.amdgcn.bc", \
+                                             "oclc_daz_opt_off.amdgcn.bc",                \
+                                             "oclc_finite_only_off.amdgcn.bc",            \
+                                             "oclc_isa_version_803.amdgcn.bc",            \
+                                             "oclc_unsafe_math_off.amdgcn.bc",            \
+                                             "irif.amdgcn.bc"
+                                     };
+
+
+// Environment class to setup and teardown the LLVMContext.
+// This is indicative of the use from python (calls to init to ensure the
+// library is initialised and a single call to finalize when gc takes place).
+class globalDSOLoadEnv: public ::testing::Environment
+{
+    public:
+        virtual void SetUp()
+        {
+            ROC_Initialize();
+        }
+        virtual void TearDown()
+        {
+            ROC_Finalize();
+        }
+};
+
+::testing::Environment* const global_env =
+    ::testing::AddGlobalTestEnvironment(new globalDSOLoadEnv);
+
+// Check string copy/destroy works
+TEST(TEST_BASE, String_Manipulation)
+{
+    const char string_orig[] = "Use numba for AMD GPUs!";
+    char * string_copy =  ROC_CreateString(string_orig);
+    ASSERT_TRUE(string_copy != nullptr);
+    ASSERT_STREQ(string_orig, string_copy);
+    ROC_DisposeString(string_copy);
+}
+
+// Check the module IR parse works cleanly and can then be destroyed.
+TEST(TEST_BASE, test_parse_ir_module)
+{
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* theRef = ROC_ParseModule(ir.c_str());
+    ROC_ModuleDestroy(theRef);
+}
+
+// Check the module BC parse works cleanly and can then be destroyed.
+TEST(TEST_BASE, test_parse_bc_module)
+{
+    std::string bc = read_bc_from_file("opencl.amdgcn.bc");
+    ModuleRef* theRef = ROC_ParseBitcode(bc.c_str(), bc.size());
+    ROC_ModuleDestroy(theRef);
+}
+
+// Check link-in works
+TEST(TEST_BASE, test_linkin_modules)
+{
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* dst = ROC_ParseModule(ir.c_str());
+
+    int ret;
+
+    for (auto& bitcode : _bitcodes)
+    {
+
+        std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+        ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                             builtins_bc.size());
+
+        // link the builtins into the module
+        ret = ROC_ModuleLinkIn(dst, bc_src);
+        EXPECT_TRUE(ret != 0);
+
+        // pointlessly link in the same a few times, there was an subtle corruption
+        // present in previous versions of the linkin function.
+        for(int i = 0; i < 3; i++)
+        {
+            int ret = ROC_ModuleLinkIn(dst, bc_src);
+            EXPECT_TRUE(ret!=0);
+        }
+        ROC_ModuleDestroy(bc_src);
+    }
+
+    ROC_ModuleDestroy(dst);
+}
+
+// Test optimization call works
+TEST(TEST_BASE, test_optimize_module)
+{
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* dst = ROC_ParseModule(ir.c_str());
+
+    int ret;
+
+    for (auto& bitcode : _bitcodes)
+    {
+
+        std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+        ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                             builtins_bc.size());
+
+        // link the builtins into the module
+        ret = ROC_ModuleLinkIn(dst, bc_src);
+        EXPECT_TRUE(ret != 0);
+
+        ROC_ModuleDestroy(bc_src);
+    }
+
+    // run an optimisation pass over the module
+    ret = ROC_ModuleOptimize(dst, 3, 0, 1, "fiji");
+    EXPECT_TRUE(ret == 1);
+
+    ROC_ModuleDestroy(dst);
+}
+
+// Test compilation call to HSAIL works
+TEST(TEST_BASE, test_compile_module_to_HSAIL)
+{
+    ROC_Initialize();
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* dst = ROC_ParseModule(ir.c_str());
+
+    int ret;
+
+    for (auto& bitcode : _bitcodes)
+    {
+
+        std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+        ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                             builtins_bc.size());
+
+        // link the builtins into the module
+        ret = ROC_ModuleLinkIn(dst, bc_src);
+        EXPECT_TRUE(ret != 0);
+
+        ROC_ModuleDestroy(bc_src);
+    }
+
+    const char * cpu = "fiji";
+
+    // run an optimisation pass over the module
+    ret = ROC_ModuleOptimize(dst, 3, 0, 1, cpu);
+    ASSERT_TRUE(ret == 1);
+
+    char * output;
+    ret = ROC_ModuleEmitHSAIL(dst, 2, cpu, &output);
+    EXPECT_TRUE(ret > 0);
+    std::string hsail(output);
+
+    // check this is an HSA code object, search the dump for an HSA ISA string
+    // like '.hsa_code_object_isa 8,0,3,"AMD","AMDGPU"'
+    std::regex regex(".*\\.hsa_code_object_isa.*\"AMD\",\"AMDGPU\".*");
+    EXPECT_TRUE(std::regex_search(hsail, regex));
+    free(output);
+
+    ROC_ModuleDestroy(dst);
+}
+
+// Test compilation call to BRIG works
+TEST(TEST_BASE, test_compile_module_to_BRIG)
+{
+    ROC_Initialize();
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* dst = ROC_ParseModule(ir.c_str());
+
+    int ret;
+
+    for (auto& bitcode : _bitcodes)
+    {
+
+        std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+        ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                             builtins_bc.size());
+
+        // link the builtins into the module
+        ret = ROC_ModuleLinkIn(dst, bc_src);
+        EXPECT_TRUE(ret != 0);
+
+        ROC_ModuleDestroy(bc_src);
+    }
+
+    const char * cpu = "fiji";
+
+    // run an optimisation pass over the module
+    ret = ROC_ModuleOptimize(dst, 3, 0, 1, cpu);
+    ASSERT_TRUE(ret == 1);
+
+    char * output;
+    ret = ROC_ModuleEmitBRIG(dst, 2, cpu, &output);
+    EXPECT_TRUE(ret > 0);
+
+    char elf_string[] = "\x7f\x45\x4c\x46";
+
+    // check this is an ELF object
+    for (size_t i = 0; i < 4; i++)
+    {
+        EXPECT_TRUE(output[i]==elf_string[i]);
+    }
+
+    free(output);
+
+    ROC_ModuleDestroy(dst);
+}
+
+// Test many compilation calls to BRIG works
+TEST(TEST_BASE, test_many_compile_module_to_BRIG)
+{
+    const char * cpu = "fiji";
+    int trials = 5;
+
+    for(int k = 0; k < trials; k++)
+    {
+        ROC_Initialize();
+        std::string ir = read_ir_from_file("demo_ir.ll");
+        ModuleRef* dst = ROC_ParseModule(ir.c_str());
+
+        int ret;
+
+        for (auto& bitcode : _bitcodes)
+        {
+
+            std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+            ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                                 builtins_bc.size());
+
+            // link the builtins into the module
+            ret = ROC_ModuleLinkIn(dst, bc_src);
+            EXPECT_TRUE(ret != 0);
+
+            ROC_ModuleDestroy(bc_src);
+        }
+
+        // run an optimisation pass over the module
+        ret = ROC_ModuleOptimize(dst, 3, 0, 1, cpu);
+        ASSERT_TRUE(ret == 1);
+
+        char * output;
+        ret = ROC_ModuleEmitBRIG(dst, 2, cpu, &output);
+        EXPECT_TRUE(ret > 0);
+
+        char elf_string[] = "\x7f\x45\x4c\x46";
+
+        // check this is an ELF object
+        for (size_t i = 0; i < 4; i++)
+        {
+            EXPECT_TRUE(output[i]==elf_string[i]);
+        }
+
+        free(output);
+
+        ROC_ModuleDestroy(dst);
+    }
+}