init

55ada7af · dugupeiwen · 55ada7af · 55ada7af · 55ada7af · 55ada7af
Commit 55ada7af authored Mar 17, 2024 by dugupeiwen
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+#
+# Copyright (c) 2016 , Continuum Analytics, Inc.
+# All rights reserved.
+#
+# Require cmake 2.8.12+
+cmake_minimum_required (VERSION 2.8.12)
+# Refuse to build on anything but linux
+if(APPLE OR WIN32)
+    message(FATAL_ERROR "rocmlite can only be built on linux.")
+endif(APPLE OR WIN32)
+option(CMAKE_CONDA_ROOT "CMAKE_CONDA_ROOT" "")
+set(CONDA_ROOT ${CMAKE_CONDA_ROOT})
+message(STATUS "CONDA_ROOT = ${CONDA_ROOT}")
+option(CMAKE_BITCODE_ROOT "CMAKE_BITCODE_ROOT" "")
+set(BITCODE_ROOT ${CMAKE_BITCODE_ROOT})
+message(STATUS "BITCODE_ROOT = ${BITCODE_ROOT}")
+# project name
+project (rocmlite)
+# The version number
+set (librocmlite_VERSION_MAJOR 0)
+set (librocmlite_VERSION_MINOR 1)
+set (librocmlite_VERSION ${librocmlite_VERSION_MAJOR}.${librocmlite_VERSION_MINOR})
+set (librocmlite_SOVERSION 0.1.0) # the .soversion of the shared lib.
+enable_language(CXX)
+# CMAKE 3.1.0+ has CXX_STANDARD etc, not present by default on older linux
+# just check the flag is supported (most likely gcc).
+include(CheckCXXCompilerFlag)
+CHECK_CXX_COMPILER_FLAG(-std=c++11 CXX_HAS_CXX11)
+if(CXX_HAS_CXX11)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fno-rtti -Wall -std=c++11")
+else()
+    message(FATAL_ERROR "Compiler must support C++11")
+endif()
+find_package(LLVM REQUIRED CONFIG)
+if(LLVM_PACKAGE_VERSION VERSION_LESS 6.0)
+    message(FATAL_ERROR "llvm version must be 6.0+")
+endif()
+message(STATUS "Found LLVM version: ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake found in: ${LLVM_DIR}")
+include_directories(${LLVM_INCLUDE_DIRS})
+add_definitions(${LLVM_DEFINITIONS})
+# conda root
+link_directories( "${CONDA_ROOT}/lib" )
+# turn on testing
+enable_testing()
+# rocmlite code
+add_subdirectory(rocmlite)
+message(
+"
+------------------------------------
+|           Build Summary          |
+------------------------------------
+Building...........: ${CMAKE_PROJECT_NAME} version ${librocmlite_VERSION}
+LLVM version.......: ${LLVM_PACKAGE_VERSION}
+LLVM location......: ${LLVM_DIR}
+C++ Compiler.......: ${CMAKE_CXX_COMPILER}
+C++ Flags..........: ${CMAKE_CXX_FLAGS}
+")
--- a/LICENSE
+++ b/LICENSE
+BSD 2-Clause License
+Copyright (c) 2018, Numba
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
+# roctools
+This repository acts as a collection point for the resources needed to produce 
+a conda package containing all the components necessary to use Numba with AMD 
+GCN discrete GPUs. Included are:
+ * Source code and build tooling for a library, `librocmlite`, which performs 
+the same function for AMD GPUs as 
+[`llvmlite`](https://github.com/numba/llvmlite) does for CPUs. It essentially 
+acts as shim between Python and LLVM. For convenience, `librocmlite` is 
+statically linked against releases from AMDs [LLVM 
+fork](https://github.com/RadeonOpenCompute/llvm), therefore there is no LLVM 
+dependency.
+ * A conda recipe (`llvmdev_amdgcn`) for building the aforementioned fork of 
+LLVM to bootstrap the `roctools` package.
+ * A conda recipe (`roctools`) that:
+    * Builds and tests `librocmlite`
+    * Extracts necessary math (and other) library bitcodes from AMDs `rpm` 
+based releases.
+    * Extracts necessary binaries from a build of AMDs LLVM fork (as a conda 
+package).
+    It is this package upon which Numba depends.
+------------------------
+## Conda build instructions
+1. Build the AMD LLVM fork package (this will take a while):
+    ```
+    $ conda build conda-recipes/llvmdev_amdgcn
+    ```
+    Upon successful completion a package called `llvmdev_amdgcn-{version}` will 
+be produced. This package is needed to bootstrap the build of `librocmlite` and 
+also to provide some binary tools used in the AMD GCN tool chain.
+2. Build the roctools package:
+    ```
+    $ conda build conda-recipes/roctools
+    ```
+    Upon successful completion a package called `roctools-{version}` will 
+be produced. This package is self contained and holds all the necessary 
+components for using AMD GCN GPUs.
+------------------------
+## License
+See [LICENSE](https://github.com/numba/roctools/blob/master/LICENSE).
--- a/conda-recipes/llvmdev_amdgcn/0001-Transforms-Add-missing-header-for-InstructionCombini.patch
+++ b/conda-recipes/llvmdev_amdgcn/0001-Transforms-Add-missing-header-for-InstructionCombini.patch
+From 7c9054610e354340f9474dcd13a927f929912d1d Mon Sep 17 00:00:00 2001
+From: Eugene Zelenko <eugene.zelenko@gmail.com>
+Date: Tue, 6 Mar 2018 23:06:13 +0000
+Subject: [PATCH] [Transforms] Add missing header for InstructionCombining.cpp,
+ in order to export LLVMInitializeInstCombine as extern "C". Fixes PR35947.
+Patch by Brenton Bostick.
+Differential revision: https://reviews.llvm.org/D44140
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@326843 91177308-0d34-0410-b5e6-96231b3b80d8
+---
+ lib/Transforms/InstCombine/InstructionCombining.cpp | 1 +
+ 1 file changed, 1 insertion(+)
+diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
+index a3b2fe9..7ec7343 100644
+--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
+@@ -34,6 +34,7 @@
+ //===----------------------------------------------------------------------===//
+ #include "InstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+ #include "llvm/ADT/APInt.h"
+ #include "llvm/ADT/ArrayRef.h"
+ #include "llvm/ADT/DenseMap.h"
+-- 
+1.8.3.1
--- a/conda-recipes/llvmdev_amdgcn/bld.bat
+++ b/conda-recipes/llvmdev_amdgcn/bld.bat
+mkdir build
+cd build
+set BUILD_CONFIG=Release
+REM Configure step
+if "%ARCH%"=="32" (
+    set CMAKE_GENERATOR=Visual Studio 14 2015
+) else (
+    set CMAKE_GENERATOR=Visual Studio 14 2015 Win64
+)
+set CMAKE_GENERATOR_TOOLSET=v140_xp
+REM Reduce build times and package size by removing unused stuff
+set CMAKE_CUSTOM=-DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_INCLUDE_TESTS=OFF ^
+    -DLLVM_INCLUDE_UTILS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF ^
+    -DLLVM_ENABLE_ASSERTIONS=ON
+cmake -G "%CMAKE_GENERATOR%" -T "%CMAKE_GENERATOR_TOOLSET%" ^
+    -DCMAKE_BUILD_TYPE="%BUILD_CONFIG%" -DCMAKE_PREFIX_PATH=%LIBRARY_PREFIX% ^
+    -DCMAKE_INSTALL_PREFIX:PATH=%LIBRARY_PREFIX% %CMAKE_CUSTOM% %SRC_DIR%
+if errorlevel 1 exit 1
+REM Build step
+cmake --build . --config "%BUILD_CONFIG%"
+if errorlevel 1 exit 1
+REM Install step
+cmake --build . --config "%BUILD_CONFIG%" --target install
+if errorlevel 1 exit 1
--- a/conda-recipes/llvmdev_amdgcn/build.sh
+++ b/conda-recipes/llvmdev_amdgcn/build.sh
+#!/bin/bash
+# based on https://github.com/AnacondaRecipes/llvmdev-feedstock/blob/master/recipe/build.sh
+set -x
+# This is the clang compiler prefix
+DARWIN_TARGET=x86_64-apple-darwin13.4.0
+declare -a _cmake_config
+_cmake_config+=(-DCMAKE_INSTALL_PREFIX:PATH=${PREFIX})
+_cmake_config+=(-DCMAKE_BUILD_TYPE:STRING=Release)
+# The bootstrap clang I use was built with a static libLLVMObject.a and I trying to get the same here
+# _cmake_config+=(-DBUILD_SHARED_LIBS:BOOL=ON)
+_cmake_config+=(-DLLVM_ENABLE_ASSERTIONS:BOOL=ON)
+_cmake_config+=(-DLINK_POLLY_INTO_TOOLS:BOOL=ON)
+# Don't really require libxml2. Turn it off explicitly to avoid accidentally linking to system libs
+_cmake_config+=(-DLLVM_ENABLE_LIBXML2:BOOL=OFF)
+# Urgh, llvm *really* wants to link to ncurses / terminfo and we *really* do not want it to.
+_cmake_config+=(-DHAVE_TERMINFO_CURSES=OFF)
+# Sometimes these are reported as unused. Whatever.
+_cmake_config+=(-DHAVE_TERMINFO_NCURSES=OFF)
+_cmake_config+=(-DHAVE_TERMINFO_NCURSESW=OFF)
+_cmake_config+=(-DHAVE_TERMINFO_TERMINFO=OFF)
+_cmake_config+=(-DHAVE_TERMINFO_TINFO=OFF)
+_cmake_config+=(-DHAVE_TERMIOS_H=OFF)
+_cmake_config+=(-DCLANG_ENABLE_LIBXML=OFF)
+_cmake_config+=(-DLIBOMP_INSTALL_ALIASES=OFF)
+_cmake_config+=(-DLLVM_ENABLE_RTTI=OFF)
+_cmake_config+=(-DLLVM_TARGETS_TO_BUILD="AMDGPU;X86")
+# TODO :: It would be nice if we had a cross-ecosystem 'BUILD_TIME_LIMITED' env var we could use to
+#         disable these unnecessary but useful things.
+if [[ ${CONDA_FORGE} == yes ]]; then
+  _cmake_config+=(-DLLVM_INCLUDE_TESTS=OFF)
+  _cmake_config+=(-DLLVM_INCLUDE_UTILS=OFF)
+  _cmake_config+=(-DLLVM_INCLUDE_DOCS=OFF)
+  _cmake_config+=(-DLLVM_INCLUDE_EXAMPLES=OFF)
+fi
+# Only valid when using the Ninja Generator AFAICT
+# _cmake_config+=(-DLLVM_PARALLEL_LINK_JOBS:STRING=1)
+# What about cross-compiling targetting Darwin here? Are any of these needed?
+if [[ $(uname) == Darwin ]]; then
+  _cmake_config+=(-DCMAKE_OSX_SYSROOT=${SYSROOT_DIR})
+  _cmake_config+=(-DDARWIN_macosx_CACHED_SYSROOT=${SYSROOT_DIR})
+  _cmake_config+=(-DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET})
+  _cmake_config+=(-DCMAKE_LIBTOOL=$(which ${DARWIN_TARGET}-libtool))
+  _cmake_config+=(-DLD64_EXECUTABLE=$(which ${DARWIN_TARGET}-ld))
+  _cmake_config+=(-DCMAKE_INSTALL_NAME_TOOL=$(which ${DARWIN_TARGET}-install_name_tool))
+  # Once we are using our libc++ (not until llvm_build_final), it will be single-arch only and not setting
+  # this causes link failures building the santizers since they respect DARWIN_osx_ARCHS. We may as well
+  # save some compilation time by setting this for all of our llvm builds.
+  _cmake_config+=(-DDARWIN_osx_ARCHS=x86_64)
+#elif [[ $(uname) == Linux ]]; then
+#  _cmake_config+=(-DLLVM_BINUTILS_INCDIR=${PREFIX}/lib/gcc/${cpu_arch}-${vendor}-linux-gnu/${compiler_ver}/plugin/include)
+fi
+# For when the going gets tough:
+# _cmake_config+=(-Wdev)
+# _cmake_config+=(--debug-output)
+# _cmake_config+=(--trace-expand)
+# CPU_COUNT=1
+mkdir build
+cd build
+cmake -G'Unix Makefiles'     \
+      "${_cmake_config[@]}"  \
+      ..
+make -j${CPU_COUNT} VERBOSE=1
+make install
--- a/conda-recipes/llvmdev_amdgcn/cfg_test.ll
+++ b/conda-recipes/llvmdev_amdgcn/cfg_test.ll
+; ModuleID = 'foo'
+source_filename = "<string>"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin17.4.0"
+@.const.foo = internal constant [4 x i8] c"foo\00"
+@".const.Fatal error: missing _dynfunc.Closure" = internal constant [38 x i8] c"Fatal error: missing _dynfunc.Closure\00"
+@PyExc_RuntimeError = external global i8
+@".const.missing Environment" = internal constant [20 x i8] c"missing Environment\00"
+; Function Attrs: norecurse nounwind
+declare i32 @"_ZN8__main__7foo$241Ex"(i64* noalias nocapture %retptr, { i8*, i32 }** noalias nocapture readnone %excinfo, i8* noalias nocapture readnone %env, i64 %arg.x) local_unnamed_addr #0
+define i8* @"testme"(i8* %py_closure, i8* %py_args, i8* nocapture readnone %py_kws) local_unnamed_addr {
+entry:
+  %.5 = alloca i8*, align 8
+  %.6 = call i32 (i8*, i8*, i64, i64, ...) @PyArg_UnpackTuple(i8* %py_args, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.const.foo, i64 0, i64 0), i64 1, i64 1, i8** nonnull %.5)
+  %.7 = icmp eq i32 %.6, 0
+  br i1 %.7, label %entry_if, label %entry_endif, !prof !0
+entry_if:                                         ; preds = %entry.endif.1.1.endif, %entry
+  ret i8* null
+entry_endif:                                      ; preds = %entry
+  %.11 = icmp eq i8* %py_closure, null
+  ret i8* null
+}
+declare i32 @PyArg_UnpackTuple(i8*, i8*, i64, i64, ...) local_unnamed_addr
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) local_unnamed_addr #1
+declare void @PyErr_SetString(i8*, i8*) local_unnamed_addr
+declare i8* @PyNumber_Long(i8*) local_unnamed_addr
+declare i64 @PyLong_AsLongLong(i8*) local_unnamed_addr
+declare void @Py_DecRef(i8*) local_unnamed_addr
+declare i8* @PyErr_Occurred() local_unnamed_addr
+declare i8* @PyLong_FromLongLong(i64) local_unnamed_addr
+; Function Attrs: nounwind
+declare void @llvm.stackprotector(i8*, i8**) #1
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind }
+!0 = !{!"branch_weights", i32 1, i32 9}
+!1 = !{!"branch_weights", i32 9, i32 1}
--- a/conda-recipes/llvmdev_amdgcn/meta.yaml
+++ b/conda-recipes/llvmdev_amdgcn/meta.yaml
+{% set shortversion = "roc-1.8" %}
+{% set version = "roc-1.8.x" %}
+{% set conda_version = "roc_1.8.x" %}
+{% set build_number = "0" %}
+package:
+  name: llvmdev_amdgcn
+  version: {{ conda_version }}
+source:
+  - git_url: https://github.com/RadeonOpenCompute/llvm.git
+    git_tag: {{ version }}
+    patches:
+        # undefined behavior bug due to Twine usage
+        - twine_cfg_undefined_behavior.patch
+  - git_url: https://github.com/RadeonOpenCompute/lld.git
+    git_tag: {{ version }}
+    folder: tools/lld
+build:
+  number: {{ build_number }}
+  script_env:
+    - PY_VCRUNTIME_REDIST
+  ignore_run_exports:
+    # Is static-linked
+    - xar
+requirements:
+  build:
+    # We cannot do this on macOS or windows
+    # OSX already has llvm so has to be handled
+    # at build.sh time
+    # Windows needs to build using vs2015_runtime
+    # irrespective of python version
+    - {{ compiler('c') }} # [unix]
+    - {{ compiler('cxx') }} # [unix]
+    - cmake
+    # Needed to unpack the source tarball
+    - m2w64-xz  # [py27 and win]
+    # ninja not currently used, bld.bat needs an update
+    - ninja  # [win]
+    # Needed to build LLVM
+    - python
+    # need vs2015_runtime to build, do not want it at run time
+    # as extensions for py27 need vs2008
+    - vs2015_runtime # [win]
+    - make # [unix]
+  host:
+    # needed for llc at runtime
+    - zlib # [not win]
+    - xar # [osx]
+test:
+  requires:
+    - python
+  files:
+    - cfg_test.ll
+    - test_cfg_dot.py
+  commands:
+    - $PREFIX/bin/llvm-config --libs                         # [not win]
+    - $PREFIX/bin/llc -version                               # [not win]
+    - if not exist %LIBRARY_INC%\\llvm\\Pass.h exit 1        # [win]
+    - if not exist %LIBRARY_LIB%\\LLVMSupport.lib exit 1     # [win]
+    - test -f $PREFIX/include/llvm/Pass.h                    # [unix]
+    - test -f $PREFIX/lib/libLLVMSupport.a                   # [unix]
+    - test -f $PREFIX/lib/libLLVMCore.a                      # [not win]
+    # Test for ../twine_cfg_undefined_behavior.patch
+    - $PREFIX/bin/opt -dot-cfg cfg_test.ll                   # [not win]
+    - python test_cfg_dot.py                                 # [not win]
+about:
+  home: http://llvm.org/
+  dev_url: https://github.com/llvm-mirror/llvm
+  license: NCSA
+  license_file: LICENSE.TXT
+  summary: Development headers and libraries for LLVM
--- a/conda-recipes/llvmdev_amdgcn/test_cfg_dot.py
+++ b/conda-recipes/llvmdev_amdgcn/test_cfg_dot.py
+with open("cfg.testme.dot") as fin:
+    got = fin.read()
+assert '[label="W:1"]' in got
+assert '[label="W:9"]' in got
--- a/conda-recipes/llvmdev_amdgcn/twine_cfg_undefined_behavior.patch
+++ b/conda-recipes/llvmdev_amdgcn/twine_cfg_undefined_behavior.patch
+From b42222e01abc1a799c4e421fa26d72d49afe4b99 Mon Sep 17 00:00:00 2001
+From: Siu Kwan Lam <michael.lam.sk@gmail.com>
+Date: Fri, 23 Mar 2018 11:46:45 -0500
+Subject: [PATCH] Patch to fix undefined behavior in cfgprinter
+---
+ include/llvm/Analysis/CFGPrinter.h | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
+index 5786769..a4b642b 100644
+--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
+@@ -172,8 +172,7 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
+     // Prepend a 'W' to indicate that this is a weight rather than the actual
+     // profile count (due to scaling).
+-    Twine Attrs = "label=\"W:" + Twine(Weight->getZExtValue()) + "\"";
+-    return Attrs.str();
+    return ("label=\"W:" + Twine(Weight->getZExtValue()) + "\"").str();
+   }
+ };
+ } // End llvm namespace
+-- 
+2.10.1
--- a/conda-recipes/roctools/build.sh
+++ b/conda-recipes/roctools/build.sh
+#!/bin/bash
+set -x
+###############################################################################
+# Extract bitcodes from ROCM rpm source
+###############################################################################
+RPM_PATH=`readlink -f opencl_tmp/*.rpm`
+ROCM_PATH="opt/rocm/opencl/lib/x86_64/bitcode"
+declare -a bitcodes=(                       \
+"opencl.amdgcn.bc"                          \
+"ocml.amdgcn.bc"                            \
+"ockl.amdgcn.bc"                            \
+"oclc_correctly_rounded_sqrt_off.amdgcn.bc" \
+"oclc_daz_opt_off.amdgcn.bc"                \
+"oclc_finite_only_off.amdgcn.bc"            \
+"oclc_isa_version_803.amdgcn.bc"            \
+"oclc_unsafe_math_off.amdgcn.bc"            \
+"irif.amdgcn.bc"                            \
+)
+for bitcode in "${bitcodes[@]}"; do
+    bsdtar -x -f "$RPM_PATH" --strip-components 6 "$ROCM_PATH/$bitcode"
+done
+# move the bitcode to the pkg dir
+RESOURCE_PATH="$PREFIX/share/rocmtools"
+mv bitcode $RESOURCE_PATH
+###############################################################################
+# Now do C++ library build
+###############################################################################
+CMAKE_BUILD_DIR="cmake_build" # this needs to match meta.yaml test::source_files
+mkdir ${CMAKE_BUILD_DIR}
+pushd ${CMAKE_BUILD_DIR}
+printenv
+# Force CMake to look in the conda env "CMAKE_CONDA_ROOT" `/lib` etc 
+# for libraries via `-L`
+cmake .. -DCMAKE_BUILD_TYPE=RELEASE \
+         -DCMAKE_CONDA_ROOT:PATH="$BUILD_PREFIX" \
+         -DCMAKE_BITCODE_ROOT:PATH="$RESOURCE_PATH"
+# build
+make VERBOSE=1
+# move DSO to lib
+cp "rocmlite/librocmlite.so" "$PREFIX/lib"
+# test now, splitting this out to work at test time is hard to do
+# the test_XXX binaries are dynamically linked to librocmlite but no rpath
+# fix is made unless the binaries are also shipped (undesirable).
+ctest -V
+popd
+###############################################################################
+# Copy llvmdev binary tools to /bin
+# NOTE: should these names start to cause collision issues with llvm installs
+# they can be prefixed e.g. amd_opt. However `ld.lld` will need to have a 
+# `-flavour gnu` permanently supplied so it knows that it is emulating the GNU
+# linker variant.
+###############################################################################
+declare -a tools=( \
+"opt"              \
+"llc"              \
+"llvm-link"        \
+"ld.lld"           \
+)
+for tool in "${tools[@]}"; do
+    cp "$BUILD_PREFIX/bin/$tool" "$PREFIX/bin/$tool"
+done
--- a/conda-recipes/roctools/meta.yaml
+++ b/conda-recipes/roctools/meta.yaml
+{% set opencl_devel_ver="1.2.0-2018053132" %}
+{% set opencl_devel_sha256="95f429a25d7e6081fe1c75bd05feb5e515408b82a1631f09d96abb4232e1af68" %}
+package:
+    name: roctools
+    version:  {{ environ.get('GIT_DESCRIBE_TAG', '') }}
+source:
+  - path: ../..
+  - fn: rocm-opencl-devel-{{ opencl_devel_ver }}.x86_64.rpm
+    url: http://repo.radeon.com/rocm/yum/rpm/rocm-opencl-devel-{{ opencl_devel_ver }}.x86_64.rpm
+    folder: opencl_tmp
+    sha256: {{ opencl_devel_sha256 }}
+build:
+    number: {{ environ.get('GIT_DESCRIBE_NUMBER', 0) }}
+requirements:
+  build:
+    - {{ compiler('c') }} # [unix]
+    - {{ compiler('cxx') }} # [unix]
+    - cmake>=2.8
+    - xz
+    - zlib
+    - bzip2
+    - libarchive
+    - llvmdev_amdgcn
+    # gtest is used in the test binaries
+    - gtest
+  host:
+    - zlib # for llvm binary tooling that is copied in
+test:
+  commands:
+    # The librocmlite.so DSO is tested at compile time
+    # Check llvm binaries actually run
+    - opt --help | grep amdgpu
+    - llc --help | grep amdgpu
+    - llvm-link --help
+    - ld.lld --help
+about:
+    home: https://github.com/numba/roctools
+    license: BSD
+    summary: A shared library that wraps LLVM for code
+             generation for devices with Radeon Open Compute support.
+    license_file: LICENSE
--- a/include/rocmlite.hh
+++ b/include/rocmlite.hh
+/**
+ * Copyright (c) 2016 , Continuum Analytics, Inc.
+ * All rights reserved.
+ */
+#ifndef _ROC_HH
+#define _ROC_HH
+#include "llvm/IR/Module.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace std;
+namespace librocmlite
+{
+    class ModuleRef
+    {
+        public:
+            ModuleRef(llvm::Module * module);
+            operator bool () const;
+            llvm::Module * getModule();
+            void destroy();
+            std::string to_string();
+            static ModuleRef* parseAssembly(const char* Asm);
+            static ModuleRef* parseBitcode(const char *Bitcode, size_t Len);
+        private:
+            llvm::Module * M;
+    };
+    // Initializes the llvm libary tooling.
+    void Initialize();
+    // Finalizes the llvm library tooling.
+    void Finalize();
+    // Optimize a module in place
+    void Optimize(llvm::Module * M, int OptLevel, int SizeLevel, int Verify, const char * Cpu);
+    // Compile a module
+    int CompileModule(std::unique_ptr<llvm::Module> mod, llvm::raw_string_ostream &os, bool emitBRIG,
+                      int OptLevel);
+}
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+using namespace librocmlite;
+// ROC_ C/CFFI entry points
+void ROC_Initialize();
+void ROC_Finalize();
+char* ROC_CreateString(const char *str);
+void ROC_DisposeString(char *str);
+// rename this to ParseIR2Module ?
+ModuleRef* ROC_ParseModule(const char *Asm);
+ModuleRef* ROC_ParseBitcode(const char *Asm, size_t Len);
+void ROC_ModulePrint(ModuleRef *M, char **output);
+void ROC_ModuleDestroy(ModuleRef *M);
+int ROC_ModuleOptimize(ModuleRef *M, int OptLevel, int SizeLevel, int Verify, const char * Cpu);
+int ROC_ModuleLinkIn(ModuleRef * Dst, ModuleRef * Src);
+int ROC_ModuleEmitHSAIL(ModuleRef *M, int OptLevel, const char * Cpu, char **output);
+size_t ROC_ModuleEmitBRIG(ModuleRef *M, int OptLevel, const char * Cpu, char **output);
+void ROC_SetCommandLineOption(int argc, const char * const * argv);
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif //ifdef _ROC_HH
--- a/rocmlite/CMakeLists.txt
+++ b/rocmlite/CMakeLists.txt
+#
+# Copyright (c) 2016 , Continuum Analytics, Inc.
+# All rights reserved.
+#
+set(LIBROCMLITE_SOURCES rocmlite.cpp)
+add_library(
+            rocmlite
+            SHARED
+            ${LIBROCMLITE_SOURCES}
+            )
+# llvm components needed (will get mapped to libs for linking)
+# See `llvm-config --components` for a list of available components.
+llvm_map_components_to_libnames(
+llvm_libs # this name is magic, it is the variable in which the
+          # component linker info is stored.
+#all
+amdgpuasmparser
+amdgpuasmprinter
+amdgpucodegen
+amdgpudesc
+amdgpudisassembler
+amdgpuinfo
+amdgpuutils
+coroutines
+objcarcopts
+native
+core
+)
+#Link against LLVM libraries
+target_link_libraries(rocmlite ${llvm_libs})
+# include include/
+target_include_directories(rocmlite PUBLIC ${CMAKE_SOURCE_DIR}/include)
+# set library properties
+set_target_properties(rocmlite PROPERTIES
+                     VERSION   ${librocmlite_VERSION}
+                     SOVERSION ${librocmlite_SOVERSION})
+# Add in test dir
+add_subdirectory(test)
--- a/rocmlite/rocmlite.cpp
+++ b/rocmlite/rocmlite.cpp
--- a/rocmlite/test/CMakeLists.txt
+++ b/rocmlite/test/CMakeLists.txt
+#
+# Copyright (c) 2016 , Continuum Analytics, Inc.
+# All rights reserved.
+#
+find_program(VALGRIND valgrind)
+if(NOT VALGRIND)
+    message("Did not find valgrind.")
+else()
+    message("Found valgrind. ${VALGRIND}")
+endif()
+# Add a gtest with and the same running with valgrind (if found)
+macro(add_gtest TESTNAME)
+    add_executable(${TESTNAME} ${TESTNAME}.cpp)
+    # set link dir
+    link_directories(${librocmlite_BINARY_DIR})
+    # include include/
+    target_include_directories(${TESTNAME} PUBLIC ${CMAKE_SOURCE_DIR}/include ${BIN_INCLUDE_DIR})
+    # set linkage
+    target_link_libraries(${TESTNAME}
+                         rocmlite
+                         gtest
+                         gtest_main
+                         pthread
+                         )
+    get_target_property(TEST_LOCATION ${TESTNAME} LOCATION)
+    add_test(${TESTNAME} ${TEST_LOCATION})
+    if(VALGRIND)
+        add_test(${TESTNAME}_with_valgrind
+            valgrind --error-exitcode=1 --leak-check=full --show-leak-kinds=all
+            --suppressions=test_suppressions.supp
+            ${TEST_LOCATION})
+    endif()
+endmacro()
+set(TESTS
+    test_rocmlite
+    test_rocmlite_functions
+    )
+foreach(TEST ${TESTS})
+    add_gtest(${TEST})
+endforeach()
+add_custom_target(COPY_IN_COMPILATION_RESOURCES ALL
+                 COMMAND cmake -E copy_directory
+                 ${BITCODE_ROOT}
+                 ${CMAKE_BINARY_DIR}/rocmlite/test/
+                 DEPENDS ${gtest})
+add_custom_target(COPY_IN_TEST_RESOURCES ALL
+                 COMMAND cmake -E copy_directory
+                 ${CMAKE_SOURCE_DIR}/rocmlite/test/resources
+                 ${CMAKE_BINARY_DIR}/rocmlite/test/
+                 DEPENDS ${gtest})
--- a/rocmlite/test/resources/demo_ir.ll
+++ b/rocmlite/test/resources/demo_ir.ll
--- a/rocmlite/test/resources/test_suppressions.supp
+++ b/rocmlite/test/resources/test_suppressions.supp
+{
+   <Suppress_leak_in_stack_trace_handler>
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:_ZL17CreateSigAltStackv
+   fun:_ZL16RegisterHandlersv
+   fun:_ZN4llvm3sys16AddSignalHandlerEPFvPvES1_
+   fun:_ZN4llvm3sys28PrintStackTraceOnErrorSignalEb
+}
+{
+   <Suppress_leak_in_stack_trace_handler_with_debug_info>
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:CreateSigAltStack
+   fun:_ZL16RegisterHandlersv
+   fun:_ZN4llvm3sys16AddSignalHandlerEPFvPvES1_
+}
+{
+   <Suppress_leak_in_stack_trace_in_docker>
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:_ZL16RegisterHandlersv
+}
--- a/rocmlite/test/test_rocmlite.cpp
+++ b/rocmlite/test/test_rocmlite.cpp
+#include "rocmlite.hh"
+#include "gtest/gtest.h"
+#define TEST_BASE C_Linkage
+// Check initializer and finalizer work as expected.
+// (essentially a leak check)
+TEST(TEST_BASE, Initialization)
+{
+    for (int i = 0 ; i < 4; i++)
+    {
+        ROC_Initialize();
+        ROC_Finalize();
+    }
+}
--- a/rocmlite/test/test_rocmlite_functions.cpp
+++ b/rocmlite/test/test_rocmlite_functions.cpp
+#include "rocmlite.hh"
+#include "gtest/gtest.h"
+#include <string>
+#include <fstream>
+#include <streambuf>
+#include <exception>
+#include <regex>
+#define TEST_BASE C_Linkage_Functions
+using namespace std;
+// Helper functions
+std::string read_ir_from_file(const char * filename)
+{
+    std::stringstream buf;
+    std::ifstream f(filename, std::ios::out);
+    if(f.is_open())
+    {
+        buf << f.rdbuf();
+        f.close();
+    }
+    else
+    {
+        throw std::runtime_error("Could not open file.");
+    }
+    return buf.str();
+}
+std::string read_bc_from_file(const char * filename)
+{
+    std::stringstream buf;
+    std::ifstream f(filename, std::ios::out | std::ios::binary);
+    if(f.is_open())
+    {
+        buf << f.rdbuf();
+        f.close();
+    }
+    else
+    {
+        throw std::runtime_error("Could not open file.");
+    }
+    return buf.str();
+}
+// the names of the bitcode files that need linking in
+std::vector<std::string> _bitcodes = {       \
+                                             "opencl.amdgcn.bc",                          \
+                                             "ocml.amdgcn.bc",                            \
+                                             "ockl.amdgcn.bc",                            \
+                                             "oclc_correctly_rounded_sqrt_off.amdgcn.bc", \
+                                             "oclc_daz_opt_off.amdgcn.bc",                \
+                                             "oclc_finite_only_off.amdgcn.bc",            \
+                                             "oclc_isa_version_803.amdgcn.bc",            \
+                                             "oclc_unsafe_math_off.amdgcn.bc",            \
+                                             "irif.amdgcn.bc"
+                                     };
+// Environment class to setup and teardown the LLVMContext.
+// This is indicative of the use from python (calls to init to ensure the
+// library is initialised and a single call to finalize when gc takes place).
+class globalDSOLoadEnv: public ::testing::Environment
+{
+    public:
+        virtual void SetUp()
+        {
+            ROC_Initialize();
+        }
+        virtual void TearDown()
+        {
+            ROC_Finalize();
+        }
+};
+::testing::Environment* const global_env =
+    ::testing::AddGlobalTestEnvironment(new globalDSOLoadEnv);
+// Check string copy/destroy works
+TEST(TEST_BASE, String_Manipulation)
+{
+    const char string_orig[] = "Use numba for AMD GPUs!";
+    char * string_copy =  ROC_CreateString(string_orig);
+    ASSERT_TRUE(string_copy != nullptr);
+    ASSERT_STREQ(string_orig, string_copy);
+    ROC_DisposeString(string_copy);
+}
+// Check the module IR parse works cleanly and can then be destroyed.
+TEST(TEST_BASE, test_parse_ir_module)
+{
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* theRef = ROC_ParseModule(ir.c_str());
+    ROC_ModuleDestroy(theRef);
+}
+// Check the module BC parse works cleanly and can then be destroyed.
+TEST(TEST_BASE, test_parse_bc_module)
+{
+    std::string bc = read_bc_from_file("opencl.amdgcn.bc");
+    ModuleRef* theRef = ROC_ParseBitcode(bc.c_str(), bc.size());
+    ROC_ModuleDestroy(theRef);
+}
+// Check link-in works
+TEST(TEST_BASE, test_linkin_modules)
+{
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* dst = ROC_ParseModule(ir.c_str());
+    int ret;
+    for (auto& bitcode : _bitcodes)
+    {
+        std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+        ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                             builtins_bc.size());
+        // link the builtins into the module
+        ret = ROC_ModuleLinkIn(dst, bc_src);
+        EXPECT_TRUE(ret != 0);
+        // pointlessly link in the same a few times, there was an subtle corruption
+        // present in previous versions of the linkin function.
+        for(int i = 0; i < 3; i++)
+        {
+            int ret = ROC_ModuleLinkIn(dst, bc_src);
+            EXPECT_TRUE(ret!=0);
+        }
+        ROC_ModuleDestroy(bc_src);
+    }
+    ROC_ModuleDestroy(dst);
+}
+// Test optimization call works
+TEST(TEST_BASE, test_optimize_module)
+{
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* dst = ROC_ParseModule(ir.c_str());
+    int ret;
+    for (auto& bitcode : _bitcodes)
+    {
+        std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+        ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                             builtins_bc.size());
+        // link the builtins into the module
+        ret = ROC_ModuleLinkIn(dst, bc_src);
+        EXPECT_TRUE(ret != 0);
+        ROC_ModuleDestroy(bc_src);
+    }
+    // run an optimisation pass over the module
+    ret = ROC_ModuleOptimize(dst, 3, 0, 1, "fiji");
+    EXPECT_TRUE(ret == 1);
+    ROC_ModuleDestroy(dst);
+}
+// Test compilation call to HSAIL works
+TEST(TEST_BASE, test_compile_module_to_HSAIL)
+{
+    ROC_Initialize();
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* dst = ROC_ParseModule(ir.c_str());
+    int ret;
+    for (auto& bitcode : _bitcodes)
+    {
+        std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+        ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                             builtins_bc.size());
+        // link the builtins into the module
+        ret = ROC_ModuleLinkIn(dst, bc_src);
+        EXPECT_TRUE(ret != 0);
+        ROC_ModuleDestroy(bc_src);
+    }
+    const char * cpu = "fiji";
+    // run an optimisation pass over the module
+    ret = ROC_ModuleOptimize(dst, 3, 0, 1, cpu);
+    ASSERT_TRUE(ret == 1);
+    char * output;
+    ret = ROC_ModuleEmitHSAIL(dst, 2, cpu, &output);
+    EXPECT_TRUE(ret > 0);
+    std::string hsail(output);
+    // check this is an HSA code object, search the dump for an HSA ISA string
+    // like '.hsa_code_object_isa 8,0,3,"AMD","AMDGPU"'
+    std::regex regex(".*\\.hsa_code_object_isa.*\"AMD\",\"AMDGPU\".*");
+    EXPECT_TRUE(std::regex_search(hsail, regex));
+    free(output);
+    ROC_ModuleDestroy(dst);
+}
+// Test compilation call to BRIG works
+TEST(TEST_BASE, test_compile_module_to_BRIG)
+{
+    ROC_Initialize();
+    std::string ir = read_ir_from_file("demo_ir.ll");
+    ModuleRef* dst = ROC_ParseModule(ir.c_str());
+    int ret;
+    for (auto& bitcode : _bitcodes)
+    {
+        std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+        ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                             builtins_bc.size());
+        // link the builtins into the module
+        ret = ROC_ModuleLinkIn(dst, bc_src);
+        EXPECT_TRUE(ret != 0);
+        ROC_ModuleDestroy(bc_src);
+    }
+    const char * cpu = "fiji";
+    // run an optimisation pass over the module
+    ret = ROC_ModuleOptimize(dst, 3, 0, 1, cpu);
+    ASSERT_TRUE(ret == 1);
+    char * output;
+    ret = ROC_ModuleEmitBRIG(dst, 2, cpu, &output);
+    EXPECT_TRUE(ret > 0);
+    char elf_string[] = "\x7f\x45\x4c\x46";
+    // check this is an ELF object
+    for (size_t i = 0; i < 4; i++)
+    {
+        EXPECT_TRUE(output[i]==elf_string[i]);
+    }
+    free(output);
+    ROC_ModuleDestroy(dst);
+}
+// Test many compilation calls to BRIG works
+TEST(TEST_BASE, test_many_compile_module_to_BRIG)
+{
+    const char * cpu = "fiji";
+    int trials = 5;
+    for(int k = 0; k < trials; k++)
+    {
+        ROC_Initialize();
+        std::string ir = read_ir_from_file("demo_ir.ll");
+        ModuleRef* dst = ROC_ParseModule(ir.c_str());
+        int ret;
+        for (auto& bitcode : _bitcodes)
+        {
+            std::string builtins_bc = read_bc_from_file(bitcode.c_str());
+            ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
+                                                 builtins_bc.size());
+            // link the builtins into the module
+            ret = ROC_ModuleLinkIn(dst, bc_src);
+            EXPECT_TRUE(ret != 0);
+            ROC_ModuleDestroy(bc_src);
+        }
+        // run an optimisation pass over the module
+        ret = ROC_ModuleOptimize(dst, 3, 0, 1, cpu);
+        ASSERT_TRUE(ret == 1);
+        char * output;
+        ret = ROC_ModuleEmitBRIG(dst, 2, cpu, &output);
+        EXPECT_TRUE(ret > 0);
+        char elf_string[] = "\x7f\x45\x4c\x46";
+        // check this is an ELF object
+        for (size_t i = 0; i < 4; i++)
+        {
+            EXPECT_TRUE(output[i]==elf_string[i]);
+        }
+        free(output);
+        ROC_ModuleDestroy(dst);
+    }
+}