Commit 55ada7af authored by dugupeiwen's avatar dugupeiwen
Browse files

init

parents
#
# Copyright (c) 2016 , Continuum Analytics, Inc.
# All rights reserved.
#
# Require cmake 2.8.12+
cmake_minimum_required (VERSION 2.8.12)
# Refuse to build on anything but linux
if(APPLE OR WIN32)
message(FATAL_ERROR "rocmlite can only be built on linux.")
endif(APPLE OR WIN32)
option(CMAKE_CONDA_ROOT "CMAKE_CONDA_ROOT" "")
set(CONDA_ROOT ${CMAKE_CONDA_ROOT})
message(STATUS "CONDA_ROOT = ${CONDA_ROOT}")
option(CMAKE_BITCODE_ROOT "CMAKE_BITCODE_ROOT" "")
set(BITCODE_ROOT ${CMAKE_BITCODE_ROOT})
message(STATUS "BITCODE_ROOT = ${BITCODE_ROOT}")
# project name
project (rocmlite)
# The version number
set (librocmlite_VERSION_MAJOR 0)
set (librocmlite_VERSION_MINOR 1)
set (librocmlite_VERSION ${librocmlite_VERSION_MAJOR}.${librocmlite_VERSION_MINOR})
set (librocmlite_SOVERSION 0.1.0) # the .soversion of the shared lib.
enable_language(CXX)
# CMAKE 3.1.0+ has CXX_STANDARD etc, not present by default on older linux
# just check the flag is supported (most likely gcc).
include(CheckCXXCompilerFlag)
CHECK_CXX_COMPILER_FLAG(-std=c++11 CXX_HAS_CXX11)
if(CXX_HAS_CXX11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fno-rtti -Wall -std=c++11")
else()
message(FATAL_ERROR "Compiler must support C++11")
endif()
find_package(LLVM REQUIRED CONFIG)
if(LLVM_PACKAGE_VERSION VERSION_LESS 6.0)
message(FATAL_ERROR "llvm version must be 6.0+")
endif()
message(STATUS "Found LLVM version: ${LLVM_PACKAGE_VERSION}")
message(STATUS "Using LLVMConfig.cmake found in: ${LLVM_DIR}")
include_directories(${LLVM_INCLUDE_DIRS})
add_definitions(${LLVM_DEFINITIONS})
# conda root
link_directories( "${CONDA_ROOT}/lib" )
# turn on testing
enable_testing()
# rocmlite code
add_subdirectory(rocmlite)
message(
"
------------------------------------
| Build Summary |
------------------------------------
Building...........: ${CMAKE_PROJECT_NAME} version ${librocmlite_VERSION}
LLVM version.......: ${LLVM_PACKAGE_VERSION}
LLVM location......: ${LLVM_DIR}
C++ Compiler.......: ${CMAKE_CXX_COMPILER}
C++ Flags..........: ${CMAKE_CXX_FLAGS}
")
BSD 2-Clause License
Copyright (c) 2018, Numba
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# roctools
This repository acts as a collection point for the resources needed to produce
a conda package containing all the components necessary to use Numba with AMD
GCN discrete GPUs. Included are:
* Source code and build tooling for a library, `librocmlite`, which performs
the same function for AMD GPUs as
[`llvmlite`](https://github.com/numba/llvmlite) does for CPUs. It essentially
acts as shim between Python and LLVM. For convenience, `librocmlite` is
statically linked against releases from AMDs [LLVM
fork](https://github.com/RadeonOpenCompute/llvm), therefore there is no LLVM
dependency.
* A conda recipe (`llvmdev_amdgcn`) for building the aforementioned fork of
LLVM to bootstrap the `roctools` package.
* A conda recipe (`roctools`) that:
* Builds and tests `librocmlite`
* Extracts necessary math (and other) library bitcodes from AMDs `rpm`
based releases.
* Extracts necessary binaries from a build of AMDs LLVM fork (as a conda
package).
It is this package upon which Numba depends.
------------------------
## Conda build instructions
1. Build the AMD LLVM fork package (this will take a while):
```
$ conda build conda-recipes/llvmdev_amdgcn
```
Upon successful completion a package called `llvmdev_amdgcn-{version}` will
be produced. This package is needed to bootstrap the build of `librocmlite` and
also to provide some binary tools used in the AMD GCN tool chain.
2. Build the roctools package:
```
$ conda build conda-recipes/roctools
```
Upon successful completion a package called `roctools-{version}` will
be produced. This package is self contained and holds all the necessary
components for using AMD GCN GPUs.
------------------------
## License
See [LICENSE](https://github.com/numba/roctools/blob/master/LICENSE).
From 7c9054610e354340f9474dcd13a927f929912d1d Mon Sep 17 00:00:00 2001
From: Eugene Zelenko <eugene.zelenko@gmail.com>
Date: Tue, 6 Mar 2018 23:06:13 +0000
Subject: [PATCH] [Transforms] Add missing header for InstructionCombining.cpp,
in order to export LLVMInitializeInstCombine as extern "C". Fixes PR35947.
Patch by Brenton Bostick.
Differential revision: https://reviews.llvm.org/D44140
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@326843 91177308-0d34-0410-b5e6-96231b3b80d8
---
lib/Transforms/InstCombine/InstructionCombining.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index a3b2fe9..7ec7343 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -34,6 +34,7 @@
//===----------------------------------------------------------------------===//
#include "InstCombineInternal.h"
+#include "llvm-c/Initialization.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
--
1.8.3.1
mkdir build
cd build
set BUILD_CONFIG=Release
REM Configure step
if "%ARCH%"=="32" (
set CMAKE_GENERATOR=Visual Studio 14 2015
) else (
set CMAKE_GENERATOR=Visual Studio 14 2015 Win64
)
set CMAKE_GENERATOR_TOOLSET=v140_xp
REM Reduce build times and package size by removing unused stuff
set CMAKE_CUSTOM=-DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_INCLUDE_TESTS=OFF ^
-DLLVM_INCLUDE_UTILS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF ^
-DLLVM_ENABLE_ASSERTIONS=ON
cmake -G "%CMAKE_GENERATOR%" -T "%CMAKE_GENERATOR_TOOLSET%" ^
-DCMAKE_BUILD_TYPE="%BUILD_CONFIG%" -DCMAKE_PREFIX_PATH=%LIBRARY_PREFIX% ^
-DCMAKE_INSTALL_PREFIX:PATH=%LIBRARY_PREFIX% %CMAKE_CUSTOM% %SRC_DIR%
if errorlevel 1 exit 1
REM Build step
cmake --build . --config "%BUILD_CONFIG%"
if errorlevel 1 exit 1
REM Install step
cmake --build . --config "%BUILD_CONFIG%" --target install
if errorlevel 1 exit 1
#!/bin/bash
# based on https://github.com/AnacondaRecipes/llvmdev-feedstock/blob/master/recipe/build.sh
set -x
# This is the clang compiler prefix
DARWIN_TARGET=x86_64-apple-darwin13.4.0
declare -a _cmake_config
_cmake_config+=(-DCMAKE_INSTALL_PREFIX:PATH=${PREFIX})
_cmake_config+=(-DCMAKE_BUILD_TYPE:STRING=Release)
# The bootstrap clang I use was built with a static libLLVMObject.a and I trying to get the same here
# _cmake_config+=(-DBUILD_SHARED_LIBS:BOOL=ON)
_cmake_config+=(-DLLVM_ENABLE_ASSERTIONS:BOOL=ON)
_cmake_config+=(-DLINK_POLLY_INTO_TOOLS:BOOL=ON)
# Don't really require libxml2. Turn it off explicitly to avoid accidentally linking to system libs
_cmake_config+=(-DLLVM_ENABLE_LIBXML2:BOOL=OFF)
# Urgh, llvm *really* wants to link to ncurses / terminfo and we *really* do not want it to.
_cmake_config+=(-DHAVE_TERMINFO_CURSES=OFF)
# Sometimes these are reported as unused. Whatever.
_cmake_config+=(-DHAVE_TERMINFO_NCURSES=OFF)
_cmake_config+=(-DHAVE_TERMINFO_NCURSESW=OFF)
_cmake_config+=(-DHAVE_TERMINFO_TERMINFO=OFF)
_cmake_config+=(-DHAVE_TERMINFO_TINFO=OFF)
_cmake_config+=(-DHAVE_TERMIOS_H=OFF)
_cmake_config+=(-DCLANG_ENABLE_LIBXML=OFF)
_cmake_config+=(-DLIBOMP_INSTALL_ALIASES=OFF)
_cmake_config+=(-DLLVM_ENABLE_RTTI=OFF)
_cmake_config+=(-DLLVM_TARGETS_TO_BUILD="AMDGPU;X86")
# TODO :: It would be nice if we had a cross-ecosystem 'BUILD_TIME_LIMITED' env var we could use to
# disable these unnecessary but useful things.
if [[ ${CONDA_FORGE} == yes ]]; then
_cmake_config+=(-DLLVM_INCLUDE_TESTS=OFF)
_cmake_config+=(-DLLVM_INCLUDE_UTILS=OFF)
_cmake_config+=(-DLLVM_INCLUDE_DOCS=OFF)
_cmake_config+=(-DLLVM_INCLUDE_EXAMPLES=OFF)
fi
# Only valid when using the Ninja Generator AFAICT
# _cmake_config+=(-DLLVM_PARALLEL_LINK_JOBS:STRING=1)
# What about cross-compiling targetting Darwin here? Are any of these needed?
if [[ $(uname) == Darwin ]]; then
_cmake_config+=(-DCMAKE_OSX_SYSROOT=${SYSROOT_DIR})
_cmake_config+=(-DDARWIN_macosx_CACHED_SYSROOT=${SYSROOT_DIR})
_cmake_config+=(-DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET})
_cmake_config+=(-DCMAKE_LIBTOOL=$(which ${DARWIN_TARGET}-libtool))
_cmake_config+=(-DLD64_EXECUTABLE=$(which ${DARWIN_TARGET}-ld))
_cmake_config+=(-DCMAKE_INSTALL_NAME_TOOL=$(which ${DARWIN_TARGET}-install_name_tool))
# Once we are using our libc++ (not until llvm_build_final), it will be single-arch only and not setting
# this causes link failures building the santizers since they respect DARWIN_osx_ARCHS. We may as well
# save some compilation time by setting this for all of our llvm builds.
_cmake_config+=(-DDARWIN_osx_ARCHS=x86_64)
#elif [[ $(uname) == Linux ]]; then
# _cmake_config+=(-DLLVM_BINUTILS_INCDIR=${PREFIX}/lib/gcc/${cpu_arch}-${vendor}-linux-gnu/${compiler_ver}/plugin/include)
fi
# For when the going gets tough:
# _cmake_config+=(-Wdev)
# _cmake_config+=(--debug-output)
# _cmake_config+=(--trace-expand)
# CPU_COUNT=1
mkdir build
cd build
cmake -G'Unix Makefiles' \
"${_cmake_config[@]}" \
..
make -j${CPU_COUNT} VERBOSE=1
make install
; ModuleID = 'foo'
source_filename = "<string>"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-darwin17.4.0"
@.const.foo = internal constant [4 x i8] c"foo\00"
@".const.Fatal error: missing _dynfunc.Closure" = internal constant [38 x i8] c"Fatal error: missing _dynfunc.Closure\00"
@PyExc_RuntimeError = external global i8
@".const.missing Environment" = internal constant [20 x i8] c"missing Environment\00"
; Function Attrs: norecurse nounwind
declare i32 @"_ZN8__main__7foo$241Ex"(i64* noalias nocapture %retptr, { i8*, i32 }** noalias nocapture readnone %excinfo, i8* noalias nocapture readnone %env, i64 %arg.x) local_unnamed_addr #0
define i8* @"testme"(i8* %py_closure, i8* %py_args, i8* nocapture readnone %py_kws) local_unnamed_addr {
entry:
%.5 = alloca i8*, align 8
%.6 = call i32 (i8*, i8*, i64, i64, ...) @PyArg_UnpackTuple(i8* %py_args, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.const.foo, i64 0, i64 0), i64 1, i64 1, i8** nonnull %.5)
%.7 = icmp eq i32 %.6, 0
br i1 %.7, label %entry_if, label %entry_endif, !prof !0
entry_if: ; preds = %entry.endif.1.1.endif, %entry
ret i8* null
entry_endif: ; preds = %entry
%.11 = icmp eq i8* %py_closure, null
ret i8* null
}
declare i32 @PyArg_UnpackTuple(i8*, i8*, i64, i64, ...) local_unnamed_addr
; Function Attrs: nounwind
declare i32 @puts(i8* nocapture readonly) local_unnamed_addr #1
declare void @PyErr_SetString(i8*, i8*) local_unnamed_addr
declare i8* @PyNumber_Long(i8*) local_unnamed_addr
declare i64 @PyLong_AsLongLong(i8*) local_unnamed_addr
declare void @Py_DecRef(i8*) local_unnamed_addr
declare i8* @PyErr_Occurred() local_unnamed_addr
declare i8* @PyLong_FromLongLong(i64) local_unnamed_addr
; Function Attrs: nounwind
declare void @llvm.stackprotector(i8*, i8**) #1
attributes #0 = { norecurse nounwind }
attributes #1 = { nounwind }
!0 = !{!"branch_weights", i32 1, i32 9}
!1 = !{!"branch_weights", i32 9, i32 1}
{% set shortversion = "roc-1.8" %}
{% set version = "roc-1.8.x" %}
{% set conda_version = "roc_1.8.x" %}
{% set build_number = "0" %}
package:
name: llvmdev_amdgcn
version: {{ conda_version }}
source:
- git_url: https://github.com/RadeonOpenCompute/llvm.git
git_tag: {{ version }}
patches:
# undefined behavior bug due to Twine usage
- twine_cfg_undefined_behavior.patch
- git_url: https://github.com/RadeonOpenCompute/lld.git
git_tag: {{ version }}
folder: tools/lld
build:
number: {{ build_number }}
script_env:
- PY_VCRUNTIME_REDIST
ignore_run_exports:
# Is static-linked
- xar
requirements:
build:
# We cannot do this on macOS or windows
# OSX already has llvm so has to be handled
# at build.sh time
# Windows needs to build using vs2015_runtime
# irrespective of python version
- {{ compiler('c') }} # [unix]
- {{ compiler('cxx') }} # [unix]
- cmake
# Needed to unpack the source tarball
- m2w64-xz # [py27 and win]
# ninja not currently used, bld.bat needs an update
- ninja # [win]
# Needed to build LLVM
- python
# need vs2015_runtime to build, do not want it at run time
# as extensions for py27 need vs2008
- vs2015_runtime # [win]
- make # [unix]
host:
# needed for llc at runtime
- zlib # [not win]
- xar # [osx]
test:
requires:
- python
files:
- cfg_test.ll
- test_cfg_dot.py
commands:
- $PREFIX/bin/llvm-config --libs # [not win]
- $PREFIX/bin/llc -version # [not win]
- if not exist %LIBRARY_INC%\\llvm\\Pass.h exit 1 # [win]
- if not exist %LIBRARY_LIB%\\LLVMSupport.lib exit 1 # [win]
- test -f $PREFIX/include/llvm/Pass.h # [unix]
- test -f $PREFIX/lib/libLLVMSupport.a # [unix]
- test -f $PREFIX/lib/libLLVMCore.a # [not win]
# Test for ../twine_cfg_undefined_behavior.patch
- $PREFIX/bin/opt -dot-cfg cfg_test.ll # [not win]
- python test_cfg_dot.py # [not win]
about:
home: http://llvm.org/
dev_url: https://github.com/llvm-mirror/llvm
license: NCSA
license_file: LICENSE.TXT
summary: Development headers and libraries for LLVM
with open("cfg.testme.dot") as fin:
got = fin.read()
assert '[label="W:1"]' in got
assert '[label="W:9"]' in got
From b42222e01abc1a799c4e421fa26d72d49afe4b99 Mon Sep 17 00:00:00 2001
From: Siu Kwan Lam <michael.lam.sk@gmail.com>
Date: Fri, 23 Mar 2018 11:46:45 -0500
Subject: [PATCH] Patch to fix undefined behavior in cfgprinter
---
include/llvm/Analysis/CFGPrinter.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
index 5786769..a4b642b 100644
--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
@@ -172,8 +172,7 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
// Prepend a 'W' to indicate that this is a weight rather than the actual
// profile count (due to scaling).
- Twine Attrs = "label=\"W:" + Twine(Weight->getZExtValue()) + "\"";
- return Attrs.str();
+ return ("label=\"W:" + Twine(Weight->getZExtValue()) + "\"").str();
}
};
} // End llvm namespace
--
2.10.1
#!/bin/bash
set -x
###############################################################################
# Extract bitcodes from ROCM rpm source
###############################################################################
RPM_PATH=`readlink -f opencl_tmp/*.rpm`
ROCM_PATH="opt/rocm/opencl/lib/x86_64/bitcode"
declare -a bitcodes=( \
"opencl.amdgcn.bc" \
"ocml.amdgcn.bc" \
"ockl.amdgcn.bc" \
"oclc_correctly_rounded_sqrt_off.amdgcn.bc" \
"oclc_daz_opt_off.amdgcn.bc" \
"oclc_finite_only_off.amdgcn.bc" \
"oclc_isa_version_803.amdgcn.bc" \
"oclc_unsafe_math_off.amdgcn.bc" \
"irif.amdgcn.bc" \
)
for bitcode in "${bitcodes[@]}"; do
bsdtar -x -f "$RPM_PATH" --strip-components 6 "$ROCM_PATH/$bitcode"
done
# move the bitcode to the pkg dir
RESOURCE_PATH="$PREFIX/share/rocmtools"
mv bitcode $RESOURCE_PATH
###############################################################################
# Now do C++ library build
###############################################################################
CMAKE_BUILD_DIR="cmake_build" # this needs to match meta.yaml test::source_files
mkdir ${CMAKE_BUILD_DIR}
pushd ${CMAKE_BUILD_DIR}
printenv
# Force CMake to look in the conda env "CMAKE_CONDA_ROOT" `/lib` etc
# for libraries via `-L`
cmake .. -DCMAKE_BUILD_TYPE=RELEASE \
-DCMAKE_CONDA_ROOT:PATH="$BUILD_PREFIX" \
-DCMAKE_BITCODE_ROOT:PATH="$RESOURCE_PATH"
# build
make VERBOSE=1
# move DSO to lib
cp "rocmlite/librocmlite.so" "$PREFIX/lib"
# test now, splitting this out to work at test time is hard to do
# the test_XXX binaries are dynamically linked to librocmlite but no rpath
# fix is made unless the binaries are also shipped (undesirable).
ctest -V
popd
###############################################################################
# Copy llvmdev binary tools to /bin
# NOTE: should these names start to cause collision issues with llvm installs
# they can be prefixed e.g. amd_opt. However `ld.lld` will need to have a
# `-flavour gnu` permanently supplied so it knows that it is emulating the GNU
# linker variant.
###############################################################################
declare -a tools=( \
"opt" \
"llc" \
"llvm-link" \
"ld.lld" \
)
for tool in "${tools[@]}"; do
cp "$BUILD_PREFIX/bin/$tool" "$PREFIX/bin/$tool"
done
{% set opencl_devel_ver="1.2.0-2018053132" %}
{% set opencl_devel_sha256="95f429a25d7e6081fe1c75bd05feb5e515408b82a1631f09d96abb4232e1af68" %}
package:
name: roctools
version: {{ environ.get('GIT_DESCRIBE_TAG', '') }}
source:
- path: ../..
- fn: rocm-opencl-devel-{{ opencl_devel_ver }}.x86_64.rpm
url: http://repo.radeon.com/rocm/yum/rpm/rocm-opencl-devel-{{ opencl_devel_ver }}.x86_64.rpm
folder: opencl_tmp
sha256: {{ opencl_devel_sha256 }}
build:
number: {{ environ.get('GIT_DESCRIBE_NUMBER', 0) }}
requirements:
build:
- {{ compiler('c') }} # [unix]
- {{ compiler('cxx') }} # [unix]
- cmake>=2.8
- xz
- zlib
- bzip2
- libarchive
- llvmdev_amdgcn
# gtest is used in the test binaries
- gtest
host:
- zlib # for llvm binary tooling that is copied in
test:
commands:
# The librocmlite.so DSO is tested at compile time
# Check llvm binaries actually run
- opt --help | grep amdgpu
- llc --help | grep amdgpu
- llvm-link --help
- ld.lld --help
about:
home: https://github.com/numba/roctools
license: BSD
summary: A shared library that wraps LLVM for code
generation for devices with Radeon Open Compute support.
license_file: LICENSE
/**
* Copyright (c) 2016 , Continuum Analytics, Inc.
* All rights reserved.
*/
#ifndef _ROC_HH
#define _ROC_HH
#include "llvm/IR/Module.h"
#include "llvm/Support/FormattedStream.h"
using namespace std;
namespace librocmlite
{
class ModuleRef
{
public:
ModuleRef(llvm::Module * module);
operator bool () const;
llvm::Module * getModule();
void destroy();
std::string to_string();
static ModuleRef* parseAssembly(const char* Asm);
static ModuleRef* parseBitcode(const char *Bitcode, size_t Len);
private:
llvm::Module * M;
};
// Initializes the llvm libary tooling.
void Initialize();
// Finalizes the llvm library tooling.
void Finalize();
// Optimize a module in place
void Optimize(llvm::Module * M, int OptLevel, int SizeLevel, int Verify, const char * Cpu);
// Compile a module
int CompileModule(std::unique_ptr<llvm::Module> mod, llvm::raw_string_ostream &os, bool emitBRIG,
int OptLevel);
}
#ifdef __cplusplus
extern "C"
{
#endif // __cplusplus
using namespace librocmlite;
// ROC_ C/CFFI entry points
void ROC_Initialize();
void ROC_Finalize();
char* ROC_CreateString(const char *str);
void ROC_DisposeString(char *str);
// rename this to ParseIR2Module ?
ModuleRef* ROC_ParseModule(const char *Asm);
ModuleRef* ROC_ParseBitcode(const char *Asm, size_t Len);
void ROC_ModulePrint(ModuleRef *M, char **output);
void ROC_ModuleDestroy(ModuleRef *M);
int ROC_ModuleOptimize(ModuleRef *M, int OptLevel, int SizeLevel, int Verify, const char * Cpu);
int ROC_ModuleLinkIn(ModuleRef * Dst, ModuleRef * Src);
int ROC_ModuleEmitHSAIL(ModuleRef *M, int OptLevel, const char * Cpu, char **output);
size_t ROC_ModuleEmitBRIG(ModuleRef *M, int OptLevel, const char * Cpu, char **output);
void ROC_SetCommandLineOption(int argc, const char * const * argv);
#ifdef __cplusplus
}
#endif // __cplusplus
#endif //ifdef _ROC_HH
#
# Copyright (c) 2016 , Continuum Analytics, Inc.
# All rights reserved.
#
set(LIBROCMLITE_SOURCES rocmlite.cpp)
add_library(
rocmlite
SHARED
${LIBROCMLITE_SOURCES}
)
# llvm components needed (will get mapped to libs for linking)
# See `llvm-config --components` for a list of available components.
llvm_map_components_to_libnames(
llvm_libs # this name is magic, it is the variable in which the
# component linker info is stored.
#all
amdgpuasmparser
amdgpuasmprinter
amdgpucodegen
amdgpudesc
amdgpudisassembler
amdgpuinfo
amdgpuutils
coroutines
objcarcopts
native
core
)
#Link against LLVM libraries
target_link_libraries(rocmlite ${llvm_libs})
# include include/
target_include_directories(rocmlite PUBLIC ${CMAKE_SOURCE_DIR}/include)
# set library properties
set_target_properties(rocmlite PROPERTIES
VERSION ${librocmlite_VERSION}
SOVERSION ${librocmlite_SOVERSION})
# Add in test dir
add_subdirectory(test)
#include "rocmlite.hh"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/RegionPass.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/Bitcode/BitcodeWriterPass.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/CodeGen/CommandFlags.def"
#include "llvm/CodeGen/LinkAllAsmWriterComponents.h"
#include "llvm/CodeGen/LinkAllCodegenComponents.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MIRParser/MIRParser.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/InitializePasses.h"
#include "llvm/IR/AutoUpgrade.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/LegacyPassNameParser.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/IR/Verifier.h"
#include "llvm/LinkAllIR.h"
#include "llvm/LinkAllPasses.h"
#include "llvm/Linker/Linker.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/Host.h"
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/PluginLoader.h"
#include "llvm/Support/PrettyStackTrace.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/SystemUtils.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Support/YAMLTraits.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Coroutines.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <iostream>
namespace librocmlite
{
static llvm::LLVMContext *TheContext = nullptr;
bool DisableInline = false;
bool UnitAtATime = true;
bool DisableLoopVectorization = false;
bool DisableSLPVectorization = false;
bool StripDebug = false;
bool DisableOptimizations = false;
bool DisableSimplifyLibCalls = false;
static const std::string MArch = "amdgcn"; // AMD Graphics Core Next
// ModuleRef impl
ModuleRef::ModuleRef(Module * module) : M(module) { };
ModuleRef::operator bool () const
{
return M != nullptr;
}
Module * ModuleRef::getModule()
{
return M;
}
void ModuleRef::destroy()
{
delete M;
M = nullptr;
}
std::string ModuleRef::to_string()
{
std::string buf;
raw_string_ostream os(buf);
M->print(os, nullptr);
os.flush();
return buf;
}
ModuleRef * ModuleRef::parseAssembly(const char* Asm)
{
SMDiagnostic SM;
Module* M = parseAssemblyString(Asm, SM, *TheContext).release();
if (!M) return nullptr;
return new ModuleRef(M);
}
ModuleRef * ModuleRef::parseBitcode(const char *Bitcode, size_t Len)
{
auto buf = MemoryBuffer::getMemBuffer(StringRef(Bitcode, Len),
"", false);
MemoryBufferRef mbref = buf->getMemBufferRef();
auto ModuleOr = parseBitcodeFile(mbref, *TheContext);
// Error handling inspired by
// https://github.com/llvm-mirror/llvm/blob/release_60/lib/Bitcode/Reader/BitReader.cpp#L79
if (Error err = ModuleOr.takeError())
{
std::string msg;
handleAllErrors(std::move(err), [&](const ErrorInfoBase &eib)
{
msg = eib.message();
});
puts(strdup(msg.c_str()));
return nullptr;
}
std::unique_ptr<Module> mod (std::move(ModuleOr.get()));
if(!mod->isMaterialized())
{
mod->materializeAll();
}
ModuleRef * mref = new ModuleRef(mod.release());
return mref;
}
CodeGenOpt::Level GetCodeGenOptLevel(int OptLevel)
{
switch (OptLevel)
{
case 1:
return CodeGenOpt::Less;
case 2:
return CodeGenOpt::Default;
case 3:
return CodeGenOpt::Aggressive;
default:
return CodeGenOpt::None;
}
}
// The following function combines initialisation code from opt and llc
// tools as found in the llvm source tree, here:
// https://github.com/llvm-mirror/llvm/blob/master/tools/opt/opt.cpp
// and here:
// https://github.com/llvm-mirror/llvm/blob/master/tools/llc/llc.cpp
void Initialize()
{
using namespace llvm;
if ( TheContext != nullptr )
{
// Already initialized
return;
}
sys::PrintStackTraceOnErrorSignal("librocmlite");
EnablePrettyStackTrace();
// Enable debug stream buffering.
EnableDebugBuffering = true;
// this has thread safety issues, there's no global context anymore
// each thread really ought to have its own.
LLVMContext * Context = new LLVMContext();
TheContext = Context;
// Initialize targets
// FROM OPT and LLC
InitializeAllTargets();
InitializeAllTargetMCs();
InitializeAllAsmPrinters();
InitializeAllAsmParsers();
// Initialize passes
// FROM OPT
PassRegistry &Registry = *PassRegistry::getPassRegistry();
initializeCore(Registry);
initializeCoroutines(Registry);
initializeScalarOpts(Registry);
initializeObjCARCOpts(Registry);
initializeVectorization(Registry);
initializeIPO(Registry);
initializeAnalysis(Registry);
initializeTransformUtils(Registry);
initializeInstCombine(Registry);
initializeInstrumentation(Registry);
initializeTarget(Registry);
// For codegen passes, only passes that do IR to IR transformation are
// supported.
initializeExpandMemCmpPassPass(Registry);
initializeScalarizeMaskedMemIntrinPass(Registry);
initializeCodeGenPreparePass(Registry);
initializeAtomicExpandPass(Registry);
initializeRewriteSymbolsLegacyPassPass(Registry);
initializeWinEHPreparePass(Registry);
initializeDwarfEHPreparePass(Registry);
initializeSafeStackLegacyPassPass(Registry);
initializeSjLjEHPreparePass(Registry);
initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
initializeGlobalMergePass(Registry);
initializeIndirectBrExpandPassPass(Registry);
initializeInterleavedAccessPass(Registry);
initializeEntryExitInstrumenterPass(Registry);
initializePostInlineEntryExitInstrumenterPass(Registry);
initializeUnreachableBlockElimLegacyPassPass(Registry);
initializeExpandReductionsPass(Registry);
initializeWriteBitcodePassPass(Registry);
#ifdef LINK_POLLY_INTO_TOOLS
polly::initializePollyPasses(Registry);
#endif
// FROM LLC
initializeCodeGen(Registry);
initializeLoopStrengthReducePass(Registry);
initializeLowerIntrinsicsPass(Registry);
initializeConstantHoistingLegacyPassPass(Registry);
// Initialize debugging passes.
initializeScavengerTestPass(Registry);
}
void Finalize()
{
using namespace llvm;
// finalizer is called when the library is potentially unloaded
// the context can be deleted.
if (TheContext)
{
delete TheContext;
TheContext = nullptr;
llvm_shutdown();
}
}
// The following section is adapted from opt.cpp from the LLVM source tree.
// Original code is here:
// https://github.com/llvm-mirror/llvm/blob/master/tools/opt/opt.cpp
// --- Start OPT section ---
static inline void addPass(legacy::PassManagerBase &PM, Pass *P)
{
// Add the pass to the pass manager...
PM.add(P);
// If we are verifying all of the intermediate steps, add the verifier...
PM.add(createVerifierPass());
}
/// This routine adds optimization passes based on selected optimization level,
/// OptLevel.
static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
legacy::FunctionPassManager &FPM,
TargetMachine *TM,
unsigned OptLevel, unsigned SizeLevel)
{
FPM.add(createVerifierPass()); // Verify that input is correct
PassManagerBuilder Builder;
Builder.OptLevel = OptLevel;
Builder.SizeLevel = SizeLevel;
if (DisableInline)
{
// No inlining pass
}
else if (OptLevel > 1)
{
Builder.Inliner = createFunctionInliningPass(OptLevel, SizeLevel, false);
}
else
{
Builder.Inliner = createAlwaysInlinerLegacyPass();
}
Builder.DisableUnitAtATime = !UnitAtATime;
Builder.DisableUnrollLoops = OptLevel == 0;
// This is final, unless there is a #pragma vectorize enable
if (DisableLoopVectorization)
Builder.LoopVectorize = false;
// If option wasn't forced via cmd line (-vectorize-loops, -loop-vectorize)
else if (!Builder.LoopVectorize)
Builder.LoopVectorize = OptLevel > 1 && SizeLevel < 2;
// When #pragma vectorize is on for SLP, do the same as above
Builder.SLPVectorize =
DisableSLPVectorization ? false : OptLevel > 1 && SizeLevel < 2;
if (TM)
TM->adjustPassManager(Builder);
Builder.populateFunctionPassManager(FPM);
Builder.populateModulePassManager(MPM);
}
static void AddStandardLinkPasses(legacy::PassManagerBase &PM)
{
PassManagerBuilder Builder;
Builder.VerifyInput = true;
if (DisableOptimizations)
Builder.OptLevel = 0;
if (!DisableInline)
Builder.Inliner = createFunctionInliningPass();
Builder.populateLTOPassManager(PM);
}
// Returns the TargetMachine instance or zero if no triple is provided.
static TargetMachine* GetTargetMachine(Triple TheTriple, StringRef CPUStr,
StringRef FeaturesStr,
const TargetOptions &Options,
int OptLevel
)
{
std::string Error;
const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
Error);
// Some modules don't specify a triple, and this is okay.
if (!TheTarget)
{
return nullptr;
}
return TheTarget->createTargetMachine(TheTriple.getTriple(),
CPUStr, FeaturesStr, Options,
getRelocModel(), getCodeModel(),
GetCodeGenOptLevel(OptLevel));
}
void Optimize(llvm::Module * M, int OptLevel, int SizeLevel, int Verify,
const char * Cpu)
{
bool OptLevelO1 = false;
bool OptLevelO2 = false;
bool OptLevelO3 = false;
bool StandardLinkOpts = false;
switch(OptLevel)
{
case 0:
break;
case 1:
OptLevelO1 = true;
break;
case 2:
OptLevelO2 = true;
break;
case 3:
OptLevelO3 = true;
break;
}
if(OptLevel > 0)
{
StandardLinkOpts = true;
}
// Strip debug info before running the verifier.
if (StripDebug)
StripDebugInfo(*M);
// Immediately run the verifier to catch any problems before starting up the
// pass pipelines. Otherwise we can crash on broken code during
// doInitialization().
if(verifyModule(*M, &errs()))
{
errs() << "error: input module is broken!\n";
exit(1);
}
M->setTargetTriple(Triple::normalize("amdgcn--amdhsa"));
Triple ModuleTriple(M->getTargetTriple());
std::string CPUStr(Cpu);
std::string FeaturesStr="";
TargetMachine *Machine = nullptr;
TargetOptions Options;
if (ModuleTriple.getArch())
{
Machine = GetTargetMachine(ModuleTriple, CPUStr, FeaturesStr, Options, OptLevel);
}
std::unique_ptr<TargetMachine> TM(Machine);
// Override function attributes based on CPUStr, FeaturesStr, and command line
// flags.
setFunctionAttributes(CPUStr, FeaturesStr, *M);
// Create a PassManager to hold and optimize the collection of passes we are
// about to build.
legacy::PassManager Passes;
// Add an appropriate TargetLibraryInfo pass for the module's triple.
TargetLibraryInfoImpl TLII(ModuleTriple);
// switch off libcall simplication, transforming loops to
// system calls is not supported
TLII.disableAllFunctions();
Passes.add(new TargetLibraryInfoWrapperPass(TLII));
// Add an appropriate DataLayout instance for this module.
const DataLayout &DL = M->getDataLayout();
if (DL.isDefault())
{
M->setDataLayout("");
}
// Add internal analysis passes from the target machine.
Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
: TargetIRAnalysis()));
std::unique_ptr<legacy::FunctionPassManager> FPasses;
if (OptLevelO1 || OptLevelO2 || OptLevelO3)
{
FPasses.reset(new legacy::FunctionPassManager(M));
FPasses->add(createTargetTransformInfoWrapperPass(
TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis()));
}
if (StandardLinkOpts)
AddStandardLinkPasses(Passes);
// Apply optimisation passes
if (OptLevelO1)
AddOptimizationPasses(Passes, *FPasses, TM.get(), 1, 0);
if (OptLevelO2)
AddOptimizationPasses(Passes, *FPasses, TM.get(), 2, 0);
if (OptLevelO3)
AddOptimizationPasses(Passes, *FPasses, TM.get(), 3, 0);
if (FPasses)
{
FPasses->doInitialization();
for (Function &F : *M)
FPasses->run(F);
FPasses->doFinalization();
}
// Check that the module is well formed on completion of optimization
Passes.add(createVerifierPass());
// Now that we have all of the passes ready, run them.
Passes.run(*M);
}
// --- END OPT section ---
// The following section is adapted from llc.cpp from the LLVM source tree.
// Original code is here:
// https://github.com/llvm-mirror/llvm/blob/master/tools/llc/llc.cpp
// --- START LLC section ---
int CompileModule(std::unique_ptr<Module> mod, raw_string_ostream &os, bool emitBRIG,
int OptLevel, const char * Cpu)
{
// Load the module to be compiled...
SMDiagnostic Err;
Triple TheTriple;// = Triple(mod->getTargetTriple());
TheTriple = Triple(Triple::normalize("amdgcn--amdhsa"));
// Get the target specific parser.
std::string Error;
const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
Error);
if (!TheTarget)
{
errs() << Error;
return 0;
}
// Package up features to be passed to target/subtarget
std::string CPUStr(Cpu);
std::string FeaturesStr = "+promote-alloca,+fp64-denormals,+flat-for-global,";
CodeGenOpt::Level OLvl = CodeGenOpt::Default;
switch (OptLevel)
{
case 0:
OLvl = CodeGenOpt::None;
break;
case 1:
OLvl = CodeGenOpt::Less;
break;
case 2:
OLvl = CodeGenOpt::Default;
break;
case 3:
OLvl = CodeGenOpt::Aggressive;
break;
}
TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
Options.MCOptions.AsmVerbose = true;
std::unique_ptr<TargetMachine> Target(
TheTarget->createTargetMachine(TheTriple.getTriple(), CPUStr, FeaturesStr,
Options, getRelocModel(), getCodeModel(),
OLvl));
assert(Target && "Could not allocate target machine!");
assert(mod && "Should have exited if we didn't have a module!");
if (FloatABIForCalls != FloatABI::Default)
Options.FloatABIType = FloatABIForCalls;
// Build up all of the passes that we want to do to the module.
legacy::PassManager PM;
// Add an appropriate TargetLibraryInfo pass for the module's triple.
TargetLibraryInfoImpl TLII(TheTriple);
// The -disable-simplify-libcalls flag actually disables all builtin optzns.
// TLII.disableAllFunctions();
PM.add(new TargetLibraryInfoWrapperPass(TLII));
// Add the target data from the target machine, if it exists, or the module./
mod->setDataLayout(Target->createDataLayout());
setFunctionAttributes(CPUStr, FeaturesStr, *mod);
auto FileType = (emitBRIG
? TargetMachine::CGFT_ObjectFile
: TargetMachine::CGFT_AssemblyFile);
{
// new scope
buffer_ostream BOS(os);
// Ask the target to add backend passes as necessary.
bool Verify = true;
if (Target->addPassesToEmitFile(PM, BOS, FileType, Verify))
{
errs() << "target does not support generation of this"
<< " file type!\n";
return 1;
}
PM.run(*mod);
}
return 0;
}
// --- END LLC section ---
} // end librocmlite namespace
extern "C" {
using namespace librocmlite;
typedef struct OpaqueModule* llvm_module_ptr;
void ROC_Initialize()
{
Initialize();
}
void ROC_Finalize()
{
Finalize();
}
char* ROC_CreateString(const char *str)
{
return strdup(str);
}
void ROC_DisposeString(char *str)
{
free(str);
}
ModuleRef* ROC_ParseModule(const char *Asm)
{
return ModuleRef::parseAssembly(Asm);
}
ModuleRef* ROC_ParseBitcode(const char *Asm, size_t Len)
{
ModuleRef * mref = ModuleRef::parseBitcode(Asm, Len);
return mref;
}
void ROC_ModulePrint(ModuleRef *M, char **output)
{
*output = ROC_CreateString(M->to_string().c_str());
}
void ROC_ModuleDestroy(ModuleRef *M)
{
M->destroy();
delete M;
}
int ROC_ModuleOptimize(ModuleRef *M, int OptLevel, int SizeLevel,
int Verify, const char * Cpu)
{
if (OptLevel < 0 || OptLevel > 3) return 0;
if (SizeLevel < 0 || SizeLevel > 2) return 0;
Module * mref = M->getModule();
Optimize(mref, OptLevel, SizeLevel, Verify, Cpu);
return 1;
}
int ROC_ModuleLinkIn(ModuleRef * Dst, ModuleRef * Src)
{
const Module * ref = Src->getModule();
std::unique_ptr<Module> sM = llvm::CloneModule (*ref);
if(llvm::verifyModule(*Dst->getModule(), nullptr))
{
return 0;
}
if(llvm::verifyModule(*Src->getModule(), nullptr))
{
return 0;
}
int status = llvm::Linker::linkModules(*Dst->getModule(), std::move(sM), 0);
return !status;
}
int ROC_ModuleEmitHSAIL(ModuleRef *M, int OptLevel, const char * Cpu,
char **output)
{
const Module * ref = M->getModule();
std::unique_ptr<Module> sM = llvm::CloneModule (*ref);
if (OptLevel < 0 || OptLevel > 3) return 0;
// Compile
std::string buf;
raw_string_ostream os(buf);
int status = CompileModule(std::move(sM), os, false, OptLevel, Cpu);
if(status) return 0;
// Write output
os.flush();
*output = ROC_CreateString(buf.c_str());
return 1;
}
size_t ROC_ModuleEmitBRIG(ModuleRef *M, int OptLevel, const char * Cpu,
char **output)
{
const Module * ref = M->getModule();
std::unique_ptr<Module> sM = llvm::CloneModule (*ref);
if (OptLevel < 0 || OptLevel > 3) return 0;
// Compile
std::string buf;
raw_string_ostream os(buf);
int status = CompileModule(std::move(sM), os, true, OptLevel, Cpu);
if(status) return 0;
// Write output
os.flush();
*output = (char*)malloc(buf.size());
memcpy(*output, buf.data(), buf.size());
return buf.size();
}
void ROC_SetCommandLineOption(int argc, const char * const * argv)
{
llvm::cl::ParseCommandLineOptions(argc, argv, "Does things");
}
} // end extern "C"
#
# Copyright (c) 2016 , Continuum Analytics, Inc.
# All rights reserved.
#
find_program(VALGRIND valgrind)
if(NOT VALGRIND)
message("Did not find valgrind.")
else()
message("Found valgrind. ${VALGRIND}")
endif()
# Add a gtest with and the same running with valgrind (if found)
macro(add_gtest TESTNAME)
add_executable(${TESTNAME} ${TESTNAME}.cpp)
# set link dir
link_directories(${librocmlite_BINARY_DIR})
# include include/
target_include_directories(${TESTNAME} PUBLIC ${CMAKE_SOURCE_DIR}/include ${BIN_INCLUDE_DIR})
# set linkage
target_link_libraries(${TESTNAME}
rocmlite
gtest
gtest_main
pthread
)
get_target_property(TEST_LOCATION ${TESTNAME} LOCATION)
add_test(${TESTNAME} ${TEST_LOCATION})
if(VALGRIND)
add_test(${TESTNAME}_with_valgrind
valgrind --error-exitcode=1 --leak-check=full --show-leak-kinds=all
--suppressions=test_suppressions.supp
${TEST_LOCATION})
endif()
endmacro()
set(TESTS
test_rocmlite
test_rocmlite_functions
)
foreach(TEST ${TESTS})
add_gtest(${TEST})
endforeach()
add_custom_target(COPY_IN_COMPILATION_RESOURCES ALL
COMMAND cmake -E copy_directory
${BITCODE_ROOT}
${CMAKE_BINARY_DIR}/rocmlite/test/
DEPENDS ${gtest})
add_custom_target(COPY_IN_TEST_RESOURCES ALL
COMMAND cmake -E copy_directory
${CMAKE_SOURCE_DIR}/rocmlite/test/resources
${CMAKE_BINARY_DIR}/rocmlite/test/
DEPENDS ${gtest})
; ModuleID = 'copy_kernel_1d'
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n32"
target triple = "amdgcn--amdhsa"
define internal spir_func i32 @hsapy_devfn__5F__5F_main_5F__5F__2E_copy_5F_kernel_5F_1d_24_1_2E_array_28_float32_2C__20_1d_2C__20_C_29__2E_array_28_float32_2C__20_1d_2C__20_C_29_(i8** %.ret, i8* %arg.out.0, i8* %arg.out.1, i64 %arg.out.2, i64 %arg.out.3, float addrspace(4)* %arg.out.4, i64 %arg.out.5.0, i64 %arg.out.6.0, i8* %arg.inp.0, i8* %arg.inp.1, i64 %arg.inp.2, i64 %arg.inp.3, float addrspace(4)* %arg.inp.4, i64 %arg.inp.5.0, i64 %arg.inp.6.0) {
entry:
%inserted.meminfo = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } undef, i8* %arg.out.0, 0
%inserted.parent = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.meminfo, i8* %arg.out.1, 1
%inserted.nitems = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.parent, i64 %arg.out.2, 2
%inserted.itemsize = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.nitems, i64 %arg.out.3, 3
%inserted.data = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.itemsize, float addrspace(4)* %arg.out.4, 4
%.17 = insertvalue [1 x i64] undef, i64 %arg.out.5.0, 0
%inserted.shape = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.data, [1 x i64] %.17, 5
%.18 = insertvalue [1 x i64] undef, i64 %arg.out.6.0, 0
%inserted.strides = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.shape, [1 x i64] %.18, 6
%inserted.meminfo.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } undef, i8* %arg.inp.0, 0
%inserted.parent.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.meminfo.1, i8* %arg.inp.1, 1
%inserted.nitems.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.parent.1, i64 %arg.inp.2, 2
%inserted.itemsize.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.nitems.1, i64 %arg.inp.3, 3
%inserted.data.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.itemsize.1, float addrspace(4)* %arg.inp.4, 4
%.19 = insertvalue [1 x i64] undef, i64 %arg.inp.5.0, 0
%inserted.shape.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.data.1, [1 x i64] %.19, 5
%.20 = insertvalue [1 x i64] undef, i64 %arg.inp.6.0, 0
%inserted.strides.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.shape.1, [1 x i64] %.20, 6
%out = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
%inp = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
%"$0.1" = alloca i8*
store i8* null, i8** %"$0.1"
%"$0.2" = alloca i8*
store i8* null, i8** %"$0.2"
%"$const0.3" = alloca i64
store i64 0, i64* %"$const0.3"
%"$0.4" = alloca i64
store i64 0, i64* %"$0.4"
%i = alloca i64
store i64 0, i64* %i
%.56 = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.56
%"$0.7" = alloca i64
store i64 0, i64* %"$0.7"
%"$0.8" = alloca i1
store i1 false, i1* %"$0.8"
%.78 = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78
%"$30.3" = alloca float
store float 0.000000e+00, float* %"$30.3"
%.114 = alloca { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114
%"$const47.1" = alloca i8*
store i8* null, i8** %"$const47.1"
%"$47.2" = alloca i8*
store i8* null, i8** %"$47.2"
br label %B0
B0: ; preds = %entry
%.22 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
%.25 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
%.28 = load i8*, i8** %"$0.1"
store i8* null, i8** %"$0.1"
%.30 = load i8*, i8** %"$0.1"
%.32 = load i8*, i8** %"$0.2"
store i8* null, i8** %"$0.2"
%.34 = load i8*, i8** %"$0.1"
store i8* null, i8** %"$0.1"
%.37 = load i64, i64* %"$const0.3"
store i64 0, i64* %"$const0.3"
%.39 = load i64, i64* %"$const0.3"
%.40 = trunc i64 %.39 to i32
%.41 = call spir_func i64 @_Z13get_global_idj(i32 %.40)
%.43 = load i64, i64* %"$0.4"
store i64 %.41, i64* %"$0.4"
%.45 = load i64, i64* %"$const0.3"
store i64 0, i64* %"$const0.3"
%.47 = load i8*, i8** %"$0.2"
store i8* null, i8** %"$0.2"
%.49 = load i64, i64* %"$0.4"
%.51 = load i64, i64* %i
store i64 %.49, i64* %i
%.53 = load i64, i64* %"$0.4"
store i64 0, i64* %"$0.4"
%.55 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %.55, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.56
%.59 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.56, i32 0, i32 2
%.60 = load i64, i64* %.59
%.62 = load i64, i64* %"$0.7"
store i64 %.60, i64* %"$0.7"
%.64 = load i64, i64* %i
%.65 = load i64, i64* %"$0.7"
%.66 = icmp slt i64 %.64, %.65
%.68 = load i1, i1* %"$0.8"
store i1 %.66, i1* %"$0.8"
%.70 = load i64, i64* %"$0.7"
store i64 0, i64* %"$0.7"
%.72 = load i1, i1* %"$0.8"
br i1 %.72, label %B30, label %B47
B30: ; preds = %B0
%.74 = load i1, i1* %"$0.8"
store i1 false, i1* %"$0.8"
%.76 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
%.77 = load i64, i64* %i
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %.76, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78
%.81 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 5
%.82 = getelementptr inbounds [1 x i64], [1 x i64]* %.81, i32 0, i32 0
%.83 = load i64, i64* %.82, !range !7
%.84 = insertvalue [1 x i64] undef, i64 %.83, 0
%.85 = extractvalue [1 x i64] %.84, 0
%.86 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 6
%.87 = load [1 x i64], [1 x i64]* %.86
%.88 = extractvalue [1 x i64] %.87, 0
%.89 = icmp slt i64 %.77, 0
%.90 = add i64 %.77, %.85
%.91 = select i1 %.89, i64 %.90, i64 %.77
%.92 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 5
%.93 = getelementptr inbounds [1 x i64], [1 x i64]* %.92, i32 0, i32 0
%.94 = load i64, i64* %.93, !range !7
%.95 = insertvalue [1 x i64] undef, i64 %.94, 0
%.96 = extractvalue [1 x i64] %.95, 0
%.97 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 6
%.98 = load [1 x i64], [1 x i64]* %.97
%.99 = extractvalue [1 x i64] %.98, 0
%.100 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.78, i32 0, i32 4
%.101 = load float addrspace(4)*, float addrspace(4)** %.100
%.102 = mul i64 %.91, 1
%.103 = add i64 0, %.102
%.104 = getelementptr float, float addrspace(4)* %.101, i64 %.103
%.105 = load float, float addrspace(4)* %.104
%.107 = load float, float* %"$30.3"
store float %.105, float* %"$30.3"
%.109 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
%.111 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
%.112 = load float, float* %"$30.3"
%.113 = load i64, i64* %i
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %.111, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114
%.117 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 5
%.118 = getelementptr inbounds [1 x i64], [1 x i64]* %.117, i32 0, i32 0
%.119 = load i64, i64* %.118, !range !7
%.120 = insertvalue [1 x i64] undef, i64 %.119, 0
%.121 = extractvalue [1 x i64] %.120, 0
%.122 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 6
%.123 = load [1 x i64], [1 x i64]* %.122
%.124 = extractvalue [1 x i64] %.123, 0
%.125 = icmp slt i64 %.113, 0
%.126 = add i64 %.113, %.121
%.127 = select i1 %.125, i64 %.126, i64 %.113
%.128 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 5
%.129 = getelementptr inbounds [1 x i64], [1 x i64]* %.128, i32 0, i32 0
%.130 = load i64, i64* %.129, !range !7
%.131 = insertvalue [1 x i64] undef, i64 %.130, 0
%.132 = extractvalue [1 x i64] %.131, 0
%.133 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 6
%.134 = load [1 x i64], [1 x i64]* %.133
%.135 = extractvalue [1 x i64] %.134, 0
%.136 = getelementptr inbounds { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %.114, i32 0, i32 4
%.137 = load float addrspace(4)*, float addrspace(4)** %.136
%.138 = mul i64 %.127, 1
%.139 = add i64 0, %.138
%.140 = getelementptr float, float addrspace(4)* %.137, i64 %.139
store float %.112, float addrspace(4)* %.140
%.142 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
%.144 = load i64, i64* %i
store i64 0, i64* %i
%.146 = load float, float* %"$30.3"
store float 0.000000e+00, float* %"$30.3"
br label %B47
B47: ; preds = %B30, %B0
%.149 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %out
%.151 = load { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
store { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } zeroinitializer, { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] }* %inp
%.153 = load i64, i64* %i
store i64 0, i64* %i
%.155 = load i1, i1* %"$0.8"
store i1 false, i1* %"$0.8"
%.158 = load i8*, i8** %"$const47.1"
store i8* null, i8** %"$const47.1"
%.160 = load i8*, i8** %"$const47.1"
%.162 = load i8*, i8** %"$47.2"
store i8* %.160, i8** %"$47.2"
%.164 = load i8*, i8** %"$const47.1"
store i8* null, i8** %"$const47.1"
%.166 = load i8*, i8** %"$47.2"
store i8* %.166, i8** %.ret
ret i32 0
}
declare spir_func i64 @_Z13get_global_idj(i32)
define spir_kernel void @hsaPy_hsapy_devfn__5F__5F_main_5F__5F__2E_copy_5F_kernel_5F_1d_24_1_2E_array_28_float32_2C__20_1d_2C__20_C_29__2E_array_28_float32_2C__20_1d_2C__20_C_29_(i8 addrspace(1)* %.1, i8 addrspace(1)* %.2, i64 %.3, i64 %.4, float addrspace(1)* %.5, i64 %.6, i64 %.7, i8 addrspace(1)* %.8, i8 addrspace(1)* %.9, i64 %.10, i64 %.11, float addrspace(1)* %.12, i64 %.13, i64 %.14) {
.16:
%.17 = addrspacecast i8 addrspace(1)* %.1 to i8*
%.18 = addrspacecast i8 addrspace(1)* %.2 to i8*
%.19 = addrspacecast float addrspace(1)* %.5 to float addrspace(4)*
%.20 = addrspacecast i8 addrspace(1)* %.8 to i8*
%.21 = addrspacecast i8 addrspace(1)* %.9 to i8*
%.22 = addrspacecast float addrspace(1)* %.12 to float addrspace(4)*
%inserted.meminfo = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } undef, i8* %.17, 0
%inserted.parent = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.meminfo, i8* %.18, 1
%inserted.nitems = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.parent, i64 %.3, 2
%inserted.itemsize = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.nitems, i64 %.4, 3
%inserted.data = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.itemsize, float addrspace(4)* %.19, 4
%.23 = insertvalue [1 x i64] undef, i64 %.6, 0
%inserted.shape = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.data, [1 x i64] %.23, 5
%.24 = insertvalue [1 x i64] undef, i64 %.7, 0
%inserted.strides = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.shape, [1 x i64] %.24, 6
%inserted.meminfo.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } undef, i8* %.20, 0
%inserted.parent.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.meminfo.1, i8* %.21, 1
%inserted.nitems.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.parent.1, i64 %.10, 2
%inserted.itemsize.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.nitems.1, i64 %.11, 3
%inserted.data.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.itemsize.1, float addrspace(4)* %.22, 4
%.25 = insertvalue [1 x i64] undef, i64 %.13, 0
%inserted.shape.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.data.1, [1 x i64] %.25, 5
%.26 = insertvalue [1 x i64] undef, i64 %.14, 0
%inserted.strides.1 = insertvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.shape.1, [1 x i64] %.26, 6
%.27 = alloca i8*
store i8* null, i8** %.27
%extracted.meminfo = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 0
%extracted.parent = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 1
%extracted.nitems = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 2
%extracted.itemsize = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 3
%extracted.data = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 4
%extracted.shape = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 5
%.29 = extractvalue [1 x i64] %extracted.shape, 0
%extracted.strides = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides, 6
%.30 = extractvalue [1 x i64] %extracted.strides, 0
%extracted.meminfo.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 0
%extracted.parent.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 1
%extracted.nitems.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 2
%extracted.itemsize.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 3
%extracted.data.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 4
%extracted.shape.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 5
%.31 = extractvalue [1 x i64] %extracted.shape.1, 0
%extracted.strides.1 = extractvalue { i8*, i8*, i64, i64, float addrspace(4)*, [1 x i64], [1 x i64] } %inserted.strides.1, 6
%.32 = extractvalue [1 x i64] %extracted.strides.1, 0
%.33 = call spir_func i32 @hsapy_devfn__5F__5F_main_5F__5F__2E_copy_5F_kernel_5F_1d_24_1_2E_array_28_float32_2C__20_1d_2C__20_C_29__2E_array_28_float32_2C__20_1d_2C__20_C_29_(i8** %.27, i8* %extracted.meminfo, i8* %extracted.parent, i64 %extracted.nitems, i64 %extracted.itemsize, float addrspace(4)* %extracted.data, i64 %.29, i64 %.30, i8* %extracted.meminfo.1, i8* %extracted.parent.1, i64 %extracted.nitems.1, i64 %extracted.itemsize.1, float addrspace(4)* %extracted.data.1, i64 %.31, i64 %.32)
%.34 = icmp eq i32 %.33, 0
%.35 = icmp eq i32 %.33, -2
%.36 = or i1 %.34, %.35
%.37 = xor i1 %.36, true
%.38 = icmp eq i32 %.33, -1
%.39 = icmp eq i32 %.33, -3
%.40 = icmp sge i32 %.33, 1
%.41 = load i8*, i8** %.27
ret void
}
!opencl.kernels = !{!0}
!opencl.ocl.version = !{!6}
!opencl.spir.version = !{!6}
!0 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i64, i64, float addrspace(1)*, i64, i64, i8 addrspace(1)*, i8 addrspace(1)*, i64, i64, float addrspace(1)*, i64, i64)* @hsaPy_hsapy_devfn__5F__5F_main_5F__5F__2E_copy_5F_kernel_5F_1d_24_1_2E_array_28_float32_2C__20_1d_2C__20_C_29__2E_array_28_float32_2C__20_1d_2C__20_C_29_, !1, !2, !3, !4, !5}
!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0}
!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none"}
!3 = !{!"kernel_arg_type", !"i8 addrspace(1)* ", !"i8 addrspace(1)* ", !"i64", !"i64", !"float addrspace(1)* ", !"i64", !"i64", !"i8 addrspace(1)* ", !"i8 addrspace(1)* ", !"i64", !"i64", !"float addrspace(1)* ", !"i64", !"i64"}
!4 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !"", !"", !"", !"", !"", !"", !"", !"", !""}
!5 = !{!"kernel_arg_base_type", !"i8 addrspace(1)* ", !"i8 addrspace(1)* ", !"i64", !"i64", !"float addrspace(1)* ", !"i64", !"i64", !"i8 addrspace(1)* ", !"i8 addrspace(1)* ", !"i64", !"i64", !"float addrspace(1)* ", !"i64", !"i64"}
!6 = !{i32 2, i32 0}
!7 = !{i64 0, i64 9223372036854775807}
{
<Suppress_leak_in_stack_trace_handler>
Memcheck:Leak
match-leak-kinds: definite
fun:malloc
fun:_ZL17CreateSigAltStackv
fun:_ZL16RegisterHandlersv
fun:_ZN4llvm3sys16AddSignalHandlerEPFvPvES1_
fun:_ZN4llvm3sys28PrintStackTraceOnErrorSignalEb
}
{
<Suppress_leak_in_stack_trace_handler_with_debug_info>
Memcheck:Leak
match-leak-kinds: definite
fun:malloc
fun:CreateSigAltStack
fun:_ZL16RegisterHandlersv
fun:_ZN4llvm3sys16AddSignalHandlerEPFvPvES1_
}
{
<Suppress_leak_in_stack_trace_in_docker>
Memcheck:Leak
match-leak-kinds: definite
fun:malloc
fun:_ZL16RegisterHandlersv
}
#include "rocmlite.hh"
#include "gtest/gtest.h"
#define TEST_BASE C_Linkage
// Check initializer and finalizer work as expected.
// (essentially a leak check)
TEST(TEST_BASE, Initialization)
{
for (int i = 0 ; i < 4; i++)
{
ROC_Initialize();
ROC_Finalize();
}
}
#include "rocmlite.hh"
#include "gtest/gtest.h"
#include <string>
#include <fstream>
#include <streambuf>
#include <exception>
#include <regex>
#define TEST_BASE C_Linkage_Functions
using namespace std;
// Helper functions
std::string read_ir_from_file(const char * filename)
{
std::stringstream buf;
std::ifstream f(filename, std::ios::out);
if(f.is_open())
{
buf << f.rdbuf();
f.close();
}
else
{
throw std::runtime_error("Could not open file.");
}
return buf.str();
}
std::string read_bc_from_file(const char * filename)
{
std::stringstream buf;
std::ifstream f(filename, std::ios::out | std::ios::binary);
if(f.is_open())
{
buf << f.rdbuf();
f.close();
}
else
{
throw std::runtime_error("Could not open file.");
}
return buf.str();
}
// the names of the bitcode files that need linking in
std::vector<std::string> _bitcodes = { \
"opencl.amdgcn.bc", \
"ocml.amdgcn.bc", \
"ockl.amdgcn.bc", \
"oclc_correctly_rounded_sqrt_off.amdgcn.bc", \
"oclc_daz_opt_off.amdgcn.bc", \
"oclc_finite_only_off.amdgcn.bc", \
"oclc_isa_version_803.amdgcn.bc", \
"oclc_unsafe_math_off.amdgcn.bc", \
"irif.amdgcn.bc"
};
// Environment class to setup and teardown the LLVMContext.
// This is indicative of the use from python (calls to init to ensure the
// library is initialised and a single call to finalize when gc takes place).
class globalDSOLoadEnv: public ::testing::Environment
{
public:
virtual void SetUp()
{
ROC_Initialize();
}
virtual void TearDown()
{
ROC_Finalize();
}
};
::testing::Environment* const global_env =
::testing::AddGlobalTestEnvironment(new globalDSOLoadEnv);
// Check string copy/destroy works
TEST(TEST_BASE, String_Manipulation)
{
const char string_orig[] = "Use numba for AMD GPUs!";
char * string_copy = ROC_CreateString(string_orig);
ASSERT_TRUE(string_copy != nullptr);
ASSERT_STREQ(string_orig, string_copy);
ROC_DisposeString(string_copy);
}
// Check the module IR parse works cleanly and can then be destroyed.
TEST(TEST_BASE, test_parse_ir_module)
{
std::string ir = read_ir_from_file("demo_ir.ll");
ModuleRef* theRef = ROC_ParseModule(ir.c_str());
ROC_ModuleDestroy(theRef);
}
// Check the module BC parse works cleanly and can then be destroyed.
TEST(TEST_BASE, test_parse_bc_module)
{
std::string bc = read_bc_from_file("opencl.amdgcn.bc");
ModuleRef* theRef = ROC_ParseBitcode(bc.c_str(), bc.size());
ROC_ModuleDestroy(theRef);
}
// Check link-in works
TEST(TEST_BASE, test_linkin_modules)
{
std::string ir = read_ir_from_file("demo_ir.ll");
ModuleRef* dst = ROC_ParseModule(ir.c_str());
int ret;
for (auto& bitcode : _bitcodes)
{
std::string builtins_bc = read_bc_from_file(bitcode.c_str());
ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
builtins_bc.size());
// link the builtins into the module
ret = ROC_ModuleLinkIn(dst, bc_src);
EXPECT_TRUE(ret != 0);
// pointlessly link in the same a few times, there was an subtle corruption
// present in previous versions of the linkin function.
for(int i = 0; i < 3; i++)
{
int ret = ROC_ModuleLinkIn(dst, bc_src);
EXPECT_TRUE(ret!=0);
}
ROC_ModuleDestroy(bc_src);
}
ROC_ModuleDestroy(dst);
}
// Test optimization call works
TEST(TEST_BASE, test_optimize_module)
{
std::string ir = read_ir_from_file("demo_ir.ll");
ModuleRef* dst = ROC_ParseModule(ir.c_str());
int ret;
for (auto& bitcode : _bitcodes)
{
std::string builtins_bc = read_bc_from_file(bitcode.c_str());
ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
builtins_bc.size());
// link the builtins into the module
ret = ROC_ModuleLinkIn(dst, bc_src);
EXPECT_TRUE(ret != 0);
ROC_ModuleDestroy(bc_src);
}
// run an optimisation pass over the module
ret = ROC_ModuleOptimize(dst, 3, 0, 1, "fiji");
EXPECT_TRUE(ret == 1);
ROC_ModuleDestroy(dst);
}
// Test compilation call to HSAIL works
TEST(TEST_BASE, test_compile_module_to_HSAIL)
{
ROC_Initialize();
std::string ir = read_ir_from_file("demo_ir.ll");
ModuleRef* dst = ROC_ParseModule(ir.c_str());
int ret;
for (auto& bitcode : _bitcodes)
{
std::string builtins_bc = read_bc_from_file(bitcode.c_str());
ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
builtins_bc.size());
// link the builtins into the module
ret = ROC_ModuleLinkIn(dst, bc_src);
EXPECT_TRUE(ret != 0);
ROC_ModuleDestroy(bc_src);
}
const char * cpu = "fiji";
// run an optimisation pass over the module
ret = ROC_ModuleOptimize(dst, 3, 0, 1, cpu);
ASSERT_TRUE(ret == 1);
char * output;
ret = ROC_ModuleEmitHSAIL(dst, 2, cpu, &output);
EXPECT_TRUE(ret > 0);
std::string hsail(output);
// check this is an HSA code object, search the dump for an HSA ISA string
// like '.hsa_code_object_isa 8,0,3,"AMD","AMDGPU"'
std::regex regex(".*\\.hsa_code_object_isa.*\"AMD\",\"AMDGPU\".*");
EXPECT_TRUE(std::regex_search(hsail, regex));
free(output);
ROC_ModuleDestroy(dst);
}
// Test compilation call to BRIG works
TEST(TEST_BASE, test_compile_module_to_BRIG)
{
ROC_Initialize();
std::string ir = read_ir_from_file("demo_ir.ll");
ModuleRef* dst = ROC_ParseModule(ir.c_str());
int ret;
for (auto& bitcode : _bitcodes)
{
std::string builtins_bc = read_bc_from_file(bitcode.c_str());
ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
builtins_bc.size());
// link the builtins into the module
ret = ROC_ModuleLinkIn(dst, bc_src);
EXPECT_TRUE(ret != 0);
ROC_ModuleDestroy(bc_src);
}
const char * cpu = "fiji";
// run an optimisation pass over the module
ret = ROC_ModuleOptimize(dst, 3, 0, 1, cpu);
ASSERT_TRUE(ret == 1);
char * output;
ret = ROC_ModuleEmitBRIG(dst, 2, cpu, &output);
EXPECT_TRUE(ret > 0);
char elf_string[] = "\x7f\x45\x4c\x46";
// check this is an ELF object
for (size_t i = 0; i < 4; i++)
{
EXPECT_TRUE(output[i]==elf_string[i]);
}
free(output);
ROC_ModuleDestroy(dst);
}
// Test many compilation calls to BRIG works
TEST(TEST_BASE, test_many_compile_module_to_BRIG)
{
const char * cpu = "fiji";
int trials = 5;
for(int k = 0; k < trials; k++)
{
ROC_Initialize();
std::string ir = read_ir_from_file("demo_ir.ll");
ModuleRef* dst = ROC_ParseModule(ir.c_str());
int ret;
for (auto& bitcode : _bitcodes)
{
std::string builtins_bc = read_bc_from_file(bitcode.c_str());
ModuleRef* bc_src = ROC_ParseBitcode(builtins_bc.c_str(),
builtins_bc.size());
// link the builtins into the module
ret = ROC_ModuleLinkIn(dst, bc_src);
EXPECT_TRUE(ret != 0);
ROC_ModuleDestroy(bc_src);
}
// run an optimisation pass over the module
ret = ROC_ModuleOptimize(dst, 3, 0, 1, cpu);
ASSERT_TRUE(ret == 1);
char * output;
ret = ROC_ModuleEmitBRIG(dst, 2, cpu, &output);
EXPECT_TRUE(ret > 0);
char elf_string[] = "\x7f\x45\x4c\x46";
// check this is an ELF object
for (size_t i = 0; i < 4; i++)
{
EXPECT_TRUE(output[i]==elf_string[i]);
}
free(output);
ROC_ModuleDestroy(dst);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment