Update compiler.bak.py, compiler.py files

6e953c61 · zsccc · 6e953c61 · 6e953c61
Commit 6e953c61 authored Dec 05, 2025 by zsccc
Hide whitespace changes
Inline Side-by-side

Showing with 1195 additions and 0 deletions

compiler.bak.py compiler.bak.py +591 -0

compiler.py compiler.py +604 -0

No files found.
--- a/compiler.bak.py
+++ b/compiler.bak.py
+# /root/Documents/bigdata/Anaconda3/envs/pynx/lib/python3.10/site-packages/pycuda/compiler.py
+
+from pytools import memoize
+
+# don't import pycuda.driver here--you'll create an import loop
+import os
+
+import sys
+from tempfile import mkstemp
+from os import unlink
+
+from pytools.prefork import call_capture_output
+
+
+@memoize
+def get_nvcc_version(nvcc):
+    cmdline = [nvcc, "--version"]
+    result, stdout, stderr = call_capture_output(cmdline)
+
+    if result != 0 or not stdout:
+        from warnings import warn
+
+        warn("NVCC version could not be determined.")
+        stdout = "nvcc unknown version"
+
+    return stdout.decode("utf-8", "replace")
+
+
+def _new_md5():
+    try:
+        import hashlib
+
+        return hashlib.md5()
+    except ImportError:
+        # for Python << 2.5
+        import md5
+
+        return md5.new()
+
+
+def preprocess_source(source, options, nvcc):
+    handle, source_path = mkstemp(suffix=".cu")
+
+    outf = open(source_path, "w")
+    outf.write(source)
+    outf.close()
+    os.close(handle)
+
+    cmdline = [nvcc, "--preprocess"] + options + [source_path]
+    if "win32" in sys.platform:
+        cmdline.extend(["--compiler-options", "-EP"])
+    else:
+        cmdline.extend(["--compiler-options", "-P"])
+
+    result, stdout, stderr = call_capture_output(cmdline, error_on_nonzero=False)
+
+    if result != 0:
+        from pycuda.driver import CompileError
+
+        raise CompileError(
+            "nvcc preprocessing of %s failed" % source_path, cmdline, stderr=stderr
+        )
+
+    # sanity check
+    if len(stdout) < 0.5 * len(source):
+        from pycuda.driver import CompileError
+
+        raise CompileError(
+            "nvcc preprocessing of %s failed with ridiculously "
+            "small code output - likely unsupported compiler." % source_path,
+            cmdline,
+            stderr=stderr.decode("utf-8", "replace"),
+        )
+
+    unlink(source_path)
+
+    preprocessed_str = stdout.decode("utf-8", "replace")
+
+    # remove the temporary filename from the preprocessed source code to get reproducible hashes
+    return preprocessed_str.replace(os.path.basename(source_path), "")
+
+
+def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
+    from os.path import join
+
+    assert target in ["cubin", "ptx", "fatbin"]
+
+    if cache_dir:
+        checksum = _new_md5()
+
+        if "#include" in source:
+            checksum.update(preprocess_source(source, options, nvcc).encode("utf-8"))
+        else:
+            checksum.update(source.encode("utf-8"))
+
+        for option in options:
+            checksum.update(option.encode("utf-8"))
+        checksum.update(get_nvcc_version(nvcc).encode("utf-8"))
+        from pycuda.characterize import platform_bits
+
+        checksum.update(str(platform_bits()).encode("utf-8"))
+
+        cache_file = checksum.hexdigest()
+        cache_path = join(cache_dir, cache_file + "." + target)
+
+        try:
+            cache_file = open(cache_path, "rb")
+            try:
+                return cache_file.read()
+            finally:
+                cache_file.close()
+
+        except Exception:
+            pass
+
+    from tempfile import mkdtemp
+
+    file_dir = mkdtemp()
+    file_root = "kernel"
+
+    cu_file_name = file_root + ".cu"
+    cu_file_path = join(file_dir, cu_file_name)
+
+    outf = open(cu_file_path, "w")
+    outf.write(str(source))
+    outf.close()
+
+    if keep:
+        options = options[:]
+        options.append("--keep")
+
+        print("*** compiler output in %s" % file_dir)
+
+    cmdline = [nvcc, "--" + target] + options + [cu_file_name]
+    result, stdout, stderr = call_capture_output(
+        cmdline, cwd=file_dir, error_on_nonzero=False
+    )
+
+    try:
+        result_f = open(join(file_dir, file_root + "." + target), "rb")
+    except OSError:
+        no_output = True
+    else:
+        no_output = False
+
+    if result != 0 or (no_output and (stdout or stderr)):
+        if result == 0:
+            from warnings import warn
+
+            warn(
+                "PyCUDA: nvcc exited with status 0, but appears to have "
+                "encountered an error"
+            )
+        from pycuda.driver import CompileError
+
+        raise CompileError(
+            "nvcc compilation of %s failed" % cu_file_path,
+            cmdline,
+            stdout=stdout.decode("utf-8", "replace"),
+            stderr=stderr.decode("utf-8", "replace"),
+        )
+
+    if stdout or stderr:
+        lcase_err_text = (stdout + stderr).decode("utf-8", "replace").lower()
+        from warnings import warn
+
+        if "demoted" in lcase_err_text or "demoting" in lcase_err_text:
+            warn(
+                "nvcc said it demoted types in source code it "
+                "compiled--this is likely not what you want.",
+                stacklevel=4,
+            )
+        warn(
+            "The CUDA compiler succeeded, but said the following:\n"
+            + (stdout + stderr).decode("utf-8", "replace"),
+            stacklevel=4,
+        )
+
+    result_data = result_f.read()
+    result_f.close()
+
+    if cache_dir:
+        outf = open(cache_path, "wb")
+        outf.write(result_data)
+        outf.close()
+
+    if not keep:
+        from os import listdir, unlink, rmdir
+
+        for name in listdir(file_dir):
+            unlink(join(file_dir, name))
+        rmdir(file_dir)
+
+    return result_data
+
+
+def _get_per_user_string():
+    try:
+        from os import getuid
+    except ImportError:
+        checksum = _new_md5()
+        from os import environ
+
+        checksum.update(environ["USERNAME"].encode("utf-8"))
+        return checksum.hexdigest()
+    else:
+        return "uid%d" % getuid()
+
+
+def _find_pycuda_include_path():
+    import importlib.util
+    import os
+
+    return os.path.abspath(
+        os.path.join(importlib.util.find_spec("pycuda").origin,
+                     os.path.pardir, "cuda"))
+
+
+DEFAULT_NVCC_FLAGS = [
+    _flag.strip()
+    for _flag in os.environ.get("PYCUDA_DEFAULT_NVCC_FLAGS", "").split()
+    if _flag.strip()
+]
+
+
+def compile(
+    source,
+    nvcc="nvcc",
+    options=None,
+    keep=False,
+    no_extern_c=False,
+    arch=None,
+    code=None,
+    cache_dir=None,
+    include_dirs=[],
+    target="cubin",
+):
+
+    assert target in ["cubin", "ptx", "fatbin"]
+
+    if not no_extern_c:
+        source = 'extern "C" {\n%s\n}\n' % source
+
+    if options is None:
+        options = DEFAULT_NVCC_FLAGS
+
+    options = options[:]
+    if arch is None:
+        from pycuda.driver import Error
+
+        try:
+            from pycuda.driver import Context
+
+            arch = "sm_%d%d" % Context.get_device().compute_capability()
+        except Error:
+            pass
+
+    from pycuda.driver import CUDA_DEBUGGING
+
+    if CUDA_DEBUGGING:
+        cache_dir = False
+        keep = True
+        options.extend(["-g", "-G"])
+
+    if "PYCUDA_CACHE_DIR" in os.environ and cache_dir is None:
+        cache_dir = os.environ["PYCUDA_CACHE_DIR"]
+
+    if "PYCUDA_DISABLE_CACHE" in os.environ:
+        cache_dir = False
+
+    if cache_dir is None:
+        import platformdirs
+
+        cache_dir = os.path.join(
+            platformdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1"
+        )
+
+        from os import makedirs
+        makedirs(cache_dir, exist_ok=True)
+
+    if arch is not None:
+        options.extend(["-arch", arch])
+
+    if code is not None:
+        options.extend(["-code", code])
+
+    if "darwin" in sys.platform and sys.maxsize == 9223372036854775807:
+        options.append("-m64")
+    elif "win32" in sys.platform and sys.maxsize == 9223372036854775807:
+        options.append("-m64")
+    elif "win32" in sys.platform and sys.maxsize == 2147483647:
+        options.append("-m32")
+
+    include_dirs = include_dirs + [_find_pycuda_include_path()]
+
+    for i in include_dirs:
+        options.append("-I" + i)
+
+    return compile_plain(source, options, keep, nvcc, cache_dir, target)
+
+
+class CudaModule:
+    def _check_arch(self, arch):
+        if arch is None:
+            return
+        try:
+            from pycuda.driver import Context
+
+            capability = Context.get_device().compute_capability()
+            if tuple(map(int, tuple(arch.split("_")[1]))) > capability:
+                from warnings import warn
+
+                warn(
+                    "trying to compile for a compute capability "
+                    "higher than selected GPU"
+                )
+        except Exception:
+            pass
+
+    def _bind_module(self):
+        self.get_global = self.module.get_global
+        self.get_texref = self.module.get_texref
+        if hasattr(self.module, "get_surfref"):
+            self.get_surfref = self.module.get_surfref
+
+    def get_function(self, name):
+        return self.module.get_function(name)
+
+
+class SourceModule(CudaModule):
+    """
+    Creates a Module from a single .cu source object linked against the
+    static CUDA runtime.
+    """
+
+    def __init__(
+        self,
+        source,
+        nvcc="nvcc",
+        options=None,
+        keep=False,
+        no_extern_c=False,
+        arch=None,
+        code=None,
+        cache_dir=None,
+        include_dirs=[],
+    ):
+        self._check_arch(arch)
+
+        cubin = compile(
+            source,
+            nvcc,
+            options,
+            keep,
+            no_extern_c,
+            arch,
+            code,
+            cache_dir,
+            include_dirs,
+        )
+
+        from pycuda.driver import module_from_buffer
+
+        self.module = module_from_buffer(cubin)
+
+        self._bind_module()
+
+
+def _search_on_path(filenames):
+    """Find file on system path."""
+    # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224
+
+    from os.path import exists, abspath, join
+    from os import pathsep, environ
+
+    search_path = environ["PATH"]
+
+    paths = search_path.split(pathsep)
+    for path in paths:
+        for filename in filenames:
+            if exists(join(path, filename)):
+                return abspath(join(path, filename))
+
+
+@memoize
+def _find_nvcc_on_path():
+    return _search_on_path(["nvcc", "nvcc.exe"])
+
+
+class DynamicModule(CudaModule):
+    """
+    Creates a Module from multiple .cu source, library file and/or data
+    objects linked against the static or dynamic CUDA runtime.
+    """
+
+    def __init__(
+        self,
+        nvcc="nvcc",
+        link_options=None,
+        keep=False,
+        no_extern_c=False,
+        arch=None,
+        code=None,
+        cache_dir=None,
+        include_dirs=[],
+        message_handler=None,
+        log_verbose=False,
+        cuda_libdir=None,
+    ):
+        from pycuda.driver import Context
+
+        compute_capability = Context.get_device().compute_capability()
+        if compute_capability < (3, 5):
+            raise Exception(
+                "Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!"
+                % (compute_capability[0], compute_capability[1])
+            )
+        else:
+            from pycuda.driver import Linker
+
+            self.linker = Linker(message_handler, link_options, log_verbose)
+        self._check_arch(arch)
+        self.nvcc = nvcc
+        self.keep = keep
+        self.no_extern_c = no_extern_c
+        self.arch = arch
+        self.code = code
+        self.cache_dir = cache_dir
+        self.include_dirs = include_dirs
+        self.cuda_libdir = cuda_libdir
+        self.libdir, self.libptn = None, None
+        self.module = None
+
+    def _locate_cuda_libdir(self):
+        """
+        Locate the "standard" CUDA SDK library directory in the local
+        file system. Supports 64-Bit Windows, Linux and Mac OS X.
+        In case the caller supplied cuda_libdir in the constructor
+        other than None that value is returned unchecked, else a
+        best-effort attempt is made.
+        Precedence:
+            Windows: cuda_libdir > %CUDA_PATH%
+            Linux:   cuda_libdir > $CUDA_ROOT > $LD_LIBRARY_PATH > '/usr/lib/x86_64-linux-gnu'
+        Returns a pair (libdir, libptn) where libdir is None in case
+            of failure or a string containing the absolute path of the
+            directory, and libptn is the %-format pattern to construct
+            library file names from library names on the local system.
+        Raises a RuntimeError in case of failure.
+        Links:
+        - Post-installation Actions
+          http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions
+        TODO:
+        - Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries on Linux?
+        - Mac OS X (Darwin) is currently treated like Linux, is that correct?
+        - Check CMake's FindCUDA module, it might contain some helpful clues in its sources
+          https://cmake.org/cmake/help/v3.0/module/FindCUDA.html
+          https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake
+        - Verify all Linux code paths somehow
+        """
+        from os.path import isfile, join
+        from platform import system as platform_system
+
+        system = platform_system()
+        libdir, libptn = None, None
+        if system == "Windows":
+            if self.cuda_libdir is not None:
+                libdir = self.cuda_libdir
+            elif "CUDA_PATH" in os.environ and isfile(
+                join(os.environ["CUDA_PATH"], "lib\\x64\\cudadevrt.lib")
+            ):
+                libdir = join(os.environ["CUDA_PATH"], "lib\\x64")
+            libptn = "%s.lib"
+        elif system in ["Linux", "Darwin"]:
+            if self.cuda_libdir is not None:
+                libdir = self.cuda_libdir
+            elif "CUDA_ROOT" in os.environ and isfile(
+                join(os.environ["CUDA_ROOT"], "lib64/libcudadevrt.a")
+            ):
+                libdir = join(os.environ["CUDA_ROOT"], "lib64")
+            elif "LD_LIBRARY_PATH" in os.environ:
+                for ld_path in os.environ["LD_LIBRARY_PATH"].split(":"):
+                    if isfile(join(ld_path, "libcudadevrt.a")):
+                        libdir = ld_path
+                        break
+
+            if libdir is None and isfile("/usr/lib/x86_64-linux-gnu/libcudadevrt.a"):
+                libdir = "/usr/lib/x86_64-linux-gnu"
+
+            if libdir is None:
+                nvcc_path = _find_nvcc_on_path()
+                if nvcc_path is not None:
+                    libdir = join(os.path.dirname(nvcc_path), "..", "lib64")
+
+            libptn = "lib%s.a"
+        if libdir is None:
+            raise RuntimeError(
+                "Unable to locate the CUDA SDK installation "
+                "directory, set CUDA library path manually"
+            )
+        return libdir, libptn
+
+    def add_source(self, source, nvcc_options=None, name="kernel.ptx"):
+        ptx = compile(
+            source,
+            nvcc=self.nvcc,
+            options=nvcc_options,
+            keep=self.keep,
+            no_extern_c=self.no_extern_c,
+            arch=self.arch,
+            code=self.code,
+            cache_dir=self.cache_dir,
+            include_dirs=self.include_dirs,
+            target="ptx",
+        )
+        from pycuda.driver import jit_input_type
+
+        self.linker.add_data(ptx, jit_input_type.PTX, name)
+        return self
+
+    def add_data(self, data, input_type, name="unknown"):
+        self.linker.add_data(data, input_type, name)
+        return self
+
+    def add_file(self, filename, input_type):
+        self.linker.add_file(filename, input_type)
+        return self
+
+    def add_stdlib(self, libname):
+        if self.libdir is None:
+            self.libdir, self.libptn = self._locate_cuda_libdir()
+        from os.path import isfile, join
+
+        libpath = join(self.libdir, self.libptn % libname)
+        if not isfile(libpath):
+            raise OSError('CUDA SDK library file "%s" not found' % libpath)
+        from pycuda.driver import jit_input_type
+
+        self.linker.add_file(libpath, jit_input_type.LIBRARY)
+        return self
+
+    def link(self):
+        self.module = self.linker.link_module()
+        self.linker = None
+        self._bind_module()
+        return self
+
+
+class DynamicSourceModule(DynamicModule):
+    """
+    Creates a Module from a single .cu source object linked against the
+    dynamic CUDA runtime.
+    - compiler generates PTX relocatable device code (rdc) from source that
+      can be linked with other relocatable device code
+    - source is linked against the CUDA device runtime library cudadevrt
+    - library cudadevrt is statically linked into the generated Module
+    """
+
+    def __init__(
+        self,
+        source,
+        nvcc="nvcc",
+        options=None,
+        keep=False,
+        no_extern_c=False,
+        arch=None,
+        code=None,
+        cache_dir=None,
+        include_dirs=[],
+        cuda_libdir=None,
+    ):
+        super().__init__(
+            nvcc=nvcc,
+            link_options=None,
+            keep=keep,
+            no_extern_c=no_extern_c,
+            arch=arch,
+            code=code,
+            cache_dir=cache_dir,
+            include_dirs=include_dirs,
+            cuda_libdir=cuda_libdir,
+        )
+        if options is None:
+            options = DEFAULT_NVCC_FLAGS
+        options = options[:]
+        if "-rdc=true" not in options:
+            options.append("-rdc=true")
+        if "-lcudadevrt" not in options:
+            options.append("-lcudadevrt")
+        self.add_source(source, nvcc_options=options)
+        self.add_stdlib("cudadevrt")
+        self.link()
--- a/compiler.py
+++ b/compiler.py
+# /root/anaconda3/envs/pynx/lib/python3.10/site-packages/pycuda-2024.1.2-py3.10-linux-x86_64.egg/pycuda/compiler.py
+
+from pytools import memoize
+
+# don't import pycuda.driver here--you'll create an import loop
+import os
+
+import sys
+from tempfile import mkstemp
+from os import unlink
+
+from pytools.prefork import call_capture_output
+
+
+@memoize
+def get_nvcc_version(nvcc):
+    cmdline = [nvcc, "--version"]
+    result, stdout, stderr = call_capture_output(cmdline)
+
+    if result != 0 or not stdout:
+        from warnings import warn
+
+        warn("NVCC version could not be determined.")
+        stdout = b"nvcc unknown version"
+
+    return stdout.decode("utf-8", "replace")
+
+
+def _new_md5():
+    try:
+        import hashlib
+
+        return hashlib.md5()
+    except ImportError:
+        # for Python << 2.5
+        import md5
+
+        return md5.new()
+
+
+def preprocess_source(source, options, nvcc):
+    handle, source_path = mkstemp(suffix=".cu")
+
+    outf = open(source_path, "w")
+    outf.write(source)
+    outf.close()
+    os.close(handle)
+
+    cmdline = [nvcc, "--preprocess"] + options + [source_path]
+    if "win32" in sys.platform:
+        cmdline.extend(["--compiler-options", "-EP"])
+    else:
+        cmdline.extend(["--compiler-options", "-P"])
+
+    result, stdout, stderr = call_capture_output(cmdline, error_on_nonzero=False)
+
+    if result != 0:
+        from pycuda.driver import CompileError
+
+        raise CompileError(
+            "nvcc preprocessing of %s failed" % source_path, cmdline, stderr=stderr
+        )
+
+    # sanity check
+    if len(stdout) < 0.5 * len(source):
+        from pycuda.driver import CompileError
+
+        raise CompileError(
+            "nvcc preprocessing of %s failed with ridiculously "
+            "small code output - likely unsupported compiler." % source_path,
+            cmdline,
+            stderr=stderr.decode("utf-8", "replace"),
+        )
+
+    unlink(source_path)
+
+    preprocessed_str = stdout.decode("utf-8", "replace")
+
+    # remove the temporary filename from the preprocessed source code to get reproducible hashes
+    return preprocessed_str.replace(os.path.basename(source_path), "")
+
+
+def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
+    from os.path import join
+
+    assert target in ["cubin", "ptx", "fatbin"]
+
+    if cache_dir:
+        checksum = _new_md5()
+
+        if "#include" in source:
+            checksum.update(preprocess_source(source, options, nvcc).encode("utf-8"))
+        else:
+            checksum.update(source.encode("utf-8"))
+
+        for option in options:
+            checksum.update(option.encode("utf-8"))
+        checksum.update(get_nvcc_version(nvcc).encode("utf-8"))
+        from pycuda.characterize import platform_bits
+
+        checksum.update(str(platform_bits()).encode("utf-8"))
+
+        cache_file = checksum.hexdigest()
+        cache_path = join(cache_dir, cache_file + "." + target)
+
+        try:
+            cache_file = open(cache_path, "rb")
+            try:
+                return cache_file.read()
+            finally:
+                cache_file.close()
+
+        except Exception:
+            pass
+
+    from tempfile import mkdtemp
+
+    file_dir = mkdtemp()
+    file_root = "kernel"
+
+    cu_file_name = file_root + ".cu"
+    cu_file_path = join(file_dir, cu_file_name)
+
+    outf = open(cu_file_path, "w")
+    outf.write(str(source))
+    outf.close()
+
+    if keep:
+        options = options[:]
+        options.append("--keep")
+
+        print("*** compiler output in %s" % file_dir)
+
+    cmdline = [nvcc, "--" + target] + options + [cu_file_name]
+    result, stdout, stderr = call_capture_output(
+        cmdline, cwd=file_dir, error_on_nonzero=False
+    )
+
+    try:
+        # GPUfusion: kernel.cubin <=> kernel.cu-hip-amdgcn-amd-amdhsa.hipfb
+        if target == "cubin": 
+            result_f = open(join(file_dir, cu_file_name + "-hip-amdgcn-amd-amdhsa.hipfb"), "rb")
+            print("compile file")
+            print(join(file_dir, cu_file_name + "-hip-amdgcn-amd-amdhsa.hipfb"))
+        # GPUfusion: kernel.ptx <=> kernel-hip-amdgcn-amd-amdhsa-gfx906.bc
+        elif target == "ptx":
+            result_f = open(join(file_dir, file_root + "-hip-amdgcn-amd-amdhsa-gfx906.bc"), "rb")
+            print("compile file")
+            print(join(file_dir, file_root + "-hip-amdgcn-amd-amdhsa-gfx906.bc"))
+        else:
+            result_f = open(join(file_dir, file_root + "." + target), "rb")
+            print("compile file")
+            print(join(file_dir, file_root + "." + target))
+    except OSError:
+        no_output = True
+    else:
+        no_output = False
+
+    if result != 0 or (no_output and (stdout or stderr)):
+        if result == 0:
+            from warnings import warn
+
+            warn(
+                "PyCUDA: nvcc exited with status 0, but appears to have "
+                "encountered an error"
+            )
+        from pycuda.driver import CompileError
+
+        raise CompileError(
+            "nvcc compilation of %s failed" % cu_file_path,
+            cmdline,
+            stdout=stdout.decode("utf-8", "replace"),
+            stderr=stderr.decode("utf-8", "replace"),
+        )
+
+    if stdout or stderr:
+        lcase_err_text = (stdout + stderr).decode("utf-8", "replace").lower()
+        from warnings import warn
+
+        if "demoted" in lcase_err_text or "demoting" in lcase_err_text:
+            warn(
+                "nvcc said it demoted types in source code it "
+                "compiled--this is likely not what you want.",
+                stacklevel=4,
+            )
+        warn(
+            "The CUDA compiler succeeded, but said the following:\n"
+            + (stdout + stderr).decode("utf-8", "replace"),
+            stacklevel=4,
+        )
+
+    result_data = result_f.read()
+    result_f.close()
+
+    if cache_dir:
+        outf = open(cache_path, "wb")
+        outf.write(result_data)
+        outf.close()
+
+    if not keep:
+        from os import listdir, unlink, rmdir
+
+        for name in listdir(file_dir):
+            unlink(join(file_dir, name))
+        rmdir(file_dir)
+
+    return result_data
+
+
+def _get_per_user_string():
+    try:
+        from os import getuid
+    except ImportError:
+        checksum = _new_md5()
+        from os import environ
+
+        checksum.update(environ["USERNAME"].encode("utf-8"))
+        return checksum.hexdigest()
+    else:
+        return "uid%d" % getuid()
+
+
+def _find_pycuda_include_path():
+    import importlib.util
+    import os
+
+    return os.path.abspath(
+        os.path.join(importlib.util.find_spec("pycuda").origin,
+                     os.path.pardir, "cuda"))
+
+
+DEFAULT_NVCC_FLAGS = [
+    _flag.strip()
+    for _flag in os.environ.get("PYCUDA_DEFAULT_NVCC_FLAGS", "").split()
+    if _flag.strip()
+]
+
+
+def compile(
+    source,
+    nvcc="nvcc",
+    options=None,
+    keep=False,
+    no_extern_c=False,
+    arch=None,
+    code=None,
+    cache_dir=None,
+    include_dirs=[],
+    target="cubin",
+):
+
+    assert target in ["cubin", "ptx", "fatbin"]
+
+    if not no_extern_c:
+        source = 'extern "C" {\n%s\n}\n' % source
+
+    if options is None:
+        options = DEFAULT_NVCC_FLAGS
+
+    options = options[:]
+    if arch is None:
+        from pycuda.driver import Error
+
+        try:
+            from pycuda.driver import Context
+
+            arch = "sm_%d%d" % Context.get_device().compute_capability()
+        except Error:
+            pass
+
+    from pycuda.driver import CUDA_DEBUGGING
+
+    if CUDA_DEBUGGING:
+        cache_dir = False
+        keep = True
+        options.extend(["-g", "-G"])
+
+    if "PYCUDA_CACHE_DIR" in os.environ and cache_dir is None:
+        cache_dir = os.environ["PYCUDA_CACHE_DIR"]
+
+    if "PYCUDA_DISABLE_CACHE" in os.environ:
+        cache_dir = False
+
+    if cache_dir is None:
+        import platformdirs
+
+        cache_dir = os.path.join(
+            platformdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1"
+        )
+
+        from os import makedirs
+        makedirs(cache_dir, exist_ok=True)
+
+    if arch is not None:
+        options.extend(["-arch", arch])
+
+    if code is not None:
+        options.extend(["-code", code])
+
+    if "darwin" in sys.platform and sys.maxsize == 9223372036854775807:
+        options.append("-m64")
+    elif "win32" in sys.platform and sys.maxsize == 9223372036854775807:
+        options.append("-m64")
+    elif "win32" in sys.platform and sys.maxsize == 2147483647:
+        options.append("-m32")
+
+    include_dirs = include_dirs + [_find_pycuda_include_path()]
+
+    for i in include_dirs:
+        options.append("-I" + i)
+
+    return compile_plain(source, options, keep, nvcc, cache_dir, target)
+
+
+class CudaModule:
+    def _check_arch(self, arch):
+        if arch is None:
+            return
+        try:
+            from pycuda.driver import Context
+
+            capability = Context.get_device().compute_capability()
+            if tuple(map(int, tuple(arch.split("_")[1]))) > capability:
+                from warnings import warn
+
+                warn(
+                    "trying to compile for a compute capability "
+                    "higher than selected GPU"
+                )
+        except Exception:
+            pass
+
+    def _bind_module(self):
+        self.get_global = self.module.get_global
+        self.get_texref = self.module.get_texref
+        if hasattr(self.module, "get_surfref"):
+            self.get_surfref = self.module.get_surfref
+
+    def get_function(self, name):
+        return self.module.get_function(name)
+
+
+class SourceModule(CudaModule):
+    """
+    Creates a Module from a single .cu source object linked against the
+    static CUDA runtime.
+    """
+
+    def __init__(
+        self,
+        source,
+        nvcc="nvcc",
+        options=None,
+        keep=False,
+        no_extern_c=False,
+        arch=None,
+        code=None,
+        cache_dir=None,
+        include_dirs=[],
+    ):
+        self._check_arch(arch)
+
+        cubin = compile(
+            source,
+            nvcc,
+            options,
+            keep,
+            no_extern_c,
+            arch,
+            code,
+            cache_dir,
+            include_dirs,
+        )
+
+        from pycuda.driver import module_from_buffer
+
+        self.module = module_from_buffer(cubin)
+
+        self._bind_module()
+
+
+def _search_on_path(filenames):
+    """Find file on system path."""
+    # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224
+
+    from os.path import exists, abspath, join
+    from os import pathsep, environ
+
+    search_path = environ["PATH"]
+
+    paths = search_path.split(pathsep)
+    for path in paths:
+        for filename in filenames:
+            if exists(join(path, filename)):
+                return abspath(join(path, filename))
+
+
+@memoize
+def _find_nvcc_on_path():
+    return _search_on_path(["nvcc", "nvcc.exe"])
+
+
+class DynamicModule(CudaModule):
+    """
+    Creates a Module from multiple .cu source, library file and/or data
+    objects linked against the static or dynamic CUDA runtime.
+    """
+
+    def __init__(
+        self,
+        nvcc="nvcc",
+        link_options=None,
+        keep=False,
+        no_extern_c=False,
+        arch=None,
+        code=None,
+        cache_dir=None,
+        include_dirs=[],
+        message_handler=None,
+        log_verbose=False,
+        cuda_libdir=None,
+    ):
+        from pycuda.driver import Context
+
+        compute_capability = Context.get_device().compute_capability()
+        if compute_capability < (3, 5):
+            raise Exception(
+                "Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!"
+                % (compute_capability[0], compute_capability[1])
+            )
+        else:
+            from pycuda.driver import Linker
+
+            self.linker = Linker(message_handler, link_options, log_verbose)
+        self._check_arch(arch)
+        self.nvcc = nvcc
+        self.keep = keep
+        self.no_extern_c = no_extern_c
+        self.arch = arch
+        self.code = code
+        self.cache_dir = cache_dir
+        self.include_dirs = include_dirs
+        self.cuda_libdir = cuda_libdir
+        self.libdir, self.libptn = None, None
+        self.module = None
+
+    def _locate_cuda_libdir(self):
+        """
+        Locate the "standard" CUDA SDK library directory in the local
+        file system. Supports 64-Bit Windows, Linux and Mac OS X.
+        In case the caller supplied cuda_libdir in the constructor
+        other than None that value is returned unchecked, else a
+        best-effort attempt is made.
+        Precedence:
+            Windows: cuda_libdir > %CUDA_PATH%
+            Linux:   cuda_libdir > $CUDA_ROOT > $LD_LIBRARY_PATH > '/usr/lib/x86_64-linux-gnu'
+        Returns a pair (libdir, libptn) where libdir is None in case
+            of failure or a string containing the absolute path of the
+            directory, and libptn is the %-format pattern to construct
+            library file names from library names on the local system.
+        Raises a RuntimeError in case of failure.
+        Links:
+        - Post-installation Actions
+          http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions
+        TODO:
+        - Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries on Linux?
+        - Mac OS X (Darwin) is currently treated like Linux, is that correct?
+        - Check CMake's FindCUDA module, it might contain some helpful clues in its sources
+          https://cmake.org/cmake/help/v3.0/module/FindCUDA.html
+          https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake
+        - Verify all Linux code paths somehow
+        """
+        from os.path import isfile, join
+        from platform import system as platform_system
+
+        system = platform_system()
+        libdir, libptn = None, None
+        if system == "Windows":
+            if self.cuda_libdir is not None:
+                libdir = self.cuda_libdir
+            elif "CUDA_PATH" in os.environ and isfile(
+                join(os.environ["CUDA_PATH"], "lib\\x64\\cudadevrt.lib")
+            ):
+                libdir = join(os.environ["CUDA_PATH"], "lib\\x64")
+            libptn = "%s.lib"
+        elif system in ["Linux", "Darwin"]:
+            if self.cuda_libdir is not None:
+                libdir = self.cuda_libdir
+            elif "CUDA_ROOT" in os.environ and isfile(
+                join(os.environ["CUDA_ROOT"], "lib64/libcudadevrt.a")
+            ):
+                libdir = join(os.environ["CUDA_ROOT"], "lib64")
+            elif "LD_LIBRARY_PATH" in os.environ:
+                for ld_path in os.environ["LD_LIBRARY_PATH"].split(":"):
+                    if isfile(join(ld_path, "libcudadevrt.a")):
+                        libdir = ld_path
+                        break
+
+            if libdir is None and isfile("/usr/lib/x86_64-linux-gnu/libcudadevrt.a"):
+                libdir = "/usr/lib/x86_64-linux-gnu"
+
+            if libdir is None:
+                nvcc_path = _find_nvcc_on_path()
+                if nvcc_path is not None:
+                    libdir = join(os.path.dirname(nvcc_path), "..", "lib64")
+
+            libptn = "lib%s.a"
+        if libdir is None:
+            raise RuntimeError(
+                "Unable to locate the CUDA SDK installation "
+                "directory, set CUDA library path manually"
+            )
+        return libdir, libptn
+
+    def add_source(self, source, nvcc_options=None, name="kernel.ptx"):
+        ptx = compile(
+            source,
+            nvcc=self.nvcc,
+            options=nvcc_options,
+            keep=self.keep,
+            no_extern_c=self.no_extern_c,
+            arch=self.arch,
+            code=self.code,
+            cache_dir=self.cache_dir,
+            include_dirs=self.include_dirs,
+            target="ptx",
+        )
+        from pycuda.driver import jit_input_type
+
+        self.linker.add_data(ptx, jit_input_type.PTX, name)
+        return self
+
+    def add_data(self, data, input_type, name="unknown"):
+        self.linker.add_data(data, input_type, name)
+        return self
+
+    def add_file(self, filename, input_type):
+        self.linker.add_file(filename, input_type)
+        return self
+
+    def add_stdlib(self, libname):
+        if self.libdir is None:
+            self.libdir, self.libptn = self._locate_cuda_libdir()
+        from os.path import isfile, join
+
+        libpath = join(self.libdir, self.libptn % libname)
+        if not isfile(libpath):
+            raise OSError('CUDA SDK library file "%s" not found' % libpath)
+        from pycuda.driver import jit_input_type
+
+        self.linker.add_file(libpath, jit_input_type.LIBRARY)
+        return self
+
+    def link(self):
+        self.module = self.linker.link_module()
+        self.linker = None
+        self._bind_module()
+        return self
+
+
+class DynamicSourceModule(DynamicModule):
+    """
+    Creates a Module from a single .cu source object linked against the
+    dynamic CUDA runtime.
+    - compiler generates PTX relocatable device code (rdc) from source that
+      can be linked with other relocatable device code
+    - source is linked against the CUDA device runtime library cudadevrt
+    - library cudadevrt is statically linked into the generated Module
+    """
+
+    def __init__(
+        self,
+        source,
+        nvcc="nvcc",
+        options=None,
+        keep=False,
+        no_extern_c=False,
+        arch=None,
+        code=None,
+        cache_dir=None,
+        include_dirs=[],
+        cuda_libdir=None,
+    ):
+        super().__init__(
+            nvcc=nvcc,
+            link_options=None,
+            keep=keep,
+            no_extern_c=no_extern_c,
+            arch=arch,
+            code=code,
+            cache_dir=cache_dir,
+            include_dirs=include_dirs,
+            cuda_libdir=cuda_libdir,
+        )
+        if options is None:
+            options = DEFAULT_NVCC_FLAGS
+        options = options[:]
+        if "-rdc=true" not in options:
+            options.append("-rdc=true")
+        if "-lcudadevrt" not in options:
+            options.append("-lcudadevrt")
+        self.add_source(source, nvcc_options=options)
+        # self.add_stdlib("cudadevrt")
+        self.link()