init 0.58

1fb0017a · dugupeiwen · 1fb0017a · 1fb0017a · 1fb0017a · 1fb0017a
Commit 1fb0017a authored Mar 23, 2024 by dugupeiwen
20 changed files
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Numba documentation build configuration file, created by
+# sphinx-quickstart on Tue Dec 30 11:55:40 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+try:
+    # Numba is installed
+    import numba
+except ImportError:
+    # Numba is run from its source checkout
+    sys.path.insert(0, os.path.abspath('../..'))
+    import numba
+
+
+on_rtd = os.environ.get('READTHEDOCS') == 'True'
+
+if on_rtd:
+    # The following is needed to fix RTD issue with numpydoc
+    # https://github.com/readthedocs/sphinx_rtd_theme/issues/766
+    from conda.cli.python_api import run_command as conda_cmd
+
+    conda_cmd("install", "-c", "conda-forge", "sphinx_rtd_theme>=0.5.1", "-y")
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    #'sphinx.ext.mathjax',
+    'sphinx.ext.autodoc',
+    #'sphinx.ext.graphviz',
+    'numpydoc',
+]
+
+# Adding the github files extension
+sys.path.append(os.path.abspath(os.path.join(".", "_ext")))
+extensions.append('ghfiles')
+
+todo_include_todos = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['../_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Numba'
+copyright = u'2012-2020, Anaconda, Inc. and others'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+version = '.'.join(numba.__version__.split('.')[:2])
+# The full version, including alpha/beta/rc tags.
+release = numba.__version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'sphinx_rtd_theme'
+
+# All sphinx_rtd_theme options. Default values commented out; uncomment to
+# change.
+html_theme_options = {
+    'canonical_url': 'https://numba.readthedocs.io/en/stable/',
+    # 'logo_only': False,
+    # 'display_version': True,
+    # 'prev_next_buttons_location': 'bottom',
+    'style_external_links': True,
+    # 'vcs_pageview_mode': '',
+    'style_nav_header_background': '#00A3E0',
+    # Toc options
+    'collapse_navigation': False,
+    # 'sticky_navigation': True,
+    # 'navigation_depth': 4,
+    # 'includehidden': True,
+    # 'titles_only': False
+}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = None
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+html_logo = "../_static/numba-white-icon-rgb.svg"
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+html_favicon = '../_static/numba-blue-icon-rgb.svg'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['../_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Numbadoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    ('index', 'numba.tex', u'Numba Documentation',
+     u'Anaconda', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'numba', 'Numba Documentation',
+     ['Anaconda'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    ('index', 'Numba', 'Numba Documentation',
+     'Anaconda', 'Numba', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+
+# Configuration for intersphinx: refer to the Python standard library
+# and the Numpy documentation.
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://numpy.org/doc/stable/', None),
+    'llvmlite': ('https://llvmlite.readthedocs.io/en/latest/', None),
+}
+
+
+# numpydoc options
+
+# To silence "WARNING: toctree contains reference to nonexisting document"
+numpydoc_show_class_members = False
+
+# -- Custom autogeneration ------------------------------------------------
+
+
+def _autogenerate():
+    from numba.scripts.generate_lower_listing import gen_lower_listing
+    from numba.misc.help.inspector import write_listings
+
+    basedir = os.path.dirname(__file__)
+    gen_lower_listing(os.path.join(basedir,
+                                   'developer/autogen_lower_listing.rst'))
+
+    # Run inspector on supported packages
+    for package in ['builtins', 'math', 'cmath', 'numpy']:
+        write_listings(
+            package_name=package,
+            filename=os.path.join(
+                basedir, 'developer', 'autogen_{}_listing'.format(package),
+            ),
+            output_format='rst',
+        )
+
+
+_autogenerate()
+
+
+def setup(app):
+    app.add_css_file('rtd-overrides.css')
--- a/docs/source/cuda-reference/host.rst
+++ b/docs/source/cuda-reference/host.rst
+CUDA Host API
+=============
+
+Device Management
+-----------------
+
+Device detection and enquiry
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions are available for querying the available hardware:
+
+.. autofunction:: numba.cuda.is_available
+
+.. autofunction:: numba.cuda.detect
+
+Context management
+~~~~~~~~~~~~~~~~~~
+
+CUDA Python functions execute within a CUDA context. Each CUDA device in a
+system has an associated CUDA context, and Numba presently allows only one context
+per thread. For further details on CUDA Contexts, refer to the `CUDA Driver API
+Documentation on Context Management
+<http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html>`_ and the
+`CUDA C Programming Guide Context Documentation
+<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#context>`_. CUDA Contexts
+are instances of the :class:`~numba.cuda.cudadrv.driver.Context` class:
+
+.. autoclass:: numba.cuda.cudadrv.driver.Context
+   :members: reset, get_memory_info, push, pop
+
+The following functions can be used to get or select the context:
+
+.. autofunction:: numba.cuda.current_context
+.. autofunction:: numba.cuda.require_context
+
+The following functions affect the current context:
+
+.. autofunction:: numba.cuda.synchronize
+.. autofunction:: numba.cuda.close
+
+Device management
+~~~~~~~~~~~~~~~~~
+
+Numba maintains a list of supported CUDA-capable devices:
+
+.. attribute:: numba.cuda.gpus
+
+   An indexable list of supported CUDA devices. This list is indexed by integer
+   device ID.
+
+Alternatively, the current device can be obtained:
+
+.. function:: numba.cuda.gpus.current
+
+   Return the currently-selected device.
+
+Getting a device through :attr:`numba.cuda.gpus` always provides an instance of
+:class:`numba.cuda.cudadrv.devices._DeviceContextManager`, which acts as a
+context manager for the selected device:
+
+.. autoclass:: numba.cuda.cudadrv.devices._DeviceContextManager
+
+One may also select a context and device or get the current device using the
+following three functions:
+
+.. autofunction:: numba.cuda.select_device
+.. autofunction:: numba.cuda.get_current_device
+.. autofunction:: numba.cuda.list_devices
+
+The :class:`numba.cuda.cudadrv.driver.Device` class can be used to enquire about
+the functionality of the selected device:
+
+.. class:: numba.cuda.cudadrv.driver.Device
+
+   The device associated with a particular context.
+
+   .. attribute:: compute_capability
+
+      A tuple, *(major, minor)* indicating the supported compute capability.
+
+   .. attribute:: id
+
+      The integer ID of the device.
+
+   .. attribute:: name
+
+      The name of the device (e.g. "GeForce GTX 970").
+
+   .. attribute:: uuid
+
+      The UUID of the device (e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643").
+
+   .. method:: reset
+
+      Delete the context for the device. This will destroy all memory
+      allocations, events, and streams created within the context.
+
+   .. attribute:: supports_float16
+
+      Return ``True`` if the device supports float16 operations, ``False``
+      otherwise.
+
+
+Compilation
+-----------
+
+Numba provides an entry point for compiling a Python function to PTX without
+invoking any of the driver API. This can be useful for:
+
+- Generating PTX that is to be inlined into other PTX code (e.g. from outside
+  the Numba / Python ecosystem).
+- Generating code when there is no device present.
+- Generating code prior to a fork without initializing CUDA.
+
+.. note:: It is the user's responsibility to manage any ABI issues arising from
+   the use of compilation to PTX.
+
+.. autofunction:: numba.cuda.compile_ptx
+
+
+The environment variable ``NUMBA_CUDA_DEFAULT_PTX_CC`` can be set to control
+the default compute capability targeted by ``compile_ptx`` - see
+:ref:`numba-envvars-gpu-support`. If PTX for the compute capability of the
+current device is required, the ``compile_ptx_for_current_device`` function can
+be used:
+
+.. autofunction:: numba.cuda.compile_ptx_for_current_device
+
+
+
+Measurement
+-----------
+
+.. _cuda-profiling:
+
+Profiling
+~~~~~~~~~
+
+The NVidia Visual Profiler can be used directly on executing CUDA Python code -
+it is not a requirement to insert calls to these functions into user code.
+However, these functions can be used to allow profiling to be performed
+selectively on specific portions of the code. For further information on
+profiling, see the `NVidia Profiler User's Guide
+<https://docs.nvidia.com/cuda/profiler-users-guide/>`_.
+
+.. autofunction:: numba.cuda.profile_start
+.. autofunction:: numba.cuda.profile_stop
+.. autofunction:: numba.cuda.profiling
+
+
+.. _events:
+
+Events
+~~~~~~
+
+Events can be used to monitor the progress of execution and to record the
+timestamps of specific points being reached. Event creation returns immediately,
+and the created event can be queried to determine if it has been reached. For
+further information, see the `CUDA C Programming Guide Events section
+<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#events>`_.
+
+The following functions are used for creating and measuring the time between
+events:
+
+.. autofunction:: numba.cuda.event
+.. autofunction:: numba.cuda.event_elapsed_time
+
+Events are instances of the :class:`numba.cuda.cudadrv.driver.Event` class:
+
+.. autoclass:: numba.cuda.cudadrv.driver.Event
+   :members: query, record, synchronize, wait
+
+
+.. _streams:
+
+Stream Management
+-----------------
+
+Streams allow concurrency of execution on a single device within a given
+context. Queued work items in the same stream execute sequentially, but work
+items in different streams may execute concurrently. Most operations involving a
+CUDA device can be performed asynchronously using streams, including data
+transfers and kernel execution. For further details on streams, see the `CUDA C
+Programming Guide Streams section
+<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#streams>`_.
+
+Numba defaults to using the legacy default stream as the default stream. The
+per-thread default stream can be made the default stream by setting the
+environment variable ``NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`` to ``1`` (see the
+:ref:`CUDA Environment Variables section <numba-envvars-gpu-support>`).
+Regardless of this setting, the objects representing the legacy and per-thread
+default streams can be constructed using the functions below.
+
+Streams are instances of :class:`numba.cuda.cudadrv.driver.Stream`:
+
+.. autoclass:: numba.cuda.cudadrv.driver.Stream
+   :members: synchronize, auto_synchronize, add_callback, async_done
+
+To create a new stream:
+
+.. autofunction:: numba.cuda.stream
+
+To get the default stream:
+
+.. autofunction:: numba.cuda.default_stream
+
+To get the default stream with an explicit choice of whether it is the legacy
+or per-thread default stream:
+
+.. autofunction:: numba.cuda.legacy_default_stream
+
+.. autofunction:: numba.cuda.per_thread_default_stream
+
+To construct a Numba ``Stream`` object using a stream allocated elsewhere, the
+``external_stream`` function is provided. Note that the lifetime of external
+streams must be managed by the user - Numba will not deallocate an external
+stream, and the stream must remain valid whilst the Numba ``Stream`` object is
+in use.
+
+.. autofunction:: numba.cuda.external_stream
+
+
+Runtime
+-------
+
+Numba generally uses the Driver API, but it provides a simple wrapper to the
+Runtime API so that the version of the runtime in use can be queried. This is
+accessed through ``cuda.runtime``, which is an instance of the
+:class:`numba.cuda.cudadrv.runtime.Runtime` class:
+
+.. autoclass:: numba.cuda.cudadrv.runtime.Runtime
+   :members: get_version, is_supported_version, supported_versions
+
+Whether the current runtime is officially supported and tested with the current
+version of Numba can also be queried:
+
+.. autofunction:: numba.cuda.is_supported_version
--- a/docs/source/cuda-reference/index.rst
+++ b/docs/source/cuda-reference/index.rst
+CUDA Python Reference
+=====================
+
+.. toctree::
+
+   host.rst
+   kernel.rst
+   types.rst
+   memory.rst
+   libdevice.rst
--- a/docs/source/cuda-reference/kernel.rst
+++ b/docs/source/cuda-reference/kernel.rst
+CUDA Kernel API
+===============
+
+Kernel declaration
+------------------
+
+The ``@cuda.jit`` decorator is used to create a CUDA dispatcher object that can
+be configured and launched:
+
+.. autofunction:: numba.cuda.jit
+
+
+Dispatcher objects
+------------------
+
+The usual syntax for configuring a Dispatcher with a launch configuration uses
+subscripting, with the arguments being as in the following:
+
+.. code-block:: python
+
+   # func is some function decorated with @cuda.jit
+   func[griddim, blockdim, stream, sharedmem]
+
+
+The ``griddim`` and ``blockdim`` arguments specify the size of the grid and
+thread blocks, and may be either integers or tuples of length up to 3. The
+``stream`` parameter is an optional stream on which the kernel will be launched,
+and the ``sharedmem`` parameter specifies the size of dynamic shared memory in
+bytes.
+
+Subscripting the Dispatcher returns a configuration object that can be called
+with the kernel arguments:
+
+.. code-block:: python
+
+   configured = func[griddim, blockdim, stream, sharedmem]
+   configured(x, y, z)
+
+
+However, it is more idiomatic to configure and call the kernel within a single
+statement:
+
+.. code-block:: python
+
+   func[griddim, blockdim, stream, sharedmem](x, y, z)
+
+This is similar to launch configuration in CUDA C/C++:
+
+.. code-block:: cuda
+
+   func<<<griddim, blockdim, sharedmem, stream>>>(x, y, z)
+
+.. note:: The order of ``stream`` and ``sharedmem`` are reversed in Numba
+   compared to in CUDA C/C++.
+
+Dispatcher objects also provide several utility methods for inspection and
+creating a specialized instance:
+
+.. autoclass:: numba.cuda.dispatcher.CUDADispatcher
+   :members: inspect_asm, inspect_llvm, inspect_sass, inspect_types,
+             get_regs_per_thread, specialize, specialized, extensions, forall,
+             get_shared_mem_per_block, get_max_threads_per_block,
+             get_const_mem_size, get_local_mem_per_thread
+
+
+
+Intrinsic Attributes and Functions
+----------------------------------
+
+The remainder of the attributes and functions in this section may only be called
+from within a CUDA Kernel.
+
+Thread Indexing
+~~~~~~~~~~~~~~~
+
+.. attribute:: numba.cuda.threadIdx
+
+    The thread indices in the current thread block, accessed through the
+    attributes ``x``, ``y``, and ``z``. Each index is an integer spanning the
+    range from 0 inclusive to the corresponding value of the attribute in
+    :attr:`numba.cuda.blockDim` exclusive.
+
+.. attribute:: numba.cuda.blockIdx
+
+    The block indices in the grid of thread blocks, accessed through the
+    attributes ``x``, ``y``, and ``z``. Each index is an integer spanning the
+    range from 0 inclusive to the corresponding value of the attribute in
+    :attr:`numba.cuda.gridDim` exclusive.
+
+.. attribute:: numba.cuda.blockDim
+
+    The shape of a block of threads, as declared when instantiating the
+    kernel.  This value is the same for all threads in a given kernel, even
+    if they belong to different blocks (i.e. each block is "full").
+
+.. attribute:: numba.cuda.gridDim
+
+    The shape of the grid of blocks, accessed through the attributes ``x``,
+    ``y``, and ``z``.
+
+.. attribute:: numba.cuda.laneid
+
+    The thread index in the current warp, as an integer spanning the range
+    from 0 inclusive to the :attr:`numba.cuda.warpsize` exclusive.
+
+.. attribute:: numba.cuda.warpsize
+
+    The size in threads of a warp on the GPU. Currently this is always 32.
+
+.. function:: numba.cuda.grid(ndim)
+
+   Return the absolute position of the current thread in the entire
+   grid of blocks.  *ndim* should correspond to the number of dimensions
+   declared when instantiating the kernel.  If *ndim* is 1, a single integer
+   is returned.  If *ndim* is 2 or 3, a tuple of the given number of
+   integers is returned.
+
+   Computation of the first integer is as follows::
+
+      cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
+
+   and is similar for the other two indices, but using the ``y`` and ``z``
+   attributes.
+
+.. function:: numba.cuda.gridsize(ndim)
+
+   Return the absolute size (or shape) in threads of the entire grid of
+   blocks. *ndim* should correspond to the number of dimensions declared when
+   instantiating the kernel.
+
+   Computation of the first integer is as follows::
+
+       cuda.blockDim.x * cuda.gridDim.x
+
+   and is similar for the other two indices, but using the ``y`` and ``z``
+   attributes.
+
+Memory Management
+~~~~~~~~~~~~~~~~~
+
+.. function:: numba.cuda.shared.array(shape, dtype)
+
+   Creates an array in the local memory space of the CUDA kernel with
+   the given ``shape`` and ``dtype``.
+
+   Returns an array with its content uninitialized.
+
+   .. note:: All threads in the same thread block sees the same array.
+
+.. function:: numba.cuda.local.array(shape, dtype)
+
+   Creates an array in the local memory space of the CUDA kernel with the
+   given ``shape`` and ``dtype``.
+
+   Returns an array with its content uninitialized.
+
+   .. note:: Each thread sees a unique array.
+
+.. function:: numba.cuda.const.array_like(ary)
+
+   Copies the ``ary`` into constant memory space on the CUDA kernel at compile
+   time.
+
+   Returns an array like the ``ary`` argument.
+
+   .. note:: All threads and blocks see the same array.
+
+Synchronization and Atomic Operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. function:: numba.cuda.atomic.add(array, idx, value)
+
+    Perform ``array[idx] += value``. Support int32, int64, float32 and
+    float64 only. The ``idx`` argument can be an integer or a tuple of integer
+    indices for indexing into multiple dimensional arrays. The number of element
+    in ``idx`` must match the number of dimension of ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic load.
+
+.. function:: numba.cuda.atomic.sub(array, idx, value)
+
+    Perform ``array[idx] -= value``. Supports int32, int64, float32 and
+    float64 only. The ``idx`` argument can be an integer or a tuple of integer
+    indices for indexing into multi-dimensional arrays. The number of elements
+    in ``idx`` must match the number of dimensions of ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic load.
+
+.. function:: numba.cuda.atomic.and_(array, idx, value)
+
+    Perform ``array[idx] &= value``. Supports int32, uint32, int64,
+    and uint64 only. The ``idx`` argument can be an integer or a tuple of
+    integer indices for indexing into multi-dimensional arrays. The number
+    of elements in ``idx`` must match the number of dimensions of ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic load.
+
+.. function:: numba.cuda.atomic.or_(array, idx, value)
+
+    Perform ``array[idx] |= value``. Supports int32, uint32, int64,
+    and uint64 only. The ``idx`` argument can be an integer or a tuple of
+    integer indices for indexing into multi-dimensional arrays. The number
+    of elements in ``idx`` must match the number of dimensions of ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic load.
+
+.. function:: numba.cuda.atomic.xor(array, idx, value)
+
+    Perform ``array[idx] ^= value``. Supports int32, uint32, int64,
+    and uint64 only. The ``idx`` argument can be an integer or a tuple of
+    integer indices for indexing into multi-dimensional arrays. The number
+    of elements in ``idx`` must match the number of dimensions of ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic load.
+
+.. function:: numba.cuda.atomic.exch(array, idx, value)
+
+    Perform ``array[idx] = value``. Supports int32, uint32, int64,
+    and uint64 only. The ``idx`` argument can be an integer or a tuple of
+    integer indices for indexing into multi-dimensional arrays. The number
+    of elements in ``idx`` must match the number of dimensions of ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic load.
+
+.. function:: numba.cuda.atomic.inc(array, idx, value)
+
+    Perform ``array[idx] = (0 if array[idx] >= value else array[idx] + 1)``.
+    Supports uint32, and uint64 only. The ``idx`` argument can be an integer
+    or a tuple of integer indices for indexing into multi-dimensional arrays.
+    The number of elements in ``idx`` must match the number of dimensions of
+    ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic load.
+
+.. function:: numba.cuda.atomic.dec(array, idx, value)
+
+    Perform ``array[idx] =
+    (value if (array[idx] == 0) or (array[idx] > value) else array[idx] - 1)``.
+    Supports uint32, and uint64 only. The ``idx`` argument can be an integer
+    or a tuple of integer indices for indexing into multi-dimensional arrays.
+    The number of elements in ``idx`` must match the number of dimensions of
+    ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic load.
+
+.. function:: numba.cuda.atomic.max(array, idx, value)
+
+    Perform ``array[idx] = max(array[idx], value)``. Support int32, int64,
+    float32 and float64 only. The ``idx`` argument can be an integer or a
+    tuple of integer indices for indexing into multiple dimensional arrays.
+    The number of element in ``idx`` must match the number of dimension of
+    ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic load.
+
+.. function:: numba.cuda.atomic.cas(array, idx, old, value)
+
+    Perform ``if array[idx] == old: array[idx] = value``. Supports int32,
+    int64, uint32, uint64 indexes only. The ``idx`` argument can be an integer
+    or a tuple of integer indices for indexing into multi-dimensional arrays.
+    The number of elements in ``idx`` must match the number of dimensions of
+    ``array``.
+
+    Returns the value of ``array[idx]`` before storing the new value.
+    Behaves like an atomic compare and swap.
+
+
+.. function:: numba.cuda.syncthreads
+
+    Synchronize all threads in the same thread block.  This function implements
+    the same pattern as barriers in traditional multi-threaded programming: this
+    function waits until all threads in the block call it, at which point it
+    returns control to all its callers.
+
+.. function:: numba.cuda.syncthreads_count(predicate)
+
+    An extension to :attr:`numba.cuda.syncthreads` where the return value is a count
+    of the threads where ``predicate`` is true.
+
+.. function:: numba.cuda.syncthreads_and(predicate)
+
+    An extension to :attr:`numba.cuda.syncthreads` where 1 is returned if ``predicate`` is
+    true for all threads or 0 otherwise.
+
+.. function:: numba.cuda.syncthreads_or(predicate)
+
+    An extension to :attr:`numba.cuda.syncthreads` where 1 is returned if ``predicate`` is
+    true for any thread or 0 otherwise.
+
+    .. warning:: All syncthreads functions must be called by every thread in the
+                 thread-block. Falling to do so may result in undefined behavior.
+
+
+Cooperative Groups
+~~~~~~~~~~~~~~~~~~
+
+.. function:: numba.cuda.cg.this_grid()
+
+   Get the current grid group.
+
+   :return: The current grid group
+   :rtype: numba.cuda.cg.GridGroup
+
+.. class:: numba.cuda.cg.GridGroup
+
+   A grid group. Users should not construct a GridGroup directly - instead, get
+   the current grid group using :func:`cg.this_grid() <numba.cuda.cg.this_grid>`.
+
+   .. method:: sync()
+
+      Synchronize the current grid group.
+
+
+Memory Fences
+~~~~~~~~~~~~~
+
+The memory fences are used to guarantee the effect of memory operations
+are visible by other threads within the same thread-block, the same GPU device,
+and the same system (across GPUs on global memory). Memory loads and stores
+are guaranteed to not move across the memory fences by optimization passes.
+
+.. warning:: The memory fences are considered to be advanced API and most
+             usercases should use the thread barrier (e.g. ``syncthreads()``).
+
+
+
+.. function:: numba.cuda.threadfence
+
+   A memory fence at device level (within the GPU).
+
+.. function:: numba.cuda.threadfence_block
+
+   A memory fence at thread block level.
+
+.. function:: numba.cuda.threadfence_system
+
+
+   A memory fence at system level (across GPUs).
+
+Warp Intrinsics
+~~~~~~~~~~~~~~~
+
+The argument ``membermask`` is a 32 bit integer mask with each bit
+corresponding to a thread in the warp, with 1 meaning the thread is in the
+subset of threads within the function call. The ``membermask`` must be all 1 if
+the GPU compute capability is below 7.x.
+
+.. function:: numba.cuda.syncwarp(membermask)
+
+   Synchronize a masked subset of the threads in a warp.
+
+.. function:: numba.cuda.all_sync(membermask, predicate)
+
+    If the ``predicate`` is true for all threads in the masked warp, then
+    a non-zero value is returned, otherwise 0 is returned.
+
+.. function:: numba.cuda.any_sync(membermask, predicate)
+
+    If the ``predicate`` is true for any thread in the masked warp, then
+    a non-zero value is returned, otherwise 0 is returned.
+
+.. function:: numba.cuda.eq_sync(membermask, predicate)
+
+    If the boolean ``predicate`` is the same for all threads in the masked warp,
+    then a non-zero value is returned, otherwise 0 is returned.
+
+.. function:: numba.cuda.ballot_sync(membermask, predicate)
+
+    Returns a mask of all threads in the warp whose ``predicate`` is true,
+    and are within the given mask.
+
+.. function:: numba.cuda.shfl_sync(membermask, value, src_lane)
+
+    Shuffles ``value`` across the masked warp and returns the ``value``
+    from ``src_lane``. If this is outside the warp, then the
+    given ``value`` is returned.
+
+.. function:: numba.cuda.shfl_up_sync(membermask, value, delta)
+
+    Shuffles ``value`` across the masked warp and returns the ``value``
+    from ``laneid - delta``. If this is outside the warp, then the
+    given ``value`` is returned.
+
+.. function:: numba.cuda.shfl_down_sync(membermask, value, delta)
+
+    Shuffles ``value`` across the masked warp and returns the ``value``
+    from ``laneid + delta``. If this is outside the warp, then the
+    given ``value`` is returned.
+
+.. function:: numba.cuda.shfl_xor_sync(membermask, value, lane_mask)
+
+    Shuffles ``value`` across the masked warp and returns the ``value``
+    from ``laneid ^ lane_mask``.
+
+.. function:: numba.cuda.match_any_sync(membermask, value, lane_mask)
+
+    Returns a mask of threads that have same ``value`` as the given ``value``
+    from within the masked warp.
+
+.. function:: numba.cuda.match_all_sync(membermask, value, lane_mask)
+
+    Returns a tuple of (mask, pred), where mask is a mask of threads that have
+    same ``value`` as the given ``value`` from within the masked warp, if they
+    all have the same value, otherwise it is 0. And pred is a boolean of whether
+    or not all threads in the mask warp have the same warp.
+
+.. function:: numba.cuda.activemask()
+
+    Returns a 32-bit integer mask of all currently active threads in the
+    calling warp. The Nth bit is set if the Nth lane in the warp is active when
+    activemask() is called. Inactive threads are represented by 0 bits in the
+    returned mask. Threads which have exited the kernel are always marked as
+    inactive.
+
+.. function:: numba.cuda.lanemask_lt()
+
+    Returns a 32-bit integer mask of all lanes (including inactive ones) with
+    ID less than the current lane.
+
+
+Integer Intrinsics
+~~~~~~~~~~~~~~~~~~
+
+A subset of the CUDA Math API's integer intrinsics are available. For further
+documentation, including semantics, please refer to the `CUDA Toolkit
+documentation
+<https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html>`_.
+
+
+.. function:: numba.cuda.popc(x)
+
+   Returns the number of bits set in ``x``.
+
+.. function:: numba.cuda.brev(x)
+
+   Returns the reverse of the bit pattern of ``x``. For example, ``0b10110110``
+   becomes ``0b01101101``.
+
+.. function:: numba.cuda.clz(x)
+
+   Returns the number of leading zeros in ``x``.
+
+.. function:: numba.cuda.ffs(x)
+
+   Returns the position of the first (least significant) bit set to 1 in ``x``,
+   where the least significant bit position is 1. ``ffs(0)`` returns 0.
+
+
+Floating Point Intrinsics
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A subset of the CUDA Math API's floating point intrinsics are available. For further
+documentation, including semantics, please refer to the `single
+<https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html>`_ and
+`double <https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__DOUBLE.html>`_
+precision parts of the CUDA Toolkit documentation.
+
+
+.. function:: numba.cuda.fma
+
+   Perform the fused multiply-add operation. Named after the ``fma`` and ``fmaf`` in
+   the C api, but maps to the ``fma.rn.f32`` and ``fma.rn.f64`` (round-to-nearest-even)
+   PTX instructions.
+
+.. function:: numba.cuda.cbrt (x)
+
+   Perform the cube root operation, x ** (1/3). Named after the functions
+   ``cbrt`` and ``cbrtf`` in the C api. Supports float32, and float64 arguments
+   only.
+
+16-bit Floating Point Intrinsics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The functions in the ``cuda.fp16`` module are used to operate on 16-bit
+floating point operands. These functions return a 16-bit floating point result.
+
+To determine whether Numba supports compiling code that uses the ``float16``
+type in the current configuration, use:
+
+   .. function:: numba.cuda.is_float16_supported ()
+
+   Return ``True`` if 16-bit floats are supported, ``False`` otherwise.
+
+To check whether a device supports ``float16``, use its
+:attr:`supports_float16 <numba.cuda.cudadrv.driver.Device.supports_float16>`
+attribute.
+
+.. function:: numba.cuda.fp16.hfma (a, b, c)
+
+   Perform the fused multiply-add operation ``(a * b) + c`` on 16-bit
+   floating point arguments in round to nearest mode. Maps to the ``fma.rn.f16``
+   PTX instruction.
+
+   Returns the 16-bit floating point result of the fused multiply-add.
+
+.. function:: numba.cuda.fp16.hadd (a, b)
+
+   Perform the add operation ``a + b`` on 16-bit floating point arguments in
+   round to nearest mode. Maps to the ``add.f16`` PTX instruction.
+
+   Returns the 16-bit floating point result of the addition.
+
+.. function:: numba.cuda.fp16.hsub (a, b)
+
+   Perform the subtract operation ``a - b`` on 16-bit floating point arguments in
+   round to nearest mode. Maps to the ``sub.f16`` PTX instruction.
+
+   Returns the 16-bit floating point result of the subtraction.
+
+.. function:: numba.cuda.fp16.hmul (a, b)
+
+   Perform the multiply operation ``a * b`` on 16-bit floating point arguments in
+   round to nearest mode. Maps to the ``mul.f16`` PTX instruction.
+
+   Returns the 16-bit floating point result of the multiplication.
+
+.. function:: numba.cuda.fp16.hdiv (a, b)
+
+   Perform the divide operation ``a / b`` on 16-bit floating point arguments in
+   round to nearest mode.
+
+   Returns the 16-bit floating point result of the division.
+
+.. function:: numba.cuda.fp16.hneg (a)
+
+   Perform the negation operation ``-a`` on the 16-bit floating point argument.
+   Maps to the ``neg.f16`` PTX instruction.
+
+   Returns the 16-bit floating point result of the negation.
+
+.. function:: numba.cuda.fp16.habs (a)
+
+   Perform the absolute value operation ``|a|`` on the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the absolute value operation.
+
+.. function:: numba.cuda.fp16.hsin (a)
+
+   Calculates the trigonometry sine function of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the sine operation.
+
+.. function:: numba.cuda.fp16.hcos (a)
+
+   Calculates the trigonometry cosine function of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the cosine operation.
+
+.. function:: numba.cuda.fp16.hlog (a)
+
+   Calculates the natural logarithm of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the natural log operation.
+
+.. function:: numba.cuda.fp16.hlog10 (a)
+
+   Calculates the base 10 logarithm of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the log base 10 operation.
+
+.. function:: numba.cuda.fp16.hlog2 (a)
+
+   Calculates the base 2 logarithm on the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the log base 2 operation.
+
+.. function:: numba.cuda.fp16.hexp (a)
+
+   Calculates the natural exponential operation of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the exponential operation.
+
+.. function:: numba.cuda.fp16.hexp10 (a)
+
+   Calculates the base 10 exponential of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the exponential operation.
+
+.. function:: numba.cuda.fp16.hexp2 (a)
+
+   Calculates the base 2 exponential of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the exponential operation.
+
+.. function:: numba.cuda.fp16.hfloor (a)
+
+   Calculates the floor operation, the largest integer less than or equal to ``a``,
+   on the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the floor operation.
+
+.. function:: numba.cuda.fp16.hceil (a)
+
+   Calculates the ceiling operation, the smallest integer greater than or equal to ``a``,
+   on the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the ceil operation.
+
+.. function:: numba.cuda.fp16.hsqrt (a)
+
+   Calculates the square root operation of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the square root operation.
+
+.. function:: numba.cuda.fp16.hrsqrt (a)
+
+   Calculates the reciprocal of the square root of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the reciprocal square root operation.
+
+.. function:: numba.cuda.fp16.hrcp (a)
+
+   Calculates the reciprocal of the 16-bit floating point argument.
+
+   Returns the 16-bit floating point result of the reciprocal.
+
+.. function:: numba.cuda.fp16.hrint (a)
+
+   Round the input 16-bit floating point argument to nearest integer value.
+
+   Returns the 16-bit floating point result of the rounding.
+
+.. function:: numba.cuda.fp16.htrunc (a)
+
+   Truncate the input 16-bit floating point argument to the nearest integer
+   that does not exceed the input argument in magnitude.
+
+   Returns the 16-bit floating point result of the truncation.
+
+.. function:: numba.cuda.fp16.heq (a, b)
+
+   Perform the comparison operation ``a == b`` on 16-bit floating point arguments.
+
+   Returns a boolean.
+
+.. function:: numba.cuda.fp16.hne (a, b)
+
+   Perform the comparison operation ``a != b`` on 16-bit floating point arguments.
+
+   Returns a boolean.
+
+.. function:: numba.cuda.fp16.hgt (a, b)
+
+   Perform the comparison operation ``a > b`` on 16-bit floating point arguments.
+
+   Returns a boolean.
+
+.. function:: numba.cuda.fp16.hge (a, b)
+
+   Perform the comparison operation ``a >= b`` on 16-bit floating point arguments.
+
+   Returns a boolean.
+
+.. function:: numba.cuda.fp16.hlt (a, b)
+
+   Perform the comparison operation ``a < b`` on 16-bit floating point arguments.
+
+   Returns a boolean.
+
+.. function:: numba.cuda.fp16.hle (a, b)
+
+   Perform the comparison operation ``a <= b`` on 16-bit floating point arguments.
+
+   Returns a boolean.
+
+.. function:: numba.cuda.fp16.hmax (a, b)
+
+   Perform the operation ``a if a > b else b.``
+
+   Returns a 16-bit floating point value.
+
+.. function:: numba.cuda.fp16.hmin (a, b)
+
+   Perform the operation ``a if a < b else b.``
+
+   Returns a 16-bit floating point value.
+
+Control Flow Instructions
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A subset of the CUDA's control flow instructions are directly available as
+intrinsics. Avoiding branches is a key way to improve CUDA performance, and
+using these intrinsics mean you don't have to rely on the ``nvcc`` optimizer
+identifying and removing branches. For further documentation, including
+semantics, please refer to the `relevant CUDA Toolkit documentation
+<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions>`_.
+
+
+.. function:: numba.cuda.selp
+
+    Select between two expressions, depending on the value of the first
+    argument. Similar to LLVM's ``select`` instruction.
+
+
+Timer Intrinsics
+~~~~~~~~~~~~~~~~
+
+.. function:: numba.cuda.nanosleep(ns)
+
+    Suspends the thread for a sleep duration approximately close to the delay
+    ``ns``, specified in nanoseconds.
--- a/docs/source/cuda-reference/libdevice.rst
+++ b/docs/source/cuda-reference/libdevice.rst
+Libdevice functions
+===================
+
+All wrapped libdevice functions are listed in this section. All functions in
+libdevice are wrapped, with the exception of ``__nv_nan`` and ``__nv_nanf``.
+These functions return a representation of a quiet NaN, but the argument they
+take (a pointer to an object specifying the representation) is undocumented, and
+follows an unusual form compared to the rest of libdevice - it is not an output
+like every other pointer argument. If a NaN is required, one can be obtained in
+CUDA Python by other means, e.g. ``math.nan``.
+
+Wrapped functions
+-----------------
+
+.. automodule:: numba.cuda.libdevice
+   :members:
--- a/docs/source/cuda-reference/memory.rst
+++ b/docs/source/cuda-reference/memory.rst
+Memory Management
+=================
+
+.. autofunction:: numba.cuda.to_device
+.. autofunction:: numba.cuda.device_array
+.. autofunction:: numba.cuda.device_array_like
+.. autofunction:: numba.cuda.pinned_array
+.. autofunction:: numba.cuda.pinned_array_like
+.. autofunction:: numba.cuda.mapped_array
+.. autofunction:: numba.cuda.mapped_array_like
+.. autofunction:: numba.cuda.managed_array
+.. autofunction:: numba.cuda.pinned
+.. autofunction:: numba.cuda.mapped
+
+Device Objects
+--------------
+
+.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceNDArray
+   :members: copy_to_device, copy_to_host, is_c_contiguous, is_f_contiguous,
+              ravel, reshape, split
+.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceRecord
+   :members: copy_to_device, copy_to_host
+.. autoclass:: numba.cuda.cudadrv.devicearray.MappedNDArray
+   :members: copy_to_device, copy_to_host, split
--- a/docs/source/cuda-reference/types.rst
+++ b/docs/source/cuda-reference/types.rst
+CUDA-Specific Types
+====================
+
+.. note::
+
+    This page is about types specific to CUDA targets. Many other types are also
+    available in the CUDA target - see :ref:`cuda-built-in-types`.
+
+Vector Types
+~~~~~~~~~~~~
+
+`CUDA Vector Types <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#built-in-vector-types>`_
+are usable in kernels. There are two important distinctions from vector types in CUDA C/C++:
+
+First, the recommended names for vector types in Numba CUDA is formatted as ``<base_type>x<N>``,
+where ``base_type`` is the base type of the vector, and ``N`` is the number of elements in the vector.
+Examples include ``int64x3``, ``uint16x4``, ``float32x4``, etc. For new Numba CUDA kernels,
+this is the recommended way to instantiate vector types.
+
+For convenience, users adapting existing kernels from CUDA C/C++ to Python may use
+aliases consistent with the C/C++ namings. For example, ``float3`` aliases ``float32x3``,
+``long3`` aliases ``int32x3`` or ``int64x3`` (depending on the platform), etc. 
+
+Second, unlike CUDA C/C++ where factory functions are used, vector types are constructed directly
+with their constructor. For example, to construct a ``float32x3``:
+
+.. code-block:: python3
+
+    from numba.cuda import float32x3
+
+    # In kernel
+    f3 = float32x3(0.0, -1.0, 1.0)
+
+Additionally, vector types can be constructed from a combination of vector and
+primitive types, as long as the total number of components matches the result
+vector type. For example, all of the following constructions are valid:
+
+.. code-block:: python3
+
+    zero = uint32(0)
+    u2 = uint32x2(1, 2)
+    # Construct a 3-component vector with primitive type and a 2-component vector
+    u3 = uint32x3(zero, u2)
+    # Construct a 4-component vector with 2 2-component vectors
+    u4 = uint32x4(u2, u2)
+
+The 1st, 2nd, 3rd and 4th component of the vector type can be accessed through fields 
+``x``, ``y``, ``z``, and ``w`` respectively. The components are immutable after
+construction in the present version of Numba; it is expected that support for
+mutating vector components will be added in a future release.
+
+.. code-block:: python3
+
+    v1 = float32x2(1.0, 1.0)
+    v2 = float32x2(1.0, -1.0)
+    dotprod = v1.x * v2.x + v1.y * v2.y
--- a/docs/source/cuda/bindings.rst
+++ b/docs/source/cuda/bindings.rst
+CUDA Bindings
+=============
+
+Numba supports two bindings to the CUDA Driver APIs: its own internal bindings
+based on ctypes, and the official `NVIDIA CUDA Python bindings
+<https://nvidia.github.io/cuda-python/>`_. Functionality is equivalent between
+the two bindings.
+
+The internal bindings are used by default. If the NVIDIA bindings are installed,
+then they can be used by setting the environment variable
+``NUMBA_CUDA_USE_NVIDIA_BINDING`` to ``1`` prior to the import of Numba. Once
+Numba has been imported, the selected binding cannot be changed.
+
+
+Per-Thread Default Streams
+--------------------------
+
+Responsibility for handling Per-Thread Default Streams (PTDS) is delegated to
+the NVIDIA bindings when they are in use. To use PTDS with the NVIDIA bindings,
+set the environment variable ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to
+``1`` instead of Numba's environmnent variable
+:envvar:`NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`.
+
+.. seealso::
+
+   The `Default Stream section
+   <https://nvidia.github.io/cuda-python/release/11.6.0-notes.html#default-stream>`_
+   in the NVIDIA Bindings documentation.
+
+
+Roadmap
+-------
+
+In Numba 0.56, the NVIDIA Bindings will be used by default, if they are
+installed.
+
+In future versions of Numba:
+
+- The internal bindings will be deprecated.
+- The internal bindings will be removed.
+
+At present, no specific release is planned for the deprecation or removal of
+the internal bindings.
--- a/docs/source/cuda/caching.rst
+++ b/docs/source/cuda/caching.rst
+On-disk Kernel Caching
+======================
+
+When the ``cache`` keyword argument of the :func:`@cuda.jit <numba.cuda.jit>`
+decorator is ``True``, a file-based cache is enabled. This shortens compilation
+times when the function was already compiled in a previous invocation.
+
+The cache is maintained in the ``__pycache__`` subdirectory of the directory
+containing the source file; if the current user is not allowed to write to it,
+the cache implementation falls back to a platform-specific user-wide cache
+directory (such as ``$HOME/.cache/numba`` on Unix platforms).
+
+
+Compute capability considerations
+---------------------------------
+
+Separate cache files are maintained for each compute capability. When a cached
+kernel is loaded, the compute capability of the device the kernel is first
+launched on in the current run is used to determine which version to load.
+Therefore, on systems that have multiple GPUs with differing compute
+capabilities, the cached versions of kernels are only used for one compute
+capability, and recompilation will occur for other compute capabilities.
+
+For example: if a system has two GPUs, one of compute capability 7.5 and one of
+8.0, then:
+
+* If a cached kernel is first launched on the CC 7.5 device, then the cached
+  version for CC 7.5 is used. If it is subsequently launched on the CC 8.0
+  device, a recompilation will occur.
+* If in a subsequent run the cached kernel is first launched on the CC 8.0
+  device, then the cached version for CC 8.0 is used. A subsequent launch on
+  the CC 7.5 device will require a recompilation.
+
+This limitation is not expected to present issues in most practical scenarios,
+as multi-GPU production systems tend to have identical GPUs within each node.
--- a/docs/source/cuda/cooperative_groups.rst
+++ b/docs/source/cuda/cooperative_groups.rst
+==================
+Cooperative Groups
+==================
+
+Supported features
+------------------
+
+Numba's Cooperative Groups support presently provides grid groups and grid
+synchronization, along with cooperative kernel launches.
+
+Cooperative groups are supported on Linux, and Windows for devices in `TCC
+mode
+<https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#tesla-compute-cluster-mode-for-windows>`_.
+
+
+Using Grid Groups
+-----------------
+
+To get the current grid group, use the :meth:`cg.this_grid()
+<numba.cuda.cg.this_grid>` function:
+
+.. code-block:: python
+
+   g = cuda.cg.this_grid()
+
+Synchronizing the grid is done with the :meth:`sync()
+<numba.cuda.cg.GridGroup.sync>` method of the grid group:
+
+.. code-block:: python
+
+   g.sync()
+
+
+Cooperative Launches
+--------------------
+
+Unlike the CUDA C/C++ API, a cooperative launch is invoked using the same syntax
+as a normal kernel launch - Numba automatically determines whether a cooperative
+launch is required based on whether a grid group is synchronized in the kernel.
+
+The grid size limit for a cooperative launch is more restrictive than for a
+normal launch - the grid must be no larger than the maximum number of active
+blocks on the device on which it is launched. To get maximum grid size for a
+cooperative launch of a kernel with a given block size and dynamic shared
+memory requirement, use the ``max_cooperative_grid_blocks()`` method of kernel
+overloads:
+
+.. automethod:: numba.cuda.dispatcher._Kernel.max_cooperative_grid_blocks
+
+This can be used to ensure that the kernel is launched with no more than the
+maximum number of blocks. Exceeding the maximum number of blocks for the
+cooperative launch will result in a ``CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE``
+error. 
+
+
+Applications and Example
+------------------------
+
+Grid group synchronization can be used to implement a global barrier across all
+threads in the grid - applications of this include a global reduction to a
+single value, or looping over rows of a large matrix sequentially using the
+entire grid to operate on column elements in parallel.
+
+In the following example, rows are written sequentially by the grid. Each thread
+in the grid reads a value from the previous row written by it's *opposite*
+thread. A grid sync is needed to ensure that threads in the grid don't run ahead
+of threads in other blocks, or fail to see updates from their opposite thread.
+
+First we'll define our kernel:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
+   :language: python
+   :caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
+   :start-after: magictoken.ex_grid_sync_kernel.begin
+   :end-before: magictoken.ex_grid_sync_kernel.end
+   :dedent: 8
+   :linenos:
+
+Then create some empty input data and determine the grid and block sizes:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
+   :language: python
+   :caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
+   :start-after: magictoken.ex_grid_sync_data.begin
+   :end-before: magictoken.ex_grid_sync_data.end
+   :dedent: 8
+   :linenos:
+
+Finally we launch the kernel and print the result:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
+   :language: python
+   :caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
+   :start-after: magictoken.ex_grid_sync_launch.begin
+   :end-before: magictoken.ex_grid_sync_launch.end
+   :dedent: 8
+   :linenos:
+
+
+The maximum grid size for ``sequential_rows`` can be enquired using:
+
+
+.. code-block:: python
+
+   overload = sequential_rows.overloads[(int32[:,::1],)
+   max_blocks = overload.max_cooperative_grid_blocks(blockdim)
+   print(max_blocks)
+   # 1152 (e.g. on Quadro RTX 8000 with Numba 0.52.1 and CUDA 11.0)
--- a/docs/source/cuda/cuda_array_interface.rst
+++ b/docs/source/cuda/cuda_array_interface.rst
+.. _cuda-array-interface:
+
+================================
+CUDA Array Interface (Version 3)
+================================
+
+The *CUDA Array Interface* (or CAI) is created for interoperability between
+different implementations of CUDA array-like objects in various projects. The
+idea is borrowed from the `NumPy array interface`_.
+
+
+.. note::
+    Currently, we only define the Python-side interface.  In the future, we may
+    add a C-side interface for efficient exchange of the information in
+    compiled code.
+
+
+Python Interface Specification
+==============================
+
+.. note:: Experimental feature.  Specification may change.
+
+The ``__cuda_array_interface__`` attribute returns a dictionary (``dict``)
+that must contain the following entries:
+
+- **shape**: ``(integer, ...)``
+
+  A tuple of ``int`` (or ``long``) representing the size of each dimension.
+
+- **typestr**: ``str``
+
+  The type string.  This has the same definition as ``typestr`` in the
+  `NumPy array interface`_.
+
+- **data**: ``(integer, boolean)``
+
+  The **data** is a 2-tuple.  The first element is the data pointer
+  as a Python ``int`` (or ``long``).  The data must be device-accessible.
+  For zero-size arrays, use ``0`` here.
+  The second element is the read-only flag as a Python ``bool``.
+
+  Because the user of the interface may or may not be in the same context,
+  the most common case is to use ``cuPointerGetAttribute`` with
+  ``CU_POINTER_ATTRIBUTE_DEVICE_POINTER`` in the CUDA driver API (or the
+  equivalent CUDA Runtime API) to retrieve a device pointer that
+  is usable in the currently active context.
+
+- **version**: ``integer``
+
+  An integer for the version of the interface being exported.
+  The current version is *3*.
+
+
+The following are optional entries:
+
+- **strides**: ``None`` or ``(integer, ...)``
+
+  If **strides** is not given, or it is ``None``, the array is in
+  C-contiguous layout. Otherwise, a tuple of ``int`` (or ``long``) is explicitly
+  given for representing the number of bytes to skip to access the next
+  element at each dimension.
+
+- **descr**
+
+  This is for describing more complicated types.  This follows the same
+  specification as in the `NumPy array interface`_.
+
+- **mask**: ``None`` or object exposing the ``__cuda_array_interface__``
+
+  If ``None`` then all values in **data** are valid. All elements of the mask
+  array should be interpreted only as true or not true indicating which
+  elements of this array are valid. This has the same definition as ``mask``
+  in the `NumPy array interface`_.
+
+  .. note:: Numba does not currently support working with masked CUDA arrays
+            and will raise a ``NotImplementedError`` exception if one is passed
+            to a GPU function.
+
+- **stream**: ``None`` or ``integer``
+
+  An optional stream upon which synchronization must take place at the point of
+  consumption, either by synchronizing on the stream or enqueuing operations on
+  the data on the given stream. Integer values in this entry are as follows:
+
+  - ``0``: This is disallowed as it would be ambiguous between ``None`` and the
+    default stream, and also between the legacy and per-thread default streams.
+    Any use case where ``0`` might be given should either use ``None``, ``1``,
+    or ``2`` instead for clarity.
+  - ``1``: The legacy default stream.
+  - ``2``: The per-thread default stream.
+  - Any other integer: a ``cudaStream_t`` represented as a Python integer.
+
+  When ``None``, no synchronization is required. See the
+  :ref:`cuda-array-interface-synchronization` section below for further details.
+
+  In a future revision of the interface, this entry may be expanded (or another
+  entry added) so that an event to synchronize on can be specified instead of a
+  stream.
+
+
+.. _cuda-array-interface-synchronization:
+
+Synchronization
+---------------
+
+Definitions
+~~~~~~~~~~~
+
+When discussing synchronization, the following definitions are used:
+
+- *Producer*: The library / object on which ``__cuda_array_interface__`` is
+  accessed.
+- *Consumer*: The library / function that accesses the
+  ``__cuda_array_interface__`` of the Producer.
+- *User Code*: Code that induces a Producer and Consumer to share data through
+  the CAI.
+- *User*: The person writing or maintaining the User Code. The User may
+  implement User Code without knowledge of the CAI, since the CAI accesses can
+  be hidden from their view.
+
+In the following example:
+
+.. code-block:: python
+
+   import cupy
+   from numba import cuda
+
+   @cuda.jit
+   def add(x, y, out):
+       start = cuda.grid(1)
+       stride = cuda.gridsize(1)
+       for i in range(start, x.shape[0], stride):
+           out[i] = x[i] + y[i]
+
+   a = cupy.arange(10)
+   b = a * 2
+   out = cupy.zeros_like(a)
+
+   add[1, 32](a, b, out)
+
+When the ``add`` kernel is launched:
+
+- ``a``, ``b``, ``out`` are Producers.
+- The ``add`` kernel is the Consumer.
+- The User Code is specifically ``add[1, 32](a, b, out)``.
+- The author of the code is the User.
+
+
+Design Motivations
+~~~~~~~~~~~~~~~~~~
+
+Elements of the CAI design related to synchronization seek to fulfill these
+requirements:
+
+1. Producers and Consumers that exchange data through the CAI must be able to do
+   so without data races.
+2. Requirement 1 should be met without requiring the user to be
+   aware of any particulars of the CAI - in other words, exchanging data between
+   Producers and Consumers that operate on data asynchronously should be correct
+   by default.
+
+   - An exception to this requirement is made for Producers and Consumers that
+     explicitly document that the User is required to take additional steps to
+     ensure correctness with respect to synchronization. In this case, Users
+     are required to understand the details of the CUDA Array Interface, and
+     the Producer/Consumer library documentation must specify the steps that
+     Users are required to take.
+
+     Use of this exception should be avoided where possible, as it is provided
+     for libraries that cannot implement the synchronization semantics without
+     the involvement of the User - for example, those interfacing with
+     third-party libraries oblivious to the CUDA Array Interface.
+
+3. Where the User is aware of the particulars of the CAI and implementation
+   details of the Producer and Consumer, they should be able to, at their
+   discretion, override some of the synchronization semantics of the interface
+   to reduce the synchronization overhead. Overriding synchronization semantics
+   implies that:
+
+   - The CAI design, and the design and implementation of the Producer and
+     Consumer do not specify or guarantee correctness with respect to data
+     races.
+   - Instead, the User is responsible for ensuring correctness with respect to
+     data races.
+
+
+Interface Requirements
+~~~~~~~~~~~~~~~~~~~~~~
+
+The ``stream`` entry enables Producers and Consumers to avoid hazards when
+exchanging data. Expected behaviour of the Consumer is as follows:
+
+* When ``stream`` is not present or is ``None``:
+
+  - No synchronization is required on the part of the Consumer.
+  - The Consumer may enqueue operations on the underlying data immediately on
+    any stream.
+
+* When ``stream`` is an integer, its value indicates the stream on which the
+  Producer may have in-progress operations on the data, and which the Consumer
+  is expected to either:
+
+  - Synchronize on before accessing the data, or
+  - Enqueue operations in when accessing the data.
+
+  The Consumer can choose which mechanism to use, with the following
+  considerations:
+
+  - If the Consumer synchronizes on the provided stream prior to accessing the
+    data, then it must ensure that no computation can take place in the provided
+    stream until its operations in its own choice of stream have taken place.
+    This could be achieved by either:
+
+    - Placing a wait on an event in the provided stream that occurs once all
+      of the Consumer's operations on the data are completed, or
+    - Avoiding returning control to the user code until after its operations
+      on its own stream have completed.
+
+  - If the consumer chooses to only enqueue operations on the data in the
+    provided stream, then it may return control to the User code immediately
+    after enqueueing its work, as the work will all be serialized on the
+    exported array's stream. This is sufficient to ensure correctness even if
+    the User code were to induce the Producer to subsequently start enqueueing
+    more work on the same stream.
+
+* If the User has set the Consumer to ignore CAI synchronization semantics, the
+  Consumer may assume it can operate on the data immediately in any stream with
+  no further synchronization, even if the ``stream`` member has an integer
+  value.
+
+
+When exporting an array through the CAI, Producers must ensure that:
+
+* If there is work on the data enqueued in one or more streams, then
+  synchronization on the provided ``stream`` is sufficient to ensure
+  synchronization with all pending work.
+
+  - If the Producer has no enqueued work, or work only enqueued on the stream
+    identified by ``stream``, then this condition is met.
+  - If the Producer has enqueued work on the data on multiple streams, then it
+    must enqueue events on those streams that follow the enqueued work, and
+    then wait on those events in the provided ``stream``. For example:
+
+    1. Work is enqueued by the Producer on streams ``7``, ``9``, and ``15``.
+    2. Events are then enqueued on each of streams ``7``, ``9``, and ``15``.
+    3. Producer then tells stream ``3`` to wait on the events from Step 2, and
+       the ``stream`` entry is set to ``3``.
+
+* If there is no work enqueued on the data, then the ``stream`` entry may be
+  either ``None``, or not provided.
+
+Optionally, to facilitate the User relaxing conformance to synchronization
+semantics:
+
+* Producers may provide a configuration option to always set ``stream`` to
+  ``None``.
+* Consumers may provide a configuration option to ignore the value of ``stream``
+  and act as if it were ``None`` or not provided.  This elides synchronization
+  on the Producer-provided streams, and allows enqueuing work on streams other
+  than that provided by the Producer.
+
+These options should not be set by default in either a Producer or a Consumer.
+The CAI specification does not prescribe the exact mechanism by which these
+options are set, or related options that Producers or Consumers might provide
+to allow the user further control over synchronization behavior.
+
+
+Synchronization in Numba
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Numba is neither strictly a Producer nor a Consumer - it may be used to
+implement either by a User. In order to facilitate the correct implementation of
+synchronization semantics, Numba exhibits the following behaviors related to
+synchronization of the interface:
+
+- When Numba acts as a Consumer (for example when an array-like object is passed
+  to a kernel launch): If ``stream`` is an integer, then Numba will immediately
+  synchronize on the provided ``stream``. A Numba :class:`Device Array
+  <numba.cuda.cudadrv.devicearray.DeviceNDArray>` created from an array-like
+  object has its *default stream* set to the provided stream.
+
+- When Numba acts as a Producer (when the ``__cuda_array_interface__`` property
+  of a Numba CUDA Array is accessed): If the exported CUDA Array has a
+  *default stream*, then it is given as the ``stream`` entry. Otherwise,
+  ``stream`` is set to ``None``.
+
+.. note:: In Numba's terminology, an array's *default stream* is a property
+          specifying the stream that Numba will enqueue asynchronous
+          transfers in if no other stream is provided as an argument to the
+          function invoking the transfer. It is not the same as the `Default
+          Stream
+          <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#default-stream>`_
+          in normal CUDA terminology.
+
+Numba's synchronization behavior results in the following intended
+consequences:
+
+- Exchanging data either as a Producer or a Consumer will be correct without
+  the need for any further action from the User, provided that the other side
+  of the interaction also follows the CAI synchronization semantics.
+- The User is expected to either:
+
+  - Avoid launching kernels or other operations on streams that
+    are not the default stream for their parameters, or
+  - When launching operations on a stream that is not the default stream for
+    a given parameter, they should then insert an event into the stream that
+    they are operating in, and wait on that event in the default stream for
+    the parameter. For an example of this, :ref:`see below
+    <example-multi-streams>`.
+
+The User may override Numba's synchronization behavior by setting the
+environment variable ``NUMBA_CUDA_ARRAY_INTERFACE_SYNC`` or the config variable
+``CUDA_ARRAY_INTERFACE_SYNC`` to ``0`` (see :ref:`GPU Support Environment
+Variables <numba-envvars-gpu-support>`).  When set, Numba will not synchronize
+on the streams of imported arrays, and it is the responsibility of the user to
+ensure correctness with respect to stream synchronization. Synchronization when
+creating a Numba CUDA Array from an object exporting the CUDA Array Interface
+may also be elided by passing ``sync=False`` when creating the Numba CUDA
+Array with :func:`numba.cuda.as_cuda_array` or
+:func:`numba.cuda.from_cuda_array_interface`.
+
+There is scope for Numba's synchronization implementation to be optimized in
+the future, by eliding synchronizations when a kernel or driver API operation
+(e.g.  a memcopy or memset) is launched on the same stream as an imported
+array.
+
+
+.. _example-multi-streams:
+
+An example launching on an array's non-default stream
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example shows how to ensure that a Consumer can safely consume an array
+with a default stream when it is passed to a kernel launched in a different
+stream.
+
+First we need to import Numba and a consumer library (a fictitious library named
+``other_cai_library`` for this example):
+
+.. code-block:: python
+
+   from numba import cuda, int32, void
+   import other_cai_library
+
+Now we'll define a kernel - this initializes the elements of the array, setting
+each entry to its index:
+
+.. code-block:: python
+
+   @cuda.jit(void, int32[::1])
+   def initialize_array(x):
+       i = cuda.grid(1)
+       if i < len(x):
+           x[i] = i
+
+Next we will create two streams:
+
+.. code-block:: python
+
+   array_stream = cuda.stream()
+   kernel_stream = cuda.stream()
+
+Then create an array with one of the streams as its default stream:
+
+.. code-block:: python
+
+   N = 16384
+   x = cuda.device_array(N, stream=array_stream)
+
+Now we launch the kernel in the other stream:
+
+.. code-block:: python
+
+   nthreads = 256
+   nblocks = N // nthreads
+
+   initialize_array[nthreads, nblocks, kernel_stream](x)
+
+If we were to pass ``x`` to a Consumer now, there is a risk that it may operate on
+it in ``array_stream`` whilst the kernel is still running in ``kernel_stream``.
+To prevent operations in ``array_stream`` starting before the kernel launch is
+finished, we create an event and wait on it:
+
+.. code-block:: python
+
+   # Create event
+   evt = cuda.event()
+   # Record the event after the kernel launch in kernel_stream
+   evt.record(kernel_stream)
+   # Wait for the event in array_stream
+   evt.wait(array_stream)
+
+It is now safe for ``other_cai_library`` to consume ``x``:
+
+.. code-block:: python
+
+   other_cai_library.consume(x)
+
+
+Lifetime management
+-------------------
+
+Data
+~~~~
+
+Obtaining the value of the ``__cuda_array_interface__`` property of any object
+has no effect on the lifetime of the object from which it was created. In
+particular, note that the interface has no slot for the owner of the data.
+
+The User code must preserve the lifetime of the object owning the data for as
+long as the Consumer might use it.
+
+
+Streams
+~~~~~~~
+
+Like data, CUDA streams also have a finite lifetime. It is therefore required
+that a Producer exporting data on the interface with an associated stream
+ensures that the exported stream's lifetime is equal to or surpasses the
+lifetime of the object from which the interface was exported.
+
+
+Lifetime management in Numba
+----------------------------
+
+Producing Arrays
+~~~~~~~~~~~~~~~~
+
+Numba takes no steps to maintain the lifetime of an object from which the
+interface is exported - it is the user's responsibility to ensure that the
+underlying object is kept alive for the duration that the exported interface
+might be used.
+
+The lifetime of any Numba-managed stream exported on the interface is guaranteed
+to equal or surpass the lifetime of the underlying object, because the
+underlying object holds a reference to the stream.
+
+.. note:: Numba-managed streams are those created with
+          ``cuda.default_stream()``, ``cuda.legacy_default_stream()``, or
+          ``cuda.per_thread_default_stream()``. Streams not managed by Numba
+          are created from an external stream with ``cuda.external_stream()``.
+
+
+Consuming Arrays
+~~~~~~~~~~~~~~~~
+
+Numba provides two mechanisms for creating device arrays from objects exporting
+the CUDA Array Interface. Which to use depends on whether the created device
+array should maintain the life of the object from which it is created:
+
+- ``as_cuda_array``: This creates a device array that holds a reference to the
+  owning object. As long as a reference to the device array is held, its
+  underlying data will also be kept alive, even if all other references to the
+  original owning object have been dropped.
+- ``from_cuda_array_interface``: This creates a device array with no reference
+  to the owning object by default. The owning object, or some other object to
+  be considered the owner can be passed in the ``owner`` parameter.
+
+The interfaces of these functions are:
+
+.. automethod:: numba.cuda.as_cuda_array
+
+.. automethod:: numba.cuda.from_cuda_array_interface
+
+
+Pointer Attributes
+------------------
+
+Additional information about the data pointer can be retrieved using
+``cuPointerGetAttribute`` or ``cudaPointerGetAttributes``.  Such information
+include:
+
+- the CUDA context that owns the pointer;
+- is the pointer host-accessible?
+- is the pointer a managed memory?
+
+
+.. _NumPy array interface: https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html#__array_interface__
+
+
+Differences with CUDA Array Interface (Version 0)
+-------------------------------------------------
+
+Version 0 of the CUDA Array Interface did not have the optional **mask**
+attribute to support masked arrays.
+
+
+Differences with CUDA Array Interface (Version 1)
+-------------------------------------------------
+
+Versions 0 and 1 of the CUDA Array Interface neither clarified the
+**strides** attribute for C-contiguous arrays nor specified the treatment for
+zero-size arrays.
+
+
+Differences with CUDA Array Interface (Version 2)
+-------------------------------------------------
+
+Prior versions of the CUDA Array Interface made no statement about
+synchronization.
+
+
+Interoperability
+----------------
+
+The following Python libraries have adopted the CUDA Array Interface:
+
+- Numba
+- `CuPy <https://docs-cupy.chainer.org/en/stable/reference/interoperability.html>`_
+- `PyTorch <https://pytorch.org>`_
+- `PyArrow <https://arrow.apache.org/docs/python/generated/pyarrow.cuda.Context.html#pyarrow.cuda.Context.buffer_from_object>`_
+- `mpi4py <https://mpi4py.readthedocs.io/en/latest/overview.html#support-for-cuda-aware-mpi>`_
+- `ArrayViews <https://github.com/xnd-project/arrayviews>`_
+- `JAX <https://jax.readthedocs.io/en/latest/index.html>`_
+- `PyCUDA <https://documen.tician.de/pycuda/tutorial.html#interoperability-with-other-libraries-using-the-cuda-array-interface>`_
+- `DALI: the NVIDIA Data Loading Library <https://github.com/NVIDIA/DALI>`_ :
+
+    - `TensorGPU objects
+      <https://docs.nvidia.com/deeplearning/dali/user-guide/docs/data_types.html#nvidia.dali.backend.TensorGPU>`_
+      expose the CUDA Array Interface.
+    - `The External Source operator
+      <https://docs.nvidia.com/deeplearning/dali/user-guide/docs/supported_ops.html#nvidia.dali.fn.external_source>`_
+      consumes objects exporting the CUDA Array Interface.
+- The RAPIDS stack:
+
+    - `cuDF <https://rapidsai.github.io/projects/cudf/en/0.11.0/10min-cudf-cupy.html>`_
+    - `cuML <https://docs.rapids.ai/api/cuml/nightly/>`_
+    - `cuSignal <https://github.com/rapidsai/cusignal>`_
+    - `RMM <https://docs.rapids.ai/api/rmm/stable/>`_
+
+If your project is not on this list, please feel free to report it on the `Numba issue tracker <https://github.com/numba/numba/issues>`_.
--- a/docs/source/cuda/cuda_ffi.rst
+++ b/docs/source/cuda/cuda_ffi.rst
+
+.. _cuda_ffi:
+
+Calling foreign functions from Python kernels
+=============================================
+
+Python kernels can call device functions written in other languages. CUDA C/C++,
+PTX, and binary objects (cubins, fat binaries, etc.) are directly supported;
+sources in other languages must be compiled to PTX first. The constituent parts
+of a Python kernel call to a foreign device function are:
+
+- The device function implementation in a foreign language (e.g. CUDA C).
+- A declaration of the device function in Python.
+- A kernel that links with and calls the foreign function.
+
+
+Device function ABI
+-------------------
+
+Numba's ABI for calling device functions defines the following prototype in
+C/C++:
+
+.. code:: C
+
+   extern "C"
+   __device__ int
+   function(
+     T* return_value,
+     ...
+   );
+
+
+Components of the prototype are as follows:
+
+- ``extern "C"`` is used to prevent name-mangling so that it is easy to declare
+  the function in Python. It can be removed, but then the mangled name must be
+  used in the declaration of the function in Python.
+- ``__device__`` is required to define the function as a device function.
+- The return value is always of type ``int``, and is used to signal whether a
+  Python exception occurred. Since Python exceptions don't occur in foreign
+  functions, this should always be set to 0 by the callee.
+- The first argument is a pointer to the return value of type ``T``, which is
+  allocated in the local address space [#f1]_ and passed in by the caller. If
+  the function returns a value, the pointee should be set by the callee to
+  store the return value.
+- Subsequent arguments should match the types and order of arguments passed to
+  the function from the Python kernel.
+
+Functions written in other languages must compile to PTX that conforms to this
+prototype specification.
+
+A function that accepts two floats and returns a float would have the following
+prototype:
+
+.. code:: C
+
+   extern "C"
+   __device__ int
+   mul_f32_f32(
+     float* return_value,
+     float x,
+     float y
+   );
+
+.. rubric:: Notes
+
+.. [#f1] Care must be taken to ensure that any operations on the return value
+         are applicable to data in the local address space.  Some operations,
+         such as atomics, cannot be performed on data in the local address
+         space.
+
+Declaration in Python
+---------------------
+
+To declare a foreign device function in Python, use :func:`declare_device()
+<numba.cuda.declare_device>`:
+
+.. autofunction:: numba.cuda.declare_device
+
+The returned descriptor name need not match the name of the foreign function.
+For example, when:
+
+.. code::
+
+   mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)')
+
+is declared, calling ``mul(a, b)`` inside a kernel will translate into a call to
+``mul_f32_f32(a, b)`` in the compiled code.
+
+Passing pointers
+----------------
+
+Numba's calling convention requires multiple values to be passed for array
+arguments. These include the data pointer along with shape, stride, and other
+information. This is incompatible with the expectations of most C/C++ functions,
+which generally only expect a pointer to the data. To align the calling
+conventions between C device code and Python kernels it is necessary to declare
+array arguments using C pointer types.
+
+For example, a function with the following prototype:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/ffi/functions.cu
+   :language: C
+   :caption: ``numba/cuda/tests/doc_examples/ffi/functions.cu``
+   :start-after: magictoken.ex_sum_reduce_proto.begin
+   :end-before: magictoken.ex_sum_reduce_proto.end
+   :linenos:
+
+would be declared as follows:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_ffi.py
+   :language: python
+   :caption: from ``test_ex_from_buffer`` in ``numba/cuda/tests/doc_examples/test_ffi.py``
+   :start-after: magictoken.ex_from_buffer_decl.begin
+   :end-before: magictoken.ex_from_buffer_decl.end
+   :dedent: 8
+   :linenos:
+
+To obtain a pointer to array data for passing to foreign functions, use the
+``from_buffer()`` method of a ``cffi.FFI`` instance. For example, a kernel using
+the ``sum_reduce`` function could be defined as:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_ffi.py
+   :language: python
+   :caption: from ``test_ex_from_buffer`` in ``numba/cuda/tests/doc_examples/test_ffi.py``
+   :start-after: magictoken.ex_from_buffer_kernel.begin
+   :end-before: magictoken.ex_from_buffer_kernel.end
+   :dedent: 8
+   :linenos:
+
+where ``result`` and ``array`` are both arrays of ``float32`` data.
+
+Linking and Calling functions
+-----------------------------
+
+The ``link`` keyword argument of the :func:`@cuda.jit <numba.cuda.jit>`
+decorator accepts a list of file names specified by absolute path or a path
+relative to the current working directory. Files whose name ends in ``.cu``
+will be compiled with the `NVIDIA Runtime Compiler (NVRTC)
+<https://docs.nvidia.com/cuda/nvrtc/index.html>`_ and linked into the kernel as
+PTX; other files will be passed directly to the CUDA Linker.
+
+For example, the following kernel calls the ``mul()`` function declared above
+with the implementation ``mul_f32_f32()`` in a file called ``functions.cu``:
+
+.. code::
+
+   @cuda.jit(link=['functions.cu'])
+   def multiply_vectors(r, x, y):
+       i = cuda.grid(1)
+
+       if i < len(r):
+           r[i] = mul(x[i], y[i])
+
+
+C/C++ Support
+-------------
+
+Support for compiling and linking of CUDA C/C++ code is provided through the use
+of NVRTC subject to the following considerations:
+
+- It is only available when using the NVIDIA Bindings. See
+  :envvar:`NUMBA_CUDA_USE_NVIDIA_BINDING`.
+- A suitable version of the NVRTC library for the installed version of the
+  NVIDIA CUDA Bindings must be available.
+- The CUDA include path is assumed by default to be ``/usr/local/cuda/include``
+  on Linux and ``$env:CUDA_PATH\include`` on Windows. It can be modified using
+  the environment variable :envvar:`NUMBA_CUDA_INCLUDE_PATH`.
+- The CUDA include directory will be made available to NVRTC on the include
+  path; additional includes are not supported.
+
+
+Complete Example
+----------------
+
+This example demonstrates calling a foreign function written in CUDA C to
+multiply pairs of numbers from two arrays.
+
+The foreign function is written as follows:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/ffi/functions.cu
+   :language: C
+   :caption: ``numba/cuda/tests/doc_examples/ffi/functions.cu``
+   :start-after: magictoken.ex_mul_f32_f32.begin
+   :end-before: magictoken.ex_mul_f32_f32.end
+   :linenos:
+
+The Python code and kernel are:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_ffi.py
+   :language: python
+   :caption: from ``test_ex_linking_cu`` in ``numba/cuda/tests/doc_examples/test_ffi.py``
+   :start-after: magictoken.ex_linking_cu.begin
+   :end-before: magictoken.ex_linking_cu.end
+   :dedent: 8
+   :linenos:
+
+.. note::
+
+  The example above is minimal in order to illustrate a foreign function call -
+  it would not be expected to be particularly performant due to the small grid
+  and light workload of the foreign function.
--- a/docs/source/cuda/cudapysupported.rst
+++ b/docs/source/cuda/cudapysupported.rst
+========================================
+Supported Python features in CUDA Python
+========================================
+
+This page lists the Python features supported in the CUDA Python.  This includes
+all kernel and device functions compiled with ``@cuda.jit`` and other higher
+level Numba decorators that targets the CUDA GPU.
+
+Language
+========
+
+Execution Model
+---------------
+
+CUDA Python maps directly to the *single-instruction multiple-thread*
+execution (SIMT) model of CUDA.  Each instruction is implicitly
+executed by multiple threads in parallel.  With this execution model, array
+expressions are less useful because we don't want multiple threads to perform
+the same task.  Instead, we want threads to perform a task in a cooperative
+fashion.
+
+For details please consult the
+`CUDA Programming Guide
+<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#programming-model>`_.
+
+Floating Point Error Model
+--------------------------
+
+By default, CUDA Python kernels execute with the NumPy error model. In this
+model, division by zero raises no exception and instead produces a result of
+``inf``, ``-inf`` or ``nan``. This differs from the normal Python error model,
+in which division by zero raises a ``ZeroDivisionError``.
+
+When debug is enabled (by passing ``debug=True`` to the
+:func:`@cuda.jit <numba.cuda.jit>` decorator), the Python error model is used.
+This allows division-by-zero errors during kernel execution to be identified.
+
+Constructs
+----------
+
+The following Python constructs are not supported:
+
+* Exception handling (``try .. except``, ``try .. finally``)
+* Context management (the ``with`` statement)
+* Comprehensions (either list, dict, set or generator comprehensions)
+* Generator (any ``yield`` statements)
+
+The ``raise`` and ``assert`` statements are supported, with the following
+constraints:
+
+- They can only be used in kernels, not in device functions.
+- They only have an effect when ``debug=True`` is passed to the
+  :func:`@cuda.jit <numba.cuda.jit>` decorator. This is similar to the behavior
+  of the ``assert`` keyword in CUDA C/C++, which is ignored unless compiling
+  with device debug turned on.
+
+
+Printing of strings, integers, and floats is supported, but printing is an
+asynchronous operation - in order to ensure that all output is printed after a
+kernel launch, it is necessary to call :func:`numba.cuda.synchronize`. Eliding
+the call to ``synchronize`` is acceptable, but output from a kernel may appear
+during other later driver operations (e.g. subsequent kernel launches, memory
+transfers, etc.), or fail to appear before the program execution completes. Up
+to 32 arguments may be passed to the ``print`` function - if more are passed
+then a format string will be emitted instead and a warning will be produced.
+This is due to a general limitation in CUDA printing, as outlined in the
+`section on limitations in printing
+<https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#limitations>`_
+in the CUDA C++ Programming Guide.
+
+
+Recursion
+---------
+
+Self-recursive device functions are supported, with the constraint that
+recursive calls must have the same argument types as the initial call to
+the function. For example, the following form of recursion is supported:
+
+.. code:: python
+
+   @cuda.jit("int64(int64)", device=True)
+   def fib(n):
+       if n < 2:
+           return n
+       return fib(n - 1) + fib(n - 2)
+
+(the ``fib`` function always has an ``int64`` argument), whereas the following
+is unsupported:
+
+.. code:: python
+
+   # Called with x := int64, y := float64
+   @cuda.jit
+   def type_change_self(x, y):
+       if x > 1 and y > 0:
+           return x + type_change_self(x - y, y)
+       else:
+           return y
+
+The outer call to ``type_change_self`` provides ``(int64, float64)`` arguments,
+but the inner call uses ``(float64, float64)`` arguments (because ``x - y`` /
+``int64 - float64`` results in a ``float64`` type). Therefore, this function is
+unsupported.
+
+Mutual recursion between functions (e.g. where a function ``func1()`` calls
+``func2()`` which again calls ``func1()``) is unsupported.
+
+.. note::
+
+   The call stack in CUDA is typically quite limited in size, so it is easier
+   to overflow it with recursive calls on CUDA devices than it is on CPUs.
+
+   Stack overflow will result in an Unspecified Launch Failure (ULF) during
+   kernel execution.  In order to identify whether a ULF is due to stack
+   overflow, programs can be run under `Compute Sanitizer
+   <https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html>`_,
+   which explicitly states when stack overflow has occurred.
+
+.. _cuda-built-in-types:
+
+Built-in types
+===============
+
+The following built-in types support are inherited from CPU nopython mode.
+
+* int
+* float
+* complex
+* bool
+* None
+* tuple
+* Enum, IntEnum
+
+See :ref:`nopython built-in types <pysupported-builtin-types>`.
+
+There is also some very limited support for character sequences (bytes and
+unicode strings) used in NumPy arrays. Note that this support can only be used
+with CUDA 11.2 onwards.
+
+Built-in functions
+==================
+
+The following built-in functions are supported:
+
+* :func:`abs`
+* :class:`bool`
+* :class:`complex`
+* :func:`enumerate`
+* :class:`float`
+* :class:`int`: only the one-argument form
+* :func:`len`
+* :func:`min`: only the multiple-argument form
+* :func:`max`: only the multiple-argument form
+* :func:`pow`
+* :class:`range`
+* :func:`round`
+* :func:`zip`
+
+
+Standard library modules
+========================
+
+
+``cmath``
+---------
+
+The following functions from the :mod:`cmath` module are supported:
+
+* :func:`cmath.acos`
+* :func:`cmath.acosh`
+* :func:`cmath.asin`
+* :func:`cmath.asinh`
+* :func:`cmath.atan`
+* :func:`cmath.atanh`
+* :func:`cmath.cos`
+* :func:`cmath.cosh`
+* :func:`cmath.exp`
+* :func:`cmath.isfinite`
+* :func:`cmath.isinf`
+* :func:`cmath.isnan`
+* :func:`cmath.log`
+* :func:`cmath.log10`
+* :func:`cmath.phase`
+* :func:`cmath.polar`
+* :func:`cmath.rect`
+* :func:`cmath.sin`
+* :func:`cmath.sinh`
+* :func:`cmath.sqrt`
+* :func:`cmath.tan`
+* :func:`cmath.tanh`
+
+``math``
+--------
+
+The following functions from the :mod:`math` module are supported:
+
+* :func:`math.acos`
+* :func:`math.asin`
+* :func:`math.atan`
+* :func:`math.acosh`
+* :func:`math.asinh`
+* :func:`math.atanh`
+* :func:`math.cos`
+* :func:`math.sin`
+* :func:`math.tan`
+* :func:`math.hypot`
+* :func:`math.cosh`
+* :func:`math.sinh`
+* :func:`math.tanh`
+* :func:`math.atan2`
+* :func:`math.erf`
+* :func:`math.erfc`
+* :func:`math.exp`
+* :func:`math.expm1`
+* :func:`math.fabs`
+* :func:`math.frexp`
+* :func:`math.ldexp`
+* :func:`math.gamma`
+* :func:`math.lgamma`
+* :func:`math.log`
+* :func:`math.log2`
+* :func:`math.log10`
+* :func:`math.log1p`
+* :func:`math.sqrt`
+* :func:`math.remainder`
+* :func:`math.pow`
+* :func:`math.ceil`
+* :func:`math.floor`
+* :func:`math.copysign`
+* :func:`math.fmod`
+* :func:`math.modf`
+* :func:`math.isnan`
+* :func:`math.isinf`
+* :func:`math.isfinite`
+
+
+``operator``
+------------
+
+The following functions from the :mod:`operator` module are supported:
+
+* :func:`operator.add`
+* :func:`operator.and_`
+* :func:`operator.eq`
+* :func:`operator.floordiv`
+* :func:`operator.ge`
+* :func:`operator.gt`
+* :func:`operator.iadd`
+* :func:`operator.iand`
+* :func:`operator.ifloordiv`
+* :func:`operator.ilshift`
+* :func:`operator.imod`
+* :func:`operator.imul`
+* :func:`operator.invert`
+* :func:`operator.ior`
+* :func:`operator.ipow`
+* :func:`operator.irshift`
+* :func:`operator.isub`
+* :func:`operator.itruediv`
+* :func:`operator.ixor`
+* :func:`operator.le`
+* :func:`operator.lshift`
+* :func:`operator.lt`
+* :func:`operator.mod`
+* :func:`operator.mul`
+* :func:`operator.ne`
+* :func:`operator.neg`
+* :func:`operator.not_`
+* :func:`operator.or_`
+* :func:`operator.pos`
+* :func:`operator.pow`
+* :func:`operator.rshift`
+* :func:`operator.sub`
+* :func:`operator.truediv`
+* :func:`operator.xor`
+
+.. _cuda_numpy_support:
+
+NumPy support
+=============
+
+Due to the CUDA programming model, dynamic memory allocation inside a kernel is
+inefficient and is often not needed.  Numba disallows any memory allocating features.
+This disables a large number of NumPy APIs.  For best performance, users should write
+code such that each thread is dealing with a single element at a time.
+
+Supported NumPy features:
+
+* accessing `ndarray` attributes `.shape`, `.strides`, `.ndim`, `.size`, etc..
+* indexing and slicing works.
+* A subset of ufuncs are supported, but the output array must be passed in as a
+  positional argument (see :ref:`cuda_ufunc_call_example`). Note that ufuncs
+  execute sequentially in each thread - there is no automatic parallelisation
+  of ufuncs across threads over the elements of an input array.
+
+  The following ufuncs are supported:
+
+  * :func:`numpy.sin`
+  * :func:`numpy.cos`
+  * :func:`numpy.tan`
+  * :func:`numpy.arcsin`
+  * :func:`numpy.arccos`
+  * :func:`numpy.arctan`
+  * :func:`numpy.arctan2`
+  * :func:`numpy.hypot`
+  * :func:`numpy.sinh`
+  * :func:`numpy.cosh`
+  * :func:`numpy.tanh`
+  * :func:`numpy.arcsinh`
+  * :func:`numpy.arccosh`
+  * :func:`numpy.arctanh`
+  * :func:`numpy.deg2rad`
+  * :func:`numpy.radians`
+  * :func:`numpy.rad2deg`
+  * :func:`numpy.degrees`
+  * :func:`numpy.greater`
+  * :func:`numpy.greater_equal`
+  * :func:`numpy.less`
+  * :func:`numpy.less_equal`
+  * :func:`numpy.not_equal`
+  * :func:`numpy.equal`
+  * :func:`numpy.logical_and`
+  * :func:`numpy.logical_or`
+  * :func:`numpy.logical_xor`
+  * :func:`numpy.logical_not`
+  * :func:`numpy.maximum`
+  * :func:`numpy.minimum`
+  * :func:`numpy.fmax`
+  * :func:`numpy.fmin`
+  * :func:`numpy.bitwise_and`
+  * :func:`numpy.bitwise_or`
+  * :func:`numpy.bitwise_xor`
+  * :func:`numpy.invert`
+  * :func:`numpy.bitwise_not`
+  * :func:`numpy.left_shift`
+  * :func:`numpy.right_shift`
+
+Unsupported NumPy features:
+
+* array creation APIs.
+* array methods.
+* functions that returns a new array.
+
+
+CFFI support
+============
+
+The ``from_buffer()`` method of ``cffi.FFI`` objects is supported. This is
+useful for obtaining a pointer that can be passed to external C / C++ / PTX
+functions (see the :ref:`CUDA FFI documentation <cuda_ffi>`).
--- a/docs/source/cuda/device-functions.rst
+++ b/docs/source/cuda/device-functions.rst
+
+Writing Device Functions
+========================
+
+CUDA device functions can only be invoked from within the device (by a kernel
+or another device function).  To define a device function::
+
+    from numba import cuda
+
+    @cuda.jit(device=True)
+    def a_device_function(a, b):
+        return a + b
+
+Unlike a kernel function, a device function can return a value like normal
+functions.
--- a/docs/source/cuda/device-management.rst
+++ b/docs/source/cuda/device-management.rst
+
+Device management
+=================
+
+For multi-GPU machines, users may want to select which GPU to use.
+By default the CUDA driver selects the fastest GPU as the device 0,
+which is the default device used by Numba.
+
+The features introduced on this page are generally not of interest
+unless working with systems hosting/offering more than one CUDA-capable GPU.
+
+Device Selection
+----------------
+
+If at all required, device selection must be done before any CUDA feature is
+used.
+
+::
+
+    from numba import cuda
+    cuda.select_device(0)
+
+The device can be closed by:
+
+::
+
+    cuda.close()
+
+Users can then create a new context with another device.
+
+::
+
+    cuda.select_device(1)  # assuming we have 2 GPUs
+
+
+.. function:: numba.cuda.select_device(device_id)
+   :noindex:
+
+   Create a new CUDA context for the selected *device_id*.  *device_id*
+   should be the number of the device (starting from 0; the device order
+   is determined by the CUDA libraries).  The context is associated with
+   the current thread.  Numba currently allows only one context per thread.
+
+   If successful, this function returns a device instance.
+
+   .. XXX document device instances?
+
+
+.. function:: numba.cuda.close
+   :noindex:
+
+   Explicitly close all contexts in the current thread.
+
+   .. note::
+      Compiled functions are associated with the CUDA context.
+      This makes it not very useful to close and create new devices, though it
+      is certainly useful for choosing which device to use when the machine
+      has multiple GPUs.
+
+The Device List
+===============
+
+The Device List is a list of all the GPUs in the system, and can be indexed to
+obtain a context manager that ensures execution on the selected GPU.
+
+.. attribute:: numba.cuda.gpus
+   :noindex:
+.. attribute:: numba.cuda.cudadrv.devices.gpus
+
+:py:data:`numba.cuda.gpus` is an instance of the ``_DeviceList`` class, from
+which the current GPU context can also be retrieved:
+
+.. autoclass:: numba.cuda.cudadrv.devices._DeviceList
+    :members: current
+    :noindex:
+
+
+Device UUIDs
+============
+
+The UUID of a device (equal to that returned by ``nvidia-smi -L``) is available
+in the :attr:`uuid <numba.cuda.cudadrv.driver.Device.uuid>` attribute of a CUDA
+device object.
+
+For example, to obtain the UUID of the current device:
+
+.. code-block:: python
+
+   dev = cuda.current_context().device
+   # prints e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643"
+   print(dev.uuid)
+
--- a/docs/source/cuda/examples.rst
+++ b/docs/source/cuda/examples.rst
+
+========
+Examples
+========
+
+.. _cuda-vecadd:
+
+Vector Addition
+===============
+This example uses Numba to create on-device arrays and a vector addition kernel;
+it is a warmup for learning how to write GPU kernels using Numba. We'll begin
+with some required imports:
+
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
+   :language: python
+   :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
+   :start-after: ex_vecadd.import.begin
+   :end-before: ex_vecadd.import.end
+   :dedent: 8
+   :linenos:
+
+The following function is the kernel. Note that it is defined in terms of Python
+variables with unspecified types. When the kernel is launched, Numba will
+examine the types of the arguments that are passed at runtime and generate a
+CUDA kernel specialized for them.
+
+Note that Numba kernels do not return values and must write any output into
+arrays passed in as parameters (this is similar to the requirement that CUDA
+C/C++ kernels have ``void`` return type). Here we pass in ``c`` for the results
+to be written into.
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
+   :language: python
+   :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
+   :start-after: ex_vecadd.kernel.begin
+   :end-before: ex_vecadd.kernel.end
+   :dedent: 8
+   :linenos:
+
+:func:`cuda.to_device() <numba.cuda.to_device>` can be used create device-side
+copies of arrays.  :func:`cuda.device_array_like()
+<numba.cuda.device_array_like>` creates an uninitialized array of the same shape
+and type as an existing array.  Here we transfer two vectors and create an empty
+vector to hold our results:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
+   :language: python
+   :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
+   :start-after: ex_vecadd.allocate.begin
+   :end-before: ex_vecadd.allocate.end
+   :dedent: 8
+   :linenos:
+
+A call to :meth:`forall() <numba.cuda.dispatcher.Dispatcher.forall>` generates
+an appropriate launch configuration with a 1D grid (see
+:ref:`cuda-kernel-invocation`) for a given data size and is often the simplest
+way of launching a kernel:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
+   :language: python
+   :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
+   :start-after: ex_vecadd.forall.begin
+   :end-before: ex_vecadd.forall.end
+   :dedent: 8
+   :linenos:
+
+This prints:
+
+.. code-block:: none
+
+   [0.73548323 1.32061059 0.12582968 ... 1.25925809 1.49335059 1.59315414]
+
+One can also configure the grid manually using the subscripting syntax. The
+following example launches a grid with sufficient threads to operate on every
+vector element:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
+   :language: python
+   :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
+   :start-after: ex_vecadd.launch.begin
+   :end-before: ex_vecadd.launch.end
+   :dedent: 8
+   :linenos:
+
+This also prints:
+
+.. code-block:: none
+
+   [0.73548323 1.32061059 0.12582968 ... 1.25925809 1.49335059 1.59315414]
+
+.. _cuda-laplace:
+
+1D Heat Equation
+=====================
+This example solves Laplace's equation in one dimension for a certain set of initial
+conditions and boundary conditions. A full discussion of Laplace's equation is out of
+scope for this documentation, but it will suffice to say that it describes how heat
+propagates through an object over time. It works by discretizing the problem in two ways:
+
+1. The domain is partitioned into a mesh of points that each have an individual temperature.
+2. Time is partitioned into discrete intervals that are advanced forward sequentially.
+
+Then, the following assumption is applied: The temperature of a point after some interval 
+has passed is some weighted average of the temperature of the points that are directly
+adjacent to it. Intuitively, if all the points in the domain are very hot
+and a single point in the middle is very cold, as time passes, the hot points will cause
+the cold one to heat up and the cold point will cause the surrounding hot pieces to cool
+slightly. Simply put, the heat spreads throughout the object.
+
+We can implement this simulation using a Numba kernel. Let's start simple by assuming
+we have a one dimensional object which we'll represent with an array of values. The position 
+of the element in the array is the position of a point within the object, and the value
+of the element represents the temperature. 
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py
+   :language: python
+   :caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py``
+   :start-after: ex_laplace.import.begin
+   :end-before: ex_laplace.import.end
+   :dedent: 8
+   :linenos:
+
+
+Some initial setup here. Let's make one point in the center of the object very hot.
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py
+   :language: python
+   :caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py``
+   :start-after: ex_laplace.allocate.begin
+   :end-before: ex_laplace.allocate.end
+   :dedent: 8
+   :linenos:
+
+The initial state of the problem can be visualized as:
+
+.. image:: laplace_initial.svg
+
+In our kernel each thread will be responsible for managing the temperature update for a single element
+in a loop over the desired number of timesteps. The kernel is below. Note the use of cooperative group
+synchronization and the use of two buffers swapped at each iteration to avoid race conditions. See 
+:func:`numba.cuda.cg.this_grid() <numba.cuda.cg.this_grid>` for details.
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py
+   :language: python
+   :caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py``
+   :start-after: ex_laplace.kernel.begin
+   :end-before: ex_laplace.kernel.end
+   :dedent: 8
+   :linenos:
+
+
+Calling the kernel:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py
+   :language: python
+   :caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py``
+   :start-after: ex_laplace.launch.begin
+   :end-before: ex_laplace.launch.end
+   :dedent: 8
+   :linenos:
+
+
+Plotting the final data shows an arc that is highest where
+the object was hot initially and gradually sloping down to zero towards the
+edges where the temperature is fixed at zero. In the limit of infinite time,
+the arc will flatten out completely.
+
+.. image:: laplace_final.svg
+
+.. _cuda_reduction_shared:
+
+Shared Memory Reduction
+=======================
+Numba exposes many CUDA features, including :ref:`shared memory
+<cuda-shared-memory>`. To demonstrate shared memory, let's reimplement a
+famous CUDA solution for summing a vector which works by "folding" the data up
+using a successively smaller number of threads.
+
+
+Note that this is a fairly naive implementation, and there are more efficient ways of implementing reductions
+using Numba - see :ref:`cuda_montecarlo` for an example.
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py
+   :language: python
+   :caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py``
+   :start-after: ex_reduction.import.begin
+   :end-before: ex_reduction.import.end
+   :dedent: 8
+   :linenos:
+
+Let's create some one dimensional data that we'll use to demonstrate the
+kernel itself:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py
+   :language: python
+   :caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py``
+   :start-after: ex_reduction.allocate.begin
+   :end-before: ex_reduction.allocate.end
+   :dedent: 8
+   :linenos:
+
+
+Here is a version of the kernel implemented using Numba:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py
+   :language: python
+   :caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py``
+   :start-after: ex_reduction.kernel.begin
+   :end-before: ex_reduction.kernel.end
+   :dedent: 8
+   :linenos:
+
+We can run kernel and verify that the same result is obtained through
+summing data on the host as follows:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py
+   :language: python
+   :caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py``
+   :start-after: ex_reduction.launch.begin
+   :end-before: ex_reduction.launch.end
+   :dedent: 8
+   :linenos:
+
+This algorithm can be greatly improved upon by redesigning the inner loop
+to use sequential memory accesses, and even further by using strategies that
+keep more threads active and working, since in this example most threads quickly
+become idle.
+
+.. _cuda_sessionization:
+
+Dividing Click Data into Sessions
+=================================
+
+
+A common problem in business analytics is that of grouping the activity of users of an online platform into
+sessions, called "sessionization". The idea is that users generally traverse through a website and perform
+various actions (clicking something, filling out a form, etc.) in discrete groups. Perhaps a customer spends
+some time shopping for an item in the morning and then again at night - often the business is interested in
+treating these periods as separate interactions with their service, and this creates the problem of 
+programmatically splitting up activity in some agreed-upon way.
+
+Here we'll illustrate how to write a Numba kernel to solve this problem. We'll start with data 
+containing two fields: let ``user_id`` represent a unique ID corresponding to an individual customer, and let 
+``action_time`` be a time that some unknown action was taken on the service. Right now, we'll assume there's 
+only one type of action, so all there is to know is when it happened.
+
+Our goal will be to create a new column called ``session_id``, which contains a label corresponding to a unique 
+session. We'll define the boundary between sessions as when there has been at least one hour between clicks.
+
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_sessionize.py
+   :language: python
+   :caption: from ``test_ex_sessionize`` in ``numba/cuda/tests/doc_examples/test_sessionize.py``
+   :start-after: ex_sessionize.import.begin
+   :end-before: ex_sessionize.import.end
+   :dedent: 8
+   :linenos:
+   
+Here is a solution using Numba:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_sessionize.py
+   :language: python
+   :caption: from ``test_ex_sessionize`` in ``numba/cuda/tests/doc_examples/test_sessionize.py``
+   :start-after: ex_sessionize.kernel.begin
+   :end-before: ex_sessionize.kernel.end
+   :dedent: 8
+   :linenos:
+
+Let's generate some data and try out the kernel:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_sessionize.py
+   :language: python
+   :caption: from ``test_ex_sessionize`` in ``numba/cuda/tests/doc_examples/test_sessionize.py``
+   :start-after: ex_sessionize.allocate.begin
+   :end-before: ex_sessionize.allocate.end
+   :dedent: 8
+   :linenos:
+
+As can be seen above, the kernel successfully divided the first three datapoints from the second three for the first user ID,
+and a similar pattern is seen throughout.
+
+.. _cuda_reuse_function:
+
+JIT Function CPU-GPU Compatibility
+==================================
+
+This example demonstrates how ``numba.jit`` can be used to jit compile a function for the CPU, while at the same time making 
+it available for use inside CUDA kernels. This can be very useful for users that are migrating workflows from CPU to GPU as 
+they can directly reuse potential business logic with fewer code changes.
+
+Take the following example function:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
+   :language: python
+   :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
+   :start-after: ex_cpu_gpu_compat.define.begin
+   :end-before: ex_cpu_gpu_compat.define.end
+   :dedent: 8
+   :linenos:
+
+The function ``business_logic`` can be run standalone in compiled form on the CPU:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
+   :language: python
+   :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
+   :start-after: ex_cpu_gpu_compat.cpurun.begin
+   :end-before: ex_cpu_gpu_compat.cpurun.end
+   :dedent: 8
+   :linenos:
+
+It can also be directly reused threadwise inside a GPU kernel. For example one may 
+generate some vectors to represent ``x``, ``y``, and ``z``:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
+   :language: python
+   :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
+   :start-after: ex_cpu_gpu_compat.allocate.begin
+   :end-before: ex_cpu_gpu_compat.allocate.end
+   :dedent: 8
+   :linenos:
+
+And a numba kernel referencing the decorated function:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
+   :language: python
+   :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
+   :start-after: ex_cpu_gpu_compat.usegpu.begin
+   :end-before: ex_cpu_gpu_compat.usegpu.end
+   :dedent: 8
+   :linenos:
+
+This kernel can be invoked in the normal way:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
+   :language: python
+   :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
+   :start-after: ex_cpu_gpu_compat.launch.begin
+   :end-before: ex_cpu_gpu_compat.launch.end
+   :dedent: 8
+   :linenos:
+
+.. _cuda_montecarlo:
+
+Monte Carlo Integration
+=======================
+
+This example shows how to use Numba to approximate the value of a definite integral by rapidly generating 
+random numbers on the GPU. A detailed description of the mathematical mechanics of Monte Carlo integeration 
+is out of the scope of the example, but it can briefly be described as an averaging process where the area 
+under the curve is approximated by taking the average of many rectangles formed by its function values.
+
+In addition, this example shows how to perform reductions in numba using the 
+:func:`cuda.reduce() <numba.cuda.Reduce>` API.
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
+   :language: python
+   :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
+   :start-after: ex_montecarlo.import.begin
+   :end-before: ex_montecarlo.import.end
+   :dedent: 8
+   :linenos:
+
+Let's create a variable to control the number of samples drawn:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
+   :language: python
+   :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
+   :start-after: ex_montecarlo.define.begin
+   :end-before: ex_montecarlo.define.end
+   :dedent: 8
+   :linenos:
+
+
+The following kernel implements the main integration routine:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
+   :language: python
+   :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
+   :start-after: ex_montecarlo.kernel.begin
+   :end-before: ex_montecarlo.kernel.end
+   :dedent: 8
+   :linenos:
+
+This convenience function calls the kernel performs some
+preprocessing and post processing steps. Note the use of Numba's reduction API to
+take sum of the array and compute the final result:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
+   :language: python
+   :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
+   :start-after: ex_montecarlo.callfunc.begin
+   :end-before: ex_montecarlo.callfunc.end
+   :dedent: 8
+   :linenos:
+
+
+We can now use ``mc_integrate`` to compute the definite integral of this function between
+two limits:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
+   :language: python
+   :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
+   :start-after: ex_montecarlo.launch.begin
+   :end-before: ex_montecarlo.launch.end
+   :dedent: 8
+   :linenos:
+
+
+.. _cuda-matmul:
+
+Matrix multiplication
+=====================
+First, import the modules needed for this example:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
+   :language: python
+   :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
+   :start-after: magictoken.ex_import.begin
+   :end-before: magictoken.ex_import.end
+   :dedent: 8
+   :linenos:
+
+Here is a naïve implementation of matrix multiplication using a CUDA kernel:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
+   :language: python
+   :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
+   :start-after: magictoken.ex_matmul.begin
+   :end-before: magictoken.ex_matmul.end
+   :dedent: 8
+   :linenos:
+
+An example usage of this function is as follows:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
+   :language: python
+   :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
+   :start-after: magictoken.ex_run_matmul.begin
+   :end-before: magictoken.ex_run_matmul.end
+   :dedent: 8
+   :linenos:
+
+This implementation is straightforward and intuitive but performs poorly,
+because the same matrix elements will be loaded multiple times from device
+memory, which is slow (some devices may have transparent data caches, but
+they may not be large enough to hold the entire inputs at once).
+
+It will be faster if we use a blocked algorithm to reduce accesses to the
+device memory.  CUDA provides a fast :ref:`shared memory <cuda-shared-memory>`
+for threads in a block to cooperatively compute on a task.  The following
+implements a faster version of the square matrix multiplication using shared
+memory:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
+   :language: python
+   :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
+   :start-after: magictoken.ex_fast_matmul.begin
+   :end-before: magictoken.ex_fast_matmul.end
+   :dedent: 8
+   :linenos:
+
+
+Because the shared memory is a limited resource, the code preloads a small
+block at a time from the input arrays.  Then, it calls
+:func:`~numba.cuda.syncthreads` to wait until all threads have finished
+preloading and before doing the computation on the shared memory.
+It synchronizes again after the computation to ensure all threads
+have finished with the data in shared memory before overwriting it
+in the next loop iteration.
+
+An example usage of the ``fast_matmul`` function is as follows:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
+   :language: python
+   :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
+   :start-after: magictoken.ex_run_fast_matmul.begin
+   :end-before: magictoken.ex_run_fast_matmul.end
+   :dedent: 8
+   :linenos:
+
+
+This passes a :ref:`CUDA memory check test <debugging-cuda-python-code>`, which
+can help with debugging. Running the code above produces the following output:
+
+.. code-block:: none
+
+    $ python fast_matmul.py
+    [[ 6.  6.  6.  6.]
+    [22. 22. 22. 22.]
+    [38. 38. 38. 38.]
+    [54. 54. 54. 54.]]
+    [[ 6.  6.  6.  6.]
+    [22. 22. 22. 22.]
+    [38. 38. 38. 38.]
+    [54. 54. 54. 54.]]
+
+.. note:: For high performance matrix multiplication in CUDA, see also the `CuPy implementation <https://docs.cupy.dev/en/stable/reference/generated/cupy.matmul.html>`_.
+
+The approach outlined here generalizes to non-square matrix multiplication as
+follows by adjusting the ``blockspergrid`` variable:
+
+Again, here is an example usage:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
+   :language: python
+   :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
+   :start-after: magictoken.ex_run_nonsquare.begin
+   :end-before: magictoken.ex_run_nonsquare.end
+   :dedent: 8
+   :linenos:
+
+and the corresponding output:
+
+.. code-block:: none
+
+  $ python nonsquare_matmul.py
+  [[ 253.  253.  253.  253.  253.  253.  253.]
+  [ 782.  782.  782.  782.  782.  782.  782.]
+  [1311. 1311. 1311. 1311. 1311. 1311. 1311.]
+  [1840. 1840. 1840. 1840. 1840. 1840. 1840.]
+  [2369. 2369. 2369. 2369. 2369. 2369. 2369.]]
+  [[ 253.  253.  253.  253.  253.  253.  253.]
+  [ 782.  782.  782.  782.  782.  782.  782.]
+  [1311. 1311. 1311. 1311. 1311. 1311. 1311.]
+  [1840. 1840. 1840. 1840. 1840. 1840. 1840.]
+  [2369. 2369. 2369. 2369. 2369. 2369. 2369.]]
+
+
+.. _cuda_ufunc_call_example:
+
+Calling a NumPy UFunc
+=====================
+
+UFuncs supported in the CUDA target (see :ref:`cuda_numpy_support`) can be
+called inside kernels, but the output array must be passed in as a positional
+argument. The following example demonstrates a call to :func:`np.sin` inside a
+kernel following this pattern:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_ufunc.py
+   :language: python
+   :caption: from ``test_ex_cuda_ufunc_call`` in ``numba/cuda/tests/doc_examples/test_ufunc.py``
+   :start-after: ex_cuda_ufunc.begin
+   :end-before: ex_cuda_ufunc.end
+   :dedent: 8
+   :linenos:
--- a/docs/source/cuda/external-memory.rst
+++ b/docs/source/cuda/external-memory.rst
+.. _cuda-emm-plugin:
+
+=================================================
+External Memory Management (EMM) Plugin interface
+=================================================
+
+The :ref:`CUDA Array Interface <cuda-array-interface>` enables sharing of data
+between different Python libraries that access CUDA devices. However, each
+library manages its own memory distinctly from the others. For example:
+
+- By default, Numba allocates memory on CUDA devices by interacting with the
+  CUDA driver API to call functions such as ``cuMemAlloc`` and ``cuMemFree``,
+  which is suitable for many use cases.
+- The RAPIDS libraries (cuDF, cuML, etc.) use the `RAPIDS Memory Manager (RMM)
+  <https://github.com/rapidsai/rmm>`_ for allocating device memory.
+- `CuPy <https://cupy.chainer.org/>`_ includes a `memory pool implementation
+  <https://docs-cupy.chainer.org/en/stable/reference/memory.html>`_ for both
+  device and pinned memory.
+
+When multiple CUDA-aware libraries are used together, it may be preferable for
+Numba to defer to another library for memory management. The EMM Plugin
+interface facilitates this, by enabling Numba to use another CUDA-aware library
+for all allocations and deallocations.
+
+An EMM Plugin is used to facilitate the use of an external library for memory
+management. An EMM Plugin can be a part of an external library, or could be
+implemented as a separate library.
+
+
+Overview of External Memory Management
+======================================
+
+When an EMM Plugin is in use (see :ref:`setting-emm-plugin`), Numba will make
+memory allocations and deallocations through the Plugin. It will never directly call
+functions such as ``cuMemAlloc``, ``cuMemFree``, etc.
+
+EMM Plugins always take responsibility for the management of device memory.
+However, not all CUDA-aware libraries also support managing host memory, so a
+facility for Numba to continue the management of host memory whilst ceding
+control of device memory to the EMM is provided (see
+:ref:`host-only-cuda-memory-manager`).
+
+
+Effects on Deallocation Strategies
+----------------------------------
+
+Numba's internal :ref:`deallocation-behavior` is designed to increase efficiency
+by deferring deallocations until a significant quantity are pending. It also
+provides a mechanism for preventing deallocations entirely during critical
+sections, using the :func:`~numba.cuda.defer_cleanup` context manager.
+
+When an EMM Plugin is in use, the deallocation strategy is implemented by the
+EMM, and Numba's internal deallocation mechanism is not used. The EMM
+Plugin could implement:
+  
+- A similar strategy to the Numba deallocation behaviour, or
+- Something more appropriate to the plugin - for example, deallocated memory
+  might immediately be returned to a memory pool.
+
+The ``defer_cleanup`` context manager may behave differently with an EMM Plugin
+- an EMM Plugin should be accompanied by documentation of the behaviour of the
+``defer_cleanup`` context manager when it is in use. For example, a pool
+allocator could always immediately return memory to a pool even when the
+context manager is in use, but could choose not to free empty pools until
+``defer_cleanup`` is not in use.
+
+
+Management of other objects
+---------------------------
+
+In addition to memory, Numba manages the allocation and deallocation of
+:ref:`events <events>`, :ref:`streams <streams>`, and modules (a module is a
+compiled object, which is generated from ``@cuda.jit``\ -ted functions). The
+management of events, streams, and modules is unchanged by the use of an EMM
+Plugin.
+
+
+Asynchronous allocation and deallocation
+----------------------------------------
+
+The present EMM Plugin interface does not provide support for asynchronous
+allocation and deallocation. This may be added to a future version of the
+interface.
+
+
+Implementing an EMM Plugin
+==========================
+
+An EMM Plugin is implemented by deriving from
+:class:`~numba.cuda.BaseCUDAMemoryManager`. A summary of considerations for the
+implementation follows:
+
+- Numba instantiates one instance of the EMM Plugin class per context. The
+  context that owns an EMM Plugin object is accessible through ``self.context``,
+  if required.
+- The EMM Plugin is transparent to any code that uses Numba - all its methods
+  are invoked by Numba, and never need to be called by code that uses Numba.
+- The allocation methods ``memalloc``, ``memhostalloc``, and ``mempin``, should
+  use the underlying library to allocate and/or pin device or host memory, and
+  construct an instance of a :ref:`memory pointer <memory-pointers>`
+  representing the memory to return back to Numba. These methods are always
+  called when the current CUDA context is the context that owns the EMM Plugin
+  instance.
+- The ``initialize`` method is called by Numba prior to the first use of the EMM
+  Plugin object for a context. This method should do anything required to
+  prepare the underlying library for allocations in the current context. This
+  method may be called multiple times, and must not invalidate previous state
+  when it is called.
+- The ``reset`` method is called when all allocations in the context are to be
+  cleaned up. It may be called even prior to ``initialize``, and an EMM Plugin
+  implementation needs to guard against this.
+- To support inter-GPU communication, the ``get_ipc_handle`` method should
+  provide an :class:`~numba.cuda.IpcHandle` for a given
+  :class:`~numba.cuda.MemoryPointer` instance. This method is part of the EMM
+  interface (rather than being handled within Numba) because the base address of
+  the allocation is only known by the underlying library. Closing an IPC handle
+  is handled internally within Numba.
+- It is optional to provide memory info from the ``get_memory_info`` method, which
+  provides a count of the total and free memory on the device for the context.
+  It is preferable to implement the method, but this may not be practical for
+  all allocators. If memory info is not provided, this method should raise a
+  :class:`RuntimeError`.
+- The ``defer_cleanup`` method should return a context manager that ensures that
+  expensive cleanup operations are avoided whilst it is active. The nuances of
+  this will vary between plugins, so the plugin documentation should include an
+  explanation of how deferring cleanup affects deallocations, and performance in
+  general.
+- The ``interface_version`` property is used to ensure that the plugin version
+  matches the interface provided by the version of Numba. At present, this
+  should always be 1.
+
+Full documentation for the base class follows:
+
+.. autoclass:: numba.cuda.BaseCUDAMemoryManager
+   :members: memalloc, memhostalloc, mempin, initialize, get_ipc_handle,
+             get_memory_info, reset, defer_cleanup, interface_version
+   :member-order: bysource
+
+
+.. _host-only-cuda-memory-manager:
+
+The Host-Only CUDA Memory Manager
+---------------------------------
+
+Some external memory managers will support management of on-device memory but
+not host memory. For implementing EMM Plugins using one of these memory
+managers, a partial implementation of a plugin that implements host-side
+allocation and pinning is provided. To use it, derive from
+:class:`~numba.cuda.HostOnlyCUDAMemoryManager` instead of
+:class:`~numba.cuda.BaseCUDAMemoryManager`. Guidelines for using this class
+are:
+
+- The host-only memory manager implements ``memhostalloc`` and ``mempin`` - the
+  EMM Plugin should still implement ``memalloc``.
+- If ``reset`` is overridden, it must also call ``super().reset()`` to allow the
+  host allocations to be cleaned up.
+- If ``defer_cleanup`` is overridden, it must hold an active context manager
+  from ``super().defer_cleanup()`` to ensure that host-side cleanup is also
+  deferred.
+
+Documentation for the methods of :class:`~numba.cuda.HostOnlyCUDAMemoryManager`
+follows:
+
+.. autoclass:: numba.cuda.HostOnlyCUDAMemoryManager
+   :members: memhostalloc, mempin, reset, defer_cleanup
+   :member-order: bysource
+
+
+The IPC Handle Mixin
+--------------------
+
+An implementation of the ``get_ipc_handle()`` function is is provided in the
+``GetIpcHandleMixin`` class. This uses the driver API to determine the base
+address of an allocation for opening an IPC handle. If this implementation is
+appropriate for an EMM plugin, it can be added by mixing in the
+``GetIpcHandleMixin`` class:
+
+.. autoclass:: numba.cuda.GetIpcHandleMixin
+   :members: get_ipc_handle
+
+
+Classes and structures of returned objects
+==========================================
+
+This section provides an overview of the classes and structures that need to be
+constructed by an EMM Plugin.
+
+.. _memory-pointers:
+
+Memory Pointers
+---------------
+
+EMM Plugins should construct memory pointer instances that represent their
+allocations, for return to Numba. The appropriate memory pointer class to use in
+each method is:
+
+- :class:`~numba.cuda.MemoryPointer`: returned from ``memalloc``
+- :class:`~numba.cuda.MappedMemory`: returned from ``memhostalloc`` or
+  ``mempin`` when the host memory is mapped into the device memory space.
+- :class:`~numba.cuda.PinnedMemory`: return from ``memhostalloc`` or ``mempin``
+  when the host memory is not mapped into the device memory space.
+
+Memory pointers can take a finalizer, which is a function that is called when
+the buffer is no longer needed. Usually the finalizer will make a call to the
+memory management library (either internal to Numba, or external if allocated
+by an EMM Plugin) to inform it that the memory is no longer required, and that
+it could potentially be freed and/or unpinned. The memory manager may choose to
+defer actually cleaning up the memory to any later time after the finalizer
+runs - it is not required to free the buffer immediately.
+
+Documentation for the memory pointer classes follows.
+
+.. autoclass:: numba.cuda.MemoryPointer
+
+The ``AutoFreePointer`` class need not be used directly, but is documented here
+as it is subclassed by :class:`numba.cuda.MappedMemory`:
+
+.. autoclass:: numba.cuda.cudadrv.driver.AutoFreePointer
+
+.. autoclass:: numba.cuda.MappedMemory
+
+.. autoclass:: numba.cuda.PinnedMemory
+
+
+Memory Info
+-----------
+
+If an implementation of
+:meth:`~numba.cuda.BaseCUDAMemoryManager.get_memory_info` is to provide a
+result, then it should return an instance of the ``MemoryInfo`` named tuple:
+
+.. autoclass:: numba.cuda.MemoryInfo
+
+
+IPC
+---
+
+An instance of ``IpcHandle`` is required to be returned from an implementation
+of :meth:`~numba.cuda.BaseCUDAMemoryManager.get_ipc_handle`:
+
+.. autoclass:: numba.cuda.IpcHandle
+
+Guidance for constructing an IPC handle in the context of implementing an EMM
+Plugin:
+
+- The ``memory`` parameter passed to the ``get_ipc_handle`` method of an EMM
+  Plugin can be passed as the ``base`` parameter.
+- A suitable type for the ``handle`` can be constructed as ``ctypes.c_byte *
+  64``. The data for ``handle`` must be populated using a method for obtaining a
+  CUDA IPC handle appropriate to the underlying library.
+- ``size`` should match the size of the original allocation, which can be
+  obtained with ``memory.size`` in ``get_ipc_handle``.
+- An appropriate value for ``source_info`` can be created by calling
+  ``self.context.device.get_device_identity()``.
+- If the underlying memory does not point to the base of an allocation returned
+  by the CUDA driver or runtime API (e.g. if a pool allocator is in use) then
+  the ``offset`` from the base must be provided.
+
+
+.. _setting-emm-plugin:
+
+Setting the EMM Plugin
+======================
+
+By default, Numba uses its internal memory management - if an EMM Plugin is to
+be used, it must be configured. There are two mechanisms for configuring the use
+of an EMM Plugin: an environment variable, and a function.
+
+
+Environment variable
+--------------------
+
+A module name can be provided in the environment variable,
+``NUMBA_CUDA_MEMORY_MANAGER``. If this environment variable is set, Numba will
+attempt to import the module, and and use its ``_numba_memory_manager`` global
+variable as the memory manager class. This is primarily useful for running the
+Numba test suite with an EMM Plugin, e.g.:
+
+.. code::
+
+   $ NUMBA_CUDA_MEMORY_MANAGER=rmm python -m numba.runtests numba.cuda.tests
+
+
+Function
+--------
+
+The :func:`~numba.cuda.set_memory_manager` function can be used to set the
+memory manager at runtime. This should be called prior to the initialization of
+any contexts, as EMM Plugin instances are instantiated along with contexts.
+
+.. autofunction:: numba.cuda.set_memory_manager
+
+
+Resetting the memory manager
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is recommended that the memory manager is set once prior to using any CUDA
+functionality, and left unchanged for the remainder of execution. It is possible
+to set the memory manager multiple times, noting the following:
+
+* At the time of their creation, contexts are bound to an instance of a memory
+  manager for their lifetime.
+* Changing the memory manager will have no effect on existing contexts - only
+  contexts created after the memory manager was updated will use instances of
+  the new memory manager.
+* :func:`numba.cuda.close` can be used to destroy contexts after setting the
+  memory manager so that they get re-created with the new memory manager.
+
+  - This will invalidate any arrays, streams, events, and modules owned by the
+    context.
+  - Attempting to use invalid arrays, streams, or events will likely fail with
+    an exception being raised due to a ``CUDA_ERROR_INVALID_CONTEXT`` or
+    ``CUDA_ERROR_CONTEXT_IS_DESTROYED`` return code from a Driver API function.
+  - Attempting to use an invalid module will result in similar, or in some
+    cases a segmentation fault / access violation.
+
+.. note:: The invalidation of modules means that all functions compiled with
+          ``@cuda.jit`` prior to context destruction will need to be
+          redefined, as the code underlying them will also have been unloaded
+          from the GPU.
--- a/docs/source/cuda/faq.rst
+++ b/docs/source/cuda/faq.rst
+
+.. _cudafaq:
+
+=================================================
+CUDA Frequently Asked Questions
+=================================================
+
+nvprof reports "No kernels were profiled"
+-----------------------------------------
+
+When using the ``nvprof`` tool to profile Numba jitted code for the CUDA
+target, the output contains ``No kernels were profiled`` but there are clearly
+running kernels present, what is going on?
+
+This is quite likely due to the profiling data not being flushed on program
+exit, see the `NVIDIA CUDA documentation
+<http://docs.nvidia.com/cuda/profiler-users-guide/#flush-profile-data>`_ for
+details. To fix this simply add a call to ``numba.cuda.profile_stop()`` prior
+to the exit point in your program (or wherever you want to stop profiling).
+For more on CUDA profiling support in Numba, see :ref:`cuda-profiling`.
--- a/docs/source/cuda/fastmath.rst
+++ b/docs/source/cuda/fastmath.rst
+
+.. _cuda-fast-math:
+
+CUDA Fast Math
+==============
+
+As noted in :ref:`fast-math`, for certain classes of applications that utilize
+floating point, strict IEEE-754 conformance is not required. For this subset of
+applications, performance speedups may be possible.
+
+The CUDA target implements :ref:`fast-math` behavior with two differences.
+
+* First, the ``fastmath`` argument to the :func:`@jit decorator
+  <numba.cuda.jit>` is limited to the values ``True`` and ``False``.
+  When ``True``, the following optimizations are enabled:
+
+  - Flushing of denormals to zero.
+  - Use of a fast approximation to the square root function.
+  - Use of a fast approximation to the division operation.
+  - Contraction of multiply and add operations into single fused multiply-add
+    operations.
+
+  See the `documentation for nvvmCompileProgram <https://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g76ac1e23f5d0e2240e78be0e63450346>`_ for more details of these optimizations.
+
+* Secondly, calls to a subset of math module functions on ``float32`` operands
+  will be implemented using fast approximate implementations from the libdevice
+  library.
+
+  - :func:`math.cos`: Implemented using `__nv_fast_cosf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_cosf.html>`_.
+  - :func:`math.sin`: Implemented using `__nv_fast_sinf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_sinf.html>`_.
+  - :func:`math.tan`: Implemented using `__nv_fast_tanf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_tanf.html>`_.
+  - :func:`math.exp`: Implemented using `__nv_fast_expf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_expf.html>`_.
+  - :func:`math.log2`: Implemented using `__nv_fast_log2f <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_log2f.html>`_.
+  - :func:`math.log10`: Implemented using `__nv_fast_log10f <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_log10f.html>`_.
+  - :func:`math.log`: Implemented using `__nv_fast_logf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_logf.html>`_.
+  - :func:`math.pow`: Implemented using `__nv_fast_powf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_powf.html>`_.
--- a/docs/source/cuda/index.rst
+++ b/docs/source/cuda/index.rst
+
+.. _cuda-index:
+
+Numba for CUDA GPUs
+===================
+
+.. toctree::
+
+   overview.rst
+   kernels.rst
+   memory.rst
+   device-functions.rst
+   cudapysupported.rst
+   fastmath.rst
+   intrinsics.rst
+   cooperative_groups.rst
+   random.rst
+   device-management.rst
+   examples.rst
+   simulator.rst
+   reduction.rst
+   ufunc.rst
+   ipc.rst
+   cuda_array_interface.rst
+   external-memory.rst
+   bindings.rst
+   cuda_ffi.rst
+   caching.rst
+   minor_version_compatibility.rst
+   faq.rst