init 0.58

1fb0017a · dugupeiwen · 1fb0017a · 1fb0017a · 1fb0017a · 1fb0017a
Commit 1fb0017a authored Mar 23, 2024 by dugupeiwen
20 changed files
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Numba documentation build configuration file, created by
+# sphinx-quickstart on Tue Dec 30 11:55:40 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+try:
+    # Numba is installed
+    import numba
+except ImportError:
+    # Numba is run from its source checkout
+    sys.path.insert(0, os.path.abspath('../..'))
+    import numba
+
+
+on_rtd = os.environ.get('READTHEDOCS') == 'True'
+
+if on_rtd:
+    # The following is needed to fix RTD issue with numpydoc
+    # https://github.com/readthedocs/sphinx_rtd_theme/issues/766
+    from conda.cli.python_api import run_command as conda_cmd
+
+    conda_cmd("install", "-c", "conda-forge", "sphinx_rtd_theme>=0.5.1", "-y")
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    #'sphinx.ext.mathjax',
+    'sphinx.ext.autodoc',
+    #'sphinx.ext.graphviz',
+    'numpydoc',
+]
+
+# Adding the github files extension
+sys.path.append(os.path.abspath(os.path.join(".", "_ext")))
+extensions.append('ghfiles')
+
+todo_include_todos = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['../_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Numba'
+copyright = u'2012-2020, Anaconda, Inc. and others'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+version = '.'.join(numba.__version__.split('.')[:2])
+# The full version, including alpha/beta/rc tags.
+release = numba.__version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'sphinx_rtd_theme'
+
+# All sphinx_rtd_theme options. Default values commented out; uncomment to
+# change.
+html_theme_options = {
+    'canonical_url': 'https://numba.readthedocs.io/en/stable/',
+    # 'logo_only': False,
+    # 'display_version': True,
+    # 'prev_next_buttons_location': 'bottom',
+    'style_external_links': True,
+    # 'vcs_pageview_mode': '',
+    'style_nav_header_background': '#00A3E0',
+    # Toc options
+    'collapse_navigation': False,
+    # 'sticky_navigation': True,
+    # 'navigation_depth': 4,
+    # 'includehidden': True,
+    # 'titles_only': False
+}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = None
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+html_logo = "../_static/numba-white-icon-rgb.svg"
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+html_favicon = '../_static/numba-blue-icon-rgb.svg'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['../_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Numbadoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    ('index', 'numba.tex', u'Numba Documentation',
+     u'Anaconda', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'numba', 'Numba Documentation',
+     ['Anaconda'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    ('index', 'Numba', 'Numba Documentation',
+     'Anaconda', 'Numba', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+
+# Configuration for intersphinx: refer to the Python standard library
+# and the Numpy documentation.
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://numpy.org/doc/stable/', None),
+    'llvmlite': ('https://llvmlite.readthedocs.io/en/latest/', None),
+}
+
+
+# numpydoc options
+
+# To silence "WARNING: toctree contains reference to nonexisting document"
+numpydoc_show_class_members = False
+
+# -- Custom autogeneration ------------------------------------------------
+
+
+def _autogenerate():
+    from numba.scripts.generate_lower_listing import gen_lower_listing
+    from numba.misc.help.inspector import write_listings
+
+    basedir = os.path.dirname(__file__)
+    gen_lower_listing(os.path.join(basedir,
+                                   'developer/autogen_lower_listing.rst'))
+
+    # Run inspector on supported packages
+    for package in ['builtins', 'math', 'cmath', 'numpy']:
+        write_listings(
+            package_name=package,
+            filename=os.path.join(
+                basedir, 'developer', 'autogen_{}_listing'.format(package),
+            ),
+            output_format='rst',
+        )
+
+
+_autogenerate()
+
+
+def setup(app):
+    app.add_css_file('rtd-overrides.css')
--- a/docs/source/cuda-reference/host.rst
+++ b/docs/source/cuda-reference/host.rst
+CUDA Host API
+=============
+
+Device Management
+-----------------
+
+Device detection and enquiry
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions are available for querying the available hardware:
+
+.. autofunction:: numba.cuda.is_available
+
+.. autofunction:: numba.cuda.detect
+
+Context management
+~~~~~~~~~~~~~~~~~~
+
+CUDA Python functions execute within a CUDA context. Each CUDA device in a
+system has an associated CUDA context, and Numba presently allows only one context
+per thread. For further details on CUDA Contexts, refer to the `CUDA Driver API
+Documentation on Context Management
+<http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html>`_ and the
+`CUDA C Programming Guide Context Documentation
+<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#context>`_. CUDA Contexts
+are instances of the :class:`~numba.cuda.cudadrv.driver.Context` class:
+
+.. autoclass:: numba.cuda.cudadrv.driver.Context
+   :members: reset, get_memory_info, push, pop
+
+The following functions can be used to get or select the context:
+
+.. autofunction:: numba.cuda.current_context
+.. autofunction:: numba.cuda.require_context
+
+The following functions affect the current context:
+
+.. autofunction:: numba.cuda.synchronize
+.. autofunction:: numba.cuda.close
+
+Device management
+~~~~~~~~~~~~~~~~~
+
+Numba maintains a list of supported CUDA-capable devices:
+
+.. attribute:: numba.cuda.gpus
+
+   An indexable list of supported CUDA devices. This list is indexed by integer
+   device ID.
+
+Alternatively, the current device can be obtained:
+
+.. function:: numba.cuda.gpus.current
+
+   Return the currently-selected device.
+
+Getting a device through :attr:`numba.cuda.gpus` always provides an instance of
+:class:`numba.cuda.cudadrv.devices._DeviceContextManager`, which acts as a
+context manager for the selected device:
+
+.. autoclass:: numba.cuda.cudadrv.devices._DeviceContextManager
+
+One may also select a context and device or get the current device using the
+following three functions:
+
+.. autofunction:: numba.cuda.select_device
+.. autofunction:: numba.cuda.get_current_device
+.. autofunction:: numba.cuda.list_devices
+
+The :class:`numba.cuda.cudadrv.driver.Device` class can be used to enquire about
+the functionality of the selected device:
+
+.. class:: numba.cuda.cudadrv.driver.Device
+
+   The device associated with a particular context.
+
+   .. attribute:: compute_capability
+
+      A tuple, *(major, minor)* indicating the supported compute capability.
+
+   .. attribute:: id
+
+      The integer ID of the device.
+
+   .. attribute:: name
+
+      The name of the device (e.g. "GeForce GTX 970").
+
+   .. attribute:: uuid
+
+      The UUID of the device (e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643").
+
+   .. method:: reset
+
+      Delete the context for the device. This will destroy all memory
+      allocations, events, and streams created within the context.
+
+   .. attribute:: supports_float16
+
+      Return ``True`` if the device supports float16 operations, ``False``
+      otherwise.
+
+
+Compilation
+-----------
+
+Numba provides an entry point for compiling a Python function to PTX without
+invoking any of the driver API. This can be useful for:
+
+- Generating PTX that is to be inlined into other PTX code (e.g. from outside
+  the Numba / Python ecosystem).
+- Generating code when there is no device present.
+- Generating code prior to a fork without initializing CUDA.
+
+.. note:: It is the user's responsibility to manage any ABI issues arising from
+   the use of compilation to PTX.
+
+.. autofunction:: numba.cuda.compile_ptx
+
+
+The environment variable ``NUMBA_CUDA_DEFAULT_PTX_CC`` can be set to control
+the default compute capability targeted by ``compile_ptx`` - see
+:ref:`numba-envvars-gpu-support`. If PTX for the compute capability of the
+current device is required, the ``compile_ptx_for_current_device`` function can
+be used:
+
+.. autofunction:: numba.cuda.compile_ptx_for_current_device
+
+
+
+Measurement
+-----------
+
+.. _cuda-profiling:
+
+Profiling
+~~~~~~~~~
+
+The NVidia Visual Profiler can be used directly on executing CUDA Python code -
+it is not a requirement to insert calls to these functions into user code.
+However, these functions can be used to allow profiling to be performed
+selectively on specific portions of the code. For further information on
+profiling, see the `NVidia Profiler User's Guide
+<https://docs.nvidia.com/cuda/profiler-users-guide/>`_.
+
+.. autofunction:: numba.cuda.profile_start
+.. autofunction:: numba.cuda.profile_stop
+.. autofunction:: numba.cuda.profiling
+
+
+.. _events:
+
+Events
+~~~~~~
+
+Events can be used to monitor the progress of execution and to record the
+timestamps of specific points being reached. Event creation returns immediately,
+and the created event can be queried to determine if it has been reached. For
+further information, see the `CUDA C Programming Guide Events section
+<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#events>`_.
+
+The following functions are used for creating and measuring the time between
+events:
+
+.. autofunction:: numba.cuda.event
+.. autofunction:: numba.cuda.event_elapsed_time
+
+Events are instances of the :class:`numba.cuda.cudadrv.driver.Event` class:
+
+.. autoclass:: numba.cuda.cudadrv.driver.Event
+   :members: query, record, synchronize, wait
+
+
+.. _streams:
+
+Stream Management
+-----------------
+
+Streams allow concurrency of execution on a single device within a given
+context. Queued work items in the same stream execute sequentially, but work
+items in different streams may execute concurrently. Most operations involving a
+CUDA device can be performed asynchronously using streams, including data
+transfers and kernel execution. For further details on streams, see the `CUDA C
+Programming Guide Streams section
+<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#streams>`_.
+
+Numba defaults to using the legacy default stream as the default stream. The
+per-thread default stream can be made the default stream by setting the
+environment variable ``NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`` to ``1`` (see the
+:ref:`CUDA Environment Variables section <numba-envvars-gpu-support>`).
+Regardless of this setting, the objects representing the legacy and per-thread
+default streams can be constructed using the functions below.
+
+Streams are instances of :class:`numba.cuda.cudadrv.driver.Stream`:
+
+.. autoclass:: numba.cuda.cudadrv.driver.Stream
+   :members: synchronize, auto_synchronize, add_callback, async_done
+
+To create a new stream:
+
+.. autofunction:: numba.cuda.stream
+
+To get the default stream:
+
+.. autofunction:: numba.cuda.default_stream
+
+To get the default stream with an explicit choice of whether it is the legacy
+or per-thread default stream:
+
+.. autofunction:: numba.cuda.legacy_default_stream
+
+.. autofunction:: numba.cuda.per_thread_default_stream
+
+To construct a Numba ``Stream`` object using a stream allocated elsewhere, the
+``external_stream`` function is provided. Note that the lifetime of external
+streams must be managed by the user - Numba will not deallocate an external
+stream, and the stream must remain valid whilst the Numba ``Stream`` object is
+in use.
+
+.. autofunction:: numba.cuda.external_stream
+
+
+Runtime
+-------
+
+Numba generally uses the Driver API, but it provides a simple wrapper to the
+Runtime API so that the version of the runtime in use can be queried. This is
+accessed through ``cuda.runtime``, which is an instance of the
+:class:`numba.cuda.cudadrv.runtime.Runtime` class:
+
+.. autoclass:: numba.cuda.cudadrv.runtime.Runtime
+   :members: get_version, is_supported_version, supported_versions
+
+Whether the current runtime is officially supported and tested with the current
+version of Numba can also be queried:
+
+.. autofunction:: numba.cuda.is_supported_version
--- a/docs/source/cuda-reference/index.rst
+++ b/docs/source/cuda-reference/index.rst
+CUDA Python Reference
+=====================
+
+.. toctree::
+
+   host.rst
+   kernel.rst
+   types.rst
+   memory.rst
+   libdevice.rst
--- a/docs/source/cuda-reference/kernel.rst
+++ b/docs/source/cuda-reference/kernel.rst
--- a/docs/source/cuda-reference/libdevice.rst
+++ b/docs/source/cuda-reference/libdevice.rst
+Libdevice functions
+===================
+
+All wrapped libdevice functions are listed in this section. All functions in
+libdevice are wrapped, with the exception of ``__nv_nan`` and ``__nv_nanf``.
+These functions return a representation of a quiet NaN, but the argument they
+take (a pointer to an object specifying the representation) is undocumented, and
+follows an unusual form compared to the rest of libdevice - it is not an output
+like every other pointer argument. If a NaN is required, one can be obtained in
+CUDA Python by other means, e.g. ``math.nan``.
+
+Wrapped functions
+-----------------
+
+.. automodule:: numba.cuda.libdevice
+   :members:
--- a/docs/source/cuda-reference/memory.rst
+++ b/docs/source/cuda-reference/memory.rst
+Memory Management
+=================
+
+.. autofunction:: numba.cuda.to_device
+.. autofunction:: numba.cuda.device_array
+.. autofunction:: numba.cuda.device_array_like
+.. autofunction:: numba.cuda.pinned_array
+.. autofunction:: numba.cuda.pinned_array_like
+.. autofunction:: numba.cuda.mapped_array
+.. autofunction:: numba.cuda.mapped_array_like
+.. autofunction:: numba.cuda.managed_array
+.. autofunction:: numba.cuda.pinned
+.. autofunction:: numba.cuda.mapped
+
+Device Objects
+--------------
+
+.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceNDArray
+   :members: copy_to_device, copy_to_host, is_c_contiguous, is_f_contiguous,
+              ravel, reshape, split
+.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceRecord
+   :members: copy_to_device, copy_to_host
+.. autoclass:: numba.cuda.cudadrv.devicearray.MappedNDArray
+   :members: copy_to_device, copy_to_host, split
--- a/docs/source/cuda-reference/types.rst
+++ b/docs/source/cuda-reference/types.rst
+CUDA-Specific Types
+====================
+
+.. note::
+
+    This page is about types specific to CUDA targets. Many other types are also
+    available in the CUDA target - see :ref:`cuda-built-in-types`.
+
+Vector Types
+~~~~~~~~~~~~
+
+`CUDA Vector Types <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#built-in-vector-types>`_
+are usable in kernels. There are two important distinctions from vector types in CUDA C/C++:
+
+First, the recommended names for vector types in Numba CUDA is formatted as ``<base_type>x<N>``,
+where ``base_type`` is the base type of the vector, and ``N`` is the number of elements in the vector.
+Examples include ``int64x3``, ``uint16x4``, ``float32x4``, etc. For new Numba CUDA kernels,
+this is the recommended way to instantiate vector types.
+
+For convenience, users adapting existing kernels from CUDA C/C++ to Python may use
+aliases consistent with the C/C++ namings. For example, ``float3`` aliases ``float32x3``,
+``long3`` aliases ``int32x3`` or ``int64x3`` (depending on the platform), etc. 
+
+Second, unlike CUDA C/C++ where factory functions are used, vector types are constructed directly
+with their constructor. For example, to construct a ``float32x3``:
+
+.. code-block:: python3
+
+    from numba.cuda import float32x3
+
+    # In kernel
+    f3 = float32x3(0.0, -1.0, 1.0)
+
+Additionally, vector types can be constructed from a combination of vector and
+primitive types, as long as the total number of components matches the result
+vector type. For example, all of the following constructions are valid:
+
+.. code-block:: python3
+
+    zero = uint32(0)
+    u2 = uint32x2(1, 2)
+    # Construct a 3-component vector with primitive type and a 2-component vector
+    u3 = uint32x3(zero, u2)
+    # Construct a 4-component vector with 2 2-component vectors
+    u4 = uint32x4(u2, u2)
+
+The 1st, 2nd, 3rd and 4th component of the vector type can be accessed through fields 
+``x``, ``y``, ``z``, and ``w`` respectively. The components are immutable after
+construction in the present version of Numba; it is expected that support for
+mutating vector components will be added in a future release.
+
+.. code-block:: python3
+
+    v1 = float32x2(1.0, 1.0)
+    v2 = float32x2(1.0, -1.0)
+    dotprod = v1.x * v2.x + v1.y * v2.y
--- a/docs/source/cuda/bindings.rst
+++ b/docs/source/cuda/bindings.rst
+CUDA Bindings
+=============
+
+Numba supports two bindings to the CUDA Driver APIs: its own internal bindings
+based on ctypes, and the official `NVIDIA CUDA Python bindings
+<https://nvidia.github.io/cuda-python/>`_. Functionality is equivalent between
+the two bindings.
+
+The internal bindings are used by default. If the NVIDIA bindings are installed,
+then they can be used by setting the environment variable
+``NUMBA_CUDA_USE_NVIDIA_BINDING`` to ``1`` prior to the import of Numba. Once
+Numba has been imported, the selected binding cannot be changed.
+
+
+Per-Thread Default Streams
+--------------------------
+
+Responsibility for handling Per-Thread Default Streams (PTDS) is delegated to
+the NVIDIA bindings when they are in use. To use PTDS with the NVIDIA bindings,
+set the environment variable ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to
+``1`` instead of Numba's environmnent variable
+:envvar:`NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`.
+
+.. seealso::
+
+   The `Default Stream section
+   <https://nvidia.github.io/cuda-python/release/11.6.0-notes.html#default-stream>`_
+   in the NVIDIA Bindings documentation.
+
+
+Roadmap
+-------
+
+In Numba 0.56, the NVIDIA Bindings will be used by default, if they are
+installed.
+
+In future versions of Numba:
+
+- The internal bindings will be deprecated.
+- The internal bindings will be removed.
+
+At present, no specific release is planned for the deprecation or removal of
+the internal bindings.
--- a/docs/source/cuda/caching.rst
+++ b/docs/source/cuda/caching.rst
+On-disk Kernel Caching
+======================
+
+When the ``cache`` keyword argument of the :func:`@cuda.jit <numba.cuda.jit>`
+decorator is ``True``, a file-based cache is enabled. This shortens compilation
+times when the function was already compiled in a previous invocation.
+
+The cache is maintained in the ``__pycache__`` subdirectory of the directory
+containing the source file; if the current user is not allowed to write to it,
+the cache implementation falls back to a platform-specific user-wide cache
+directory (such as ``$HOME/.cache/numba`` on Unix platforms).
+
+
+Compute capability considerations
+---------------------------------
+
+Separate cache files are maintained for each compute capability. When a cached
+kernel is loaded, the compute capability of the device the kernel is first
+launched on in the current run is used to determine which version to load.
+Therefore, on systems that have multiple GPUs with differing compute
+capabilities, the cached versions of kernels are only used for one compute
+capability, and recompilation will occur for other compute capabilities.
+
+For example: if a system has two GPUs, one of compute capability 7.5 and one of
+8.0, then:
+
+* If a cached kernel is first launched on the CC 7.5 device, then the cached
+  version for CC 7.5 is used. If it is subsequently launched on the CC 8.0
+  device, a recompilation will occur.
+* If in a subsequent run the cached kernel is first launched on the CC 8.0
+  device, then the cached version for CC 8.0 is used. A subsequent launch on
+  the CC 7.5 device will require a recompilation.
+
+This limitation is not expected to present issues in most practical scenarios,
+as multi-GPU production systems tend to have identical GPUs within each node.
--- a/docs/source/cuda/cooperative_groups.rst
+++ b/docs/source/cuda/cooperative_groups.rst
+==================
+Cooperative Groups
+==================
+
+Supported features
+------------------
+
+Numba's Cooperative Groups support presently provides grid groups and grid
+synchronization, along with cooperative kernel launches.
+
+Cooperative groups are supported on Linux, and Windows for devices in `TCC
+mode
+<https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#tesla-compute-cluster-mode-for-windows>`_.
+
+
+Using Grid Groups
+-----------------
+
+To get the current grid group, use the :meth:`cg.this_grid()
+<numba.cuda.cg.this_grid>` function:
+
+.. code-block:: python
+
+   g = cuda.cg.this_grid()
+
+Synchronizing the grid is done with the :meth:`sync()
+<numba.cuda.cg.GridGroup.sync>` method of the grid group:
+
+.. code-block:: python
+
+   g.sync()
+
+
+Cooperative Launches
+--------------------
+
+Unlike the CUDA C/C++ API, a cooperative launch is invoked using the same syntax
+as a normal kernel launch - Numba automatically determines whether a cooperative
+launch is required based on whether a grid group is synchronized in the kernel.
+
+The grid size limit for a cooperative launch is more restrictive than for a
+normal launch - the grid must be no larger than the maximum number of active
+blocks on the device on which it is launched. To get maximum grid size for a
+cooperative launch of a kernel with a given block size and dynamic shared
+memory requirement, use the ``max_cooperative_grid_blocks()`` method of kernel
+overloads:
+
+.. automethod:: numba.cuda.dispatcher._Kernel.max_cooperative_grid_blocks
+
+This can be used to ensure that the kernel is launched with no more than the
+maximum number of blocks. Exceeding the maximum number of blocks for the
+cooperative launch will result in a ``CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE``
+error. 
+
+
+Applications and Example
+------------------------
+
+Grid group synchronization can be used to implement a global barrier across all
+threads in the grid - applications of this include a global reduction to a
+single value, or looping over rows of a large matrix sequentially using the
+entire grid to operate on column elements in parallel.
+
+In the following example, rows are written sequentially by the grid. Each thread
+in the grid reads a value from the previous row written by it's *opposite*
+thread. A grid sync is needed to ensure that threads in the grid don't run ahead
+of threads in other blocks, or fail to see updates from their opposite thread.
+
+First we'll define our kernel:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
+   :language: python
+   :caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
+   :start-after: magictoken.ex_grid_sync_kernel.begin
+   :end-before: magictoken.ex_grid_sync_kernel.end
+   :dedent: 8
+   :linenos:
+
+Then create some empty input data and determine the grid and block sizes:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
+   :language: python
+   :caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
+   :start-after: magictoken.ex_grid_sync_data.begin
+   :end-before: magictoken.ex_grid_sync_data.end
+   :dedent: 8
+   :linenos:
+
+Finally we launch the kernel and print the result:
+
+.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
+   :language: python
+   :caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
+   :start-after: magictoken.ex_grid_sync_launch.begin
+   :end-before: magictoken.ex_grid_sync_launch.end
+   :dedent: 8
+   :linenos:
+
+
+The maximum grid size for ``sequential_rows`` can be enquired using:
+
+
+.. code-block:: python
+
+   overload = sequential_rows.overloads[(int32[:,::1],)
+   max_blocks = overload.max_cooperative_grid_blocks(blockdim)
+   print(max_blocks)
+   # 1152 (e.g. on Quadro RTX 8000 with Numba 0.52.1 and CUDA 11.0)
--- a/docs/source/cuda/cuda_array_interface.rst
+++ b/docs/source/cuda/cuda_array_interface.rst
--- a/docs/source/cuda/cuda_ffi.rst
+++ b/docs/source/cuda/cuda_ffi.rst
--- a/docs/source/cuda/cudapysupported.rst
+++ b/docs/source/cuda/cudapysupported.rst
--- a/docs/source/cuda/device-functions.rst
+++ b/docs/source/cuda/device-functions.rst
+
+Writing Device Functions
+========================
+
+CUDA device functions can only be invoked from within the device (by a kernel
+or another device function).  To define a device function::
+
+    from numba import cuda
+
+    @cuda.jit(device=True)
+    def a_device_function(a, b):
+        return a + b
+
+Unlike a kernel function, a device function can return a value like normal
+functions.
--- a/docs/source/cuda/device-management.rst
+++ b/docs/source/cuda/device-management.rst
+
+Device management
+=================
+
+For multi-GPU machines, users may want to select which GPU to use.
+By default the CUDA driver selects the fastest GPU as the device 0,
+which is the default device used by Numba.
+
+The features introduced on this page are generally not of interest
+unless working with systems hosting/offering more than one CUDA-capable GPU.
+
+Device Selection
+----------------
+
+If at all required, device selection must be done before any CUDA feature is
+used.
+
+::
+
+    from numba import cuda
+    cuda.select_device(0)
+
+The device can be closed by:
+
+::
+
+    cuda.close()
+
+Users can then create a new context with another device.
+
+::
+
+    cuda.select_device(1)  # assuming we have 2 GPUs
+
+
+.. function:: numba.cuda.select_device(device_id)
+   :noindex:
+
+   Create a new CUDA context for the selected *device_id*.  *device_id*
+   should be the number of the device (starting from 0; the device order
+   is determined by the CUDA libraries).  The context is associated with
+   the current thread.  Numba currently allows only one context per thread.
+
+   If successful, this function returns a device instance.
+
+   .. XXX document device instances?
+
+
+.. function:: numba.cuda.close
+   :noindex:
+
+   Explicitly close all contexts in the current thread.
+
+   .. note::
+      Compiled functions are associated with the CUDA context.
+      This makes it not very useful to close and create new devices, though it
+      is certainly useful for choosing which device to use when the machine
+      has multiple GPUs.
+
+The Device List
+===============
+
+The Device List is a list of all the GPUs in the system, and can be indexed to
+obtain a context manager that ensures execution on the selected GPU.
+
+.. attribute:: numba.cuda.gpus
+   :noindex:
+.. attribute:: numba.cuda.cudadrv.devices.gpus
+
+:py:data:`numba.cuda.gpus` is an instance of the ``_DeviceList`` class, from
+which the current GPU context can also be retrieved:
+
+.. autoclass:: numba.cuda.cudadrv.devices._DeviceList
+    :members: current
+    :noindex:
+
+
+Device UUIDs
+============
+
+The UUID of a device (equal to that returned by ``nvidia-smi -L``) is available
+in the :attr:`uuid <numba.cuda.cudadrv.driver.Device.uuid>` attribute of a CUDA
+device object.
+
+For example, to obtain the UUID of the current device:
+
+.. code-block:: python
+
+   dev = cuda.current_context().device
+   # prints e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643"
+   print(dev.uuid)
+
--- a/docs/source/cuda/examples.rst
+++ b/docs/source/cuda/examples.rst
--- a/docs/source/cuda/external-memory.rst
+++ b/docs/source/cuda/external-memory.rst
--- a/docs/source/cuda/faq.rst
+++ b/docs/source/cuda/faq.rst
+
+.. _cudafaq:
+
+=================================================
+CUDA Frequently Asked Questions
+=================================================
+
+nvprof reports "No kernels were profiled"
+-----------------------------------------
+
+When using the ``nvprof`` tool to profile Numba jitted code for the CUDA
+target, the output contains ``No kernels were profiled`` but there are clearly
+running kernels present, what is going on?
+
+This is quite likely due to the profiling data not being flushed on program
+exit, see the `NVIDIA CUDA documentation
+<http://docs.nvidia.com/cuda/profiler-users-guide/#flush-profile-data>`_ for
+details. To fix this simply add a call to ``numba.cuda.profile_stop()`` prior
+to the exit point in your program (or wherever you want to stop profiling).
+For more on CUDA profiling support in Numba, see :ref:`cuda-profiling`.
--- a/docs/source/cuda/fastmath.rst
+++ b/docs/source/cuda/fastmath.rst
+
+.. _cuda-fast-math:
+
+CUDA Fast Math
+==============
+
+As noted in :ref:`fast-math`, for certain classes of applications that utilize
+floating point, strict IEEE-754 conformance is not required. For this subset of
+applications, performance speedups may be possible.
+
+The CUDA target implements :ref:`fast-math` behavior with two differences.
+
+* First, the ``fastmath`` argument to the :func:`@jit decorator
+  <numba.cuda.jit>` is limited to the values ``True`` and ``False``.
+  When ``True``, the following optimizations are enabled:
+
+  - Flushing of denormals to zero.
+  - Use of a fast approximation to the square root function.
+  - Use of a fast approximation to the division operation.
+  - Contraction of multiply and add operations into single fused multiply-add
+    operations.
+
+  See the `documentation for nvvmCompileProgram <https://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g76ac1e23f5d0e2240e78be0e63450346>`_ for more details of these optimizations.
+
+* Secondly, calls to a subset of math module functions on ``float32`` operands
+  will be implemented using fast approximate implementations from the libdevice
+  library.
+
+  - :func:`math.cos`: Implemented using `__nv_fast_cosf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_cosf.html>`_.
+  - :func:`math.sin`: Implemented using `__nv_fast_sinf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_sinf.html>`_.
+  - :func:`math.tan`: Implemented using `__nv_fast_tanf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_tanf.html>`_.
+  - :func:`math.exp`: Implemented using `__nv_fast_expf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_expf.html>`_.
+  - :func:`math.log2`: Implemented using `__nv_fast_log2f <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_log2f.html>`_.
+  - :func:`math.log10`: Implemented using `__nv_fast_log10f <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_log10f.html>`_.
+  - :func:`math.log`: Implemented using `__nv_fast_logf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_logf.html>`_.
+  - :func:`math.pow`: Implemented using `__nv_fast_powf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_powf.html>`_.
--- a/docs/source/cuda/index.rst
+++ b/docs/source/cuda/index.rst