Commit 1fb0017a authored by dugupeiwen's avatar dugupeiwen
Browse files

init 0.58

parents
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Numba documentation build configuration file, created by
# sphinx-quickstart on Tue Dec 30 11:55:40 2014.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
import os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
try:
# Numba is installed
import numba
except ImportError:
# Numba is run from its source checkout
sys.path.insert(0, os.path.abspath('../..'))
import numba
on_rtd = os.environ.get('READTHEDOCS') == 'True'
if on_rtd:
# The following is needed to fix RTD issue with numpydoc
# https://github.com/readthedocs/sphinx_rtd_theme/issues/766
from conda.cli.python_api import run_command as conda_cmd
conda_cmd("install", "-c", "conda-forge", "sphinx_rtd_theme>=0.5.1", "-y")
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.intersphinx',
'sphinx.ext.todo',
#'sphinx.ext.mathjax',
'sphinx.ext.autodoc',
#'sphinx.ext.graphviz',
'numpydoc',
]
# Adding the github files extension
sys.path.append(os.path.abspath(os.path.join(".", "_ext")))
extensions.append('ghfiles')
todo_include_todos = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['../_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'Numba'
copyright = u'2012-2020, Anaconda, Inc. and others'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
version = '.'.join(numba.__version__.split('.')[:2])
# The full version, including alpha/beta/rc tags.
release = numba.__version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = []
# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'sphinx_rtd_theme'
# All sphinx_rtd_theme options. Default values commented out; uncomment to
# change.
html_theme_options = {
'canonical_url': 'https://numba.readthedocs.io/en/stable/',
# 'logo_only': False,
# 'display_version': True,
# 'prev_next_buttons_location': 'bottom',
'style_external_links': True,
# 'vcs_pageview_mode': '',
'style_nav_header_background': '#00A3E0',
# Toc options
'collapse_navigation': False,
# 'sticky_navigation': True,
# 'navigation_depth': 4,
# 'includehidden': True,
# 'titles_only': False
}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = None
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
html_logo = "../_static/numba-white-icon-rgb.svg"
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
html_favicon = '../_static/numba-blue-icon-rgb.svg'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['../_static']
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'Numbadoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
('index', 'numba.tex', u'Numba Documentation',
u'Anaconda', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'numba', 'Numba Documentation',
['Anaconda'], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'Numba', 'Numba Documentation',
'Anaconda', 'Numba', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False
# Configuration for intersphinx: refer to the Python standard library
# and the Numpy documentation.
intersphinx_mapping = {
'python': ('https://docs.python.org/3', None),
'numpy': ('https://numpy.org/doc/stable/', None),
'llvmlite': ('https://llvmlite.readthedocs.io/en/latest/', None),
}
# numpydoc options
# To silence "WARNING: toctree contains reference to nonexisting document"
numpydoc_show_class_members = False
# -- Custom autogeneration ------------------------------------------------
def _autogenerate():
from numba.scripts.generate_lower_listing import gen_lower_listing
from numba.misc.help.inspector import write_listings
basedir = os.path.dirname(__file__)
gen_lower_listing(os.path.join(basedir,
'developer/autogen_lower_listing.rst'))
# Run inspector on supported packages
for package in ['builtins', 'math', 'cmath', 'numpy']:
write_listings(
package_name=package,
filename=os.path.join(
basedir, 'developer', 'autogen_{}_listing'.format(package),
),
output_format='rst',
)
_autogenerate()
def setup(app):
app.add_css_file('rtd-overrides.css')
CUDA Host API
=============
Device Management
-----------------
Device detection and enquiry
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The following functions are available for querying the available hardware:
.. autofunction:: numba.cuda.is_available
.. autofunction:: numba.cuda.detect
Context management
~~~~~~~~~~~~~~~~~~
CUDA Python functions execute within a CUDA context. Each CUDA device in a
system has an associated CUDA context, and Numba presently allows only one context
per thread. For further details on CUDA Contexts, refer to the `CUDA Driver API
Documentation on Context Management
<http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html>`_ and the
`CUDA C Programming Guide Context Documentation
<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#context>`_. CUDA Contexts
are instances of the :class:`~numba.cuda.cudadrv.driver.Context` class:
.. autoclass:: numba.cuda.cudadrv.driver.Context
:members: reset, get_memory_info, push, pop
The following functions can be used to get or select the context:
.. autofunction:: numba.cuda.current_context
.. autofunction:: numba.cuda.require_context
The following functions affect the current context:
.. autofunction:: numba.cuda.synchronize
.. autofunction:: numba.cuda.close
Device management
~~~~~~~~~~~~~~~~~
Numba maintains a list of supported CUDA-capable devices:
.. attribute:: numba.cuda.gpus
An indexable list of supported CUDA devices. This list is indexed by integer
device ID.
Alternatively, the current device can be obtained:
.. function:: numba.cuda.gpus.current
Return the currently-selected device.
Getting a device through :attr:`numba.cuda.gpus` always provides an instance of
:class:`numba.cuda.cudadrv.devices._DeviceContextManager`, which acts as a
context manager for the selected device:
.. autoclass:: numba.cuda.cudadrv.devices._DeviceContextManager
One may also select a context and device or get the current device using the
following three functions:
.. autofunction:: numba.cuda.select_device
.. autofunction:: numba.cuda.get_current_device
.. autofunction:: numba.cuda.list_devices
The :class:`numba.cuda.cudadrv.driver.Device` class can be used to enquire about
the functionality of the selected device:
.. class:: numba.cuda.cudadrv.driver.Device
The device associated with a particular context.
.. attribute:: compute_capability
A tuple, *(major, minor)* indicating the supported compute capability.
.. attribute:: id
The integer ID of the device.
.. attribute:: name
The name of the device (e.g. "GeForce GTX 970").
.. attribute:: uuid
The UUID of the device (e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643").
.. method:: reset
Delete the context for the device. This will destroy all memory
allocations, events, and streams created within the context.
.. attribute:: supports_float16
Return ``True`` if the device supports float16 operations, ``False``
otherwise.
Compilation
-----------
Numba provides an entry point for compiling a Python function to PTX without
invoking any of the driver API. This can be useful for:
- Generating PTX that is to be inlined into other PTX code (e.g. from outside
the Numba / Python ecosystem).
- Generating code when there is no device present.
- Generating code prior to a fork without initializing CUDA.
.. note:: It is the user's responsibility to manage any ABI issues arising from
the use of compilation to PTX.
.. autofunction:: numba.cuda.compile_ptx
The environment variable ``NUMBA_CUDA_DEFAULT_PTX_CC`` can be set to control
the default compute capability targeted by ``compile_ptx`` - see
:ref:`numba-envvars-gpu-support`. If PTX for the compute capability of the
current device is required, the ``compile_ptx_for_current_device`` function can
be used:
.. autofunction:: numba.cuda.compile_ptx_for_current_device
Measurement
-----------
.. _cuda-profiling:
Profiling
~~~~~~~~~
The NVidia Visual Profiler can be used directly on executing CUDA Python code -
it is not a requirement to insert calls to these functions into user code.
However, these functions can be used to allow profiling to be performed
selectively on specific portions of the code. For further information on
profiling, see the `NVidia Profiler User's Guide
<https://docs.nvidia.com/cuda/profiler-users-guide/>`_.
.. autofunction:: numba.cuda.profile_start
.. autofunction:: numba.cuda.profile_stop
.. autofunction:: numba.cuda.profiling
.. _events:
Events
~~~~~~
Events can be used to monitor the progress of execution and to record the
timestamps of specific points being reached. Event creation returns immediately,
and the created event can be queried to determine if it has been reached. For
further information, see the `CUDA C Programming Guide Events section
<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#events>`_.
The following functions are used for creating and measuring the time between
events:
.. autofunction:: numba.cuda.event
.. autofunction:: numba.cuda.event_elapsed_time
Events are instances of the :class:`numba.cuda.cudadrv.driver.Event` class:
.. autoclass:: numba.cuda.cudadrv.driver.Event
:members: query, record, synchronize, wait
.. _streams:
Stream Management
-----------------
Streams allow concurrency of execution on a single device within a given
context. Queued work items in the same stream execute sequentially, but work
items in different streams may execute concurrently. Most operations involving a
CUDA device can be performed asynchronously using streams, including data
transfers and kernel execution. For further details on streams, see the `CUDA C
Programming Guide Streams section
<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#streams>`_.
Numba defaults to using the legacy default stream as the default stream. The
per-thread default stream can be made the default stream by setting the
environment variable ``NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`` to ``1`` (see the
:ref:`CUDA Environment Variables section <numba-envvars-gpu-support>`).
Regardless of this setting, the objects representing the legacy and per-thread
default streams can be constructed using the functions below.
Streams are instances of :class:`numba.cuda.cudadrv.driver.Stream`:
.. autoclass:: numba.cuda.cudadrv.driver.Stream
:members: synchronize, auto_synchronize, add_callback, async_done
To create a new stream:
.. autofunction:: numba.cuda.stream
To get the default stream:
.. autofunction:: numba.cuda.default_stream
To get the default stream with an explicit choice of whether it is the legacy
or per-thread default stream:
.. autofunction:: numba.cuda.legacy_default_stream
.. autofunction:: numba.cuda.per_thread_default_stream
To construct a Numba ``Stream`` object using a stream allocated elsewhere, the
``external_stream`` function is provided. Note that the lifetime of external
streams must be managed by the user - Numba will not deallocate an external
stream, and the stream must remain valid whilst the Numba ``Stream`` object is
in use.
.. autofunction:: numba.cuda.external_stream
Runtime
-------
Numba generally uses the Driver API, but it provides a simple wrapper to the
Runtime API so that the version of the runtime in use can be queried. This is
accessed through ``cuda.runtime``, which is an instance of the
:class:`numba.cuda.cudadrv.runtime.Runtime` class:
.. autoclass:: numba.cuda.cudadrv.runtime.Runtime
:members: get_version, is_supported_version, supported_versions
Whether the current runtime is officially supported and tested with the current
version of Numba can also be queried:
.. autofunction:: numba.cuda.is_supported_version
CUDA Python Reference
=====================
.. toctree::
host.rst
kernel.rst
types.rst
memory.rst
libdevice.rst
CUDA Kernel API
===============
Kernel declaration
------------------
The ``@cuda.jit`` decorator is used to create a CUDA dispatcher object that can
be configured and launched:
.. autofunction:: numba.cuda.jit
Dispatcher objects
------------------
The usual syntax for configuring a Dispatcher with a launch configuration uses
subscripting, with the arguments being as in the following:
.. code-block:: python
# func is some function decorated with @cuda.jit
func[griddim, blockdim, stream, sharedmem]
The ``griddim`` and ``blockdim`` arguments specify the size of the grid and
thread blocks, and may be either integers or tuples of length up to 3. The
``stream`` parameter is an optional stream on which the kernel will be launched,
and the ``sharedmem`` parameter specifies the size of dynamic shared memory in
bytes.
Subscripting the Dispatcher returns a configuration object that can be called
with the kernel arguments:
.. code-block:: python
configured = func[griddim, blockdim, stream, sharedmem]
configured(x, y, z)
However, it is more idiomatic to configure and call the kernel within a single
statement:
.. code-block:: python
func[griddim, blockdim, stream, sharedmem](x, y, z)
This is similar to launch configuration in CUDA C/C++:
.. code-block:: cuda
func<<<griddim, blockdim, sharedmem, stream>>>(x, y, z)
.. note:: The order of ``stream`` and ``sharedmem`` are reversed in Numba
compared to in CUDA C/C++.
Dispatcher objects also provide several utility methods for inspection and
creating a specialized instance:
.. autoclass:: numba.cuda.dispatcher.CUDADispatcher
:members: inspect_asm, inspect_llvm, inspect_sass, inspect_types,
get_regs_per_thread, specialize, specialized, extensions, forall,
get_shared_mem_per_block, get_max_threads_per_block,
get_const_mem_size, get_local_mem_per_thread
Intrinsic Attributes and Functions
----------------------------------
The remainder of the attributes and functions in this section may only be called
from within a CUDA Kernel.
Thread Indexing
~~~~~~~~~~~~~~~
.. attribute:: numba.cuda.threadIdx
The thread indices in the current thread block, accessed through the
attributes ``x``, ``y``, and ``z``. Each index is an integer spanning the
range from 0 inclusive to the corresponding value of the attribute in
:attr:`numba.cuda.blockDim` exclusive.
.. attribute:: numba.cuda.blockIdx
The block indices in the grid of thread blocks, accessed through the
attributes ``x``, ``y``, and ``z``. Each index is an integer spanning the
range from 0 inclusive to the corresponding value of the attribute in
:attr:`numba.cuda.gridDim` exclusive.
.. attribute:: numba.cuda.blockDim
The shape of a block of threads, as declared when instantiating the
kernel. This value is the same for all threads in a given kernel, even
if they belong to different blocks (i.e. each block is "full").
.. attribute:: numba.cuda.gridDim
The shape of the grid of blocks, accessed through the attributes ``x``,
``y``, and ``z``.
.. attribute:: numba.cuda.laneid
The thread index in the current warp, as an integer spanning the range
from 0 inclusive to the :attr:`numba.cuda.warpsize` exclusive.
.. attribute:: numba.cuda.warpsize
The size in threads of a warp on the GPU. Currently this is always 32.
.. function:: numba.cuda.grid(ndim)
Return the absolute position of the current thread in the entire
grid of blocks. *ndim* should correspond to the number of dimensions
declared when instantiating the kernel. If *ndim* is 1, a single integer
is returned. If *ndim* is 2 or 3, a tuple of the given number of
integers is returned.
Computation of the first integer is as follows::
cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
and is similar for the other two indices, but using the ``y`` and ``z``
attributes.
.. function:: numba.cuda.gridsize(ndim)
Return the absolute size (or shape) in threads of the entire grid of
blocks. *ndim* should correspond to the number of dimensions declared when
instantiating the kernel.
Computation of the first integer is as follows::
cuda.blockDim.x * cuda.gridDim.x
and is similar for the other two indices, but using the ``y`` and ``z``
attributes.
Memory Management
~~~~~~~~~~~~~~~~~
.. function:: numba.cuda.shared.array(shape, dtype)
Creates an array in the local memory space of the CUDA kernel with
the given ``shape`` and ``dtype``.
Returns an array with its content uninitialized.
.. note:: All threads in the same thread block sees the same array.
.. function:: numba.cuda.local.array(shape, dtype)
Creates an array in the local memory space of the CUDA kernel with the
given ``shape`` and ``dtype``.
Returns an array with its content uninitialized.
.. note:: Each thread sees a unique array.
.. function:: numba.cuda.const.array_like(ary)
Copies the ``ary`` into constant memory space on the CUDA kernel at compile
time.
Returns an array like the ``ary`` argument.
.. note:: All threads and blocks see the same array.
Synchronization and Atomic Operations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. function:: numba.cuda.atomic.add(array, idx, value)
Perform ``array[idx] += value``. Support int32, int64, float32 and
float64 only. The ``idx`` argument can be an integer or a tuple of integer
indices for indexing into multiple dimensional arrays. The number of element
in ``idx`` must match the number of dimension of ``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic load.
.. function:: numba.cuda.atomic.sub(array, idx, value)
Perform ``array[idx] -= value``. Supports int32, int64, float32 and
float64 only. The ``idx`` argument can be an integer or a tuple of integer
indices for indexing into multi-dimensional arrays. The number of elements
in ``idx`` must match the number of dimensions of ``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic load.
.. function:: numba.cuda.atomic.and_(array, idx, value)
Perform ``array[idx] &= value``. Supports int32, uint32, int64,
and uint64 only. The ``idx`` argument can be an integer or a tuple of
integer indices for indexing into multi-dimensional arrays. The number
of elements in ``idx`` must match the number of dimensions of ``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic load.
.. function:: numba.cuda.atomic.or_(array, idx, value)
Perform ``array[idx] |= value``. Supports int32, uint32, int64,
and uint64 only. The ``idx`` argument can be an integer or a tuple of
integer indices for indexing into multi-dimensional arrays. The number
of elements in ``idx`` must match the number of dimensions of ``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic load.
.. function:: numba.cuda.atomic.xor(array, idx, value)
Perform ``array[idx] ^= value``. Supports int32, uint32, int64,
and uint64 only. The ``idx`` argument can be an integer or a tuple of
integer indices for indexing into multi-dimensional arrays. The number
of elements in ``idx`` must match the number of dimensions of ``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic load.
.. function:: numba.cuda.atomic.exch(array, idx, value)
Perform ``array[idx] = value``. Supports int32, uint32, int64,
and uint64 only. The ``idx`` argument can be an integer or a tuple of
integer indices for indexing into multi-dimensional arrays. The number
of elements in ``idx`` must match the number of dimensions of ``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic load.
.. function:: numba.cuda.atomic.inc(array, idx, value)
Perform ``array[idx] = (0 if array[idx] >= value else array[idx] + 1)``.
Supports uint32, and uint64 only. The ``idx`` argument can be an integer
or a tuple of integer indices for indexing into multi-dimensional arrays.
The number of elements in ``idx`` must match the number of dimensions of
``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic load.
.. function:: numba.cuda.atomic.dec(array, idx, value)
Perform ``array[idx] =
(value if (array[idx] == 0) or (array[idx] > value) else array[idx] - 1)``.
Supports uint32, and uint64 only. The ``idx`` argument can be an integer
or a tuple of integer indices for indexing into multi-dimensional arrays.
The number of elements in ``idx`` must match the number of dimensions of
``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic load.
.. function:: numba.cuda.atomic.max(array, idx, value)
Perform ``array[idx] = max(array[idx], value)``. Support int32, int64,
float32 and float64 only. The ``idx`` argument can be an integer or a
tuple of integer indices for indexing into multiple dimensional arrays.
The number of element in ``idx`` must match the number of dimension of
``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic load.
.. function:: numba.cuda.atomic.cas(array, idx, old, value)
Perform ``if array[idx] == old: array[idx] = value``. Supports int32,
int64, uint32, uint64 indexes only. The ``idx`` argument can be an integer
or a tuple of integer indices for indexing into multi-dimensional arrays.
The number of elements in ``idx`` must match the number of dimensions of
``array``.
Returns the value of ``array[idx]`` before storing the new value.
Behaves like an atomic compare and swap.
.. function:: numba.cuda.syncthreads
Synchronize all threads in the same thread block. This function implements
the same pattern as barriers in traditional multi-threaded programming: this
function waits until all threads in the block call it, at which point it
returns control to all its callers.
.. function:: numba.cuda.syncthreads_count(predicate)
An extension to :attr:`numba.cuda.syncthreads` where the return value is a count
of the threads where ``predicate`` is true.
.. function:: numba.cuda.syncthreads_and(predicate)
An extension to :attr:`numba.cuda.syncthreads` where 1 is returned if ``predicate`` is
true for all threads or 0 otherwise.
.. function:: numba.cuda.syncthreads_or(predicate)
An extension to :attr:`numba.cuda.syncthreads` where 1 is returned if ``predicate`` is
true for any thread or 0 otherwise.
.. warning:: All syncthreads functions must be called by every thread in the
thread-block. Falling to do so may result in undefined behavior.
Cooperative Groups
~~~~~~~~~~~~~~~~~~
.. function:: numba.cuda.cg.this_grid()
Get the current grid group.
:return: The current grid group
:rtype: numba.cuda.cg.GridGroup
.. class:: numba.cuda.cg.GridGroup
A grid group. Users should not construct a GridGroup directly - instead, get
the current grid group using :func:`cg.this_grid() <numba.cuda.cg.this_grid>`.
.. method:: sync()
Synchronize the current grid group.
Memory Fences
~~~~~~~~~~~~~
The memory fences are used to guarantee the effect of memory operations
are visible by other threads within the same thread-block, the same GPU device,
and the same system (across GPUs on global memory). Memory loads and stores
are guaranteed to not move across the memory fences by optimization passes.
.. warning:: The memory fences are considered to be advanced API and most
usercases should use the thread barrier (e.g. ``syncthreads()``).
.. function:: numba.cuda.threadfence
A memory fence at device level (within the GPU).
.. function:: numba.cuda.threadfence_block
A memory fence at thread block level.
.. function:: numba.cuda.threadfence_system
A memory fence at system level (across GPUs).
Warp Intrinsics
~~~~~~~~~~~~~~~
The argument ``membermask`` is a 32 bit integer mask with each bit
corresponding to a thread in the warp, with 1 meaning the thread is in the
subset of threads within the function call. The ``membermask`` must be all 1 if
the GPU compute capability is below 7.x.
.. function:: numba.cuda.syncwarp(membermask)
Synchronize a masked subset of the threads in a warp.
.. function:: numba.cuda.all_sync(membermask, predicate)
If the ``predicate`` is true for all threads in the masked warp, then
a non-zero value is returned, otherwise 0 is returned.
.. function:: numba.cuda.any_sync(membermask, predicate)
If the ``predicate`` is true for any thread in the masked warp, then
a non-zero value is returned, otherwise 0 is returned.
.. function:: numba.cuda.eq_sync(membermask, predicate)
If the boolean ``predicate`` is the same for all threads in the masked warp,
then a non-zero value is returned, otherwise 0 is returned.
.. function:: numba.cuda.ballot_sync(membermask, predicate)
Returns a mask of all threads in the warp whose ``predicate`` is true,
and are within the given mask.
.. function:: numba.cuda.shfl_sync(membermask, value, src_lane)
Shuffles ``value`` across the masked warp and returns the ``value``
from ``src_lane``. If this is outside the warp, then the
given ``value`` is returned.
.. function:: numba.cuda.shfl_up_sync(membermask, value, delta)
Shuffles ``value`` across the masked warp and returns the ``value``
from ``laneid - delta``. If this is outside the warp, then the
given ``value`` is returned.
.. function:: numba.cuda.shfl_down_sync(membermask, value, delta)
Shuffles ``value`` across the masked warp and returns the ``value``
from ``laneid + delta``. If this is outside the warp, then the
given ``value`` is returned.
.. function:: numba.cuda.shfl_xor_sync(membermask, value, lane_mask)
Shuffles ``value`` across the masked warp and returns the ``value``
from ``laneid ^ lane_mask``.
.. function:: numba.cuda.match_any_sync(membermask, value, lane_mask)
Returns a mask of threads that have same ``value`` as the given ``value``
from within the masked warp.
.. function:: numba.cuda.match_all_sync(membermask, value, lane_mask)
Returns a tuple of (mask, pred), where mask is a mask of threads that have
same ``value`` as the given ``value`` from within the masked warp, if they
all have the same value, otherwise it is 0. And pred is a boolean of whether
or not all threads in the mask warp have the same warp.
.. function:: numba.cuda.activemask()
Returns a 32-bit integer mask of all currently active threads in the
calling warp. The Nth bit is set if the Nth lane in the warp is active when
activemask() is called. Inactive threads are represented by 0 bits in the
returned mask. Threads which have exited the kernel are always marked as
inactive.
.. function:: numba.cuda.lanemask_lt()
Returns a 32-bit integer mask of all lanes (including inactive ones) with
ID less than the current lane.
Integer Intrinsics
~~~~~~~~~~~~~~~~~~
A subset of the CUDA Math API's integer intrinsics are available. For further
documentation, including semantics, please refer to the `CUDA Toolkit
documentation
<https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html>`_.
.. function:: numba.cuda.popc(x)
Returns the number of bits set in ``x``.
.. function:: numba.cuda.brev(x)
Returns the reverse of the bit pattern of ``x``. For example, ``0b10110110``
becomes ``0b01101101``.
.. function:: numba.cuda.clz(x)
Returns the number of leading zeros in ``x``.
.. function:: numba.cuda.ffs(x)
Returns the position of the first (least significant) bit set to 1 in ``x``,
where the least significant bit position is 1. ``ffs(0)`` returns 0.
Floating Point Intrinsics
~~~~~~~~~~~~~~~~~~~~~~~~~
A subset of the CUDA Math API's floating point intrinsics are available. For further
documentation, including semantics, please refer to the `single
<https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html>`_ and
`double <https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__DOUBLE.html>`_
precision parts of the CUDA Toolkit documentation.
.. function:: numba.cuda.fma
Perform the fused multiply-add operation. Named after the ``fma`` and ``fmaf`` in
the C api, but maps to the ``fma.rn.f32`` and ``fma.rn.f64`` (round-to-nearest-even)
PTX instructions.
.. function:: numba.cuda.cbrt (x)
Perform the cube root operation, x ** (1/3). Named after the functions
``cbrt`` and ``cbrtf`` in the C api. Supports float32, and float64 arguments
only.
16-bit Floating Point Intrinsics
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The functions in the ``cuda.fp16`` module are used to operate on 16-bit
floating point operands. These functions return a 16-bit floating point result.
To determine whether Numba supports compiling code that uses the ``float16``
type in the current configuration, use:
.. function:: numba.cuda.is_float16_supported ()
Return ``True`` if 16-bit floats are supported, ``False`` otherwise.
To check whether a device supports ``float16``, use its
:attr:`supports_float16 <numba.cuda.cudadrv.driver.Device.supports_float16>`
attribute.
.. function:: numba.cuda.fp16.hfma (a, b, c)
Perform the fused multiply-add operation ``(a * b) + c`` on 16-bit
floating point arguments in round to nearest mode. Maps to the ``fma.rn.f16``
PTX instruction.
Returns the 16-bit floating point result of the fused multiply-add.
.. function:: numba.cuda.fp16.hadd (a, b)
Perform the add operation ``a + b`` on 16-bit floating point arguments in
round to nearest mode. Maps to the ``add.f16`` PTX instruction.
Returns the 16-bit floating point result of the addition.
.. function:: numba.cuda.fp16.hsub (a, b)
Perform the subtract operation ``a - b`` on 16-bit floating point arguments in
round to nearest mode. Maps to the ``sub.f16`` PTX instruction.
Returns the 16-bit floating point result of the subtraction.
.. function:: numba.cuda.fp16.hmul (a, b)
Perform the multiply operation ``a * b`` on 16-bit floating point arguments in
round to nearest mode. Maps to the ``mul.f16`` PTX instruction.
Returns the 16-bit floating point result of the multiplication.
.. function:: numba.cuda.fp16.hdiv (a, b)
Perform the divide operation ``a / b`` on 16-bit floating point arguments in
round to nearest mode.
Returns the 16-bit floating point result of the division.
.. function:: numba.cuda.fp16.hneg (a)
Perform the negation operation ``-a`` on the 16-bit floating point argument.
Maps to the ``neg.f16`` PTX instruction.
Returns the 16-bit floating point result of the negation.
.. function:: numba.cuda.fp16.habs (a)
Perform the absolute value operation ``|a|`` on the 16-bit floating point argument.
Returns the 16-bit floating point result of the absolute value operation.
.. function:: numba.cuda.fp16.hsin (a)
Calculates the trigonometry sine function of the 16-bit floating point argument.
Returns the 16-bit floating point result of the sine operation.
.. function:: numba.cuda.fp16.hcos (a)
Calculates the trigonometry cosine function of the 16-bit floating point argument.
Returns the 16-bit floating point result of the cosine operation.
.. function:: numba.cuda.fp16.hlog (a)
Calculates the natural logarithm of the 16-bit floating point argument.
Returns the 16-bit floating point result of the natural log operation.
.. function:: numba.cuda.fp16.hlog10 (a)
Calculates the base 10 logarithm of the 16-bit floating point argument.
Returns the 16-bit floating point result of the log base 10 operation.
.. function:: numba.cuda.fp16.hlog2 (a)
Calculates the base 2 logarithm on the 16-bit floating point argument.
Returns the 16-bit floating point result of the log base 2 operation.
.. function:: numba.cuda.fp16.hexp (a)
Calculates the natural exponential operation of the 16-bit floating point argument.
Returns the 16-bit floating point result of the exponential operation.
.. function:: numba.cuda.fp16.hexp10 (a)
Calculates the base 10 exponential of the 16-bit floating point argument.
Returns the 16-bit floating point result of the exponential operation.
.. function:: numba.cuda.fp16.hexp2 (a)
Calculates the base 2 exponential of the 16-bit floating point argument.
Returns the 16-bit floating point result of the exponential operation.
.. function:: numba.cuda.fp16.hfloor (a)
Calculates the floor operation, the largest integer less than or equal to ``a``,
on the 16-bit floating point argument.
Returns the 16-bit floating point result of the floor operation.
.. function:: numba.cuda.fp16.hceil (a)
Calculates the ceiling operation, the smallest integer greater than or equal to ``a``,
on the 16-bit floating point argument.
Returns the 16-bit floating point result of the ceil operation.
.. function:: numba.cuda.fp16.hsqrt (a)
Calculates the square root operation of the 16-bit floating point argument.
Returns the 16-bit floating point result of the square root operation.
.. function:: numba.cuda.fp16.hrsqrt (a)
Calculates the reciprocal of the square root of the 16-bit floating point argument.
Returns the 16-bit floating point result of the reciprocal square root operation.
.. function:: numba.cuda.fp16.hrcp (a)
Calculates the reciprocal of the 16-bit floating point argument.
Returns the 16-bit floating point result of the reciprocal.
.. function:: numba.cuda.fp16.hrint (a)
Round the input 16-bit floating point argument to nearest integer value.
Returns the 16-bit floating point result of the rounding.
.. function:: numba.cuda.fp16.htrunc (a)
Truncate the input 16-bit floating point argument to the nearest integer
that does not exceed the input argument in magnitude.
Returns the 16-bit floating point result of the truncation.
.. function:: numba.cuda.fp16.heq (a, b)
Perform the comparison operation ``a == b`` on 16-bit floating point arguments.
Returns a boolean.
.. function:: numba.cuda.fp16.hne (a, b)
Perform the comparison operation ``a != b`` on 16-bit floating point arguments.
Returns a boolean.
.. function:: numba.cuda.fp16.hgt (a, b)
Perform the comparison operation ``a > b`` on 16-bit floating point arguments.
Returns a boolean.
.. function:: numba.cuda.fp16.hge (a, b)
Perform the comparison operation ``a >= b`` on 16-bit floating point arguments.
Returns a boolean.
.. function:: numba.cuda.fp16.hlt (a, b)
Perform the comparison operation ``a < b`` on 16-bit floating point arguments.
Returns a boolean.
.. function:: numba.cuda.fp16.hle (a, b)
Perform the comparison operation ``a <= b`` on 16-bit floating point arguments.
Returns a boolean.
.. function:: numba.cuda.fp16.hmax (a, b)
Perform the operation ``a if a > b else b.``
Returns a 16-bit floating point value.
.. function:: numba.cuda.fp16.hmin (a, b)
Perform the operation ``a if a < b else b.``
Returns a 16-bit floating point value.
Control Flow Instructions
~~~~~~~~~~~~~~~~~~~~~~~~~
A subset of the CUDA's control flow instructions are directly available as
intrinsics. Avoiding branches is a key way to improve CUDA performance, and
using these intrinsics mean you don't have to rely on the ``nvcc`` optimizer
identifying and removing branches. For further documentation, including
semantics, please refer to the `relevant CUDA Toolkit documentation
<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions>`_.
.. function:: numba.cuda.selp
Select between two expressions, depending on the value of the first
argument. Similar to LLVM's ``select`` instruction.
Timer Intrinsics
~~~~~~~~~~~~~~~~
.. function:: numba.cuda.nanosleep(ns)
Suspends the thread for a sleep duration approximately close to the delay
``ns``, specified in nanoseconds.
Libdevice functions
===================
All wrapped libdevice functions are listed in this section. All functions in
libdevice are wrapped, with the exception of ``__nv_nan`` and ``__nv_nanf``.
These functions return a representation of a quiet NaN, but the argument they
take (a pointer to an object specifying the representation) is undocumented, and
follows an unusual form compared to the rest of libdevice - it is not an output
like every other pointer argument. If a NaN is required, one can be obtained in
CUDA Python by other means, e.g. ``math.nan``.
Wrapped functions
-----------------
.. automodule:: numba.cuda.libdevice
:members:
Memory Management
=================
.. autofunction:: numba.cuda.to_device
.. autofunction:: numba.cuda.device_array
.. autofunction:: numba.cuda.device_array_like
.. autofunction:: numba.cuda.pinned_array
.. autofunction:: numba.cuda.pinned_array_like
.. autofunction:: numba.cuda.mapped_array
.. autofunction:: numba.cuda.mapped_array_like
.. autofunction:: numba.cuda.managed_array
.. autofunction:: numba.cuda.pinned
.. autofunction:: numba.cuda.mapped
Device Objects
--------------
.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceNDArray
:members: copy_to_device, copy_to_host, is_c_contiguous, is_f_contiguous,
ravel, reshape, split
.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceRecord
:members: copy_to_device, copy_to_host
.. autoclass:: numba.cuda.cudadrv.devicearray.MappedNDArray
:members: copy_to_device, copy_to_host, split
CUDA-Specific Types
====================
.. note::
This page is about types specific to CUDA targets. Many other types are also
available in the CUDA target - see :ref:`cuda-built-in-types`.
Vector Types
~~~~~~~~~~~~
`CUDA Vector Types <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#built-in-vector-types>`_
are usable in kernels. There are two important distinctions from vector types in CUDA C/C++:
First, the recommended names for vector types in Numba CUDA is formatted as ``<base_type>x<N>``,
where ``base_type`` is the base type of the vector, and ``N`` is the number of elements in the vector.
Examples include ``int64x3``, ``uint16x4``, ``float32x4``, etc. For new Numba CUDA kernels,
this is the recommended way to instantiate vector types.
For convenience, users adapting existing kernels from CUDA C/C++ to Python may use
aliases consistent with the C/C++ namings. For example, ``float3`` aliases ``float32x3``,
``long3`` aliases ``int32x3`` or ``int64x3`` (depending on the platform), etc.
Second, unlike CUDA C/C++ where factory functions are used, vector types are constructed directly
with their constructor. For example, to construct a ``float32x3``:
.. code-block:: python3
from numba.cuda import float32x3
# In kernel
f3 = float32x3(0.0, -1.0, 1.0)
Additionally, vector types can be constructed from a combination of vector and
primitive types, as long as the total number of components matches the result
vector type. For example, all of the following constructions are valid:
.. code-block:: python3
zero = uint32(0)
u2 = uint32x2(1, 2)
# Construct a 3-component vector with primitive type and a 2-component vector
u3 = uint32x3(zero, u2)
# Construct a 4-component vector with 2 2-component vectors
u4 = uint32x4(u2, u2)
The 1st, 2nd, 3rd and 4th component of the vector type can be accessed through fields
``x``, ``y``, ``z``, and ``w`` respectively. The components are immutable after
construction in the present version of Numba; it is expected that support for
mutating vector components will be added in a future release.
.. code-block:: python3
v1 = float32x2(1.0, 1.0)
v2 = float32x2(1.0, -1.0)
dotprod = v1.x * v2.x + v1.y * v2.y
CUDA Bindings
=============
Numba supports two bindings to the CUDA Driver APIs: its own internal bindings
based on ctypes, and the official `NVIDIA CUDA Python bindings
<https://nvidia.github.io/cuda-python/>`_. Functionality is equivalent between
the two bindings.
The internal bindings are used by default. If the NVIDIA bindings are installed,
then they can be used by setting the environment variable
``NUMBA_CUDA_USE_NVIDIA_BINDING`` to ``1`` prior to the import of Numba. Once
Numba has been imported, the selected binding cannot be changed.
Per-Thread Default Streams
--------------------------
Responsibility for handling Per-Thread Default Streams (PTDS) is delegated to
the NVIDIA bindings when they are in use. To use PTDS with the NVIDIA bindings,
set the environment variable ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to
``1`` instead of Numba's environmnent variable
:envvar:`NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`.
.. seealso::
The `Default Stream section
<https://nvidia.github.io/cuda-python/release/11.6.0-notes.html#default-stream>`_
in the NVIDIA Bindings documentation.
Roadmap
-------
In Numba 0.56, the NVIDIA Bindings will be used by default, if they are
installed.
In future versions of Numba:
- The internal bindings will be deprecated.
- The internal bindings will be removed.
At present, no specific release is planned for the deprecation or removal of
the internal bindings.
On-disk Kernel Caching
======================
When the ``cache`` keyword argument of the :func:`@cuda.jit <numba.cuda.jit>`
decorator is ``True``, a file-based cache is enabled. This shortens compilation
times when the function was already compiled in a previous invocation.
The cache is maintained in the ``__pycache__`` subdirectory of the directory
containing the source file; if the current user is not allowed to write to it,
the cache implementation falls back to a platform-specific user-wide cache
directory (such as ``$HOME/.cache/numba`` on Unix platforms).
Compute capability considerations
---------------------------------
Separate cache files are maintained for each compute capability. When a cached
kernel is loaded, the compute capability of the device the kernel is first
launched on in the current run is used to determine which version to load.
Therefore, on systems that have multiple GPUs with differing compute
capabilities, the cached versions of kernels are only used for one compute
capability, and recompilation will occur for other compute capabilities.
For example: if a system has two GPUs, one of compute capability 7.5 and one of
8.0, then:
* If a cached kernel is first launched on the CC 7.5 device, then the cached
version for CC 7.5 is used. If it is subsequently launched on the CC 8.0
device, a recompilation will occur.
* If in a subsequent run the cached kernel is first launched on the CC 8.0
device, then the cached version for CC 8.0 is used. A subsequent launch on
the CC 7.5 device will require a recompilation.
This limitation is not expected to present issues in most practical scenarios,
as multi-GPU production systems tend to have identical GPUs within each node.
==================
Cooperative Groups
==================
Supported features
------------------
Numba's Cooperative Groups support presently provides grid groups and grid
synchronization, along with cooperative kernel launches.
Cooperative groups are supported on Linux, and Windows for devices in `TCC
mode
<https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#tesla-compute-cluster-mode-for-windows>`_.
Using Grid Groups
-----------------
To get the current grid group, use the :meth:`cg.this_grid()
<numba.cuda.cg.this_grid>` function:
.. code-block:: python
g = cuda.cg.this_grid()
Synchronizing the grid is done with the :meth:`sync()
<numba.cuda.cg.GridGroup.sync>` method of the grid group:
.. code-block:: python
g.sync()
Cooperative Launches
--------------------
Unlike the CUDA C/C++ API, a cooperative launch is invoked using the same syntax
as a normal kernel launch - Numba automatically determines whether a cooperative
launch is required based on whether a grid group is synchronized in the kernel.
The grid size limit for a cooperative launch is more restrictive than for a
normal launch - the grid must be no larger than the maximum number of active
blocks on the device on which it is launched. To get maximum grid size for a
cooperative launch of a kernel with a given block size and dynamic shared
memory requirement, use the ``max_cooperative_grid_blocks()`` method of kernel
overloads:
.. automethod:: numba.cuda.dispatcher._Kernel.max_cooperative_grid_blocks
This can be used to ensure that the kernel is launched with no more than the
maximum number of blocks. Exceeding the maximum number of blocks for the
cooperative launch will result in a ``CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE``
error.
Applications and Example
------------------------
Grid group synchronization can be used to implement a global barrier across all
threads in the grid - applications of this include a global reduction to a
single value, or looping over rows of a large matrix sequentially using the
entire grid to operate on column elements in parallel.
In the following example, rows are written sequentially by the grid. Each thread
in the grid reads a value from the previous row written by it's *opposite*
thread. A grid sync is needed to ensure that threads in the grid don't run ahead
of threads in other blocks, or fail to see updates from their opposite thread.
First we'll define our kernel:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
:language: python
:caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
:start-after: magictoken.ex_grid_sync_kernel.begin
:end-before: magictoken.ex_grid_sync_kernel.end
:dedent: 8
:linenos:
Then create some empty input data and determine the grid and block sizes:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
:language: python
:caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
:start-after: magictoken.ex_grid_sync_data.begin
:end-before: magictoken.ex_grid_sync_data.end
:dedent: 8
:linenos:
Finally we launch the kernel and print the result:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
:language: python
:caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
:start-after: magictoken.ex_grid_sync_launch.begin
:end-before: magictoken.ex_grid_sync_launch.end
:dedent: 8
:linenos:
The maximum grid size for ``sequential_rows`` can be enquired using:
.. code-block:: python
overload = sequential_rows.overloads[(int32[:,::1],)
max_blocks = overload.max_cooperative_grid_blocks(blockdim)
print(max_blocks)
# 1152 (e.g. on Quadro RTX 8000 with Numba 0.52.1 and CUDA 11.0)
.. _cuda-array-interface:
================================
CUDA Array Interface (Version 3)
================================
The *CUDA Array Interface* (or CAI) is created for interoperability between
different implementations of CUDA array-like objects in various projects. The
idea is borrowed from the `NumPy array interface`_.
.. note::
Currently, we only define the Python-side interface. In the future, we may
add a C-side interface for efficient exchange of the information in
compiled code.
Python Interface Specification
==============================
.. note:: Experimental feature. Specification may change.
The ``__cuda_array_interface__`` attribute returns a dictionary (``dict``)
that must contain the following entries:
- **shape**: ``(integer, ...)``
A tuple of ``int`` (or ``long``) representing the size of each dimension.
- **typestr**: ``str``
The type string. This has the same definition as ``typestr`` in the
`NumPy array interface`_.
- **data**: ``(integer, boolean)``
The **data** is a 2-tuple. The first element is the data pointer
as a Python ``int`` (or ``long``). The data must be device-accessible.
For zero-size arrays, use ``0`` here.
The second element is the read-only flag as a Python ``bool``.
Because the user of the interface may or may not be in the same context,
the most common case is to use ``cuPointerGetAttribute`` with
``CU_POINTER_ATTRIBUTE_DEVICE_POINTER`` in the CUDA driver API (or the
equivalent CUDA Runtime API) to retrieve a device pointer that
is usable in the currently active context.
- **version**: ``integer``
An integer for the version of the interface being exported.
The current version is *3*.
The following are optional entries:
- **strides**: ``None`` or ``(integer, ...)``
If **strides** is not given, or it is ``None``, the array is in
C-contiguous layout. Otherwise, a tuple of ``int`` (or ``long``) is explicitly
given for representing the number of bytes to skip to access the next
element at each dimension.
- **descr**
This is for describing more complicated types. This follows the same
specification as in the `NumPy array interface`_.
- **mask**: ``None`` or object exposing the ``__cuda_array_interface__``
If ``None`` then all values in **data** are valid. All elements of the mask
array should be interpreted only as true or not true indicating which
elements of this array are valid. This has the same definition as ``mask``
in the `NumPy array interface`_.
.. note:: Numba does not currently support working with masked CUDA arrays
and will raise a ``NotImplementedError`` exception if one is passed
to a GPU function.
- **stream**: ``None`` or ``integer``
An optional stream upon which synchronization must take place at the point of
consumption, either by synchronizing on the stream or enqueuing operations on
the data on the given stream. Integer values in this entry are as follows:
- ``0``: This is disallowed as it would be ambiguous between ``None`` and the
default stream, and also between the legacy and per-thread default streams.
Any use case where ``0`` might be given should either use ``None``, ``1``,
or ``2`` instead for clarity.
- ``1``: The legacy default stream.
- ``2``: The per-thread default stream.
- Any other integer: a ``cudaStream_t`` represented as a Python integer.
When ``None``, no synchronization is required. See the
:ref:`cuda-array-interface-synchronization` section below for further details.
In a future revision of the interface, this entry may be expanded (or another
entry added) so that an event to synchronize on can be specified instead of a
stream.
.. _cuda-array-interface-synchronization:
Synchronization
---------------
Definitions
~~~~~~~~~~~
When discussing synchronization, the following definitions are used:
- *Producer*: The library / object on which ``__cuda_array_interface__`` is
accessed.
- *Consumer*: The library / function that accesses the
``__cuda_array_interface__`` of the Producer.
- *User Code*: Code that induces a Producer and Consumer to share data through
the CAI.
- *User*: The person writing or maintaining the User Code. The User may
implement User Code without knowledge of the CAI, since the CAI accesses can
be hidden from their view.
In the following example:
.. code-block:: python
import cupy
from numba import cuda
@cuda.jit
def add(x, y, out):
start = cuda.grid(1)
stride = cuda.gridsize(1)
for i in range(start, x.shape[0], stride):
out[i] = x[i] + y[i]
a = cupy.arange(10)
b = a * 2
out = cupy.zeros_like(a)
add[1, 32](a, b, out)
When the ``add`` kernel is launched:
- ``a``, ``b``, ``out`` are Producers.
- The ``add`` kernel is the Consumer.
- The User Code is specifically ``add[1, 32](a, b, out)``.
- The author of the code is the User.
Design Motivations
~~~~~~~~~~~~~~~~~~
Elements of the CAI design related to synchronization seek to fulfill these
requirements:
1. Producers and Consumers that exchange data through the CAI must be able to do
so without data races.
2. Requirement 1 should be met without requiring the user to be
aware of any particulars of the CAI - in other words, exchanging data between
Producers and Consumers that operate on data asynchronously should be correct
by default.
- An exception to this requirement is made for Producers and Consumers that
explicitly document that the User is required to take additional steps to
ensure correctness with respect to synchronization. In this case, Users
are required to understand the details of the CUDA Array Interface, and
the Producer/Consumer library documentation must specify the steps that
Users are required to take.
Use of this exception should be avoided where possible, as it is provided
for libraries that cannot implement the synchronization semantics without
the involvement of the User - for example, those interfacing with
third-party libraries oblivious to the CUDA Array Interface.
3. Where the User is aware of the particulars of the CAI and implementation
details of the Producer and Consumer, they should be able to, at their
discretion, override some of the synchronization semantics of the interface
to reduce the synchronization overhead. Overriding synchronization semantics
implies that:
- The CAI design, and the design and implementation of the Producer and
Consumer do not specify or guarantee correctness with respect to data
races.
- Instead, the User is responsible for ensuring correctness with respect to
data races.
Interface Requirements
~~~~~~~~~~~~~~~~~~~~~~
The ``stream`` entry enables Producers and Consumers to avoid hazards when
exchanging data. Expected behaviour of the Consumer is as follows:
* When ``stream`` is not present or is ``None``:
- No synchronization is required on the part of the Consumer.
- The Consumer may enqueue operations on the underlying data immediately on
any stream.
* When ``stream`` is an integer, its value indicates the stream on which the
Producer may have in-progress operations on the data, and which the Consumer
is expected to either:
- Synchronize on before accessing the data, or
- Enqueue operations in when accessing the data.
The Consumer can choose which mechanism to use, with the following
considerations:
- If the Consumer synchronizes on the provided stream prior to accessing the
data, then it must ensure that no computation can take place in the provided
stream until its operations in its own choice of stream have taken place.
This could be achieved by either:
- Placing a wait on an event in the provided stream that occurs once all
of the Consumer's operations on the data are completed, or
- Avoiding returning control to the user code until after its operations
on its own stream have completed.
- If the consumer chooses to only enqueue operations on the data in the
provided stream, then it may return control to the User code immediately
after enqueueing its work, as the work will all be serialized on the
exported array's stream. This is sufficient to ensure correctness even if
the User code were to induce the Producer to subsequently start enqueueing
more work on the same stream.
* If the User has set the Consumer to ignore CAI synchronization semantics, the
Consumer may assume it can operate on the data immediately in any stream with
no further synchronization, even if the ``stream`` member has an integer
value.
When exporting an array through the CAI, Producers must ensure that:
* If there is work on the data enqueued in one or more streams, then
synchronization on the provided ``stream`` is sufficient to ensure
synchronization with all pending work.
- If the Producer has no enqueued work, or work only enqueued on the stream
identified by ``stream``, then this condition is met.
- If the Producer has enqueued work on the data on multiple streams, then it
must enqueue events on those streams that follow the enqueued work, and
then wait on those events in the provided ``stream``. For example:
1. Work is enqueued by the Producer on streams ``7``, ``9``, and ``15``.
2. Events are then enqueued on each of streams ``7``, ``9``, and ``15``.
3. Producer then tells stream ``3`` to wait on the events from Step 2, and
the ``stream`` entry is set to ``3``.
* If there is no work enqueued on the data, then the ``stream`` entry may be
either ``None``, or not provided.
Optionally, to facilitate the User relaxing conformance to synchronization
semantics:
* Producers may provide a configuration option to always set ``stream`` to
``None``.
* Consumers may provide a configuration option to ignore the value of ``stream``
and act as if it were ``None`` or not provided. This elides synchronization
on the Producer-provided streams, and allows enqueuing work on streams other
than that provided by the Producer.
These options should not be set by default in either a Producer or a Consumer.
The CAI specification does not prescribe the exact mechanism by which these
options are set, or related options that Producers or Consumers might provide
to allow the user further control over synchronization behavior.
Synchronization in Numba
~~~~~~~~~~~~~~~~~~~~~~~~
Numba is neither strictly a Producer nor a Consumer - it may be used to
implement either by a User. In order to facilitate the correct implementation of
synchronization semantics, Numba exhibits the following behaviors related to
synchronization of the interface:
- When Numba acts as a Consumer (for example when an array-like object is passed
to a kernel launch): If ``stream`` is an integer, then Numba will immediately
synchronize on the provided ``stream``. A Numba :class:`Device Array
<numba.cuda.cudadrv.devicearray.DeviceNDArray>` created from an array-like
object has its *default stream* set to the provided stream.
- When Numba acts as a Producer (when the ``__cuda_array_interface__`` property
of a Numba CUDA Array is accessed): If the exported CUDA Array has a
*default stream*, then it is given as the ``stream`` entry. Otherwise,
``stream`` is set to ``None``.
.. note:: In Numba's terminology, an array's *default stream* is a property
specifying the stream that Numba will enqueue asynchronous
transfers in if no other stream is provided as an argument to the
function invoking the transfer. It is not the same as the `Default
Stream
<https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#default-stream>`_
in normal CUDA terminology.
Numba's synchronization behavior results in the following intended
consequences:
- Exchanging data either as a Producer or a Consumer will be correct without
the need for any further action from the User, provided that the other side
of the interaction also follows the CAI synchronization semantics.
- The User is expected to either:
- Avoid launching kernels or other operations on streams that
are not the default stream for their parameters, or
- When launching operations on a stream that is not the default stream for
a given parameter, they should then insert an event into the stream that
they are operating in, and wait on that event in the default stream for
the parameter. For an example of this, :ref:`see below
<example-multi-streams>`.
The User may override Numba's synchronization behavior by setting the
environment variable ``NUMBA_CUDA_ARRAY_INTERFACE_SYNC`` or the config variable
``CUDA_ARRAY_INTERFACE_SYNC`` to ``0`` (see :ref:`GPU Support Environment
Variables <numba-envvars-gpu-support>`). When set, Numba will not synchronize
on the streams of imported arrays, and it is the responsibility of the user to
ensure correctness with respect to stream synchronization. Synchronization when
creating a Numba CUDA Array from an object exporting the CUDA Array Interface
may also be elided by passing ``sync=False`` when creating the Numba CUDA
Array with :func:`numba.cuda.as_cuda_array` or
:func:`numba.cuda.from_cuda_array_interface`.
There is scope for Numba's synchronization implementation to be optimized in
the future, by eliding synchronizations when a kernel or driver API operation
(e.g. a memcopy or memset) is launched on the same stream as an imported
array.
.. _example-multi-streams:
An example launching on an array's non-default stream
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This example shows how to ensure that a Consumer can safely consume an array
with a default stream when it is passed to a kernel launched in a different
stream.
First we need to import Numba and a consumer library (a fictitious library named
``other_cai_library`` for this example):
.. code-block:: python
from numba import cuda, int32, void
import other_cai_library
Now we'll define a kernel - this initializes the elements of the array, setting
each entry to its index:
.. code-block:: python
@cuda.jit(void, int32[::1])
def initialize_array(x):
i = cuda.grid(1)
if i < len(x):
x[i] = i
Next we will create two streams:
.. code-block:: python
array_stream = cuda.stream()
kernel_stream = cuda.stream()
Then create an array with one of the streams as its default stream:
.. code-block:: python
N = 16384
x = cuda.device_array(N, stream=array_stream)
Now we launch the kernel in the other stream:
.. code-block:: python
nthreads = 256
nblocks = N // nthreads
initialize_array[nthreads, nblocks, kernel_stream](x)
If we were to pass ``x`` to a Consumer now, there is a risk that it may operate on
it in ``array_stream`` whilst the kernel is still running in ``kernel_stream``.
To prevent operations in ``array_stream`` starting before the kernel launch is
finished, we create an event and wait on it:
.. code-block:: python
# Create event
evt = cuda.event()
# Record the event after the kernel launch in kernel_stream
evt.record(kernel_stream)
# Wait for the event in array_stream
evt.wait(array_stream)
It is now safe for ``other_cai_library`` to consume ``x``:
.. code-block:: python
other_cai_library.consume(x)
Lifetime management
-------------------
Data
~~~~
Obtaining the value of the ``__cuda_array_interface__`` property of any object
has no effect on the lifetime of the object from which it was created. In
particular, note that the interface has no slot for the owner of the data.
The User code must preserve the lifetime of the object owning the data for as
long as the Consumer might use it.
Streams
~~~~~~~
Like data, CUDA streams also have a finite lifetime. It is therefore required
that a Producer exporting data on the interface with an associated stream
ensures that the exported stream's lifetime is equal to or surpasses the
lifetime of the object from which the interface was exported.
Lifetime management in Numba
----------------------------
Producing Arrays
~~~~~~~~~~~~~~~~
Numba takes no steps to maintain the lifetime of an object from which the
interface is exported - it is the user's responsibility to ensure that the
underlying object is kept alive for the duration that the exported interface
might be used.
The lifetime of any Numba-managed stream exported on the interface is guaranteed
to equal or surpass the lifetime of the underlying object, because the
underlying object holds a reference to the stream.
.. note:: Numba-managed streams are those created with
``cuda.default_stream()``, ``cuda.legacy_default_stream()``, or
``cuda.per_thread_default_stream()``. Streams not managed by Numba
are created from an external stream with ``cuda.external_stream()``.
Consuming Arrays
~~~~~~~~~~~~~~~~
Numba provides two mechanisms for creating device arrays from objects exporting
the CUDA Array Interface. Which to use depends on whether the created device
array should maintain the life of the object from which it is created:
- ``as_cuda_array``: This creates a device array that holds a reference to the
owning object. As long as a reference to the device array is held, its
underlying data will also be kept alive, even if all other references to the
original owning object have been dropped.
- ``from_cuda_array_interface``: This creates a device array with no reference
to the owning object by default. The owning object, or some other object to
be considered the owner can be passed in the ``owner`` parameter.
The interfaces of these functions are:
.. automethod:: numba.cuda.as_cuda_array
.. automethod:: numba.cuda.from_cuda_array_interface
Pointer Attributes
------------------
Additional information about the data pointer can be retrieved using
``cuPointerGetAttribute`` or ``cudaPointerGetAttributes``. Such information
include:
- the CUDA context that owns the pointer;
- is the pointer host-accessible?
- is the pointer a managed memory?
.. _NumPy array interface: https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html#__array_interface__
Differences with CUDA Array Interface (Version 0)
-------------------------------------------------
Version 0 of the CUDA Array Interface did not have the optional **mask**
attribute to support masked arrays.
Differences with CUDA Array Interface (Version 1)
-------------------------------------------------
Versions 0 and 1 of the CUDA Array Interface neither clarified the
**strides** attribute for C-contiguous arrays nor specified the treatment for
zero-size arrays.
Differences with CUDA Array Interface (Version 2)
-------------------------------------------------
Prior versions of the CUDA Array Interface made no statement about
synchronization.
Interoperability
----------------
The following Python libraries have adopted the CUDA Array Interface:
- Numba
- `CuPy <https://docs-cupy.chainer.org/en/stable/reference/interoperability.html>`_
- `PyTorch <https://pytorch.org>`_
- `PyArrow <https://arrow.apache.org/docs/python/generated/pyarrow.cuda.Context.html#pyarrow.cuda.Context.buffer_from_object>`_
- `mpi4py <https://mpi4py.readthedocs.io/en/latest/overview.html#support-for-cuda-aware-mpi>`_
- `ArrayViews <https://github.com/xnd-project/arrayviews>`_
- `JAX <https://jax.readthedocs.io/en/latest/index.html>`_
- `PyCUDA <https://documen.tician.de/pycuda/tutorial.html#interoperability-with-other-libraries-using-the-cuda-array-interface>`_
- `DALI: the NVIDIA Data Loading Library <https://github.com/NVIDIA/DALI>`_ :
- `TensorGPU objects
<https://docs.nvidia.com/deeplearning/dali/user-guide/docs/data_types.html#nvidia.dali.backend.TensorGPU>`_
expose the CUDA Array Interface.
- `The External Source operator
<https://docs.nvidia.com/deeplearning/dali/user-guide/docs/supported_ops.html#nvidia.dali.fn.external_source>`_
consumes objects exporting the CUDA Array Interface.
- The RAPIDS stack:
- `cuDF <https://rapidsai.github.io/projects/cudf/en/0.11.0/10min-cudf-cupy.html>`_
- `cuML <https://docs.rapids.ai/api/cuml/nightly/>`_
- `cuSignal <https://github.com/rapidsai/cusignal>`_
- `RMM <https://docs.rapids.ai/api/rmm/stable/>`_
If your project is not on this list, please feel free to report it on the `Numba issue tracker <https://github.com/numba/numba/issues>`_.
.. _cuda_ffi:
Calling foreign functions from Python kernels
=============================================
Python kernels can call device functions written in other languages. CUDA C/C++,
PTX, and binary objects (cubins, fat binaries, etc.) are directly supported;
sources in other languages must be compiled to PTX first. The constituent parts
of a Python kernel call to a foreign device function are:
- The device function implementation in a foreign language (e.g. CUDA C).
- A declaration of the device function in Python.
- A kernel that links with and calls the foreign function.
Device function ABI
-------------------
Numba's ABI for calling device functions defines the following prototype in
C/C++:
.. code:: C
extern "C"
__device__ int
function(
T* return_value,
...
);
Components of the prototype are as follows:
- ``extern "C"`` is used to prevent name-mangling so that it is easy to declare
the function in Python. It can be removed, but then the mangled name must be
used in the declaration of the function in Python.
- ``__device__`` is required to define the function as a device function.
- The return value is always of type ``int``, and is used to signal whether a
Python exception occurred. Since Python exceptions don't occur in foreign
functions, this should always be set to 0 by the callee.
- The first argument is a pointer to the return value of type ``T``, which is
allocated in the local address space [#f1]_ and passed in by the caller. If
the function returns a value, the pointee should be set by the callee to
store the return value.
- Subsequent arguments should match the types and order of arguments passed to
the function from the Python kernel.
Functions written in other languages must compile to PTX that conforms to this
prototype specification.
A function that accepts two floats and returns a float would have the following
prototype:
.. code:: C
extern "C"
__device__ int
mul_f32_f32(
float* return_value,
float x,
float y
);
.. rubric:: Notes
.. [#f1] Care must be taken to ensure that any operations on the return value
are applicable to data in the local address space. Some operations,
such as atomics, cannot be performed on data in the local address
space.
Declaration in Python
---------------------
To declare a foreign device function in Python, use :func:`declare_device()
<numba.cuda.declare_device>`:
.. autofunction:: numba.cuda.declare_device
The returned descriptor name need not match the name of the foreign function.
For example, when:
.. code::
mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)')
is declared, calling ``mul(a, b)`` inside a kernel will translate into a call to
``mul_f32_f32(a, b)`` in the compiled code.
Passing pointers
----------------
Numba's calling convention requires multiple values to be passed for array
arguments. These include the data pointer along with shape, stride, and other
information. This is incompatible with the expectations of most C/C++ functions,
which generally only expect a pointer to the data. To align the calling
conventions between C device code and Python kernels it is necessary to declare
array arguments using C pointer types.
For example, a function with the following prototype:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/ffi/functions.cu
:language: C
:caption: ``numba/cuda/tests/doc_examples/ffi/functions.cu``
:start-after: magictoken.ex_sum_reduce_proto.begin
:end-before: magictoken.ex_sum_reduce_proto.end
:linenos:
would be declared as follows:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_ffi.py
:language: python
:caption: from ``test_ex_from_buffer`` in ``numba/cuda/tests/doc_examples/test_ffi.py``
:start-after: magictoken.ex_from_buffer_decl.begin
:end-before: magictoken.ex_from_buffer_decl.end
:dedent: 8
:linenos:
To obtain a pointer to array data for passing to foreign functions, use the
``from_buffer()`` method of a ``cffi.FFI`` instance. For example, a kernel using
the ``sum_reduce`` function could be defined as:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_ffi.py
:language: python
:caption: from ``test_ex_from_buffer`` in ``numba/cuda/tests/doc_examples/test_ffi.py``
:start-after: magictoken.ex_from_buffer_kernel.begin
:end-before: magictoken.ex_from_buffer_kernel.end
:dedent: 8
:linenos:
where ``result`` and ``array`` are both arrays of ``float32`` data.
Linking and Calling functions
-----------------------------
The ``link`` keyword argument of the :func:`@cuda.jit <numba.cuda.jit>`
decorator accepts a list of file names specified by absolute path or a path
relative to the current working directory. Files whose name ends in ``.cu``
will be compiled with the `NVIDIA Runtime Compiler (NVRTC)
<https://docs.nvidia.com/cuda/nvrtc/index.html>`_ and linked into the kernel as
PTX; other files will be passed directly to the CUDA Linker.
For example, the following kernel calls the ``mul()`` function declared above
with the implementation ``mul_f32_f32()`` in a file called ``functions.cu``:
.. code::
@cuda.jit(link=['functions.cu'])
def multiply_vectors(r, x, y):
i = cuda.grid(1)
if i < len(r):
r[i] = mul(x[i], y[i])
C/C++ Support
-------------
Support for compiling and linking of CUDA C/C++ code is provided through the use
of NVRTC subject to the following considerations:
- It is only available when using the NVIDIA Bindings. See
:envvar:`NUMBA_CUDA_USE_NVIDIA_BINDING`.
- A suitable version of the NVRTC library for the installed version of the
NVIDIA CUDA Bindings must be available.
- The CUDA include path is assumed by default to be ``/usr/local/cuda/include``
on Linux and ``$env:CUDA_PATH\include`` on Windows. It can be modified using
the environment variable :envvar:`NUMBA_CUDA_INCLUDE_PATH`.
- The CUDA include directory will be made available to NVRTC on the include
path; additional includes are not supported.
Complete Example
----------------
This example demonstrates calling a foreign function written in CUDA C to
multiply pairs of numbers from two arrays.
The foreign function is written as follows:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/ffi/functions.cu
:language: C
:caption: ``numba/cuda/tests/doc_examples/ffi/functions.cu``
:start-after: magictoken.ex_mul_f32_f32.begin
:end-before: magictoken.ex_mul_f32_f32.end
:linenos:
The Python code and kernel are:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_ffi.py
:language: python
:caption: from ``test_ex_linking_cu`` in ``numba/cuda/tests/doc_examples/test_ffi.py``
:start-after: magictoken.ex_linking_cu.begin
:end-before: magictoken.ex_linking_cu.end
:dedent: 8
:linenos:
.. note::
The example above is minimal in order to illustrate a foreign function call -
it would not be expected to be particularly performant due to the small grid
and light workload of the foreign function.
========================================
Supported Python features in CUDA Python
========================================
This page lists the Python features supported in the CUDA Python. This includes
all kernel and device functions compiled with ``@cuda.jit`` and other higher
level Numba decorators that targets the CUDA GPU.
Language
========
Execution Model
---------------
CUDA Python maps directly to the *single-instruction multiple-thread*
execution (SIMT) model of CUDA. Each instruction is implicitly
executed by multiple threads in parallel. With this execution model, array
expressions are less useful because we don't want multiple threads to perform
the same task. Instead, we want threads to perform a task in a cooperative
fashion.
For details please consult the
`CUDA Programming Guide
<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#programming-model>`_.
Floating Point Error Model
--------------------------
By default, CUDA Python kernels execute with the NumPy error model. In this
model, division by zero raises no exception and instead produces a result of
``inf``, ``-inf`` or ``nan``. This differs from the normal Python error model,
in which division by zero raises a ``ZeroDivisionError``.
When debug is enabled (by passing ``debug=True`` to the
:func:`@cuda.jit <numba.cuda.jit>` decorator), the Python error model is used.
This allows division-by-zero errors during kernel execution to be identified.
Constructs
----------
The following Python constructs are not supported:
* Exception handling (``try .. except``, ``try .. finally``)
* Context management (the ``with`` statement)
* Comprehensions (either list, dict, set or generator comprehensions)
* Generator (any ``yield`` statements)
The ``raise`` and ``assert`` statements are supported, with the following
constraints:
- They can only be used in kernels, not in device functions.
- They only have an effect when ``debug=True`` is passed to the
:func:`@cuda.jit <numba.cuda.jit>` decorator. This is similar to the behavior
of the ``assert`` keyword in CUDA C/C++, which is ignored unless compiling
with device debug turned on.
Printing of strings, integers, and floats is supported, but printing is an
asynchronous operation - in order to ensure that all output is printed after a
kernel launch, it is necessary to call :func:`numba.cuda.synchronize`. Eliding
the call to ``synchronize`` is acceptable, but output from a kernel may appear
during other later driver operations (e.g. subsequent kernel launches, memory
transfers, etc.), or fail to appear before the program execution completes. Up
to 32 arguments may be passed to the ``print`` function - if more are passed
then a format string will be emitted instead and a warning will be produced.
This is due to a general limitation in CUDA printing, as outlined in the
`section on limitations in printing
<https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#limitations>`_
in the CUDA C++ Programming Guide.
Recursion
---------
Self-recursive device functions are supported, with the constraint that
recursive calls must have the same argument types as the initial call to
the function. For example, the following form of recursion is supported:
.. code:: python
@cuda.jit("int64(int64)", device=True)
def fib(n):
if n < 2:
return n
return fib(n - 1) + fib(n - 2)
(the ``fib`` function always has an ``int64`` argument), whereas the following
is unsupported:
.. code:: python
# Called with x := int64, y := float64
@cuda.jit
def type_change_self(x, y):
if x > 1 and y > 0:
return x + type_change_self(x - y, y)
else:
return y
The outer call to ``type_change_self`` provides ``(int64, float64)`` arguments,
but the inner call uses ``(float64, float64)`` arguments (because ``x - y`` /
``int64 - float64`` results in a ``float64`` type). Therefore, this function is
unsupported.
Mutual recursion between functions (e.g. where a function ``func1()`` calls
``func2()`` which again calls ``func1()``) is unsupported.
.. note::
The call stack in CUDA is typically quite limited in size, so it is easier
to overflow it with recursive calls on CUDA devices than it is on CPUs.
Stack overflow will result in an Unspecified Launch Failure (ULF) during
kernel execution. In order to identify whether a ULF is due to stack
overflow, programs can be run under `Compute Sanitizer
<https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html>`_,
which explicitly states when stack overflow has occurred.
.. _cuda-built-in-types:
Built-in types
===============
The following built-in types support are inherited from CPU nopython mode.
* int
* float
* complex
* bool
* None
* tuple
* Enum, IntEnum
See :ref:`nopython built-in types <pysupported-builtin-types>`.
There is also some very limited support for character sequences (bytes and
unicode strings) used in NumPy arrays. Note that this support can only be used
with CUDA 11.2 onwards.
Built-in functions
==================
The following built-in functions are supported:
* :func:`abs`
* :class:`bool`
* :class:`complex`
* :func:`enumerate`
* :class:`float`
* :class:`int`: only the one-argument form
* :func:`len`
* :func:`min`: only the multiple-argument form
* :func:`max`: only the multiple-argument form
* :func:`pow`
* :class:`range`
* :func:`round`
* :func:`zip`
Standard library modules
========================
``cmath``
---------
The following functions from the :mod:`cmath` module are supported:
* :func:`cmath.acos`
* :func:`cmath.acosh`
* :func:`cmath.asin`
* :func:`cmath.asinh`
* :func:`cmath.atan`
* :func:`cmath.atanh`
* :func:`cmath.cos`
* :func:`cmath.cosh`
* :func:`cmath.exp`
* :func:`cmath.isfinite`
* :func:`cmath.isinf`
* :func:`cmath.isnan`
* :func:`cmath.log`
* :func:`cmath.log10`
* :func:`cmath.phase`
* :func:`cmath.polar`
* :func:`cmath.rect`
* :func:`cmath.sin`
* :func:`cmath.sinh`
* :func:`cmath.sqrt`
* :func:`cmath.tan`
* :func:`cmath.tanh`
``math``
--------
The following functions from the :mod:`math` module are supported:
* :func:`math.acos`
* :func:`math.asin`
* :func:`math.atan`
* :func:`math.acosh`
* :func:`math.asinh`
* :func:`math.atanh`
* :func:`math.cos`
* :func:`math.sin`
* :func:`math.tan`
* :func:`math.hypot`
* :func:`math.cosh`
* :func:`math.sinh`
* :func:`math.tanh`
* :func:`math.atan2`
* :func:`math.erf`
* :func:`math.erfc`
* :func:`math.exp`
* :func:`math.expm1`
* :func:`math.fabs`
* :func:`math.frexp`
* :func:`math.ldexp`
* :func:`math.gamma`
* :func:`math.lgamma`
* :func:`math.log`
* :func:`math.log2`
* :func:`math.log10`
* :func:`math.log1p`
* :func:`math.sqrt`
* :func:`math.remainder`
* :func:`math.pow`
* :func:`math.ceil`
* :func:`math.floor`
* :func:`math.copysign`
* :func:`math.fmod`
* :func:`math.modf`
* :func:`math.isnan`
* :func:`math.isinf`
* :func:`math.isfinite`
``operator``
------------
The following functions from the :mod:`operator` module are supported:
* :func:`operator.add`
* :func:`operator.and_`
* :func:`operator.eq`
* :func:`operator.floordiv`
* :func:`operator.ge`
* :func:`operator.gt`
* :func:`operator.iadd`
* :func:`operator.iand`
* :func:`operator.ifloordiv`
* :func:`operator.ilshift`
* :func:`operator.imod`
* :func:`operator.imul`
* :func:`operator.invert`
* :func:`operator.ior`
* :func:`operator.ipow`
* :func:`operator.irshift`
* :func:`operator.isub`
* :func:`operator.itruediv`
* :func:`operator.ixor`
* :func:`operator.le`
* :func:`operator.lshift`
* :func:`operator.lt`
* :func:`operator.mod`
* :func:`operator.mul`
* :func:`operator.ne`
* :func:`operator.neg`
* :func:`operator.not_`
* :func:`operator.or_`
* :func:`operator.pos`
* :func:`operator.pow`
* :func:`operator.rshift`
* :func:`operator.sub`
* :func:`operator.truediv`
* :func:`operator.xor`
.. _cuda_numpy_support:
NumPy support
=============
Due to the CUDA programming model, dynamic memory allocation inside a kernel is
inefficient and is often not needed. Numba disallows any memory allocating features.
This disables a large number of NumPy APIs. For best performance, users should write
code such that each thread is dealing with a single element at a time.
Supported NumPy features:
* accessing `ndarray` attributes `.shape`, `.strides`, `.ndim`, `.size`, etc..
* indexing and slicing works.
* A subset of ufuncs are supported, but the output array must be passed in as a
positional argument (see :ref:`cuda_ufunc_call_example`). Note that ufuncs
execute sequentially in each thread - there is no automatic parallelisation
of ufuncs across threads over the elements of an input array.
The following ufuncs are supported:
* :func:`numpy.sin`
* :func:`numpy.cos`
* :func:`numpy.tan`
* :func:`numpy.arcsin`
* :func:`numpy.arccos`
* :func:`numpy.arctan`
* :func:`numpy.arctan2`
* :func:`numpy.hypot`
* :func:`numpy.sinh`
* :func:`numpy.cosh`
* :func:`numpy.tanh`
* :func:`numpy.arcsinh`
* :func:`numpy.arccosh`
* :func:`numpy.arctanh`
* :func:`numpy.deg2rad`
* :func:`numpy.radians`
* :func:`numpy.rad2deg`
* :func:`numpy.degrees`
* :func:`numpy.greater`
* :func:`numpy.greater_equal`
* :func:`numpy.less`
* :func:`numpy.less_equal`
* :func:`numpy.not_equal`
* :func:`numpy.equal`
* :func:`numpy.logical_and`
* :func:`numpy.logical_or`
* :func:`numpy.logical_xor`
* :func:`numpy.logical_not`
* :func:`numpy.maximum`
* :func:`numpy.minimum`
* :func:`numpy.fmax`
* :func:`numpy.fmin`
* :func:`numpy.bitwise_and`
* :func:`numpy.bitwise_or`
* :func:`numpy.bitwise_xor`
* :func:`numpy.invert`
* :func:`numpy.bitwise_not`
* :func:`numpy.left_shift`
* :func:`numpy.right_shift`
Unsupported NumPy features:
* array creation APIs.
* array methods.
* functions that returns a new array.
CFFI support
============
The ``from_buffer()`` method of ``cffi.FFI`` objects is supported. This is
useful for obtaining a pointer that can be passed to external C / C++ / PTX
functions (see the :ref:`CUDA FFI documentation <cuda_ffi>`).
Writing Device Functions
========================
CUDA device functions can only be invoked from within the device (by a kernel
or another device function). To define a device function::
from numba import cuda
@cuda.jit(device=True)
def a_device_function(a, b):
return a + b
Unlike a kernel function, a device function can return a value like normal
functions.
Device management
=================
For multi-GPU machines, users may want to select which GPU to use.
By default the CUDA driver selects the fastest GPU as the device 0,
which is the default device used by Numba.
The features introduced on this page are generally not of interest
unless working with systems hosting/offering more than one CUDA-capable GPU.
Device Selection
----------------
If at all required, device selection must be done before any CUDA feature is
used.
::
from numba import cuda
cuda.select_device(0)
The device can be closed by:
::
cuda.close()
Users can then create a new context with another device.
::
cuda.select_device(1) # assuming we have 2 GPUs
.. function:: numba.cuda.select_device(device_id)
:noindex:
Create a new CUDA context for the selected *device_id*. *device_id*
should be the number of the device (starting from 0; the device order
is determined by the CUDA libraries). The context is associated with
the current thread. Numba currently allows only one context per thread.
If successful, this function returns a device instance.
.. XXX document device instances?
.. function:: numba.cuda.close
:noindex:
Explicitly close all contexts in the current thread.
.. note::
Compiled functions are associated with the CUDA context.
This makes it not very useful to close and create new devices, though it
is certainly useful for choosing which device to use when the machine
has multiple GPUs.
The Device List
===============
The Device List is a list of all the GPUs in the system, and can be indexed to
obtain a context manager that ensures execution on the selected GPU.
.. attribute:: numba.cuda.gpus
:noindex:
.. attribute:: numba.cuda.cudadrv.devices.gpus
:py:data:`numba.cuda.gpus` is an instance of the ``_DeviceList`` class, from
which the current GPU context can also be retrieved:
.. autoclass:: numba.cuda.cudadrv.devices._DeviceList
:members: current
:noindex:
Device UUIDs
============
The UUID of a device (equal to that returned by ``nvidia-smi -L``) is available
in the :attr:`uuid <numba.cuda.cudadrv.driver.Device.uuid>` attribute of a CUDA
device object.
For example, to obtain the UUID of the current device:
.. code-block:: python
dev = cuda.current_context().device
# prints e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643"
print(dev.uuid)
========
Examples
========
.. _cuda-vecadd:
Vector Addition
===============
This example uses Numba to create on-device arrays and a vector addition kernel;
it is a warmup for learning how to write GPU kernels using Numba. We'll begin
with some required imports:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
:language: python
:caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
:start-after: ex_vecadd.import.begin
:end-before: ex_vecadd.import.end
:dedent: 8
:linenos:
The following function is the kernel. Note that it is defined in terms of Python
variables with unspecified types. When the kernel is launched, Numba will
examine the types of the arguments that are passed at runtime and generate a
CUDA kernel specialized for them.
Note that Numba kernels do not return values and must write any output into
arrays passed in as parameters (this is similar to the requirement that CUDA
C/C++ kernels have ``void`` return type). Here we pass in ``c`` for the results
to be written into.
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
:language: python
:caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
:start-after: ex_vecadd.kernel.begin
:end-before: ex_vecadd.kernel.end
:dedent: 8
:linenos:
:func:`cuda.to_device() <numba.cuda.to_device>` can be used create device-side
copies of arrays. :func:`cuda.device_array_like()
<numba.cuda.device_array_like>` creates an uninitialized array of the same shape
and type as an existing array. Here we transfer two vectors and create an empty
vector to hold our results:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
:language: python
:caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
:start-after: ex_vecadd.allocate.begin
:end-before: ex_vecadd.allocate.end
:dedent: 8
:linenos:
A call to :meth:`forall() <numba.cuda.dispatcher.Dispatcher.forall>` generates
an appropriate launch configuration with a 1D grid (see
:ref:`cuda-kernel-invocation`) for a given data size and is often the simplest
way of launching a kernel:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
:language: python
:caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
:start-after: ex_vecadd.forall.begin
:end-before: ex_vecadd.forall.end
:dedent: 8
:linenos:
This prints:
.. code-block:: none
[0.73548323 1.32061059 0.12582968 ... 1.25925809 1.49335059 1.59315414]
One can also configure the grid manually using the subscripting syntax. The
following example launches a grid with sufficient threads to operate on every
vector element:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py
:language: python
:caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py``
:start-after: ex_vecadd.launch.begin
:end-before: ex_vecadd.launch.end
:dedent: 8
:linenos:
This also prints:
.. code-block:: none
[0.73548323 1.32061059 0.12582968 ... 1.25925809 1.49335059 1.59315414]
.. _cuda-laplace:
1D Heat Equation
=====================
This example solves Laplace's equation in one dimension for a certain set of initial
conditions and boundary conditions. A full discussion of Laplace's equation is out of
scope for this documentation, but it will suffice to say that it describes how heat
propagates through an object over time. It works by discretizing the problem in two ways:
1. The domain is partitioned into a mesh of points that each have an individual temperature.
2. Time is partitioned into discrete intervals that are advanced forward sequentially.
Then, the following assumption is applied: The temperature of a point after some interval
has passed is some weighted average of the temperature of the points that are directly
adjacent to it. Intuitively, if all the points in the domain are very hot
and a single point in the middle is very cold, as time passes, the hot points will cause
the cold one to heat up and the cold point will cause the surrounding hot pieces to cool
slightly. Simply put, the heat spreads throughout the object.
We can implement this simulation using a Numba kernel. Let's start simple by assuming
we have a one dimensional object which we'll represent with an array of values. The position
of the element in the array is the position of a point within the object, and the value
of the element represents the temperature.
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py
:language: python
:caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py``
:start-after: ex_laplace.import.begin
:end-before: ex_laplace.import.end
:dedent: 8
:linenos:
Some initial setup here. Let's make one point in the center of the object very hot.
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py
:language: python
:caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py``
:start-after: ex_laplace.allocate.begin
:end-before: ex_laplace.allocate.end
:dedent: 8
:linenos:
The initial state of the problem can be visualized as:
.. image:: laplace_initial.svg
In our kernel each thread will be responsible for managing the temperature update for a single element
in a loop over the desired number of timesteps. The kernel is below. Note the use of cooperative group
synchronization and the use of two buffers swapped at each iteration to avoid race conditions. See
:func:`numba.cuda.cg.this_grid() <numba.cuda.cg.this_grid>` for details.
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py
:language: python
:caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py``
:start-after: ex_laplace.kernel.begin
:end-before: ex_laplace.kernel.end
:dedent: 8
:linenos:
Calling the kernel:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py
:language: python
:caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py``
:start-after: ex_laplace.launch.begin
:end-before: ex_laplace.launch.end
:dedent: 8
:linenos:
Plotting the final data shows an arc that is highest where
the object was hot initially and gradually sloping down to zero towards the
edges where the temperature is fixed at zero. In the limit of infinite time,
the arc will flatten out completely.
.. image:: laplace_final.svg
.. _cuda_reduction_shared:
Shared Memory Reduction
=======================
Numba exposes many CUDA features, including :ref:`shared memory
<cuda-shared-memory>`. To demonstrate shared memory, let's reimplement a
famous CUDA solution for summing a vector which works by "folding" the data up
using a successively smaller number of threads.
Note that this is a fairly naive implementation, and there are more efficient ways of implementing reductions
using Numba - see :ref:`cuda_montecarlo` for an example.
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py
:language: python
:caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py``
:start-after: ex_reduction.import.begin
:end-before: ex_reduction.import.end
:dedent: 8
:linenos:
Let's create some one dimensional data that we'll use to demonstrate the
kernel itself:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py
:language: python
:caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py``
:start-after: ex_reduction.allocate.begin
:end-before: ex_reduction.allocate.end
:dedent: 8
:linenos:
Here is a version of the kernel implemented using Numba:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py
:language: python
:caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py``
:start-after: ex_reduction.kernel.begin
:end-before: ex_reduction.kernel.end
:dedent: 8
:linenos:
We can run kernel and verify that the same result is obtained through
summing data on the host as follows:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py
:language: python
:caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py``
:start-after: ex_reduction.launch.begin
:end-before: ex_reduction.launch.end
:dedent: 8
:linenos:
This algorithm can be greatly improved upon by redesigning the inner loop
to use sequential memory accesses, and even further by using strategies that
keep more threads active and working, since in this example most threads quickly
become idle.
.. _cuda_sessionization:
Dividing Click Data into Sessions
=================================
A common problem in business analytics is that of grouping the activity of users of an online platform into
sessions, called "sessionization". The idea is that users generally traverse through a website and perform
various actions (clicking something, filling out a form, etc.) in discrete groups. Perhaps a customer spends
some time shopping for an item in the morning and then again at night - often the business is interested in
treating these periods as separate interactions with their service, and this creates the problem of
programmatically splitting up activity in some agreed-upon way.
Here we'll illustrate how to write a Numba kernel to solve this problem. We'll start with data
containing two fields: let ``user_id`` represent a unique ID corresponding to an individual customer, and let
``action_time`` be a time that some unknown action was taken on the service. Right now, we'll assume there's
only one type of action, so all there is to know is when it happened.
Our goal will be to create a new column called ``session_id``, which contains a label corresponding to a unique
session. We'll define the boundary between sessions as when there has been at least one hour between clicks.
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_sessionize.py
:language: python
:caption: from ``test_ex_sessionize`` in ``numba/cuda/tests/doc_examples/test_sessionize.py``
:start-after: ex_sessionize.import.begin
:end-before: ex_sessionize.import.end
:dedent: 8
:linenos:
Here is a solution using Numba:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_sessionize.py
:language: python
:caption: from ``test_ex_sessionize`` in ``numba/cuda/tests/doc_examples/test_sessionize.py``
:start-after: ex_sessionize.kernel.begin
:end-before: ex_sessionize.kernel.end
:dedent: 8
:linenos:
Let's generate some data and try out the kernel:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_sessionize.py
:language: python
:caption: from ``test_ex_sessionize`` in ``numba/cuda/tests/doc_examples/test_sessionize.py``
:start-after: ex_sessionize.allocate.begin
:end-before: ex_sessionize.allocate.end
:dedent: 8
:linenos:
As can be seen above, the kernel successfully divided the first three datapoints from the second three for the first user ID,
and a similar pattern is seen throughout.
.. _cuda_reuse_function:
JIT Function CPU-GPU Compatibility
==================================
This example demonstrates how ``numba.jit`` can be used to jit compile a function for the CPU, while at the same time making
it available for use inside CUDA kernels. This can be very useful for users that are migrating workflows from CPU to GPU as
they can directly reuse potential business logic with fewer code changes.
Take the following example function:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
:language: python
:caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
:start-after: ex_cpu_gpu_compat.define.begin
:end-before: ex_cpu_gpu_compat.define.end
:dedent: 8
:linenos:
The function ``business_logic`` can be run standalone in compiled form on the CPU:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
:language: python
:caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
:start-after: ex_cpu_gpu_compat.cpurun.begin
:end-before: ex_cpu_gpu_compat.cpurun.end
:dedent: 8
:linenos:
It can also be directly reused threadwise inside a GPU kernel. For example one may
generate some vectors to represent ``x``, ``y``, and ``z``:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
:language: python
:caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
:start-after: ex_cpu_gpu_compat.allocate.begin
:end-before: ex_cpu_gpu_compat.allocate.end
:dedent: 8
:linenos:
And a numba kernel referencing the decorated function:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
:language: python
:caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
:start-after: ex_cpu_gpu_compat.usegpu.begin
:end-before: ex_cpu_gpu_compat.usegpu.end
:dedent: 8
:linenos:
This kernel can be invoked in the normal way:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
:language: python
:caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py``
:start-after: ex_cpu_gpu_compat.launch.begin
:end-before: ex_cpu_gpu_compat.launch.end
:dedent: 8
:linenos:
.. _cuda_montecarlo:
Monte Carlo Integration
=======================
This example shows how to use Numba to approximate the value of a definite integral by rapidly generating
random numbers on the GPU. A detailed description of the mathematical mechanics of Monte Carlo integeration
is out of the scope of the example, but it can briefly be described as an averaging process where the area
under the curve is approximated by taking the average of many rectangles formed by its function values.
In addition, this example shows how to perform reductions in numba using the
:func:`cuda.reduce() <numba.cuda.Reduce>` API.
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
:language: python
:caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
:start-after: ex_montecarlo.import.begin
:end-before: ex_montecarlo.import.end
:dedent: 8
:linenos:
Let's create a variable to control the number of samples drawn:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
:language: python
:caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
:start-after: ex_montecarlo.define.begin
:end-before: ex_montecarlo.define.end
:dedent: 8
:linenos:
The following kernel implements the main integration routine:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
:language: python
:caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
:start-after: ex_montecarlo.kernel.begin
:end-before: ex_montecarlo.kernel.end
:dedent: 8
:linenos:
This convenience function calls the kernel performs some
preprocessing and post processing steps. Note the use of Numba's reduction API to
take sum of the array and compute the final result:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
:language: python
:caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
:start-after: ex_montecarlo.callfunc.begin
:end-before: ex_montecarlo.callfunc.end
:dedent: 8
:linenos:
We can now use ``mc_integrate`` to compute the definite integral of this function between
two limits:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py
:language: python
:caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py``
:start-after: ex_montecarlo.launch.begin
:end-before: ex_montecarlo.launch.end
:dedent: 8
:linenos:
.. _cuda-matmul:
Matrix multiplication
=====================
First, import the modules needed for this example:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
:language: python
:caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
:start-after: magictoken.ex_import.begin
:end-before: magictoken.ex_import.end
:dedent: 8
:linenos:
Here is a naïve implementation of matrix multiplication using a CUDA kernel:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
:language: python
:caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
:start-after: magictoken.ex_matmul.begin
:end-before: magictoken.ex_matmul.end
:dedent: 8
:linenos:
An example usage of this function is as follows:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
:language: python
:caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
:start-after: magictoken.ex_run_matmul.begin
:end-before: magictoken.ex_run_matmul.end
:dedent: 8
:linenos:
This implementation is straightforward and intuitive but performs poorly,
because the same matrix elements will be loaded multiple times from device
memory, which is slow (some devices may have transparent data caches, but
they may not be large enough to hold the entire inputs at once).
It will be faster if we use a blocked algorithm to reduce accesses to the
device memory. CUDA provides a fast :ref:`shared memory <cuda-shared-memory>`
for threads in a block to cooperatively compute on a task. The following
implements a faster version of the square matrix multiplication using shared
memory:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
:language: python
:caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
:start-after: magictoken.ex_fast_matmul.begin
:end-before: magictoken.ex_fast_matmul.end
:dedent: 8
:linenos:
Because the shared memory is a limited resource, the code preloads a small
block at a time from the input arrays. Then, it calls
:func:`~numba.cuda.syncthreads` to wait until all threads have finished
preloading and before doing the computation on the shared memory.
It synchronizes again after the computation to ensure all threads
have finished with the data in shared memory before overwriting it
in the next loop iteration.
An example usage of the ``fast_matmul`` function is as follows:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
:language: python
:caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
:start-after: magictoken.ex_run_fast_matmul.begin
:end-before: magictoken.ex_run_fast_matmul.end
:dedent: 8
:linenos:
This passes a :ref:`CUDA memory check test <debugging-cuda-python-code>`, which
can help with debugging. Running the code above produces the following output:
.. code-block:: none
$ python fast_matmul.py
[[ 6. 6. 6. 6.]
[22. 22. 22. 22.]
[38. 38. 38. 38.]
[54. 54. 54. 54.]]
[[ 6. 6. 6. 6.]
[22. 22. 22. 22.]
[38. 38. 38. 38.]
[54. 54. 54. 54.]]
.. note:: For high performance matrix multiplication in CUDA, see also the `CuPy implementation <https://docs.cupy.dev/en/stable/reference/generated/cupy.matmul.html>`_.
The approach outlined here generalizes to non-square matrix multiplication as
follows by adjusting the ``blockspergrid`` variable:
Again, here is an example usage:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py
:language: python
:caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py``
:start-after: magictoken.ex_run_nonsquare.begin
:end-before: magictoken.ex_run_nonsquare.end
:dedent: 8
:linenos:
and the corresponding output:
.. code-block:: none
$ python nonsquare_matmul.py
[[ 253. 253. 253. 253. 253. 253. 253.]
[ 782. 782. 782. 782. 782. 782. 782.]
[1311. 1311. 1311. 1311. 1311. 1311. 1311.]
[1840. 1840. 1840. 1840. 1840. 1840. 1840.]
[2369. 2369. 2369. 2369. 2369. 2369. 2369.]]
[[ 253. 253. 253. 253. 253. 253. 253.]
[ 782. 782. 782. 782. 782. 782. 782.]
[1311. 1311. 1311. 1311. 1311. 1311. 1311.]
[1840. 1840. 1840. 1840. 1840. 1840. 1840.]
[2369. 2369. 2369. 2369. 2369. 2369. 2369.]]
.. _cuda_ufunc_call_example:
Calling a NumPy UFunc
=====================
UFuncs supported in the CUDA target (see :ref:`cuda_numpy_support`) can be
called inside kernels, but the output array must be passed in as a positional
argument. The following example demonstrates a call to :func:`np.sin` inside a
kernel following this pattern:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_ufunc.py
:language: python
:caption: from ``test_ex_cuda_ufunc_call`` in ``numba/cuda/tests/doc_examples/test_ufunc.py``
:start-after: ex_cuda_ufunc.begin
:end-before: ex_cuda_ufunc.end
:dedent: 8
:linenos:
.. _cuda-emm-plugin:
=================================================
External Memory Management (EMM) Plugin interface
=================================================
The :ref:`CUDA Array Interface <cuda-array-interface>` enables sharing of data
between different Python libraries that access CUDA devices. However, each
library manages its own memory distinctly from the others. For example:
- By default, Numba allocates memory on CUDA devices by interacting with the
CUDA driver API to call functions such as ``cuMemAlloc`` and ``cuMemFree``,
which is suitable for many use cases.
- The RAPIDS libraries (cuDF, cuML, etc.) use the `RAPIDS Memory Manager (RMM)
<https://github.com/rapidsai/rmm>`_ for allocating device memory.
- `CuPy <https://cupy.chainer.org/>`_ includes a `memory pool implementation
<https://docs-cupy.chainer.org/en/stable/reference/memory.html>`_ for both
device and pinned memory.
When multiple CUDA-aware libraries are used together, it may be preferable for
Numba to defer to another library for memory management. The EMM Plugin
interface facilitates this, by enabling Numba to use another CUDA-aware library
for all allocations and deallocations.
An EMM Plugin is used to facilitate the use of an external library for memory
management. An EMM Plugin can be a part of an external library, or could be
implemented as a separate library.
Overview of External Memory Management
======================================
When an EMM Plugin is in use (see :ref:`setting-emm-plugin`), Numba will make
memory allocations and deallocations through the Plugin. It will never directly call
functions such as ``cuMemAlloc``, ``cuMemFree``, etc.
EMM Plugins always take responsibility for the management of device memory.
However, not all CUDA-aware libraries also support managing host memory, so a
facility for Numba to continue the management of host memory whilst ceding
control of device memory to the EMM is provided (see
:ref:`host-only-cuda-memory-manager`).
Effects on Deallocation Strategies
----------------------------------
Numba's internal :ref:`deallocation-behavior` is designed to increase efficiency
by deferring deallocations until a significant quantity are pending. It also
provides a mechanism for preventing deallocations entirely during critical
sections, using the :func:`~numba.cuda.defer_cleanup` context manager.
When an EMM Plugin is in use, the deallocation strategy is implemented by the
EMM, and Numba's internal deallocation mechanism is not used. The EMM
Plugin could implement:
- A similar strategy to the Numba deallocation behaviour, or
- Something more appropriate to the plugin - for example, deallocated memory
might immediately be returned to a memory pool.
The ``defer_cleanup`` context manager may behave differently with an EMM Plugin
- an EMM Plugin should be accompanied by documentation of the behaviour of the
``defer_cleanup`` context manager when it is in use. For example, a pool
allocator could always immediately return memory to a pool even when the
context manager is in use, but could choose not to free empty pools until
``defer_cleanup`` is not in use.
Management of other objects
---------------------------
In addition to memory, Numba manages the allocation and deallocation of
:ref:`events <events>`, :ref:`streams <streams>`, and modules (a module is a
compiled object, which is generated from ``@cuda.jit``\ -ted functions). The
management of events, streams, and modules is unchanged by the use of an EMM
Plugin.
Asynchronous allocation and deallocation
----------------------------------------
The present EMM Plugin interface does not provide support for asynchronous
allocation and deallocation. This may be added to a future version of the
interface.
Implementing an EMM Plugin
==========================
An EMM Plugin is implemented by deriving from
:class:`~numba.cuda.BaseCUDAMemoryManager`. A summary of considerations for the
implementation follows:
- Numba instantiates one instance of the EMM Plugin class per context. The
context that owns an EMM Plugin object is accessible through ``self.context``,
if required.
- The EMM Plugin is transparent to any code that uses Numba - all its methods
are invoked by Numba, and never need to be called by code that uses Numba.
- The allocation methods ``memalloc``, ``memhostalloc``, and ``mempin``, should
use the underlying library to allocate and/or pin device or host memory, and
construct an instance of a :ref:`memory pointer <memory-pointers>`
representing the memory to return back to Numba. These methods are always
called when the current CUDA context is the context that owns the EMM Plugin
instance.
- The ``initialize`` method is called by Numba prior to the first use of the EMM
Plugin object for a context. This method should do anything required to
prepare the underlying library for allocations in the current context. This
method may be called multiple times, and must not invalidate previous state
when it is called.
- The ``reset`` method is called when all allocations in the context are to be
cleaned up. It may be called even prior to ``initialize``, and an EMM Plugin
implementation needs to guard against this.
- To support inter-GPU communication, the ``get_ipc_handle`` method should
provide an :class:`~numba.cuda.IpcHandle` for a given
:class:`~numba.cuda.MemoryPointer` instance. This method is part of the EMM
interface (rather than being handled within Numba) because the base address of
the allocation is only known by the underlying library. Closing an IPC handle
is handled internally within Numba.
- It is optional to provide memory info from the ``get_memory_info`` method, which
provides a count of the total and free memory on the device for the context.
It is preferable to implement the method, but this may not be practical for
all allocators. If memory info is not provided, this method should raise a
:class:`RuntimeError`.
- The ``defer_cleanup`` method should return a context manager that ensures that
expensive cleanup operations are avoided whilst it is active. The nuances of
this will vary between plugins, so the plugin documentation should include an
explanation of how deferring cleanup affects deallocations, and performance in
general.
- The ``interface_version`` property is used to ensure that the plugin version
matches the interface provided by the version of Numba. At present, this
should always be 1.
Full documentation for the base class follows:
.. autoclass:: numba.cuda.BaseCUDAMemoryManager
:members: memalloc, memhostalloc, mempin, initialize, get_ipc_handle,
get_memory_info, reset, defer_cleanup, interface_version
:member-order: bysource
.. _host-only-cuda-memory-manager:
The Host-Only CUDA Memory Manager
---------------------------------
Some external memory managers will support management of on-device memory but
not host memory. For implementing EMM Plugins using one of these memory
managers, a partial implementation of a plugin that implements host-side
allocation and pinning is provided. To use it, derive from
:class:`~numba.cuda.HostOnlyCUDAMemoryManager` instead of
:class:`~numba.cuda.BaseCUDAMemoryManager`. Guidelines for using this class
are:
- The host-only memory manager implements ``memhostalloc`` and ``mempin`` - the
EMM Plugin should still implement ``memalloc``.
- If ``reset`` is overridden, it must also call ``super().reset()`` to allow the
host allocations to be cleaned up.
- If ``defer_cleanup`` is overridden, it must hold an active context manager
from ``super().defer_cleanup()`` to ensure that host-side cleanup is also
deferred.
Documentation for the methods of :class:`~numba.cuda.HostOnlyCUDAMemoryManager`
follows:
.. autoclass:: numba.cuda.HostOnlyCUDAMemoryManager
:members: memhostalloc, mempin, reset, defer_cleanup
:member-order: bysource
The IPC Handle Mixin
--------------------
An implementation of the ``get_ipc_handle()`` function is is provided in the
``GetIpcHandleMixin`` class. This uses the driver API to determine the base
address of an allocation for opening an IPC handle. If this implementation is
appropriate for an EMM plugin, it can be added by mixing in the
``GetIpcHandleMixin`` class:
.. autoclass:: numba.cuda.GetIpcHandleMixin
:members: get_ipc_handle
Classes and structures of returned objects
==========================================
This section provides an overview of the classes and structures that need to be
constructed by an EMM Plugin.
.. _memory-pointers:
Memory Pointers
---------------
EMM Plugins should construct memory pointer instances that represent their
allocations, for return to Numba. The appropriate memory pointer class to use in
each method is:
- :class:`~numba.cuda.MemoryPointer`: returned from ``memalloc``
- :class:`~numba.cuda.MappedMemory`: returned from ``memhostalloc`` or
``mempin`` when the host memory is mapped into the device memory space.
- :class:`~numba.cuda.PinnedMemory`: return from ``memhostalloc`` or ``mempin``
when the host memory is not mapped into the device memory space.
Memory pointers can take a finalizer, which is a function that is called when
the buffer is no longer needed. Usually the finalizer will make a call to the
memory management library (either internal to Numba, or external if allocated
by an EMM Plugin) to inform it that the memory is no longer required, and that
it could potentially be freed and/or unpinned. The memory manager may choose to
defer actually cleaning up the memory to any later time after the finalizer
runs - it is not required to free the buffer immediately.
Documentation for the memory pointer classes follows.
.. autoclass:: numba.cuda.MemoryPointer
The ``AutoFreePointer`` class need not be used directly, but is documented here
as it is subclassed by :class:`numba.cuda.MappedMemory`:
.. autoclass:: numba.cuda.cudadrv.driver.AutoFreePointer
.. autoclass:: numba.cuda.MappedMemory
.. autoclass:: numba.cuda.PinnedMemory
Memory Info
-----------
If an implementation of
:meth:`~numba.cuda.BaseCUDAMemoryManager.get_memory_info` is to provide a
result, then it should return an instance of the ``MemoryInfo`` named tuple:
.. autoclass:: numba.cuda.MemoryInfo
IPC
---
An instance of ``IpcHandle`` is required to be returned from an implementation
of :meth:`~numba.cuda.BaseCUDAMemoryManager.get_ipc_handle`:
.. autoclass:: numba.cuda.IpcHandle
Guidance for constructing an IPC handle in the context of implementing an EMM
Plugin:
- The ``memory`` parameter passed to the ``get_ipc_handle`` method of an EMM
Plugin can be passed as the ``base`` parameter.
- A suitable type for the ``handle`` can be constructed as ``ctypes.c_byte *
64``. The data for ``handle`` must be populated using a method for obtaining a
CUDA IPC handle appropriate to the underlying library.
- ``size`` should match the size of the original allocation, which can be
obtained with ``memory.size`` in ``get_ipc_handle``.
- An appropriate value for ``source_info`` can be created by calling
``self.context.device.get_device_identity()``.
- If the underlying memory does not point to the base of an allocation returned
by the CUDA driver or runtime API (e.g. if a pool allocator is in use) then
the ``offset`` from the base must be provided.
.. _setting-emm-plugin:
Setting the EMM Plugin
======================
By default, Numba uses its internal memory management - if an EMM Plugin is to
be used, it must be configured. There are two mechanisms for configuring the use
of an EMM Plugin: an environment variable, and a function.
Environment variable
--------------------
A module name can be provided in the environment variable,
``NUMBA_CUDA_MEMORY_MANAGER``. If this environment variable is set, Numba will
attempt to import the module, and and use its ``_numba_memory_manager`` global
variable as the memory manager class. This is primarily useful for running the
Numba test suite with an EMM Plugin, e.g.:
.. code::
$ NUMBA_CUDA_MEMORY_MANAGER=rmm python -m numba.runtests numba.cuda.tests
Function
--------
The :func:`~numba.cuda.set_memory_manager` function can be used to set the
memory manager at runtime. This should be called prior to the initialization of
any contexts, as EMM Plugin instances are instantiated along with contexts.
.. autofunction:: numba.cuda.set_memory_manager
Resetting the memory manager
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
It is recommended that the memory manager is set once prior to using any CUDA
functionality, and left unchanged for the remainder of execution. It is possible
to set the memory manager multiple times, noting the following:
* At the time of their creation, contexts are bound to an instance of a memory
manager for their lifetime.
* Changing the memory manager will have no effect on existing contexts - only
contexts created after the memory manager was updated will use instances of
the new memory manager.
* :func:`numba.cuda.close` can be used to destroy contexts after setting the
memory manager so that they get re-created with the new memory manager.
- This will invalidate any arrays, streams, events, and modules owned by the
context.
- Attempting to use invalid arrays, streams, or events will likely fail with
an exception being raised due to a ``CUDA_ERROR_INVALID_CONTEXT`` or
``CUDA_ERROR_CONTEXT_IS_DESTROYED`` return code from a Driver API function.
- Attempting to use an invalid module will result in similar, or in some
cases a segmentation fault / access violation.
.. note:: The invalidation of modules means that all functions compiled with
``@cuda.jit`` prior to context destruction will need to be
redefined, as the code underlying them will also have been unloaded
from the GPU.
.. _cudafaq:
=================================================
CUDA Frequently Asked Questions
=================================================
nvprof reports "No kernels were profiled"
-----------------------------------------
When using the ``nvprof`` tool to profile Numba jitted code for the CUDA
target, the output contains ``No kernels were profiled`` but there are clearly
running kernels present, what is going on?
This is quite likely due to the profiling data not being flushed on program
exit, see the `NVIDIA CUDA documentation
<http://docs.nvidia.com/cuda/profiler-users-guide/#flush-profile-data>`_ for
details. To fix this simply add a call to ``numba.cuda.profile_stop()`` prior
to the exit point in your program (or wherever you want to stop profiling).
For more on CUDA profiling support in Numba, see :ref:`cuda-profiling`.
.. _cuda-fast-math:
CUDA Fast Math
==============
As noted in :ref:`fast-math`, for certain classes of applications that utilize
floating point, strict IEEE-754 conformance is not required. For this subset of
applications, performance speedups may be possible.
The CUDA target implements :ref:`fast-math` behavior with two differences.
* First, the ``fastmath`` argument to the :func:`@jit decorator
<numba.cuda.jit>` is limited to the values ``True`` and ``False``.
When ``True``, the following optimizations are enabled:
- Flushing of denormals to zero.
- Use of a fast approximation to the square root function.
- Use of a fast approximation to the division operation.
- Contraction of multiply and add operations into single fused multiply-add
operations.
See the `documentation for nvvmCompileProgram <https://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g76ac1e23f5d0e2240e78be0e63450346>`_ for more details of these optimizations.
* Secondly, calls to a subset of math module functions on ``float32`` operands
will be implemented using fast approximate implementations from the libdevice
library.
- :func:`math.cos`: Implemented using `__nv_fast_cosf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_cosf.html>`_.
- :func:`math.sin`: Implemented using `__nv_fast_sinf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_sinf.html>`_.
- :func:`math.tan`: Implemented using `__nv_fast_tanf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_tanf.html>`_.
- :func:`math.exp`: Implemented using `__nv_fast_expf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_expf.html>`_.
- :func:`math.log2`: Implemented using `__nv_fast_log2f <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_log2f.html>`_.
- :func:`math.log10`: Implemented using `__nv_fast_log10f <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_log10f.html>`_.
- :func:`math.log`: Implemented using `__nv_fast_logf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_logf.html>`_.
- :func:`math.pow`: Implemented using `__nv_fast_powf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_powf.html>`_.
.. _cuda-index:
Numba for CUDA GPUs
===================
.. toctree::
overview.rst
kernels.rst
memory.rst
device-functions.rst
cudapysupported.rst
fastmath.rst
intrinsics.rst
cooperative_groups.rst
random.rst
device-management.rst
examples.rst
simulator.rst
reduction.rst
ufunc.rst
ipc.rst
cuda_array_interface.rst
external-memory.rst
bindings.rst
cuda_ffi.rst
caching.rst
minor_version_compatibility.rst
faq.rst
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment